Bring the Keras API in contrib.

Change: 150237453
author: Francois Chollet <fchollet@google.com> 2017-03-15 12:53:51 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-03-15 14:12:41 -0700
commit: f49f801276154d0f693c5d57db6977a7eb32f017 (patch)
tree: c9344fa01dde810067b72f0ed8d95a28889bc5e9
parent: d2bf01f51781611294babe1dee38e2e56de70809 (diff)
138 files changed, 34378 insertions, 0 deletions
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index db6d42e1bc..f8fe394938 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -180,6 +180,7 @@ filegroup(
         "//tensorflow/contrib/input_pipeline:all_files",
         "//tensorflow/contrib/input_pipeline/kernels:all_files",
         "//tensorflow/contrib/integrate:all_files",
+        "//tensorflow/contrib/keras:all_files",
         "//tensorflow/contrib/labeled_tensor:all_files",
         "//tensorflow/contrib/layers:all_files",
         "//tensorflow/contrib/layers/kernels:all_files",
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 29d60ae241..c97e283a2f 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -33,6 +33,7 @@ py_library(
         "//tensorflow/contrib/imperative",
         "//tensorflow/contrib/input_pipeline:input_pipeline_py",
         "//tensorflow/contrib/integrate:integrate_py",
+        "//tensorflow/contrib/keras",
         "//tensorflow/contrib/labeled_tensor",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 7c0d1da8a6..d4ddd1cf6a 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -34,6 +34,7 @@ from tensorflow.contrib import grid_rnn
 from tensorflow.contrib import image
 from tensorflow.contrib import input_pipeline
 from tensorflow.contrib import integrate
+from tensorflow.contrib import keras
 from tensorflow.contrib import labeled_tensor
 from tensorflow.contrib import layers
 from tensorflow.contrib import learn
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index e58b672347..8319cced28 100644
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -296,6 +296,49 @@ add_python_module("tensorflow/contrib/ios_examples/camera/en.lproj")
 add_python_module("tensorflow/contrib/ios_examples/simple")
 add_python_module("tensorflow/contrib/ios_examples/simple/data")
 add_python_module("tensorflow/contrib/ios_examples/simple/tf_ios_makefile_example.xcodeproj")
+add_python_module("tensorflow/contrib/keras")
+add_python_module("tensorflow/contrib/keras/api")
+add_python_module("tensorflow/contrib/keras/api/keras")
+add_python_module("tensorflow/contrib/keras/api/keras/activations")
+add_python_module("tensorflow/contrib/keras/api/keras/applications")
+add_python_module("tensorflow/contrib/keras/api/keras/applications/inception_v3")
+add_python_module("tensorflow/contrib/keras/api/keras/applications/resnet50")
+add_python_module("tensorflow/contrib/keras/api/keras/applications/vgg16")
+add_python_module("tensorflow/contrib/keras/api/keras/applications/vgg19")
+add_python_module("tensorflow/contrib/keras/api/keras/applications/xception")
+add_python_module("tensorflow/contrib/keras/api/keras/backend")
+add_python_module("tensorflow/contrib/keras/api/keras/callbacks")
+add_python_module("tensorflow/contrib/keras/api/keras/constraints")
+add_python_module("tensorflow/contrib/keras/api/keras/datasets")
+add_python_module("tensorflow/contrib/keras/api/keras/datasets/boston_housing")
+add_python_module("tensorflow/contrib/keras/api/keras/datasets/cifar10")
+add_python_module("tensorflow/contrib/keras/api/keras/datasets/cifar100")
+add_python_module("tensorflow/contrib/keras/api/keras/datasets/imdb")
+add_python_module("tensorflow/contrib/keras/api/keras/datasets/mnist")
+add_python_module("tensorflow/contrib/keras/api/keras/datasets/reuters")
+add_python_module("tensorflow/contrib/keras/api/keras/initializers")
+add_python_module("tensorflow/contrib/keras/api/keras/layers")
+add_python_module("tensorflow/contrib/keras/api/keras/losses")
+add_python_module("tensorflow/contrib/keras/api/keras/metrics")
+add_python_module("tensorflow/contrib/keras/api/keras/models")
+add_python_module("tensorflow/contrib/keras/api/keras/optimizers")
+add_python_module("tensorflow/contrib/keras/api/keras/preprocessing")
+add_python_module("tensorflow/contrib/keras/api/keras/preprocessing/image")
+add_python_module("tensorflow/contrib/keras/api/keras/preprocessing/sequence")
+add_python_module("tensorflow/contrib/keras/api/keras/preprocessing/text")
+add_python_module("tensorflow/contrib/keras/api/keras/regularizers")
+add_python_module("tensorflow/contrib/keras/api/keras/utils")
+add_python_module("tensorflow/contrib/keras/api/keras/wrappers")
+add_python_module("tensorflow/contrib/keras/api/keras/wrappers/scikit_learn")
+add_python_module("tensorflow/contrib/keras/python")
+add_python_module("tensorflow/contrib/keras/python/keras")
+add_python_module("tensorflow/contrib/keras/python/keras/applications")
+add_python_module("tensorflow/contrib/keras/python/keras/datasets")
+add_python_module("tensorflow/contrib/keras/python/keras/engine")
+add_python_module("tensorflow/contrib/keras/python/keras/layers")
+add_python_module("tensorflow/contrib/keras/python/keras/preprocessing")
+add_python_module("tensorflow/contrib/keras/python/keras/utils")
+add_python_module("tensorflow/contrib/keras/python/keras/wrappers")
 add_python_module("tensorflow/contrib/labeled_tensor")
 add_python_module("tensorflow/contrib/labeled_tensor/python")
 add_python_module("tensorflow/contrib/labeled_tensor/python/ops")
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index fc3363189d..18f9d22244 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -124,6 +124,7 @@ if (tensorflow_BUILD_PYTHON_TESTS)
     "${tensorflow_source_dir}/tensorflow/python/training/*_test.py"
     "${tensorflow_source_dir}/tensorflow/tensorboard/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/factorization/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/integration_test.py"
     # NOTE: tensor_forest tests in tensor_forest/hybrid/... still don't pass.
     "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/client/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/*_test.py"
@@ -138,6 +139,8 @@ if (tensorflow_BUILD_PYTHON_TESTS)
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/benchmark_test.py"
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/resource_variable_ops_test.py"
     "${tensorflow_source_dir}/tensorflow/python/saved_model/saved_model_test.py"
+    # requires scipy
+    "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/preprocessing/*_test.py"
   )
   if (WIN32)
     set(tf_test_src_py_exclude
diff --git a/tensorflow/contrib/keras/BUILD b/tensorflow/contrib/keras/BUILD
new file mode 100644
index 0000000000..449b0a3f50
--- /dev/null
+++ b/tensorflow/contrib/keras/BUILD
@@ -0,0 +1,581 @@
+# Description:
+#   Contains the Keras API (internal TensorFlow version).
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "keras",
+    srcs = [
+        "__init__.py",
+        "api/__init__.py",
+        "api/keras/__init__.py",
+        "api/keras/activations/__init__.py",
+        "api/keras/applications/__init__.py",
+        "api/keras/applications/inception_v3/__init__.py",
+        "api/keras/applications/resnet50/__init__.py",
+        "api/keras/applications/vgg16/__init__.py",
+        "api/keras/applications/vgg19/__init__.py",
+        "api/keras/applications/xception/__init__.py",
+        "api/keras/backend/__init__.py",
+        "api/keras/callbacks/__init__.py",
+        "api/keras/constraints/__init__.py",
+        "api/keras/datasets/__init__.py",
+        "api/keras/datasets/boston_housing/__init__.py",
+        "api/keras/datasets/cifar10/__init__.py",
+        "api/keras/datasets/cifar100/__init__.py",
+        "api/keras/datasets/imdb/__init__.py",
+        "api/keras/datasets/mnist/__init__.py",
+        "api/keras/datasets/reuters/__init__.py",
+        "api/keras/initializers/__init__.py",
+        "api/keras/layers/__init__.py",
+        "api/keras/losses/__init__.py",
+        "api/keras/metrics/__init__.py",
+        "api/keras/models/__init__.py",
+        "api/keras/optimizers/__init__.py",
+        "api/keras/preprocessing/__init__.py",
+        "api/keras/preprocessing/image/__init__.py",
+        "api/keras/preprocessing/sequence/__init__.py",
+        "api/keras/preprocessing/text/__init__.py",
+        "api/keras/regularizers/__init__.py",
+        "api/keras/utils/__init__.py",
+        "api/keras/wrappers/__init__.py",
+        "api/keras/wrappers/scikit_learn/__init__.py",
+        "python/keras/__init__.py",
+        "python/keras/activations.py",
+        "python/keras/applications/__init__.py",
+        "python/keras/applications/imagenet_utils.py",
+        "python/keras/applications/inception_v3.py",
+        "python/keras/applications/resnet50.py",
+        "python/keras/applications/vgg16.py",
+        "python/keras/applications/vgg19.py",
+        "python/keras/applications/xception.py",
+        "python/keras/backend.py",
+        "python/keras/callbacks.py",
+        "python/keras/constraints.py",
+        "python/keras/datasets/__init__.py",
+        "python/keras/datasets/boston_housing.py",
+        "python/keras/datasets/cifar.py",
+        "python/keras/datasets/cifar10.py",
+        "python/keras/datasets/cifar100.py",
+        "python/keras/datasets/imdb.py",
+        "python/keras/datasets/mnist.py",
+        "python/keras/datasets/reuters.py",
+        "python/keras/engine/__init__.py",
+        "python/keras/engine/topology.py",
+        "python/keras/engine/training.py",
+        "python/keras/initializers.py",
+        "python/keras/layers/__init__.py",
+        "python/keras/layers/advanced_activations.py",
+        "python/keras/layers/convolutional.py",
+        "python/keras/layers/convolutional_recurrent.py",
+        "python/keras/layers/core.py",
+        "python/keras/layers/embeddings.py",
+        "python/keras/layers/local.py",
+        "python/keras/layers/merge.py",
+        "python/keras/layers/noise.py",
+        "python/keras/layers/normalization.py",
+        "python/keras/layers/pooling.py",
+        "python/keras/layers/recurrent.py",
+        "python/keras/layers/serialization.py",
+        "python/keras/layers/wrappers.py",
+        "python/keras/losses.py",
+        "python/keras/metrics.py",
+        "python/keras/models.py",
+        "python/keras/optimizers.py",
+        "python/keras/preprocessing/__init__.py",
+        "python/keras/preprocessing/image.py",
+        "python/keras/preprocessing/sequence.py",
+        "python/keras/preprocessing/text.py",
+        "python/keras/regularizers.py",
+        "python/keras/testing_utils.py",
+        "python/keras/utils/__init__.py",
+        "python/keras/utils/conv_utils.py",
+        "python/keras/utils/data_utils.py",
+        "python/keras/utils/generic_utils.py",
+        "python/keras/utils/io_utils.py",
+        "python/keras/utils/layer_utils.py",
+        "python/keras/utils/np_utils.py",
+        "python/keras/utils/vis_utils.py",
+        "python/keras/wrappers/__init__.py",
+        "python/keras/wrappers/scikit_learn.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:tensor_array_grad",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_test(
+    name = "integration_test",
+    size = "small",
+    srcs = ["python/keras/integration_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "activations_test",
+    size = "small",
+    srcs = ["python/keras/activations_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "constraints_test",
+    size = "small",
+    srcs = ["python/keras/constraints_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "initializers_test",
+    size = "small",
+    srcs = ["python/keras/initializers_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "regularizers_test",
+    size = "small",
+    srcs = ["python/keras/regularizers_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "optimizers_test",
+    size = "medium",
+    srcs = ["python/keras/optimizers_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "losses_test",
+    size = "small",
+    srcs = ["python/keras/losses_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "metrics_test",
+    size = "small",
+    srcs = ["python/keras/metrics_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "inception_v3_test",
+    size = "medium",
+    srcs = ["python/keras/applications/inception_v3_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "resnet50_test",
+    size = "small",
+    srcs = ["python/keras/applications/resnet50_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "vgg16_test",
+    size = "small",
+    srcs = ["python/keras/applications/vgg16_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "vgg19_test",
+    size = "small",
+    srcs = ["python/keras/applications/vgg19_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "xception_test",
+    size = "medium",
+    srcs = ["python/keras/applications/xception_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "advanced_activations_test",
+    size = "small",
+    srcs = ["python/keras/layers/advanced_activations_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "convolutional_recurrent_test",
+    size = "medium",
+    srcs = ["python/keras/layers/convolutional_recurrent_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "convolutional_test",
+    size = "medium",
+    srcs = ["python/keras/layers/convolutional_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "pooling_test",
+    size = "small",
+    srcs = ["python/keras/layers/pooling_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "core_test",
+    size = "small",
+    srcs = ["python/keras/layers/core_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "embeddings_test",
+    size = "small",
+    srcs = ["python/keras/layers/embeddings_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "local_test",
+    size = "medium",
+    srcs = ["python/keras/layers/local_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "merge_test",
+    size = "small",
+    srcs = ["python/keras/layers/merge_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "noise_test",
+    size = "small",
+    srcs = ["python/keras/layers/noise_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "normalization_test",
+    size = "small",
+    srcs = ["python/keras/layers/normalization_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "simplernn_test",
+    size = "medium",
+    srcs = ["python/keras/layers/simplernn_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "gru_test",
+    size = "medium",
+    srcs = ["python/keras/layers/gru_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "lstm_test",
+    size = "medium",
+    srcs = ["python/keras/layers/lstm_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "serialization_test",
+    size = "small",
+    srcs = ["python/keras/layers/serialization_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "wrappers_test",
+    size = "small",
+    srcs = ["python/keras/layers/wrappers_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "scikit_learn_test",
+    size = "small",
+    srcs = ["python/keras/wrappers/scikit_learn_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "image_test",
+    size = "medium",
+    srcs = ["python/keras/preprocessing/image_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "sequence_test",
+    size = "small",
+    srcs = ["python/keras/preprocessing/sequence_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "text_test",
+    size = "small",
+    srcs = ["python/keras/preprocessing/text_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "callbacks_test",
+    size = "small",
+    srcs = ["python/keras/callbacks_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "training_test",
+    size = "small",
+    srcs = ["python/keras/engine/training_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "topology_test",
+    size = "small",
+    srcs = ["python/keras/engine/topology_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "models_test",
+    size = "small",
+    srcs = ["python/keras/models_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "backend_test",
+    size = "small",
+    srcs = ["python/keras/backend_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_library(
+    name = "testing_utils",
+    srcs = [
+        "python/keras/testing_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//third_party/py/numpy",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/keras/README.md b/tensorflow/contrib/keras/README.md
new file mode 100644
index 0000000000..db2556fe42
--- /dev/null
+++ b/tensorflow/contrib/keras/README.md
@@ -0,0 +1,6 @@
+Keras is an object-oriented API for defining and training neural networks.
+
+This module contains a pure-TensorFlow implementation of the Keras API,
+allowing for deep integration with TensorFlow functionality.
+
+See [keras.io](https://keras.io) for complete documentation and user guides.
diff --git a/tensorflow/contrib/keras/__init__.py b/tensorflow/contrib/keras/__init__.py
new file mode 100644
index 0000000000..86eae6ddda
--- /dev/null
+++ b/tensorflow/contrib/keras/__init__.py
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of the Keras API meant to be a high-level API for TensorFlow.
+
+Detailed documentation and user guides are available at
+[keras.io](https://keras.io).
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.keras.api.keras import *
+
+try:
+  from tensorflow.contrib.keras import python  # pylint: disable=g-import-not-at-top
+  del python
+except ImportError:
+  pass
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/__init__.py b/tensorflow/contrib/keras/api/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/tensorflow/contrib/keras/api/__init__.py
diff --git a/tensorflow/contrib/keras/api/keras/__init__.py b/tensorflow/contrib/keras/api/keras/__init__.py
new file mode 100644
index 0000000000..53fb4a30c9
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/__init__.py
@@ -0,0 +1,44 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of the Keras API meant to be a high-level API for TensorFlow.
+
+Detailed documentation and user guides are available at
+[keras.io](https://keras.io).
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.api.keras import activations
+from tensorflow.contrib.keras.api.keras import applications
+from tensorflow.contrib.keras.api.keras import backend
+from tensorflow.contrib.keras.api.keras import callbacks
+from tensorflow.contrib.keras.api.keras import constraints
+from tensorflow.contrib.keras.api.keras import datasets
+from tensorflow.contrib.keras.api.keras import initializers
+from tensorflow.contrib.keras.api.keras import layers
+from tensorflow.contrib.keras.api.keras import losses
+from tensorflow.contrib.keras.api.keras import metrics
+from tensorflow.contrib.keras.api.keras import models
+from tensorflow.contrib.keras.api.keras import optimizers
+from tensorflow.contrib.keras.api.keras import preprocessing
+from tensorflow.contrib.keras.api.keras import regularizers
+from tensorflow.contrib.keras.api.keras import utils
+from tensorflow.contrib.keras.api.keras import wrappers
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/activations/__init__.py b/tensorflow/contrib/keras/api/keras/activations/__init__.py
new file mode 100644
index 0000000000..e4d4b1e42c
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/activations/__init__.py
@@ -0,0 +1,40 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in activation functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Activation functions.
+from tensorflow.contrib.keras.python.keras.activations import elu
+from tensorflow.contrib.keras.python.keras.activations import hard_sigmoid
+from tensorflow.contrib.keras.python.keras.activations import linear
+from tensorflow.contrib.keras.python.keras.activations import relu
+from tensorflow.contrib.keras.python.keras.activations import sigmoid
+from tensorflow.contrib.keras.python.keras.activations import softmax
+from tensorflow.contrib.keras.python.keras.activations import softplus
+from tensorflow.contrib.keras.python.keras.activations import softsign
+from tensorflow.contrib.keras.python.keras.activations import tanh
+
+# Auxiliary utils.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.keras.python.keras.activations import deserialize
+from tensorflow.contrib.keras.python.keras.activations import serialize
+from tensorflow.contrib.keras.python.keras.activations import get
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/applications/__init__.py b/tensorflow/contrib/keras/api/keras/applications/__init__.py
new file mode 100644
index 0000000000..fee5b7103a
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/applications/__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras Applications are canned architectures with pre-trained weights."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.api.keras.applications import inception_v3
+from tensorflow.contrib.keras.api.keras.applications import resnet50
+from tensorflow.contrib.keras.api.keras.applications import vgg16
+from tensorflow.contrib.keras.api.keras.applications import vgg19
+from tensorflow.contrib.keras.api.keras.applications import xception
+from tensorflow.contrib.keras.api.keras.applications.inception_v3 import InceptionV3
+from tensorflow.contrib.keras.api.keras.applications.resnet50 import ResNet50
+from tensorflow.contrib.keras.api.keras.applications.vgg16 import VGG16
+from tensorflow.contrib.keras.api.keras.applications.vgg19 import VGG19
+from tensorflow.contrib.keras.api.keras.applications.xception import Xception
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/applications/inception_v3/__init__.py b/tensorflow/contrib/keras/api/keras/applications/inception_v3/__init__.py
new file mode 100644
index 0000000000..d8ca73fb97
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/applications/inception_v3/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inception V3 Keras application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.applications.inception_v3 import decode_predictions
+from tensorflow.contrib.keras.python.keras.applications.inception_v3 import InceptionV3
+from tensorflow.contrib.keras.python.keras.applications.inception_v3 import preprocess_input
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/applications/resnet50/__init__.py b/tensorflow/contrib/keras/api/keras/applications/resnet50/__init__.py
new file mode 100644
index 0000000000..e9b25b66d5
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/applications/resnet50/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ResNet50 Keras application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.applications.resnet50 import decode_predictions
+from tensorflow.contrib.keras.python.keras.applications.resnet50 import preprocess_input
+from tensorflow.contrib.keras.python.keras.applications.resnet50 import ResNet50
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/applications/vgg16/__init__.py b/tensorflow/contrib/keras/api/keras/applications/vgg16/__init__.py
new file mode 100644
index 0000000000..2a1f789cc5
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/applications/vgg16/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""VGG16 Keras application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.applications.vgg16 import decode_predictions
+from tensorflow.contrib.keras.python.keras.applications.vgg16 import preprocess_input
+from tensorflow.contrib.keras.python.keras.applications.vgg16 import VGG16
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/applications/vgg19/__init__.py b/tensorflow/contrib/keras/api/keras/applications/vgg19/__init__.py
new file mode 100644
index 0000000000..22b5e7c8e4
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/applications/vgg19/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""VGG19 Keras application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.applications.vgg19 import decode_predictions
+from tensorflow.contrib.keras.python.keras.applications.vgg19 import preprocess_input
+from tensorflow.contrib.keras.python.keras.applications.vgg19 import VGG19
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/applications/xception/__init__.py b/tensorflow/contrib/keras/api/keras/applications/xception/__init__.py
new file mode 100644
index 0000000000..23d1b6a0b3
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/applications/xception/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Xception Keras application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.applications.xception import decode_predictions
+from tensorflow.contrib.keras.python.keras.applications.xception import preprocess_input
+from tensorflow.contrib.keras.python.keras.applications.xception import Xception
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/backend/__init__.py b/tensorflow/contrib/keras/api/keras/backend/__init__.py
new file mode 100644
index 0000000000..f3721a8dcb
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/backend/__init__.py
@@ -0,0 +1,163 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras backend API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=redefined-builtin
+from tensorflow.contrib.keras.python.keras.backend import abs
+from tensorflow.contrib.keras.python.keras.backend import all
+from tensorflow.contrib.keras.python.keras.backend import any
+from tensorflow.contrib.keras.python.keras.backend import arange
+from tensorflow.contrib.keras.python.keras.backend import argmax
+from tensorflow.contrib.keras.python.keras.backend import argmin
+from tensorflow.contrib.keras.python.keras.backend import backend
+from tensorflow.contrib.keras.python.keras.backend import batch_dot
+from tensorflow.contrib.keras.python.keras.backend import batch_flatten
+from tensorflow.contrib.keras.python.keras.backend import batch_get_value
+from tensorflow.contrib.keras.python.keras.backend import batch_normalization
+from tensorflow.contrib.keras.python.keras.backend import batch_set_value
+from tensorflow.contrib.keras.python.keras.backend import bias_add
+from tensorflow.contrib.keras.python.keras.backend import binary_crossentropy
+from tensorflow.contrib.keras.python.keras.backend import cast
+from tensorflow.contrib.keras.python.keras.backend import cast_to_floatx
+from tensorflow.contrib.keras.python.keras.backend import categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.backend import clear_session
+from tensorflow.contrib.keras.python.keras.backend import clip
+from tensorflow.contrib.keras.python.keras.backend import concatenate
+from tensorflow.contrib.keras.python.keras.backend import constant
+from tensorflow.contrib.keras.python.keras.backend import conv1d
+from tensorflow.contrib.keras.python.keras.backend import conv2d
+from tensorflow.contrib.keras.python.keras.backend import conv2d_transpose
+from tensorflow.contrib.keras.python.keras.backend import conv3d
+from tensorflow.contrib.keras.python.keras.backend import cos
+from tensorflow.contrib.keras.python.keras.backend import count_params
+from tensorflow.contrib.keras.python.keras.backend import ctc_batch_cost
+from tensorflow.contrib.keras.python.keras.backend import ctc_decode
+from tensorflow.contrib.keras.python.keras.backend import ctc_label_dense_to_sparse
+from tensorflow.contrib.keras.python.keras.backend import dot
+from tensorflow.contrib.keras.python.keras.backend import dropout
+from tensorflow.contrib.keras.python.keras.backend import dtype
+from tensorflow.contrib.keras.python.keras.backend import elu
+from tensorflow.contrib.keras.python.keras.backend import epsilon
+from tensorflow.contrib.keras.python.keras.backend import equal
+from tensorflow.contrib.keras.python.keras.backend import eval
+from tensorflow.contrib.keras.python.keras.backend import exp
+from tensorflow.contrib.keras.python.keras.backend import expand_dims
+from tensorflow.contrib.keras.python.keras.backend import eye
+from tensorflow.contrib.keras.python.keras.backend import flatten
+from tensorflow.contrib.keras.python.keras.backend import floatx
+from tensorflow.contrib.keras.python.keras.backend import foldl
+from tensorflow.contrib.keras.python.keras.backend import foldr
+from tensorflow.contrib.keras.python.keras.backend import function
+from tensorflow.contrib.keras.python.keras.backend import gather
+from tensorflow.contrib.keras.python.keras.backend import get_session
+from tensorflow.contrib.keras.python.keras.backend import get_uid
+from tensorflow.contrib.keras.python.keras.backend import get_value
+from tensorflow.contrib.keras.python.keras.backend import gradients
+from tensorflow.contrib.keras.python.keras.backend import greater
+from tensorflow.contrib.keras.python.keras.backend import greater_equal
+from tensorflow.contrib.keras.python.keras.backend import hard_sigmoid
+from tensorflow.contrib.keras.python.keras.backend import image_data_format
+from tensorflow.contrib.keras.python.keras.backend import in_test_phase
+from tensorflow.contrib.keras.python.keras.backend import in_top_k
+from tensorflow.contrib.keras.python.keras.backend import in_train_phase
+from tensorflow.contrib.keras.python.keras.backend import int_shape
+from tensorflow.contrib.keras.python.keras.backend import is_sparse
+from tensorflow.contrib.keras.python.keras.backend import l2_normalize
+from tensorflow.contrib.keras.python.keras.backend import learning_phase
+from tensorflow.contrib.keras.python.keras.backend import less
+from tensorflow.contrib.keras.python.keras.backend import less_equal
+from tensorflow.contrib.keras.python.keras.backend import log
+from tensorflow.contrib.keras.python.keras.backend import manual_variable_initialization
+from tensorflow.contrib.keras.python.keras.backend import map_fn
+from tensorflow.contrib.keras.python.keras.backend import max
+from tensorflow.contrib.keras.python.keras.backend import maximum
+from tensorflow.contrib.keras.python.keras.backend import mean
+from tensorflow.contrib.keras.python.keras.backend import min
+from tensorflow.contrib.keras.python.keras.backend import minimum
+from tensorflow.contrib.keras.python.keras.backend import moving_average_update
+from tensorflow.contrib.keras.python.keras.backend import name_scope
+from tensorflow.contrib.keras.python.keras.backend import ndim
+from tensorflow.contrib.keras.python.keras.backend import normalize_batch_in_training
+from tensorflow.contrib.keras.python.keras.backend import not_equal
+from tensorflow.contrib.keras.python.keras.backend import one_hot
+from tensorflow.contrib.keras.python.keras.backend import ones
+from tensorflow.contrib.keras.python.keras.backend import ones_like
+from tensorflow.contrib.keras.python.keras.backend import permute_dimensions
+from tensorflow.contrib.keras.python.keras.backend import placeholder
+from tensorflow.contrib.keras.python.keras.backend import pool2d
+from tensorflow.contrib.keras.python.keras.backend import pool3d
+from tensorflow.contrib.keras.python.keras.backend import pow
+from tensorflow.contrib.keras.python.keras.backend import print_tensor
+from tensorflow.contrib.keras.python.keras.backend import prod
+from tensorflow.contrib.keras.python.keras.backend import random_binomial
+from tensorflow.contrib.keras.python.keras.backend import random_normal
+from tensorflow.contrib.keras.python.keras.backend import random_normal_variable
+from tensorflow.contrib.keras.python.keras.backend import random_uniform
+from tensorflow.contrib.keras.python.keras.backend import random_uniform_variable
+from tensorflow.contrib.keras.python.keras.backend import relu
+from tensorflow.contrib.keras.python.keras.backend import repeat
+from tensorflow.contrib.keras.python.keras.backend import repeat_elements
+from tensorflow.contrib.keras.python.keras.backend import reset_uids
+from tensorflow.contrib.keras.python.keras.backend import reshape
+from tensorflow.contrib.keras.python.keras.backend import resize_images
+from tensorflow.contrib.keras.python.keras.backend import resize_volumes
+from tensorflow.contrib.keras.python.keras.backend import reverse
+from tensorflow.contrib.keras.python.keras.backend import rnn
+from tensorflow.contrib.keras.python.keras.backend import round
+from tensorflow.contrib.keras.python.keras.backend import separable_conv2d
+from tensorflow.contrib.keras.python.keras.backend import set_epsilon
+from tensorflow.contrib.keras.python.keras.backend import set_floatx
+from tensorflow.contrib.keras.python.keras.backend import set_image_data_format
+from tensorflow.contrib.keras.python.keras.backend import set_learning_phase
+from tensorflow.contrib.keras.python.keras.backend import set_session
+from tensorflow.contrib.keras.python.keras.backend import set_value
+from tensorflow.contrib.keras.python.keras.backend import shape
+from tensorflow.contrib.keras.python.keras.backend import sigmoid
+from tensorflow.contrib.keras.python.keras.backend import sign
+from tensorflow.contrib.keras.python.keras.backend import sin
+from tensorflow.contrib.keras.python.keras.backend import softmax
+from tensorflow.contrib.keras.python.keras.backend import softplus
+from tensorflow.contrib.keras.python.keras.backend import softsign
+from tensorflow.contrib.keras.python.keras.backend import sparse_categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.backend import spatial_2d_padding
+from tensorflow.contrib.keras.python.keras.backend import spatial_3d_padding
+from tensorflow.contrib.keras.python.keras.backend import sqrt
+from tensorflow.contrib.keras.python.keras.backend import square
+from tensorflow.contrib.keras.python.keras.backend import squeeze
+from tensorflow.contrib.keras.python.keras.backend import stack
+from tensorflow.contrib.keras.python.keras.backend import std
+from tensorflow.contrib.keras.python.keras.backend import stop_gradient
+from tensorflow.contrib.keras.python.keras.backend import sum
+from tensorflow.contrib.keras.python.keras.backend import switch
+from tensorflow.contrib.keras.python.keras.backend import tanh
+from tensorflow.contrib.keras.python.keras.backend import temporal_padding
+from tensorflow.contrib.keras.python.keras.backend import to_dense
+from tensorflow.contrib.keras.python.keras.backend import transpose
+from tensorflow.contrib.keras.python.keras.backend import truncated_normal
+from tensorflow.contrib.keras.python.keras.backend import update
+from tensorflow.contrib.keras.python.keras.backend import update_add
+from tensorflow.contrib.keras.python.keras.backend import update_sub
+from tensorflow.contrib.keras.python.keras.backend import var
+from tensorflow.contrib.keras.python.keras.backend import variable
+from tensorflow.contrib.keras.python.keras.backend import zeros
+from tensorflow.contrib.keras.python.keras.backend import zeros_like
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/callbacks/__init__.py b/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
new file mode 100644
index 0000000000..2f579f2d28
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras callback classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.callbacks import BaseLogger
+from tensorflow.contrib.keras.python.keras.callbacks import Callback
+from tensorflow.contrib.keras.python.keras.callbacks import CSVLogger
+from tensorflow.contrib.keras.python.keras.callbacks import EarlyStopping
+from tensorflow.contrib.keras.python.keras.callbacks import History
+from tensorflow.contrib.keras.python.keras.callbacks import LambdaCallback
+from tensorflow.contrib.keras.python.keras.callbacks import LearningRateScheduler
+from tensorflow.contrib.keras.python.keras.callbacks import ModelCheckpoint
+from tensorflow.contrib.keras.python.keras.callbacks import ProgbarLogger
+from tensorflow.contrib.keras.python.keras.callbacks import ReduceLROnPlateau
+from tensorflow.contrib.keras.python.keras.callbacks import RemoteMonitor
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/constraints/__init__.py b/tensorflow/contrib/keras/api/keras/constraints/__init__.py
new file mode 100644
index 0000000000..6b9e3bf46e
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/constraints/__init__.py
@@ -0,0 +1,40 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in constraints functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Constraints functions / callable classes.
+from tensorflow.contrib.keras.python.keras.constraints import Constraint
+from tensorflow.contrib.keras.python.keras.constraints import max_norm
+from tensorflow.contrib.keras.python.keras.constraints import MaxNorm
+from tensorflow.contrib.keras.python.keras.constraints import min_max_norm
+from tensorflow.contrib.keras.python.keras.constraints import MinMaxNorm
+from tensorflow.contrib.keras.python.keras.constraints import non_neg
+from tensorflow.contrib.keras.python.keras.constraints import NonNeg
+from tensorflow.contrib.keras.python.keras.constraints import unit_norm
+from tensorflow.contrib.keras.python.keras.constraints import UnitNorm
+
+# Auxiliary utils.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.keras.python.keras.constraints import deserialize
+from tensorflow.contrib.keras.python.keras.constraints import serialize
+from tensorflow.contrib.keras.python.keras.constraints import get
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/datasets/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/__init__.py
new file mode 100644
index 0000000000..4513231bb4
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/datasets/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in datasets."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.api.keras.datasets import boston_housing
+from tensorflow.contrib.keras.api.keras.datasets import cifar10
+from tensorflow.contrib.keras.api.keras.datasets import cifar100
+from tensorflow.contrib.keras.api.keras.datasets import imdb
+from tensorflow.contrib.keras.api.keras.datasets import mnist
+from tensorflow.contrib.keras.api.keras.datasets import reuters
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/datasets/boston_housing/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/boston_housing/__init__.py
new file mode 100644
index 0000000000..0bfd3df540
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/datasets/boston_housing/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Boston housing price regression dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.datasets.boston_housing import load_data
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/datasets/cifar10/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/cifar10/__init__.py
new file mode 100644
index 0000000000..f5fac6982a
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/datasets/cifar10/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""CIFAR10 small image classification dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.datasets.cifar10 import load_data
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/datasets/cifar100/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/cifar100/__init__.py
new file mode 100644
index 0000000000..a7e6996136
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/datasets/cifar100/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""CIFAR100 small image classification dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.datasets.cifar100 import load_data
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/datasets/imdb/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/imdb/__init__.py
new file mode 100644
index 0000000000..f141c8a8e9
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/datasets/imdb/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""IMDB movie review sentiment classification dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.datasets.imdb import get_word_index
+from tensorflow.contrib.keras.python.keras.datasets.imdb import load_data
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/datasets/mnist/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/mnist/__init__.py
new file mode 100644
index 0000000000..50b74f149c
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/datasets/mnist/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""MNIST handwritten digits classification dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.datasets.mnist import load_data
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/datasets/reuters/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/reuters/__init__.py
new file mode 100644
index 0000000000..fc7f1235a3
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/datasets/reuters/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Reuters newswire topic classification dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.datasets.reuters import get_word_index
+from tensorflow.contrib.keras.python.keras.datasets.reuters import load_data
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/initializers/__init__.py b/tensorflow/contrib/keras/api/keras/initializers/__init__.py
new file mode 100644
index 0000000000..f0c1540d9a
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/initializers/__init__.py
@@ -0,0 +1,48 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in initializers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Initializer functions / callable classes.
+from tensorflow.contrib.keras.python.keras.initializers import Constant
+from tensorflow.contrib.keras.python.keras.initializers import Identity
+from tensorflow.contrib.keras.python.keras.initializers import Initializer
+from tensorflow.contrib.keras.python.keras.initializers import Ones
+from tensorflow.contrib.keras.python.keras.initializers import Orthogonal
+from tensorflow.contrib.keras.python.keras.initializers import RandomNormal
+from tensorflow.contrib.keras.python.keras.initializers import RandomUniform
+from tensorflow.contrib.keras.python.keras.initializers import TruncatedNormal
+from tensorflow.contrib.keras.python.keras.initializers import VarianceScaling
+from tensorflow.contrib.keras.python.keras.initializers import Zeros
+
+# Functional interface.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.keras.python.keras.initializers import glorot_normal
+from tensorflow.contrib.keras.python.keras.initializers import glorot_uniform
+from tensorflow.contrib.keras.python.keras.initializers import he_normal
+from tensorflow.contrib.keras.python.keras.initializers import he_uniform
+from tensorflow.contrib.keras.python.keras.initializers import lecun_uniform
+
+# Auxiliary utils.
+from tensorflow.contrib.keras.python.keras.initializers import deserialize
+from tensorflow.contrib.keras.python.keras.initializers import serialize
+from tensorflow.contrib.keras.python.keras.initializers import get
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/layers/__init__.py b/tensorflow/contrib/keras/api/keras/layers/__init__.py
new file mode 100644
index 0000000000..8f266df0ad
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/layers/__init__.py
@@ -0,0 +1,140 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras layers API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Generic layers.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.keras.python.keras.engine import Input
+from tensorflow.contrib.keras.python.keras.engine import InputLayer
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+
+# Advanced activations.
+from tensorflow.contrib.keras.python.keras.layers.advanced_activations import LeakyReLU
+from tensorflow.contrib.keras.python.keras.layers.advanced_activations import PReLU
+from tensorflow.contrib.keras.python.keras.layers.advanced_activations import ELU
+from tensorflow.contrib.keras.python.keras.layers.advanced_activations import ThresholdedReLU
+
+# Convolution layers.
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Conv1D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Conv2D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Conv3D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Conv2DTranspose
+from tensorflow.contrib.keras.python.keras.layers.convolutional import SeparableConv2D
+
+# Convolution layer aliases.
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Convolution1D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Convolution2D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Convolution3D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Convolution2DTranspose
+from tensorflow.contrib.keras.python.keras.layers.convolutional import SeparableConvolution2D
+
+# Image processing layers.
+from tensorflow.contrib.keras.python.keras.layers.convolutional import UpSampling1D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import UpSampling2D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import UpSampling3D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import ZeroPadding1D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import ZeroPadding2D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import ZeroPadding3D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Cropping1D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Cropping2D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Cropping3D
+
+# Convolutional-recurrent layers.
+from tensorflow.contrib.keras.python.keras.layers.convolutional_recurrent import ConvLSTM2D
+
+# Core layers.
+from tensorflow.contrib.keras.python.keras.layers.core import Masking
+from tensorflow.contrib.keras.python.keras.layers.core import Dropout
+from tensorflow.contrib.keras.python.keras.layers.core import SpatialDropout1D
+from tensorflow.contrib.keras.python.keras.layers.core import SpatialDropout2D
+from tensorflow.contrib.keras.python.keras.layers.core import SpatialDropout3D
+from tensorflow.contrib.keras.python.keras.layers.core import Activation
+from tensorflow.contrib.keras.python.keras.layers.core import Reshape
+from tensorflow.contrib.keras.python.keras.layers.core import Permute
+from tensorflow.contrib.keras.python.keras.layers.core import Flatten
+from tensorflow.contrib.keras.python.keras.layers.core import RepeatVector
+from tensorflow.contrib.keras.python.keras.layers.core import Lambda
+from tensorflow.contrib.keras.python.keras.layers.core import Dense
+from tensorflow.contrib.keras.python.keras.layers.core import ActivityRegularization
+
+# Embedding layers.
+from tensorflow.contrib.keras.python.keras.layers.embeddings import Embedding
+
+# Locally-connected layers.
+from tensorflow.contrib.keras.python.keras.layers.local import LocallyConnected1D
+from tensorflow.contrib.keras.python.keras.layers.local import LocallyConnected2D
+
+# Merge layers.
+from tensorflow.contrib.keras.python.keras.layers.merge import Add
+from tensorflow.contrib.keras.python.keras.layers.merge import Multiply
+from tensorflow.contrib.keras.python.keras.layers.merge import Average
+from tensorflow.contrib.keras.python.keras.layers.merge import Maximum
+from tensorflow.contrib.keras.python.keras.layers.merge import Concatenate
+from tensorflow.contrib.keras.python.keras.layers.merge import Dot
+from tensorflow.contrib.keras.python.keras.layers.merge import add
+from tensorflow.contrib.keras.python.keras.layers.merge import multiply
+from tensorflow.contrib.keras.python.keras.layers.merge import average
+from tensorflow.contrib.keras.python.keras.layers.merge import maximum
+from tensorflow.contrib.keras.python.keras.layers.merge import concatenate
+from tensorflow.contrib.keras.python.keras.layers.merge import dot
+
+# Noise layers.
+from tensorflow.contrib.keras.python.keras.layers.noise import GaussianNoise
+from tensorflow.contrib.keras.python.keras.layers.noise import GaussianDropout
+
+# Normalization layers.
+from tensorflow.contrib.keras.python.keras.layers.normalization import BatchNormalization
+
+# Pooling layers.
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPooling1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPooling3D
+from tensorflow.contrib.keras.python.keras.layers.pooling import AveragePooling1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import AveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import AveragePooling3D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalAveragePooling1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalAveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalAveragePooling3D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalMaxPooling1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalMaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalMaxPooling3D
+
+# Pooling layer aliases.
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPool1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPool2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPool3D
+from tensorflow.contrib.keras.python.keras.layers.pooling import AvgPool1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import AvgPool2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import AvgPool3D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalAvgPool1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalAvgPool2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalAvgPool3D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalMaxPool1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalMaxPool2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalMaxPool3D
+
+# Recurrent layers.
+from tensorflow.contrib.keras.python.keras.layers.recurrent import SimpleRNN
+from tensorflow.contrib.keras.python.keras.layers.recurrent import GRU
+from tensorflow.contrib.keras.python.keras.layers.recurrent import LSTM
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/losses/__init__.py b/tensorflow/contrib/keras/api/keras/losses/__init__.py
new file mode 100644
index 0000000000..2d2fee2698
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/losses/__init__.py
@@ -0,0 +1,43 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in loss functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Loss functions.
+from tensorflow.contrib.keras.python.keras.losses import binary_crossentropy
+from tensorflow.contrib.keras.python.keras.losses import categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.losses import cosine_proximity
+from tensorflow.contrib.keras.python.keras.losses import hinge
+from tensorflow.contrib.keras.python.keras.losses import kullback_leibler_divergence
+from tensorflow.contrib.keras.python.keras.losses import mean_absolute_error
+from tensorflow.contrib.keras.python.keras.losses import mean_absolute_percentage_error
+from tensorflow.contrib.keras.python.keras.losses import mean_squared_error
+from tensorflow.contrib.keras.python.keras.losses import mean_squared_logarithmic_error
+from tensorflow.contrib.keras.python.keras.losses import poisson
+from tensorflow.contrib.keras.python.keras.losses import sparse_categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.losses import squared_hinge
+
+# Auxiliary utils.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.keras.python.keras.losses import deserialize
+from tensorflow.contrib.keras.python.keras.losses import serialize
+from tensorflow.contrib.keras.python.keras.losses import get
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/metrics/__init__.py b/tensorflow/contrib/keras/api/keras/metrics/__init__.py
new file mode 100644
index 0000000000..ba43ffece8
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/metrics/__init__.py
@@ -0,0 +1,46 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in metrics functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Metrics functions.
+from tensorflow.contrib.keras.python.keras.metrics import binary_accuracy
+from tensorflow.contrib.keras.python.keras.metrics import binary_crossentropy
+from tensorflow.contrib.keras.python.keras.metrics import categorical_accuracy
+from tensorflow.contrib.keras.python.keras.metrics import categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.metrics import cosine_proximity
+from tensorflow.contrib.keras.python.keras.metrics import hinge
+from tensorflow.contrib.keras.python.keras.metrics import kullback_leibler_divergence
+from tensorflow.contrib.keras.python.keras.metrics import mean_absolute_error
+from tensorflow.contrib.keras.python.keras.metrics import mean_absolute_percentage_error
+from tensorflow.contrib.keras.python.keras.metrics import mean_squared_error
+from tensorflow.contrib.keras.python.keras.metrics import mean_squared_logarithmic_error
+from tensorflow.contrib.keras.python.keras.metrics import poisson
+from tensorflow.contrib.keras.python.keras.metrics import sparse_categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.metrics import squared_hinge
+from tensorflow.contrib.keras.python.keras.metrics import top_k_categorical_accuracy
+
+# Auxiliary utils.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.keras.python.keras.metrics import deserialize
+from tensorflow.contrib.keras.python.keras.metrics import serialize
+from tensorflow.contrib.keras.python.keras.metrics import get
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/models/__init__.py b/tensorflow/contrib/keras/api/keras/models/__init__.py
new file mode 100644
index 0000000000..4e5b2a1ed0
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/models/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras models API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.models import load_model
+from tensorflow.contrib.keras.python.keras.models import Model
+from tensorflow.contrib.keras.python.keras.models import model_from_config
+from tensorflow.contrib.keras.python.keras.models import model_from_json
+from tensorflow.contrib.keras.python.keras.models import model_from_yaml
+from tensorflow.contrib.keras.python.keras.models import save_model
+from tensorflow.contrib.keras.python.keras.models import Sequential
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/optimizers/__init__.py b/tensorflow/contrib/keras/api/keras/optimizers/__init__.py
new file mode 100644
index 0000000000..b3531d7933
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/optimizers/__init__.py
@@ -0,0 +1,39 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in optimizers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Optimizer classes.
+from tensorflow.contrib.keras.python.keras.optimizers import Adadelta
+from tensorflow.contrib.keras.python.keras.optimizers import Adagrad
+from tensorflow.contrib.keras.python.keras.optimizers import Adam
+from tensorflow.contrib.keras.python.keras.optimizers import Adamax
+from tensorflow.contrib.keras.python.keras.optimizers import Nadam
+from tensorflow.contrib.keras.python.keras.optimizers import Optimizer
+from tensorflow.contrib.keras.python.keras.optimizers import RMSprop
+from tensorflow.contrib.keras.python.keras.optimizers import SGD
+
+# Auxiliary utils.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.keras.python.keras.optimizers import deserialize
+from tensorflow.contrib.keras.python.keras.optimizers import serialize
+from tensorflow.contrib.keras.python.keras.optimizers import get
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/preprocessing/__init__.py b/tensorflow/contrib/keras/api/keras/preprocessing/__init__.py
new file mode 100644
index 0000000000..4a200e3f58
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/preprocessing/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras data preprocessing utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.api.keras.preprocessing import image
+from tensorflow.contrib.keras.api.keras.preprocessing import sequence
+from tensorflow.contrib.keras.api.keras.preprocessing import text
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py b/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py
new file mode 100644
index 0000000000..18ce1becc2
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py
@@ -0,0 +1,38 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras data preprocessing utils for image data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.preprocessing.image import apply_transform
+from tensorflow.contrib.keras.python.keras.preprocessing.image import array_to_img
+from tensorflow.contrib.keras.python.keras.preprocessing.image import DirectoryIterator
+from tensorflow.contrib.keras.python.keras.preprocessing.image import flip_axis
+from tensorflow.contrib.keras.python.keras.preprocessing.image import ImageDataGenerator
+from tensorflow.contrib.keras.python.keras.preprocessing.image import img_to_array
+from tensorflow.contrib.keras.python.keras.preprocessing.image import Iterator
+from tensorflow.contrib.keras.python.keras.preprocessing.image import load_img
+from tensorflow.contrib.keras.python.keras.preprocessing.image import NumpyArrayIterator
+from tensorflow.contrib.keras.python.keras.preprocessing.image import random_channel_shift
+from tensorflow.contrib.keras.python.keras.preprocessing.image import random_rotation
+from tensorflow.contrib.keras.python.keras.preprocessing.image import random_shear
+from tensorflow.contrib.keras.python.keras.preprocessing.image import random_shift
+from tensorflow.contrib.keras.python.keras.preprocessing.image import random_zoom
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/preprocessing/sequence/__init__.py b/tensorflow/contrib/keras/api/keras/preprocessing/sequence/__init__.py
new file mode 100644
index 0000000000..2621e9bf53
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/preprocessing/sequence/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras data preprocessing utils for sequence data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.preprocessing.sequence import make_sampling_table
+from tensorflow.contrib.keras.python.keras.preprocessing.sequence import pad_sequences
+from tensorflow.contrib.keras.python.keras.preprocessing.sequence import skipgrams
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/preprocessing/text/__init__.py b/tensorflow/contrib/keras/api/keras/preprocessing/text/__init__.py
new file mode 100644
index 0000000000..a6b68c3ba6
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/preprocessing/text/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras data preprocessing utils for text data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.preprocessing.text import one_hot
+from tensorflow.contrib.keras.python.keras.preprocessing.text import text_to_word_sequence
+from tensorflow.contrib.keras.python.keras.preprocessing.text import Tokenizer
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/regularizers/__init__.py b/tensorflow/contrib/keras/api/keras/regularizers/__init__.py
new file mode 100644
index 0000000000..a3b0062d5c
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/regularizers/__init__.py
@@ -0,0 +1,38 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in regularizers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Regularizer functions / callable classes.
+from tensorflow.contrib.keras.python.keras.regularizers import L1L2
+from tensorflow.contrib.keras.python.keras.regularizers import Regularizer
+
+# Functional interface.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.keras.python.keras.regularizers import l1
+from tensorflow.contrib.keras.python.keras.regularizers import l2
+from tensorflow.contrib.keras.python.keras.regularizers import l1_l2
+
+# Auxiliary utils.
+from tensorflow.contrib.keras.python.keras.regularizers import deserialize
+from tensorflow.contrib.keras.python.keras.regularizers import serialize
+from tensorflow.contrib.keras.python.keras.regularizers import get
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/utils/__init__.py b/tensorflow/contrib/keras/api/keras/utils/__init__.py
new file mode 100644
index 0000000000..7f14fa2065
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/utils/__init__.py
@@ -0,0 +1,36 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import custom_object_scope
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import get_custom_objects
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import Progbar
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.io_utils import HDF5Matrix
+from tensorflow.contrib.keras.python.keras.utils.layer_utils import convert_all_kernels_in_model
+from tensorflow.contrib.keras.python.keras.utils.np_utils import normalize
+from tensorflow.contrib.keras.python.keras.utils.np_utils import to_categorical
+from tensorflow.contrib.keras.python.keras.utils.vis_utils import plot_model
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/wrappers/__init__.py b/tensorflow/contrib/keras/api/keras/wrappers/__init__.py
new file mode 100644
index 0000000000..d2c7c4bf14
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/wrappers/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Wrappers for Keras models, providing compatibility with other frameworks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.api.keras.wrappers import scikit_learn
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/wrappers/scikit_learn/__init__.py b/tensorflow/contrib/keras/api/keras/wrappers/scikit_learn/__init__.py
new file mode 100644
index 0000000000..ba1d28c5c6
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/wrappers/scikit_learn/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras scikit-learn API wrapper."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.wrappers.scikit_learn import KerasClassifier
+from tensorflow.contrib.keras.python.keras.wrappers.scikit_learn import KerasRegressor
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/python/keras/__init__.py b/tensorflow/contrib/keras/python/keras/__init__.py
new file mode 100644
index 0000000000..cdfc40dff1
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/__init__.py
@@ -0,0 +1,40 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Keras API.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import activations
+from tensorflow.contrib.keras.python.keras import applications
+from tensorflow.contrib.keras.python.keras import backend
+from tensorflow.contrib.keras.python.keras import callbacks
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import datasets
+from tensorflow.contrib.keras.python.keras import engine
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import layers
+from tensorflow.contrib.keras.python.keras import losses
+from tensorflow.contrib.keras.python.keras import metrics
+from tensorflow.contrib.keras.python.keras import models
+from tensorflow.contrib.keras.python.keras import optimizers
+from tensorflow.contrib.keras.python.keras import preprocessing
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras import utils
+from tensorflow.contrib.keras.python.keras import wrappers
+
+
+__version__ = '2.0.0-tf'
diff --git a/tensorflow/contrib/keras/python/keras/activations.py b/tensorflow/contrib/keras/python/keras/activations.py
new file mode 100644
index 0000000000..1eac52dfad
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/activations.py
@@ -0,0 +1,95 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in activation functions.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+
+
+def softmax(x):
+  ndim = K.ndim(x)
+  if ndim == 2:
+    return K.softmax(x)
+  elif ndim == 3:
+    e = K.exp(x - K.max(x, axis=-1, keepdims=True))
+    s = K.sum(e, axis=-1, keepdims=True)
+    return e / s
+  else:
+    raise ValueError('Cannot apply softmax to a tensor '
+                     'that is not 2D or 3D. '
+                     'Here, ndim=' + str(ndim))
+
+
+def elu(x, alpha=1.0):
+  return K.elu(x, alpha)
+
+
+def softplus(x):
+  return K.softplus(x)
+
+
+def softsign(x):
+  return K.softsign(x)
+
+
+def relu(x, alpha=0., max_value=None):
+  return K.relu(x, alpha=alpha, max_value=max_value)
+
+
+def tanh(x):
+  return K.tanh(x)
+
+
+def sigmoid(x):
+  return K.sigmoid(x)
+
+
+def hard_sigmoid(x):
+  return K.hard_sigmoid(x)
+
+
+def linear(x):
+  return x
+
+
+def serialize(activation):
+  return activation.__name__
+
+
+def deserialize(name, custom_objects=None):
+  return deserialize_keras_object(
+      name,
+      module_objects=globals(),
+      custom_objects=custom_objects,
+      printable_module_name='activation function')
+
+
+def get(identifier):
+  if identifier is None:
+    return linear
+  if isinstance(identifier, six.string_types):
+    identifier = str(identifier)
+    return deserialize(identifier)
+  elif callable(identifier):
+    return identifier
+  else:
+    raise ValueError('Could not interpret '
+                     'activation function identifier:', identifier)
diff --git a/tensorflow/contrib/keras/python/keras/activations_test.py b/tensorflow/contrib/keras/python/keras/activations_test.py
new file mode 100644
index 0000000000..eec4d257f2
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/activations_test.py
@@ -0,0 +1,157 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras activation functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+def _ref_softmax(values):
+  m = np.max(values)
+  e = np.exp(values - m)
+  return e / np.sum(e)
+
+
+class KerasActivationsTest(test.TestCase):
+
+  def test_serialization(self):
+    all_activations = ['softmax', 'relu', 'elu', 'tanh',
+                       'sigmoid', 'hard_sigmoid', 'linear',
+                       'softplus', 'softsign']
+    for name in all_activations:
+      fn = keras.activations.get(name)
+      ref_fn = getattr(keras.activations, name)
+      assert fn == ref_fn
+      config = keras.activations.serialize(fn)
+      fn = keras.activations.deserialize(config)
+      assert fn == ref_fn
+
+  def test_softmax(self):
+    with self.test_session():
+      x = keras.backend.placeholder(ndim=2)
+      f = keras.backend.function([x], [keras.activations.softmax(x)])
+      test_values = np.random.random((2, 5))
+
+      result = f([test_values])[0]
+    expected = _ref_softmax(test_values[0])
+    self.assertAllClose(result[0], expected, rtol=1e-05)
+
+  def test_temporal_softmax(self):
+    with self.test_session():
+      x = keras.backend.placeholder(shape=(2, 2, 3))
+      f = keras.backend.function([x], [keras.activations.softmax(x)])
+      test_values = np.random.random((2, 2, 3)) * 10
+      result = f([test_values])[0]
+    expected = _ref_softmax(test_values[0, 0])
+    self.assertAllClose(result[0, 0], expected, rtol=1e-05)
+
+  def test_softplus(self):
+    def softplus(x):
+      return np.log(np.ones_like(x) + np.exp(x))
+
+    with self.test_session():
+      x = keras.backend.placeholder(ndim=2)
+      f = keras.backend.function([x], [keras.activations.softplus(x)])
+      test_values = np.random.random((2, 5))
+      result = f([test_values])[0]
+    expected = softplus(test_values)
+    self.assertAllClose(result, expected, rtol=1e-05)
+
+  def test_softsign(self):
+    def softsign(x):
+      return np.divide(x, np.ones_like(x) + np.absolute(x))
+
+    with self.test_session():
+      x = keras.backend.placeholder(ndim=2)
+      f = keras.backend.function([x], [keras.activations.softsign(x)])
+      test_values = np.random.random((2, 5))
+      result = f([test_values])[0]
+    expected = softsign(test_values)
+    self.assertAllClose(result, expected, rtol=1e-05)
+
+  def test_sigmoid(self):
+    def ref_sigmoid(x):
+      if x >= 0:
+        return 1 / (1 + np.exp(-x))
+      else:
+        z = np.exp(x)
+        return z / (1 + z)
+    sigmoid = np.vectorize(ref_sigmoid)
+
+    with self.test_session():
+      x = keras.backend.placeholder(ndim=2)
+      f = keras.backend.function([x], [keras.activations.sigmoid(x)])
+      test_values = np.random.random((2, 5))
+      result = f([test_values])[0]
+    expected = sigmoid(test_values)
+    self.assertAllClose(result, expected, rtol=1e-05)
+
+  def test_hard_sigmoid(self):
+    def ref_hard_sigmoid(x):
+      x = (x * 0.2) + 0.5
+      z = 0.0 if x <= 0 else (1.0 if x >= 1 else x)
+      return z
+    hard_sigmoid = np.vectorize(ref_hard_sigmoid)
+    with self.test_session():
+      x = keras.backend.placeholder(ndim=2)
+      f = keras.backend.function([x], [keras.activations.hard_sigmoid(x)])
+      test_values = np.random.random((2, 5))
+      result = f([test_values])[0]
+    expected = hard_sigmoid(test_values)
+    self.assertAllClose(result, expected, rtol=1e-05)
+
+  def test_relu(self):
+    with self.test_session():
+      x = keras.backend.placeholder(ndim=2)
+      f = keras.backend.function([x], [keras.activations.relu(x)])
+      test_values = np.random.random((2, 5))
+      result = f([test_values])[0]
+    # No negative values in test values...
+    self.assertAllClose(result, test_values, rtol=1e-05)
+
+  def test_elu(self):
+    with self.test_session():
+      x = keras.backend.placeholder(ndim=2)
+      f = keras.backend.function([x], [keras.activations.elu(x, 0.5)])
+      test_values = np.random.random((2, 5))
+      result = f([test_values])[0]
+      self.assertAllClose(result, test_values, rtol=1e-05)
+      negative_values = np.array([[-1, -2]], dtype=keras.backend.floatx())
+      result = f([negative_values])[0]
+      true_result = (np.exp(negative_values) - 1) / 2
+    self.assertAllClose(result, true_result)
+
+  def test_tanh(self):
+    with self.test_session():
+      test_values = np.random.random((2, 5))
+      x = keras.backend.placeholder(ndim=2)
+      exp = keras.activations.tanh(x)
+      f = keras.backend.function([x], [exp])
+      result = f([test_values])[0]
+    expected = np.tanh(test_values)
+    self.assertAllClose(result, expected, rtol=1e-05)
+
+  def test_linear(self):
+    x = np.random.random((10, 5))
+    self.assertAllClose(x, keras.activations.linear(x))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/applications/__init__.py b/tensorflow/contrib/keras/python/keras/applications/__init__.py
new file mode 100644
index 0000000000..c6af9ea9f1
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras Applications: models with automatic loading of pre-trained weights.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.applications.inception_v3 import InceptionV3
+from tensorflow.contrib.keras.python.keras.applications.resnet50 import ResNet50
+from tensorflow.contrib.keras.python.keras.applications.vgg16 import VGG16
+from tensorflow.contrib.keras.python.keras.applications.vgg19 import VGG19
+from tensorflow.contrib.keras.python.keras.applications.xception import Xception
+
diff --git a/tensorflow/contrib/keras/python/keras/applications/imagenet_utils.py b/tensorflow/contrib/keras/python/keras/applications/imagenet_utils.py
new file mode 100644
index 0000000000..a64021ae49
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/imagenet_utils.py
@@ -0,0 +1,155 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities used by models pre-trained on ImageNet.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+CLASS_INDEX = None
+CLASS_INDEX_PATH = 'https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json'
+
+
+def preprocess_input(x, data_format=None):
+  """Preprocesses a tensor encoding a batch of images.
+
+  Arguments:
+      x: input Numpy tensor, 4D.
+      data_format: data format of the image tensor.
+
+  Returns:
+      Preprocessed tensor.
+  """
+  if data_format is None:
+    data_format = K.image_data_format()
+  assert data_format in {'channels_last', 'channels_first'}
+
+  if data_format == 'channels_first':
+    # 'RGB'->'BGR'
+    x = x[:, ::-1, :, :]
+    # Zero-center by mean pixel
+    x[:, 0, :, :] -= 103.939
+    x[:, 1, :, :] -= 116.779
+    x[:, 2, :, :] -= 123.68
+  else:
+    # 'RGB'->'BGR'
+    x = x[:, :, :, ::-1]
+    # Zero-center by mean pixel
+    x[:, :, :, 0] -= 103.939
+    x[:, :, :, 1] -= 116.779
+    x[:, :, :, 2] -= 123.68
+  return x
+
+
+def decode_predictions(preds, top=5):
+  """Decodes the prediction of an ImageNet model.
+
+  Arguments:
+      preds: Numpy tensor encoding a batch of predictions.
+      top: integer, how many top-guesses to return.
+
+  Returns:
+      A list of lists of top class prediction tuples
+      `(class_name, class_description, score)`.
+      One list of tuples per sample in batch input.
+
+  Raises:
+      ValueError: in case of invalid shape of the `pred` array
+          (must be 2D).
+  """
+  global CLASS_INDEX
+  if len(preds.shape) != 2 or preds.shape[1] != 1000:
+    raise ValueError('`decode_predictions` expects '
+                     'a batch of predictions '
+                     '(i.e. a 2D array of shape (samples, 1000)). '
+                     'Found array with shape: ' + str(preds.shape))
+  if CLASS_INDEX is None:
+    fpath = get_file(
+        'imagenet_class_index.json', CLASS_INDEX_PATH, cache_subdir='models')
+    CLASS_INDEX = json.load(open(fpath))
+  results = []
+  for pred in preds:
+    top_indices = pred.argsort()[-top:][::-1]
+    result = [tuple(CLASS_INDEX[str(i)]) + (pred[i],) for i in top_indices]
+    result.sort(key=lambda x: x[2], reverse=True)
+    results.append(result)
+  return results
+
+
+def _obtain_input_shape(input_shape, default_size, min_size, data_format,
+                        include_top):
+  """Internal utility to compute/validate an ImageNet model's input shape.
+
+  Arguments:
+      input_shape: either None (will return the default network input shape),
+          or a user-provided shape to be validated.
+      default_size: default input width/height for the model.
+      min_size: minimum input width/height accepted by the model.
+      data_format: image data format to use.
+      include_top: whether the model is expected to
+          be linked to a classifier via a Flatten layer.
+
+  Returns:
+      An integer shape tuple (may include None entries).
+
+  Raises:
+      ValueError: in case of invalid argument values.
+  """
+  if data_format == 'channels_first':
+    default_shape = (3, default_size, default_size)
+  else:
+    default_shape = (default_size, default_size, 3)
+  if include_top:
+    if input_shape is not None:
+      if input_shape != default_shape:
+        raise ValueError('When setting`include_top=True`, '
+                         '`input_shape` should be ' + str(default_shape) + '.')
+    input_shape = default_shape
+  else:
+    if data_format == 'channels_first':
+      if input_shape is not None:
+        if len(input_shape) != 3:
+          raise ValueError('`input_shape` must be a tuple of three integers.')
+        if input_shape[0] != 3:
+          raise ValueError('The input must have 3 channels; got '
+                           '`input_shape=' + str(input_shape) + '`')
+        if ((input_shape[1] is not None and input_shape[1] < min_size) or
+            (input_shape[2] is not None and input_shape[2] < min_size)):
+          raise ValueError('Input size must be at least ' + str(min_size) + 'x'
+                           + str(min_size) + ', got '
+                           '`input_shape=' + str(input_shape) + '`')
+      else:
+        input_shape = (3, None, None)
+    else:
+      if input_shape is not None:
+        if len(input_shape) != 3:
+          raise ValueError('`input_shape` must be a tuple of three integers.')
+        if input_shape[-1] != 3:
+          raise ValueError('The input must have 3 channels; got '
+                           '`input_shape=' + str(input_shape) + '`')
+        if ((input_shape[0] is not None and input_shape[0] < min_size) or
+            (input_shape[1] is not None and input_shape[1] < min_size)):
+          raise ValueError('Input size must be at least ' + str(min_size) + 'x'
+                           + str(min_size) + ', got '
+                           '`input_shape=' + str(input_shape) + '`')
+      else:
+        input_shape = (None, None, 3)
+  return input_shape
diff --git a/tensorflow/contrib/keras/python/keras/applications/inception_v3.py b/tensorflow/contrib/keras/python/keras/applications/inception_v3.py
new file mode 100644
index 0000000000..3fc16c88ca
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/inception_v3.py
@@ -0,0 +1,406 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=invalid-name
+"""Inception V3 model for Keras.
+
+Note that the input image format for this model is different than for
+the VGG16 and ResNet models (299x299 instead of 224x224),
+and that the input preprocessing function is also different (same as Xception).
+
+# Reference
+
+- [Rethinking the Inception Architecture for Computer
+Vision](http://arxiv.org/abs/1512.00567)
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import layers
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.engine.topology import get_source_inputs
+from tensorflow.contrib.keras.python.keras.layers import Activation
+from tensorflow.contrib.keras.python.keras.layers import AveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers import BatchNormalization
+from tensorflow.contrib.keras.python.keras.layers import Conv2D
+from tensorflow.contrib.keras.python.keras.layers import Dense
+from tensorflow.contrib.keras.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers import Input
+from tensorflow.contrib.keras.python.keras.layers import MaxPooling2D
+from tensorflow.contrib.keras.python.keras.models import Model
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+from tensorflow.contrib.keras.python.keras.utils.layer_utils import convert_all_kernels_in_model
+
+
+WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels.h5'
+WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5'
+
+
+def conv2d_bn(x,
+              filters,
+              num_row,
+              num_col,
+              padding='same',
+              strides=(1, 1),
+              name=None):
+  """Utility function to apply conv + BN.
+
+  Arguments:
+      x: input tensor.
+      filters: filters in `Conv2D`.
+      num_row: height of the convolution kernel.
+      num_col: width of the convolution kernel.
+      padding: padding mode in `Conv2D`.
+      strides: strides in `Conv2D`.
+      name: name of the ops; will become `name + '_conv'`
+          for the convolution and `name + '_bn'` for the
+          batch norm layer.
+
+  Returns:
+      Output tensor after applying `Conv2D` and `BatchNormalization`.
+  """
+  if name is not None:
+    bn_name = name + '_bn'
+    conv_name = name + '_conv'
+  else:
+    bn_name = None
+    conv_name = None
+  if K.image_data_format() == 'channels_first':
+    bn_axis = 1
+  else:
+    bn_axis = 3
+  x = Conv2D(
+      filters, (num_row, num_col),
+      strides=strides,
+      padding=padding,
+      use_bias=False,
+      name=conv_name)(x)
+  x = BatchNormalization(axis=bn_axis, scale=False, name=bn_name)(x)
+  x = Activation('relu', name=name)(x)
+  return x
+
+
+def InceptionV3(include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                input_shape=None,
+                pooling=None,
+                classes=1000):
+  """Instantiates the Inception v3 architecture.
+
+  Optionally loads weights pre-trained
+  on ImageNet. Note that when using TensorFlow,
+  for best performance you should set
+  `image_data_format="channels_last"` in your Keras config
+  at ~/.keras/keras.json.
+  The model and the weights are compatible with both
+  TensorFlow and Theano. The data format
+  convention used by the model is the one
+  specified in your Keras config file.
+  Note that the default input image size for this model is 299x299.
+
+  Arguments:
+      include_top: whether to include the fully-connected
+          layer at the top of the network.
+      weights: one of `None` (random initialization)
+          or "imagenet" (pre-training on ImageNet).
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+          to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(299, 299, 3)` (with `channels_last` data format)
+          or `(3, 299, 299)` (with `channels_first` data format).
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 139.
+          E.g. `(150, 150, 3)` would be one valid value.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model will be
+              the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a 2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+  """
+  if weights not in {'imagenet', None}:
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization) or `imagenet` '
+                     '(pre-training on ImageNet).')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as imagenet with `include_top`'
+                     ' as true, `classes` should be 1000')
+
+  # Determine proper input shape
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=299,
+      min_size=139,
+      data_format=K.image_data_format(),
+      include_top=include_top)
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    img_input = Input(tensor=input_tensor, shape=input_shape)
+
+  if K.image_data_format() == 'channels_first':
+    channel_axis = 1
+  else:
+    channel_axis = 3
+
+  x = conv2d_bn(img_input, 32, 3, 3, strides=(2, 2), padding='valid')
+  x = conv2d_bn(x, 32, 3, 3, padding='valid')
+  x = conv2d_bn(x, 64, 3, 3)
+  x = MaxPooling2D((3, 3), strides=(2, 2))(x)
+
+  x = conv2d_bn(x, 80, 1, 1, padding='valid')
+  x = conv2d_bn(x, 192, 3, 3, padding='valid')
+  x = MaxPooling2D((3, 3), strides=(2, 2))(x)
+
+  # mixed 0, 1, 2: 35 x 35 x 256
+  branch1x1 = conv2d_bn(x, 64, 1, 1)
+
+  branch5x5 = conv2d_bn(x, 48, 1, 1)
+  branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
+
+  branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+
+  branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
+  branch_pool = conv2d_bn(branch_pool, 32, 1, 1)
+  x = layers.concatenate(
+      [branch1x1, branch5x5, branch3x3dbl, branch_pool],
+      axis=channel_axis,
+      name='mixed0')
+
+  # mixed 1: 35 x 35 x 256
+  branch1x1 = conv2d_bn(x, 64, 1, 1)
+
+  branch5x5 = conv2d_bn(x, 48, 1, 1)
+  branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
+
+  branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+
+  branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
+  branch_pool = conv2d_bn(branch_pool, 64, 1, 1)
+  x = layers.concatenate(
+      [branch1x1, branch5x5, branch3x3dbl, branch_pool],
+      axis=channel_axis,
+      name='mixed1')
+
+  # mixed 2: 35 x 35 x 256
+  branch1x1 = conv2d_bn(x, 64, 1, 1)
+
+  branch5x5 = conv2d_bn(x, 48, 1, 1)
+  branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
+
+  branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+
+  branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
+  branch_pool = conv2d_bn(branch_pool, 64, 1, 1)
+  x = layers.concatenate(
+      [branch1x1, branch5x5, branch3x3dbl, branch_pool],
+      axis=channel_axis,
+      name='mixed2')
+
+  # mixed 3: 17 x 17 x 768
+  branch3x3 = conv2d_bn(x, 384, 3, 3, strides=(2, 2), padding='valid')
+
+  branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+  branch3x3dbl = conv2d_bn(
+      branch3x3dbl, 96, 3, 3, strides=(2, 2), padding='valid')
+
+  branch_pool = MaxPooling2D((3, 3), strides=(2, 2))(x)
+  x = layers.concatenate(
+      [branch3x3, branch3x3dbl, branch_pool], axis=channel_axis, name='mixed3')
+
+  # mixed 4: 17 x 17 x 768
+  branch1x1 = conv2d_bn(x, 192, 1, 1)
+
+  branch7x7 = conv2d_bn(x, 128, 1, 1)
+  branch7x7 = conv2d_bn(branch7x7, 128, 1, 7)
+  branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
+
+  branch7x7dbl = conv2d_bn(x, 128, 1, 1)
+  branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
+  branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 1, 7)
+  branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
+  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+
+  branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
+  branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+  x = layers.concatenate(
+      [branch1x1, branch7x7, branch7x7dbl, branch_pool],
+      axis=channel_axis,
+      name='mixed4')
+
+  # mixed 5, 6: 17 x 17 x 768
+  for i in range(2):
+    branch1x1 = conv2d_bn(x, 192, 1, 1)
+
+    branch7x7 = conv2d_bn(x, 160, 1, 1)
+    branch7x7 = conv2d_bn(branch7x7, 160, 1, 7)
+    branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
+
+    branch7x7dbl = conv2d_bn(x, 160, 1, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 1, 7)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+
+    branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
+    branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+    x = layers.concatenate(
+        [branch1x1, branch7x7, branch7x7dbl, branch_pool],
+        axis=channel_axis,
+        name='mixed' + str(5 + i))
+
+  # mixed 7: 17 x 17 x 768
+  branch1x1 = conv2d_bn(x, 192, 1, 1)
+
+  branch7x7 = conv2d_bn(x, 192, 1, 1)
+  branch7x7 = conv2d_bn(branch7x7, 192, 1, 7)
+  branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
+
+  branch7x7dbl = conv2d_bn(x, 192, 1, 1)
+  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
+  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
+  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+
+  branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
+  branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+  x = layers.concatenate(
+      [branch1x1, branch7x7, branch7x7dbl, branch_pool],
+      axis=channel_axis,
+      name='mixed7')
+
+  # mixed 8: 8 x 8 x 1280
+  branch3x3 = conv2d_bn(x, 192, 1, 1)
+  branch3x3 = conv2d_bn(branch3x3, 320, 3, 3, strides=(2, 2), padding='valid')
+
+  branch7x7x3 = conv2d_bn(x, 192, 1, 1)
+  branch7x7x3 = conv2d_bn(branch7x7x3, 192, 1, 7)
+  branch7x7x3 = conv2d_bn(branch7x7x3, 192, 7, 1)
+  branch7x7x3 = conv2d_bn(
+      branch7x7x3, 192, 3, 3, strides=(2, 2), padding='valid')
+
+  branch_pool = MaxPooling2D((3, 3), strides=(2, 2))(x)
+  x = layers.concatenate(
+      [branch3x3, branch7x7x3, branch_pool], axis=channel_axis, name='mixed8')
+
+  # mixed 9: 8 x 8 x 2048
+  for i in range(2):
+    branch1x1 = conv2d_bn(x, 320, 1, 1)
+
+    branch3x3 = conv2d_bn(x, 384, 1, 1)
+    branch3x3_1 = conv2d_bn(branch3x3, 384, 1, 3)
+    branch3x3_2 = conv2d_bn(branch3x3, 384, 3, 1)
+    branch3x3 = layers.concatenate(
+        [branch3x3_1, branch3x3_2], axis=channel_axis, name='mixed9_' + str(i))
+
+    branch3x3dbl = conv2d_bn(x, 448, 1, 1)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 384, 3, 3)
+    branch3x3dbl_1 = conv2d_bn(branch3x3dbl, 384, 1, 3)
+    branch3x3dbl_2 = conv2d_bn(branch3x3dbl, 384, 3, 1)
+    branch3x3dbl = layers.concatenate(
+        [branch3x3dbl_1, branch3x3dbl_2], axis=channel_axis)
+
+    branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
+    branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+    x = layers.concatenate(
+        [branch1x1, branch3x3, branch3x3dbl, branch_pool],
+        axis=channel_axis,
+        name='mixed' + str(9 + i))
+  if include_top:
+    # Classification block
+    x = GlobalAveragePooling2D(name='avg_pool')(x)
+    x = Dense(classes, activation='softmax', name='predictions')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D()(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D()(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+  # Create model.
+  model = Model(inputs, x, name='inception_v3')
+
+  # load weights
+  if weights == 'imagenet':
+    if K.image_data_format() == 'channels_first':
+      if K.backend() == 'tensorflow':
+        warnings.warn('You are using the TensorFlow backend, yet you '
+                      'are using the Theano '
+                      'image data format convention '
+                      '(`image_data_format="channels_first"`). '
+                      'For best performance, set '
+                      '`image_data_format="channels_last"` in '
+                      'your Keras config '
+                      'at ~/.keras/keras.json.')
+    if include_top:
+      weights_path = get_file(
+          'inception_v3_weights_tf_dim_ordering_tf_kernels.h5',
+          WEIGHTS_PATH,
+          cache_subdir='models',
+          md5_hash='9a0d58056eeedaa3f26cb7ebd46da564')
+    else:
+      weights_path = get_file(
+          'inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5',
+          WEIGHTS_PATH_NO_TOP,
+          cache_subdir='models',
+          md5_hash='bcbd6486424b2319ff4ef7d526e38f63')
+    model.load_weights(weights_path)
+    if K.backend() == 'theano':
+      convert_all_kernels_in_model(model)
+  return model
+
+
+def preprocess_input(x):
+  x /= 255.
+  x -= 0.5
+  x *= 2.
+  return x
diff --git a/tensorflow/contrib/keras/python/keras/applications/inception_v3_test.py b/tensorflow/contrib/keras/python/keras/applications/inception_v3_test.py
new file mode 100644
index 0000000000..586f0da270
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/inception_v3_test.py
@@ -0,0 +1,42 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Inception V3 application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class InceptionV3Test(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.InceptionV3(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.InceptionV3(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 2048))
+
+  def test_with_pooling(self):
+    model = keras.applications.InceptionV3(weights=None,
+                                           include_top=False,
+                                           pooling='avg')
+    self.assertEqual(model.output_shape, (None, 2048))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/applications/resnet50.py b/tensorflow/contrib/keras/python/keras/applications/resnet50.py
new file mode 100644
index 0000000000..546fcb9433
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/resnet50.py
@@ -0,0 +1,309 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=invalid-name
+"""ResNet50 model for Keras.
+
+# Reference:
+
+- [Deep Residual Learning for Image
+Recognition](https://arxiv.org/abs/1512.03385)
+
+Adapted from code contributed by BigMoyan.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import layers
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import preprocess_input  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.engine.topology import get_source_inputs
+from tensorflow.contrib.keras.python.keras.layers import Activation
+from tensorflow.contrib.keras.python.keras.layers import AveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers import BatchNormalization
+from tensorflow.contrib.keras.python.keras.layers import Conv2D
+from tensorflow.contrib.keras.python.keras.layers import Dense
+from tensorflow.contrib.keras.python.keras.layers import Flatten
+from tensorflow.contrib.keras.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers import Input
+from tensorflow.contrib.keras.python.keras.layers import MaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers import ZeroPadding2D
+from tensorflow.contrib.keras.python.keras.models import Model
+from tensorflow.contrib.keras.python.keras.utils import layer_utils
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5'
+WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'
+
+
+def identity_block(input_tensor, kernel_size, filters, stage, block):
+  """The identity block is the block that has no conv layer at shortcut.
+
+  Arguments:
+      input_tensor: input tensor
+      kernel_size: defualt 3, the kernel size of middle conv layer at main path
+      filters: list of integers, the filterss of 3 conv layer at main path
+      stage: integer, current stage label, used for generating layer names
+      block: 'a','b'..., current block label, used for generating layer names
+
+  Returns:
+      Output tensor for the block.
+  """
+  filters1, filters2, filters3 = filters
+  if K.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+  x = Conv2D(filters1, (1, 1), name=conv_name_base + '2a')(input_tensor)
+  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
+  x = Activation('relu')(x)
+
+  x = Conv2D(
+      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(x)
+  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
+  x = Activation('relu')(x)
+
+  x = Conv2D(filters3, (1, 1), name=conv_name_base + '2c')(x)
+  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)
+
+  x = layers.add([x, input_tensor])
+  x = Activation('relu')(x)
+  return x
+
+
+def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
+                                                                          2)):
+  """conv_block is the block that has a conv layer at shortcut.
+
+  Arguments:
+      input_tensor: input tensor
+      kernel_size: defualt 3, the kernel size of middle conv layer at main path
+      filters: list of integers, the filterss of 3 conv layer at main path
+      stage: integer, current stage label, used for generating layer names
+      block: 'a','b'..., current block label, used for generating layer names
+      strides: Tuple of integers.
+
+  Returns:
+      Output tensor for the block.
+
+  Note that from stage 3, the first conv layer at main path is with
+  strides=(2,2)
+  And the shortcut should have strides=(2,2) as well
+  """
+  filters1, filters2, filters3 = filters
+  if K.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+  x = Conv2D(
+      filters1, (1, 1), strides=strides,
+      name=conv_name_base + '2a')(input_tensor)
+  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
+  x = Activation('relu')(x)
+
+  x = Conv2D(
+      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(x)
+  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
+  x = Activation('relu')(x)
+
+  x = Conv2D(filters3, (1, 1), name=conv_name_base + '2c')(x)
+  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)
+
+  shortcut = Conv2D(
+      filters3, (1, 1), strides=strides,
+      name=conv_name_base + '1')(input_tensor)
+  shortcut = BatchNormalization(axis=bn_axis, name=bn_name_base + '1')(shortcut)
+
+  x = layers.add([x, shortcut])
+  x = Activation('relu')(x)
+  return x
+
+
+def ResNet50(include_top=True,
+             weights='imagenet',
+             input_tensor=None,
+             input_shape=None,
+             pooling=None,
+             classes=1000):
+  """Instantiates the ResNet50 architecture.
+
+  Optionally loads weights pre-trained
+  on ImageNet. Note that when using TensorFlow,
+  for best performance you should set
+  `image_data_format="channels_last"` in your Keras config
+  at ~/.keras/keras.json.
+
+  The model and the weights are compatible with both
+  TensorFlow and Theano. The data format
+  convention used by the model is the one
+  specified in your Keras config file.
+
+  Arguments:
+      include_top: whether to include the 3 fully-connected
+          layers at the top of the network.
+      weights: one of `None` (random initialization)
+          or "imagenet" (pre-training on ImageNet).
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+          to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(224, 224, 3)` (with `channels_last` data format)
+          or `(3, 224, 244)` (with `channels_first` data format).
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 197.
+          E.g. `(200, 200, 3)` would be one valid value.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model will be
+              the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a 2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+  """
+  if weights not in {'imagenet', None}:
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization) or `imagenet` '
+                     '(pre-training on ImageNet).')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as imagenet with `include_top`'
+                     ' as true, `classes` should be 1000')
+
+  # Determine proper input shape
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=224,
+      min_size=197,
+      data_format=K.image_data_format(),
+      include_top=include_top)
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    img_input = Input(tensor=input_tensor, shape=input_shape)
+
+  if K.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+
+  x = ZeroPadding2D((3, 3))(img_input)
+  x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1')(x)
+  x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x)
+  x = Activation('relu')(x)
+  x = MaxPooling2D((3, 3), strides=(2, 2))(x)
+
+  x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
+  x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
+  x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
+
+  x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
+  x = identity_block(x, 3, [128, 128, 512], stage=3, block='b')
+  x = identity_block(x, 3, [128, 128, 512], stage=3, block='c')
+  x = identity_block(x, 3, [128, 128, 512], stage=3, block='d')
+
+  x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b')
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c')
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d')
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e')
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f')
+
+  x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a')
+  x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b')
+  x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c')
+
+  x = AveragePooling2D((7, 7), name='avg_pool')(x)
+
+  if include_top:
+    x = Flatten()(x)
+    x = Dense(classes, activation='softmax', name='fc1000')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D()(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D()(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+  # Create model.
+  model = Model(inputs, x, name='resnet50')
+
+  # load weights
+  if weights == 'imagenet':
+    if include_top:
+      weights_path = get_file(
+          'resnet50_weights_tf_dim_ordering_tf_kernels.h5',
+          WEIGHTS_PATH,
+          cache_subdir='models',
+          md5_hash='a7b3fe01876f51b976af0dea6bc144eb')
+    else:
+      weights_path = get_file(
+          'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5',
+          WEIGHTS_PATH_NO_TOP,
+          cache_subdir='models',
+          md5_hash='a268eb855778b3df3c7506639542a6af')
+    model.load_weights(weights_path)
+    if K.backend() == 'theano':
+      layer_utils.convert_all_kernels_in_model(model)
+
+    if K.image_data_format() == 'channels_first':
+      if include_top:
+        maxpool = model.get_layer(name='avg_pool')
+        shape = maxpool.output_shape[1:]
+        dense = model.get_layer(name='fc1000')
+        layer_utils.convert_dense_weights_data_format(dense, shape,
+                                                      'channels_first')
+
+      if K.backend() == 'tensorflow':
+        warnings.warn('You are using the TensorFlow backend, yet you '
+                      'are using the Theano '
+                      'image data format convention '
+                      '(`image_data_format="channels_first"`). '
+                      'For best performance, set '
+                      '`image_data_format="channels_last"` in '
+                      'your Keras config '
+                      'at ~/.keras/keras.json.')
+  return model
diff --git a/tensorflow/contrib/keras/python/keras/applications/resnet50_test.py b/tensorflow/contrib/keras/python/keras/applications/resnet50_test.py
new file mode 100644
index 0000000000..0ef701af93
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/resnet50_test.py
@@ -0,0 +1,42 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ResNet50 application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class ResNet50Test(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.ResNet50(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.ResNet50(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 2048))
+
+  def test_with_pooling(self):
+    model = keras.applications.ResNet50(weights=None,
+                                        include_top=False,
+                                        pooling='avg')
+    self.assertEqual(model.output_shape, (None, 2048))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/applications/vgg16.py b/tensorflow/contrib/keras/python/keras/applications/vgg16.py
new file mode 100644
index 0000000000..7fc393055f
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/vgg16.py
@@ -0,0 +1,223 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=invalid-name
+"""VGG16 model for Keras.
+
+# Reference
+
+- [Very Deep Convolutional Networks for Large-Scale Image
+Recognition](https://arxiv.org/abs/1409.1556)
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import preprocess_input  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.engine.topology import get_source_inputs
+from tensorflow.contrib.keras.python.keras.layers import Conv2D
+from tensorflow.contrib.keras.python.keras.layers import Dense
+from tensorflow.contrib.keras.python.keras.layers import Flatten
+from tensorflow.contrib.keras.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers import Input
+from tensorflow.contrib.keras.python.keras.layers import MaxPooling2D
+from tensorflow.contrib.keras.python.keras.models import Model
+from tensorflow.contrib.keras.python.keras.utils import layer_utils
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels.h5'
+WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5'
+
+
+def VGG16(include_top=True,
+          weights='imagenet',
+          input_tensor=None,
+          input_shape=None,
+          pooling=None,
+          classes=1000):
+  """Instantiates the VGG16 architecture.
+
+  Optionally loads weights pre-trained
+  on ImageNet. Note that when using TensorFlow,
+  for best performance you should set
+  `image_data_format="channels_last"` in your Keras config
+  at ~/.keras/keras.json.
+
+  The model and the weights are compatible with both
+  TensorFlow and Theano. The data format
+  convention used by the model is the one
+  specified in your Keras config file.
+
+  Arguments:
+      include_top: whether to include the 3 fully-connected
+          layers at the top of the network.
+      weights: one of `None` (random initialization)
+          or "imagenet" (pre-training on ImageNet).
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+          to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(224, 224, 3)` (with `channels_last` data format)
+          or `(3, 224, 244)` (with `channels_first` data format).
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 48.
+          E.g. `(200, 200, 3)` would be one valid value.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model will be
+              the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a 2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+  """
+  if weights not in {'imagenet', None}:
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization) or `imagenet` '
+                     '(pre-training on ImageNet).')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as imagenet with `include_top`'
+                     ' as true, `classes` should be 1000')
+  # Determine proper input shape
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=224,
+      min_size=48,
+      data_format=K.image_data_format(),
+      include_top=include_top)
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    img_input = Input(tensor=input_tensor, shape=input_shape)
+
+  # Block 1
+  x = Conv2D(
+      64, (3, 3), activation='relu', padding='same',
+      name='block1_conv1')(img_input)
+  x = Conv2D(
+      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
+
+  # Block 2
+  x = Conv2D(
+      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(x)
+  x = Conv2D(
+      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
+
+  # Block 3
+  x = Conv2D(
+      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(x)
+  x = Conv2D(
+      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(x)
+  x = Conv2D(
+      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
+
+  # Block 4
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
+
+  # Block 5
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')(x)
+
+  if include_top:
+    # Classification block
+    x = Flatten(name='flatten')(x)
+    x = Dense(4096, activation='relu', name='fc1')(x)
+    x = Dense(4096, activation='relu', name='fc2')(x)
+    x = Dense(classes, activation='softmax', name='predictions')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D()(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D()(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+  # Create model.
+  model = Model(inputs, x, name='vgg16')
+
+  # load weights
+  if weights == 'imagenet':
+    if include_top:
+      weights_path = get_file(
+          'vgg16_weights_tf_dim_ordering_tf_kernels.h5',
+          WEIGHTS_PATH,
+          cache_subdir='models')
+    else:
+      weights_path = get_file(
+          'vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5',
+          WEIGHTS_PATH_NO_TOP,
+          cache_subdir='models')
+    model.load_weights(weights_path)
+    if K.backend() == 'theano':
+      layer_utils.convert_all_kernels_in_model(model)
+
+    if K.image_data_format() == 'channels_first':
+      if include_top:
+        maxpool = model.get_layer(name='block5_pool')
+        shape = maxpool.output_shape[1:]
+        dense = model.get_layer(name='fc1')
+        layer_utils.convert_dense_weights_data_format(dense, shape,
+                                                      'channels_first')
+
+      if K.backend() == 'tensorflow':
+        warnings.warn('You are using the TensorFlow backend, yet you '
+                      'are using the Theano '
+                      'image data format convention '
+                      '(`image_data_format="channels_first"`). '
+                      'For best performance, set '
+                      '`image_data_format="channels_last"` in '
+                      'your Keras config '
+                      'at ~/.keras/keras.json.')
+  return model
diff --git a/tensorflow/contrib/keras/python/keras/applications/vgg16_test.py b/tensorflow/contrib/keras/python/keras/applications/vgg16_test.py
new file mode 100644
index 0000000000..d0e707d675
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/vgg16_test.py
@@ -0,0 +1,42 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for VGG16 application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class VGG16Test(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.VGG16(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.VGG16(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 512))
+
+  def test_with_pooling(self):
+    model = keras.applications.VGG16(weights=None,
+                                     include_top=False,
+                                     pooling='avg')
+    self.assertEqual(model.output_shape, (None, 512))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/applications/vgg19.py b/tensorflow/contrib/keras/python/keras/applications/vgg19.py
new file mode 100644
index 0000000000..f7c2921b5c
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/vgg19.py
@@ -0,0 +1,229 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=invalid-name
+"""VGG19 model for Keras.
+
+# Reference
+
+- [Very Deep Convolutional Networks for Large-Scale Image
+Recognition](https://arxiv.org/abs/1409.1556)
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import preprocess_input  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.engine.topology import get_source_inputs
+from tensorflow.contrib.keras.python.keras.layers import Conv2D
+from tensorflow.contrib.keras.python.keras.layers import Dense
+from tensorflow.contrib.keras.python.keras.layers import Flatten
+from tensorflow.contrib.keras.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers import Input
+from tensorflow.contrib.keras.python.keras.layers import MaxPooling2D
+from tensorflow.contrib.keras.python.keras.models import Model
+from tensorflow.contrib.keras.python.keras.utils import layer_utils
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg19_weights_tf_dim_ordering_tf_kernels.h5'
+WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5'
+
+
+def VGG19(include_top=True,
+          weights='imagenet',
+          input_tensor=None,
+          input_shape=None,
+          pooling=None,
+          classes=1000):
+  """Instantiates the VGG19 architecture.
+
+  Optionally loads weights pre-trained
+  on ImageNet. Note that when using TensorFlow,
+  for best performance you should set
+  `image_data_format="channels_last"` in your Keras config
+  at ~/.keras/keras.json.
+
+  The model and the weights are compatible with both
+  TensorFlow and Theano. The data format
+  convention used by the model is the one
+  specified in your Keras config file.
+
+  Arguments:
+      include_top: whether to include the 3 fully-connected
+          layers at the top of the network.
+      weights: one of `None` (random initialization)
+          or "imagenet" (pre-training on ImageNet).
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+          to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(224, 224, 3)` (with `channels_last` data format)
+          or `(3, 224, 244)` (with `channels_first` data format).
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 48.
+          E.g. `(200, 200, 3)` would be one valid value.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model will be
+              the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a 2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+  """
+  if weights not in {'imagenet', None}:
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization) or `imagenet` '
+                     '(pre-training on ImageNet).')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as imagenet with `include_top`'
+                     ' as true, `classes` should be 1000')
+  # Determine proper input shape
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=224,
+      min_size=48,
+      data_format=K.image_data_format(),
+      include_top=include_top)
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    img_input = Input(tensor=input_tensor, shape=input_shape)
+
+  # Block 1
+  x = Conv2D(
+      64, (3, 3), activation='relu', padding='same',
+      name='block1_conv1')(img_input)
+  x = Conv2D(
+      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
+
+  # Block 2
+  x = Conv2D(
+      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(x)
+  x = Conv2D(
+      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
+
+  # Block 3
+  x = Conv2D(
+      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(x)
+  x = Conv2D(
+      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(x)
+  x = Conv2D(
+      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(x)
+  x = Conv2D(
+      256, (3, 3), activation='relu', padding='same', name='block3_conv4')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
+
+  # Block 4
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block4_conv4')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
+
+  # Block 5
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block5_conv4')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')(x)
+
+  if include_top:
+    # Classification block
+    x = Flatten(name='flatten')(x)
+    x = Dense(4096, activation='relu', name='fc1')(x)
+    x = Dense(4096, activation='relu', name='fc2')(x)
+    x = Dense(classes, activation='softmax', name='predictions')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D()(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D()(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+  # Create model.
+  model = Model(inputs, x, name='vgg19')
+
+  # load weights
+  if weights == 'imagenet':
+    if include_top:
+      weights_path = get_file(
+          'vgg19_weights_tf_dim_ordering_tf_kernels.h5',
+          WEIGHTS_PATH,
+          cache_subdir='models')
+    else:
+      weights_path = get_file(
+          'vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5',
+          WEIGHTS_PATH_NO_TOP,
+          cache_subdir='models')
+    model.load_weights(weights_path)
+    if K.backend() == 'theano':
+      layer_utils.convert_all_kernels_in_model(model)
+
+    if K.image_data_format() == 'channels_first':
+      if include_top:
+        maxpool = model.get_layer(name='block5_pool')
+        shape = maxpool.output_shape[1:]
+        dense = model.get_layer(name='fc1')
+        layer_utils.convert_dense_weights_data_format(dense, shape,
+                                                      'channels_first')
+
+      if K.backend() == 'tensorflow':
+        warnings.warn('You are using the TensorFlow backend, yet you '
+                      'are using the Theano '
+                      'image data format convention '
+                      '(`image_data_format="channels_first"`). '
+                      'For best performance, set '
+                      '`image_data_format="channels_last"` in '
+                      'your Keras config '
+                      'at ~/.keras/keras.json.')
+  return model
diff --git a/tensorflow/contrib/keras/python/keras/applications/vgg19_test.py b/tensorflow/contrib/keras/python/keras/applications/vgg19_test.py
new file mode 100644
index 0000000000..f2db0da4f4
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/vgg19_test.py
@@ -0,0 +1,42 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for VGG19 application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class VGG19Test(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.VGG19(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.VGG19(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 512))
+
+  def test_with_pooling(self):
+    model = keras.applications.VGG19(weights=None,
+                                     include_top=False,
+                                     pooling='avg')
+    self.assertEqual(model.output_shape, (None, 512))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/applications/xception.py b/tensorflow/contrib/keras/python/keras/applications/xception.py
new file mode 100644
index 0000000000..3b08e73514
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/xception.py
@@ -0,0 +1,307 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=invalid-name
+"""Xception V1 model for Keras.
+
+On ImageNet, this model gets to a top-1 validation accuracy of 0.790
+and a top-5 validation accuracy of 0.945.
+
+Do note that the input image format for this model is different than for
+the VGG16 and ResNet models (299x299 instead of 224x224),
+and that the input preprocessing function
+is also different (same as Inception V3).
+
+Also do note that this model is only available for the TensorFlow backend,
+due to its reliance on `SeparableConvolution` layers.
+
+# Reference
+
+- [Xception: Deep Learning with Depthwise Separable
+Convolutions](https://arxiv.org/abs/1610.02357)
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import layers
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.engine.topology import get_source_inputs
+from tensorflow.contrib.keras.python.keras.layers import Activation
+from tensorflow.contrib.keras.python.keras.layers import BatchNormalization
+from tensorflow.contrib.keras.python.keras.layers import Conv2D
+from tensorflow.contrib.keras.python.keras.layers import Dense
+from tensorflow.contrib.keras.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers import Input
+from tensorflow.contrib.keras.python.keras.layers import MaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers import SeparableConv2D
+from tensorflow.contrib.keras.python.keras.models import Model
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+TF_WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.4/xception_weights_tf_dim_ordering_tf_kernels.h5'
+TF_WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.4/xception_weights_tf_dim_ordering_tf_kernels_notop.h5'
+
+
+def Xception(include_top=True,
+             weights='imagenet',
+             input_tensor=None,
+             input_shape=None,
+             pooling=None,
+             classes=1000):
+  """Instantiates the Xception architecture.
+
+  Optionally loads weights pre-trained
+  on ImageNet. This model is available for TensorFlow only,
+  and can only be used with inputs following the TensorFlow
+  data format `(width, height, channels)`.
+  You should set `image_data_format="channels_last"` in your Keras config
+  located at ~/.keras/keras.json.
+
+  Note that the default input image size for this model is 299x299.
+
+  Arguments:
+      include_top: whether to include the fully-connected
+          layer at the top of the network.
+      weights: one of `None` (random initialization)
+          or "imagenet" (pre-training on ImageNet).
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+          to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(299, 299, 3)`.
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 71.
+          E.g. `(150, 150, 3)` would be one valid value.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model will be
+              the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a 2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+      RuntimeError: If attempting to run this model with a
+          backend that does not support separable convolutions.
+  """
+  if weights not in {'imagenet', None}:
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization) or `imagenet` '
+                     '(pre-training on ImageNet).')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as imagenet with `include_top`'
+                     ' as true, `classes` should be 1000')
+
+  if K.backend() != 'tensorflow':
+    raise RuntimeError('The Xception model is only available with '
+                       'the TensorFlow backend.')
+  if K.image_data_format() != 'channels_last':
+    warnings.warn(
+        'The Xception model is only available for the '
+        'input data format "channels_last" '
+        '(width, height, channels). '
+        'However your settings specify the default '
+        'data format "channels_first" (channels, width, height). '
+        'You should set `image_data_format="channels_last"` in your Keras '
+        'config located at ~/.keras/keras.json. '
+        'The model being returned right now will expect inputs '
+        'to follow the "channels_last" data format.')
+    K.set_image_data_format('channels_last')
+    old_data_format = 'channels_first'
+  else:
+    old_data_format = None
+
+  # Determine proper input shape
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=299,
+      min_size=71,
+      data_format=K.image_data_format(),
+      include_top=include_top)
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    img_input = Input(tensor=input_tensor, shape=input_shape)
+
+  x = Conv2D(
+      32, (3, 3), strides=(2, 2), use_bias=False,
+      name='block1_conv1')(img_input)
+  x = BatchNormalization(name='block1_conv1_bn')(x)
+  x = Activation('relu', name='block1_conv1_act')(x)
+  x = Conv2D(64, (3, 3), use_bias=False, name='block1_conv2')(x)
+  x = BatchNormalization(name='block1_conv2_bn')(x)
+  x = Activation('relu', name='block1_conv2_act')(x)
+
+  residual = Conv2D(
+      128, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+  residual = BatchNormalization()(residual)
+
+  x = SeparableConv2D(
+      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv1')(x)
+  x = BatchNormalization(name='block2_sepconv1_bn')(x)
+  x = Activation('relu', name='block2_sepconv2_act')(x)
+  x = SeparableConv2D(
+      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv2')(x)
+  x = BatchNormalization(name='block2_sepconv2_bn')(x)
+
+  x = MaxPooling2D(
+      (3, 3), strides=(2, 2), padding='same', name='block2_pool')(x)
+  x = layers.add([x, residual])
+
+  residual = Conv2D(
+      256, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+  residual = BatchNormalization()(residual)
+
+  x = Activation('relu', name='block3_sepconv1_act')(x)
+  x = SeparableConv2D(
+      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv1')(x)
+  x = BatchNormalization(name='block3_sepconv1_bn')(x)
+  x = Activation('relu', name='block3_sepconv2_act')(x)
+  x = SeparableConv2D(
+      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv2')(x)
+  x = BatchNormalization(name='block3_sepconv2_bn')(x)
+
+  x = MaxPooling2D(
+      (3, 3), strides=(2, 2), padding='same', name='block3_pool')(x)
+  x = layers.add([x, residual])
+
+  residual = Conv2D(
+      728, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+  residual = BatchNormalization()(residual)
+
+  x = Activation('relu', name='block4_sepconv1_act')(x)
+  x = SeparableConv2D(
+      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv1')(x)
+  x = BatchNormalization(name='block4_sepconv1_bn')(x)
+  x = Activation('relu', name='block4_sepconv2_act')(x)
+  x = SeparableConv2D(
+      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv2')(x)
+  x = BatchNormalization(name='block4_sepconv2_bn')(x)
+
+  x = MaxPooling2D(
+      (3, 3), strides=(2, 2), padding='same', name='block4_pool')(x)
+  x = layers.add([x, residual])
+
+  for i in range(8):
+    residual = x
+    prefix = 'block' + str(i + 5)
+
+    x = Activation('relu', name=prefix + '_sepconv1_act')(x)
+    x = SeparableConv2D(
+        728, (3, 3), padding='same', use_bias=False,
+        name=prefix + '_sepconv1')(x)
+    x = BatchNormalization(name=prefix + '_sepconv1_bn')(x)
+    x = Activation('relu', name=prefix + '_sepconv2_act')(x)
+    x = SeparableConv2D(
+        728, (3, 3), padding='same', use_bias=False,
+        name=prefix + '_sepconv2')(x)
+    x = BatchNormalization(name=prefix + '_sepconv2_bn')(x)
+    x = Activation('relu', name=prefix + '_sepconv3_act')(x)
+    x = SeparableConv2D(
+        728, (3, 3), padding='same', use_bias=False,
+        name=prefix + '_sepconv3')(x)
+    x = BatchNormalization(name=prefix + '_sepconv3_bn')(x)
+
+    x = layers.add([x, residual])
+
+  residual = Conv2D(
+      1024, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+  residual = BatchNormalization()(residual)
+
+  x = Activation('relu', name='block13_sepconv1_act')(x)
+  x = SeparableConv2D(
+      728, (3, 3), padding='same', use_bias=False, name='block13_sepconv1')(x)
+  x = BatchNormalization(name='block13_sepconv1_bn')(x)
+  x = Activation('relu', name='block13_sepconv2_act')(x)
+  x = SeparableConv2D(
+      1024, (3, 3), padding='same', use_bias=False, name='block13_sepconv2')(x)
+  x = BatchNormalization(name='block13_sepconv2_bn')(x)
+
+  x = MaxPooling2D(
+      (3, 3), strides=(2, 2), padding='same', name='block13_pool')(x)
+  x = layers.add([x, residual])
+
+  x = SeparableConv2D(
+      1536, (3, 3), padding='same', use_bias=False, name='block14_sepconv1')(x)
+  x = BatchNormalization(name='block14_sepconv1_bn')(x)
+  x = Activation('relu', name='block14_sepconv1_act')(x)
+
+  x = SeparableConv2D(
+      2048, (3, 3), padding='same', use_bias=False, name='block14_sepconv2')(x)
+  x = BatchNormalization(name='block14_sepconv2_bn')(x)
+  x = Activation('relu', name='block14_sepconv2_act')(x)
+
+  if include_top:
+    x = GlobalAveragePooling2D(name='avg_pool')(x)
+    x = Dense(classes, activation='softmax', name='predictions')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D()(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D()(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+  # Create model.
+  model = Model(inputs, x, name='xception')
+
+  # load weights
+  if weights == 'imagenet':
+    if include_top:
+      weights_path = get_file(
+          'xception_weights_tf_dim_ordering_tf_kernels.h5',
+          TF_WEIGHTS_PATH,
+          cache_subdir='models')
+    else:
+      weights_path = get_file(
+          'xception_weights_tf_dim_ordering_tf_kernels_notop.h5',
+          TF_WEIGHTS_PATH_NO_TOP,
+          cache_subdir='models')
+    model.load_weights(weights_path)
+
+  if old_data_format:
+    K.set_image_data_format(old_data_format)
+  return model
+
+
+def preprocess_input(x):
+  x /= 255.
+  x -= 0.5
+  x *= 2.
+  return x
diff --git a/tensorflow/contrib/keras/python/keras/applications/xception_test.py b/tensorflow/contrib/keras/python/keras/applications/xception_test.py
new file mode 100644
index 0000000000..bb3cc1678e
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/xception_test.py
@@ -0,0 +1,42 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Xception application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class XceptionTest(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.Xception(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.Xception(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 2048))
+
+  def test_with_pooling(self):
+    model = keras.applications.Xception(weights=None,
+                                        include_top=False,
+                                        pooling='avg')
+    self.assertEqual(model.output_shape, (None, 2048))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/backend.py b/tensorflow/contrib/keras/python/keras/backend.py
new file mode 100644
index 0000000000..d35ed45de3
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/backend.py
@@ -0,0 +1,3605 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+# pylint: disable=redefined-outer-name
+# pylint: disable=redefined-builtin
+"""Keras backend API.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import defaultdict
+import json
+import os
+import warnings
+
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as session_module
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes as dtypes_module
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import ctc_ops as ctc
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gradients as gradients_module
+from tensorflow.python.ops import image_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variables as variables_module
+from tensorflow.python.training import moving_averages
+
+
+py_all = all
+py_sum = sum
+
+# INTERNAL UTILS
+
+# This is the default internal TF session used by Keras.
+# It can be set manually via `set_session(sess)`.
+_SESSION = None
+
+# This dictionary holds a mapping {graph: learning_phase}.
+# A learning phase is a bool tensor used to run Keras models in
+# either train mode (learning_phase == 1) or test mode (learning_phase == 0).
+_GRAPH_LEARNING_PHASES = {}
+
+# This dictionary holds a mapping {graph: UID_DICT}.
+# each UID_DICT is a dictionary mapping name prefixes to a current index,
+# used for generatic graph-specific string UIDs
+# for various names (e.g. layer names).
+_GRAPH_UID_DICTS = {}
+
+# This boolean flag can be set to True to leave variable initialization
+# up to the user.
+# Change its value via `manual_variable_initialization(value)`.
+_MANUAL_VAR_INIT = False
+
+# The type of float to use throughout a session.
+_FLOATX = 'float32'
+
+# Epsilon fuzz factor used throughout the codebase.
+_EPSILON = 10e-8
+
+# Default image data format, one of "channels_last", "channels_first".
+_IMAGE_DATA_FORMAT = 'channels_last'
+
+
+def backend():
+  """Publicly accessible method for determining the current backend.
+
+  Only exists for API compatibily with multi-backend Keras.
+
+  Returns:
+      The string "tensorflow".
+  """
+  return 'tensorflow'
+
+
+def epsilon():
+  """Returns the value of the fuzz factor used in numeric expressions.
+
+  Returns:
+      A float.
+
+  Example:
+  ```python
+      >>> keras.backend.epsilon()
+      1e-08
+  ```
+  """
+  return _EPSILON
+
+
+def set_epsilon(value):
+  """Sets the value of the fuzz factor used in numeric expressions.
+
+  Arguments:
+      value: float. New value of epsilon.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> K.epsilon()
+      1e-08
+      >>> K.set_epsilon(1e-05)
+      >>> K.epsilon()
+      1e-05
+  ```
+  """
+  global _EPSILON
+  _EPSILON = value
+
+
+def floatx():
+  """Returns the default float type, as a string.
+
+  E.g. 'float16', 'float32', 'float64'.
+
+  Returns:
+      String, the current default float type.
+
+  Example:
+  ```python
+      >>> keras.backend.floatx()
+      'float32'
+  ```
+  """
+  return _FLOATX
+
+
+def set_floatx(value):
+  """Sets the default float type.
+
+  Arguments:
+      value: String; 'float16', 'float32', or 'float64'.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> K.floatx()
+      'float32'
+      >>> K.set_floatx('float16')
+      >>> K.floatx()
+      'float16'
+  ```
+
+  Raises:
+      ValueError: In case of invalid value.
+  """
+  global _FLOATX
+  if value not in {'float16', 'float32', 'float64'}:
+    raise ValueError('Unknown floatx type: ' + str(value))
+  _FLOATX = str(value)
+
+
+def cast_to_floatx(x):
+  """Cast a Numpy array to the default Keras float type.
+
+  Arguments:
+      x: Numpy array.
+
+  Returns:
+      The same Numpy array, cast to its new type.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> K.floatx()
+      'float32'
+      >>> arr = numpy.array([1.0, 2.0], dtype='float64')
+      >>> arr.dtype
+      dtype('float64')
+      >>> new_arr = K.cast_to_floatx(arr)
+      >>> new_arr
+      array([ 1.,  2.], dtype=float32)
+      >>> new_arr.dtype
+      dtype('float32')
+  ```
+  """
+  return np.asarray(x, dtype=_FLOATX)
+
+
+def image_data_format():
+  """Returns the default image data format convention.
+
+  Returns:
+      A string, either `'channels_first'` or `'channels_last'`
+
+  Example:
+  ```python
+      >>> keras.backend.image_data_format()
+      'channels_first'
+  ```
+  """
+  return _IMAGE_DATA_FORMAT
+
+
+def set_image_data_format(data_format):
+  """Sets the value of the image data format convention.
+
+  Arguments:
+      data_format: string. `'channels_first'` or `'channels_last'`.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> K.image_data_format()
+      'channels_first'
+      >>> K.set_image_data_format('channels_last')
+      >>> K.image_data_format()
+      'channels_last'
+  ```
+
+  Raises:
+      ValueError: In case of invalid `data_format` value.
+  """
+  global _IMAGE_DATA_FORMAT
+  if data_format not in {'channels_last', 'channels_first'}:
+    raise ValueError('Unknown data_format:', data_format)
+  _IMAGE_DATA_FORMAT = str(data_format)
+
+
+def get_uid(prefix=''):
+  global _GRAPH_UID_DICTS  # pylint: disable=global-variable-not-assigned
+  graph = ops.get_default_graph()
+  if graph not in _GRAPH_UID_DICTS:
+    _GRAPH_UID_DICTS[graph] = defaultdict(int)
+  _GRAPH_UID_DICTS[graph][prefix] += 1
+  return _GRAPH_UID_DICTS[graph][prefix]
+
+
+def reset_uids():
+  global _GRAPH_UID_DICTS
+  _GRAPH_UID_DICTS = {}
+
+
+def clear_session():
+  """Destroys the current TF graph and creates a new one.
+
+  Useful to avoid clutter from old models / layers.
+  """
+  global _SESSION
+  global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
+  ops.reset_default_graph()
+  reset_uids()
+  _SESSION = None
+  phase = array_ops.placeholder(dtype='bool', name='keras_learning_phase')
+  _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = phase
+
+
+def manual_variable_initialization(value):
+  """Sets the manual variable initialization flag.
+
+  This boolean flag determines whether
+  variables should be initialized
+  as they are instantiated (default), or if
+  the user should handle the initialization
+  (e.g. via `tf.initialize_all_variables()`).
+
+  Arguments:
+      value: Python boolean.
+  """
+  global _MANUAL_VAR_INIT
+  _MANUAL_VAR_INIT = value
+
+
+def learning_phase():
+  """Returns the learning phase flag.
+
+  The learning phase flag is a bool tensor (0 = test, 1 = train)
+  to be passed as input to any Keras function
+  that uses a different behavior at train time and test time.
+
+  Returns:
+      Learning phase (scalar integer tensor or Python integer).
+  """
+  graph = ops.get_default_graph()
+  if graph not in _GRAPH_LEARNING_PHASES:
+    phase = array_ops.placeholder(dtype='bool', name='keras_learning_phase')
+    _GRAPH_LEARNING_PHASES[graph] = phase
+  return _GRAPH_LEARNING_PHASES[graph]
+
+
+def set_learning_phase(value):
+  """Sets the learning phase to a fixed value.
+
+  Arguments:
+      value: Learning phase value, either 0 or 1 (integers).
+
+  Raises:
+      ValueError: if `value` is neither `0` nor `1`.
+  """
+  global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
+  if value not in {0, 1}:
+    raise ValueError('Expected learning phase to be ' '0 or 1.')
+  _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = value
+
+
+def get_session():
+  """Returns the TF session to be used by the backend.
+
+  If a default TensorFlow session is available, we will return it.
+
+  Else, we will return the global Keras session.
+
+  If no global Keras session exists at this point:
+  we will create a new global session.
+
+  Note that you can manually set the global session
+  via `K.set_session(sess)`.
+
+  Returns:
+      A TensorFlow session.
+  """
+  global _SESSION
+  if ops.get_default_session() is not None:
+    session = ops.get_default_session()
+  else:
+    if _SESSION is None:
+      if not os.environ.get('OMP_NUM_THREADS'):
+        config = config_pb2.ConfigProto(allow_soft_placement=True)
+      else:
+        num_thread = int(os.environ.get('OMP_NUM_THREADS'))
+        config = config_pb2.ConfigProto(
+            intra_op_parallelism_threads=num_thread, allow_soft_placement=True)
+      _SESSION = session_module.Session(config=config)
+    session = _SESSION
+  if not _MANUAL_VAR_INIT:
+    _initialize_variables()
+  return session
+
+
+def set_session(session):
+  """Sets the global TensorFlow session.
+
+  Arguments:
+      session: A TF Session.
+  """
+  global _SESSION
+  _SESSION = session
+
+
+# VARIABLE MANIPULATION
+
+
+def _convert_string_dtype(dtype):
+  if dtype == 'float16':
+    return dtypes_module.float16
+  if dtype == 'float32':
+    return dtypes_module.float32
+  elif dtype == 'float64':
+    return dtypes_module.float64
+  elif dtype == 'int16':
+    return dtypes_module.int16
+  elif dtype == 'int32':
+    return dtypes_module.int32
+  elif dtype == 'int64':
+    return dtypes_module.int64
+  elif dtype == 'uint8':
+    return dtypes_module.int8
+  elif dtype == 'uint16':
+    return dtypes_module.uint16
+  else:
+    raise ValueError('Unsupported dtype:', dtype)
+
+
+def _to_tensor(x, dtype):
+  x = ops.convert_to_tensor(x)
+  if x.dtype != dtype:
+    x = math_ops.cast(x, dtype)
+  return x
+
+
+def is_sparse(tensor):
+  """Returns whether a tensor is a sparse tensor.
+
+  Arguments:
+      tensor: A tensor instance.
+
+  Returns:
+      A boolean.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> a = K.placeholder((2, 2), sparse=False)
+      >>> print(K.is_sparse(a))
+      False
+      >>> b = K.placeholder((2, 2), sparse=True)
+      >>> print(K.is_sparse(b))
+      True
+  ```
+  """
+  return isinstance(tensor, sparse_tensor.SparseTensor)
+
+
+def to_dense(tensor):
+  """Converts a sparse tensor into a dense tensor and returns it.
+
+  Arguments:
+      tensor: A tensor instance (potentially sparse).
+
+  Returns:
+      A dense tensor.
+
+  Examples:
+  ```python
+      >>> from keras import backend as K
+      >>> b = K.placeholder((2, 2), sparse=True)
+      >>> print(K.is_sparse(b))
+      True
+      >>> c = K.to_dense(b)
+      >>> print(K.is_sparse(c))
+      False
+  ```
+  """
+  if is_sparse(tensor):
+    return sparse_ops.sparse_tensor_to_dense(tensor)
+  else:
+    return tensor
+
+
+name_scope = ops.name_scope
+
+
+def variable(value, dtype=None, name=None):
+  """Instantiates a variable and returns it.
+
+  Arguments:
+      value: Numpy array, initial value of the tensor.
+      dtype: Tensor type.
+      name: Optional name string for the tensor.
+
+  Returns:
+      A variable instance (with Keras metadata included).
+
+  Examples:
+  ```python
+      >>> from keras import backend as K
+      >>> val = np.array([[1, 2], [3, 4]])
+      >>> kvar = K.variable(value=val, dtype='float64', name='example_var')
+      >>> K.dtype(kvar)
+      'float64'
+      >>> print(kvar)
+      example_var
+      >>> kvar.eval()
+      array([[ 1.,  2.],
+             [ 3.,  4.]])
+  ```
+  """
+  if dtype is None:
+    dtype = floatx()
+  if hasattr(value, 'tocoo'):
+    sparse_coo = value.tocoo()
+    indices = np.concatenate((np.expand_dims(sparse_coo.row, 1), np.expand_dims(
+        sparse_coo.col, 1)), 1)
+    v = sparse_tensor.SparseTensor(
+        indices=indices, values=sparse_coo.data, dense_shape=sparse_coo.shape)
+    v._uses_learning_phase = False
+    return v
+  v = variables_module.Variable(
+      value, dtype=_convert_string_dtype(dtype), name=name)
+  v._uses_learning_phase = False
+  return v
+
+
+def _initialize_variables():
+  """Utility to initialize uninitialized variables on the fly.
+  """
+  variables = variables_module.global_variables()
+  uninitialized_variables = []
+  for v in variables:
+    if not hasattr(v, '_keras_initialized') or not v._keras_initialized:
+      uninitialized_variables.append(v)
+      v._keras_initialized = True
+  if uninitialized_variables:
+    sess = get_session()
+    sess.run(variables_module.variables_initializer(uninitialized_variables))
+
+
+def constant(value, dtype=None, shape=None, name=None):
+  if dtype is None:
+    dtype = floatx()
+  return constant_op.constant(value, dtype=dtype, shape=shape, name=name)
+
+
+def placeholder(shape=None, ndim=None, dtype=None, sparse=False, name=None):
+  """Instantiates a placeholder tensor and returns it.
+
+  Arguments:
+      shape: Shape of the placeholder
+          (integer tuple, may include `None` entries).
+      ndim: Number of axes of the tensor.
+          At least one of {`shape`, `ndim`} must be specified.
+          If both are specified, `shape` is used.
+      dtype: Placeholder type.
+      sparse: Boolean, whether the placeholder should have a sparse type.
+      name: Optional name string for the placeholder.
+
+  Returns:
+      Tensor instance (with Keras metadata included).
+
+  Examples:
+  ```python
+      >>> from keras import backend as K
+      >>> input_ph = K.placeholder(shape=(2, 4, 5))
+      >>> input_ph
+      <tf.Tensor 'Placeholder_4:0' shape=(2, 4, 5) dtype=float32>
+  ```
+  """
+  if dtype is None:
+    dtype = floatx()
+  if not shape:
+    if ndim:
+      shape = tuple([None for _ in range(ndim)])
+  if sparse:
+    x = array_ops.sparse_placeholder(dtype, shape=shape, name=name)
+  else:
+    x = array_ops.placeholder(dtype, shape=shape, name=name)
+  x._uses_learning_phase = False
+  return x
+
+
+def shape(x):
+  """Returns the symbolic shape of a tensor or variable.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A symbolic shape (which is itself a tensor).
+
+  Examples:
+  ```
+      # TensorFlow example
+      >>> from keras import backend as K
+      >>> tf_session = K.get_session()
+      >>> val = np.array([[1, 2], [3, 4]])
+      >>> kvar = K.variable(value=val)
+      >>> input = keras.backend.placeholder(shape=(2, 4, 5))
+      >>> K.shape(kvar)
+      <tf.Tensor 'Shape_8:0' shape=(2,) dtype=int32>
+      >>> K.shape(input)
+      <tf.Tensor 'Shape_9:0' shape=(3,) dtype=int32>
+      # To get integer shape (Instead, you can use K.int_shape(x))
+      >>> K.shape(kvar).eval(session=tf_session)
+      array([2, 2], dtype=int32)
+      >>> K.shape(input).eval(session=tf_session)
+      array([2, 4, 5], dtype=int32)
+  ```
+  """
+  return array_ops.shape(x)
+
+
+def int_shape(x):
+  """Returns the shape tensor or variable as a tuple of int or None entries.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tuple of integers (or None entries).
+
+  Examples:
+  ```python
+      >>> from keras import backend as K
+      >>> input = K.placeholder(shape=(2, 4, 5))
+      >>> K.int_shape(input)
+      (2, 4, 5)
+      >>> val = np.array([[1, 2], [3, 4]])
+      >>> kvar = K.variable(value=val)
+      >>> K.int_shape(kvar)
+      (2, 2)
+  ```
+  """
+  shape = x.get_shape()
+  try:
+    return tuple([i.__int__() for i in shape])
+  except ValueError:
+    return None
+
+
+def ndim(x):
+  """Returns the number of axes in a tensor, as an integer.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      Integer (scalar), number of axes.
+
+  Examples:
+  ```python
+      >>> from keras import backend as K
+      >>> input = K.placeholder(shape=(2, 4, 5))
+      >>> val = np.array([[1, 2], [3, 4]])
+      >>> kvar = K.variable(value=val)
+      >>> K.ndim(input)
+      3
+      >>> K.ndim(kvar)
+      2
+  ```
+  """
+  dims = x.get_shape()._dims
+  if dims is not None:
+    return len(dims)
+  return None
+
+
+def dtype(x):
+  """Returns the dtype of a Keras tensor or variable, as a string.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      String, dtype of `x`.
+
+  Examples:
+  ```python
+      >>> from keras import backend as K
+      >>> K.dtype(K.placeholder(shape=(2,4,5)))
+      'float32'
+      >>> K.dtype(K.placeholder(shape=(2,4,5), dtype='float32'))
+      'float32'
+      >>> K.dtype(K.placeholder(shape=(2,4,5), dtype='float64'))
+      'float64'
+      # Keras variable
+      >>> kvar = K.variable(np.array([[1, 2], [3, 4]]))
+      >>> K.dtype(kvar)
+      'float32_ref'
+      >>> kvar = K.variable(np.array([[1, 2], [3, 4]]), dtype='float32')
+      >>> K.dtype(kvar)
+      'float32_ref'
+  ```
+  """
+  return x.dtype.name
+
+
+def eval(x):
+  """Evaluates the value of a variable.
+
+  Arguments:
+      x: A variable.
+
+  Returns:
+      A Numpy array.
+
+  Examples:
+  ```python
+      >>> from keras import backend as K
+      >>> kvar = K.variable(np.array([[1, 2], [3, 4]]), dtype='float32')
+      >>> K.eval(kvar)
+      array([[ 1.,  2.],
+             [ 3.,  4.]], dtype=float32)
+  ```
+  """
+  return to_dense(x).eval(session=get_session())
+
+
+def zeros(shape, dtype=None, name=None):
+  """Instantiates an all-zeros variable and returns it.
+
+  Arguments:
+      shape: Tuple of integers, shape of returned Keras variable
+      dtype: String, data type of returned Keras variable
+      name: String, name of returned Keras variable
+
+  Returns:
+      A variable (including Keras metadata), filled with `0.0`.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> kvar = K.zeros((3,4))
+      >>> K.eval(kvar)
+      array([[ 0.,  0.,  0.,  0.],
+             [ 0.,  0.,  0.,  0.],
+             [ 0.,  0.,  0.,  0.]], dtype=float32)
+  ```
+  """
+  if dtype is None:
+    dtype = floatx()
+  shape = tuple(map(int, shape))
+  tf_dtype = _convert_string_dtype(dtype)
+  return variable(
+      init_ops.constant_initializer(0., dtype=tf_dtype)(shape), dtype, name)
+
+
+def ones(shape, dtype=None, name=None):
+  """Instantiates an all-ones tensor variable and returns it.
+
+  Arguments:
+      shape: Tuple of integers, shape of returned Keras variable.
+      dtype: String, data type of returned Keras variable.
+      name: String, name of returned Keras variable.
+
+  Returns:
+      A Keras variable, filled with `1.0`.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> kvar = K.ones((3,4))
+      >>> K.eval(kvar)
+      array([[ 1.,  1.,  1.,  1.],
+             [ 1.,  1.,  1.,  1.],
+             [ 1.,  1.,  1.,  1.]], dtype=float32)
+  ```
+  """
+  if dtype is None:
+    dtype = floatx()
+  shape = tuple(map(int, shape))
+  tf_dtype = _convert_string_dtype(dtype)
+  return variable(
+      init_ops.constant_initializer(1., dtype=tf_dtype)(shape), dtype, name)
+
+
+def eye(size, dtype=None, name=None):
+  """Instantiate an identity matrix and returns it.
+
+  Arguments:
+      size: Integer, number of rows/columns.
+      dtype: String, data type of returned Keras variable.
+      name: String, name of returned Keras variable.
+
+  Returns:
+      A Keras variable, an identity matrix.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> kvar = K.eye(3)
+      >>> K.eval(kvar)
+      array([[ 1.,  0.,  0.],
+             [ 0.,  1.,  0.],
+             [ 0.,  0.,  1.]], dtype=float32)
+  ```
+
+  """
+  return variable(np.eye(size), dtype, name)
+
+
+def zeros_like(x, dtype=None, name=None):
+  """Instantiates an all-zeros variable of the same shape as another tensor.
+
+  Arguments:
+      x: Keras variable or Keras tensor.
+      dtype: String, dtype of returned Keras variable.
+           None uses the dtype of x.
+      name: String, name for the variable to create.
+
+  Returns:
+      A Keras variable with the shape of x filled with zeros.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> kvar = K.variable(np.random.random((2,3)))
+      >>> kvar_zeros = K.zeros_like(kvar)
+      >>> K.eval(kvar_zeros)
+      array([[ 0.,  0.,  0.],
+             [ 0.,  0.,  0.]], dtype=float32)
+  ```
+  """
+  return array_ops.zeros_like(x, dtype=dtype, name=name)
+
+
+def ones_like(x, dtype=None, name=None):
+  """Instantiates an all-ones variable of the same shape as another tensor.
+
+  Arguments:
+      x: Keras variable or tensor.
+      dtype: String, dtype of returned Keras variable.
+           None uses the dtype of x.
+      name: String, name for the variable to create.
+
+  Returns:
+      A Keras variable with the shape of x filled with ones.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> kvar = K.variable(np.random.random((2,3)))
+      >>> kvar_ones = K.ones_like(kvar)
+      >>> K.eval(kvar_ones)
+      array([[ 1.,  1.,  1.],
+             [ 1.,  1.,  1.]], dtype=float32)
+  ```
+  """
+  return array_ops.ones_like(x, dtype=dtype, name=name)
+
+
+def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
+  """Instantiates a variable with values drawn from a uniform distribution.
+
+  Arguments:
+      shape: Tuple of integers, shape of returned Keras variable.
+      low: Float, lower boundary of the output interval.
+      high: Float, upper boundary of the output interval.
+      dtype: String, dtype of returned Keras variable.
+      name: String, name of returned Keras variable.
+      seed: Integer, random seed.
+
+  Returns:
+      A Keras variable, filled with drawn samples.
+
+  Example:
+  ```python
+      # TensorFlow example
+      >>> kvar = K.random_uniform_variable((2,3), 0, 1)
+      >>> kvar
+      <tensorflow.python.ops.variables.Variable object at 0x10ab40b10>
+      >>> K.eval(kvar)
+      array([[ 0.10940075,  0.10047495,  0.476143  ],
+             [ 0.66137183,  0.00869417,  0.89220798]], dtype=float32)
+  ```
+  """
+  if dtype is None:
+    dtype = floatx()
+  shape = tuple(map(int, shape))
+  tf_dtype = _convert_string_dtype(dtype)
+  if seed is None:
+    # ensure that randomness is conditioned by the Numpy RNG
+    seed = np.random.randint(10e8)
+  value = init_ops.random_uniform_initializer(
+      low, high, dtype=tf_dtype, seed=seed)(shape)
+  return variable(value, dtype=dtype, name=name)
+
+
+def random_normal_variable(shape, mean, scale, dtype=None, name=None,
+                           seed=None):
+  """Instantiates a variable with values drawn from a normal distribution.
+
+  Arguments:
+      shape: Tuple of integers, shape of returned Keras variable.
+      mean: Float, mean of the normal distribution.
+      scale: Float, standard deviation of the normal distribution.
+      dtype: String, dtype of returned Keras variable.
+      name: String, name of returned Keras variable.
+      seed: Integer, random seed.
+
+  Returns:
+      A Keras variable, filled with drawn samples.
+
+  Example:
+  ```python
+      # TensorFlow example
+      >>> kvar = K.random_normal_variable((2,3), 0, 1)
+      >>> kvar
+      <tensorflow.python.ops.variables.Variable object at 0x10ab12dd0>
+      >>> K.eval(kvar)
+      array([[ 1.19591331,  0.68685907, -0.63814116],
+             [ 0.92629528,  0.28055015,  1.70484698]], dtype=float32)
+  ```
+  """
+  if dtype is None:
+    dtype = floatx()
+  shape = tuple(map(int, shape))
+  tf_dtype = _convert_string_dtype(dtype)
+  if seed is None:
+    # ensure that randomness is conditioned by the Numpy RNG
+    seed = np.random.randint(10e8)
+  value = init_ops.random_normal_initializer(
+      mean, scale, dtype=tf_dtype, seed=seed)(shape)
+  return variable(value, dtype=dtype, name=name)
+
+
+def count_params(x):
+  """Returns the number of scalars in a Keras variable.
+
+  Arguments:
+      x: Keras variable.
+
+  Returns:
+      Integer, the number of scalars in `x`.
+
+  Example:
+  ```python
+      >>> kvar = K.zeros((2,3))
+      >>> K.count_params(kvar)
+      6
+      >>> K.eval(kvar)
+      array([[ 0.,  0.,  0.],
+             [ 0.,  0.,  0.]], dtype=float32)
+  ```
+  """
+  shape = x.get_shape()
+  return np.prod([shape[i]._value for i in range(len(shape))])
+
+
+def cast(x, dtype):
+  """Casts a tensor to a different dtype and returns it.
+
+  You can cast a Keras variable but it still returns a Keras tensor.
+
+  Arguments:
+      x: Keras tensor (or variable).
+      dtype: String, either (`'float16'`, `'float32'`, or `'float64'`).
+
+  Returns:
+      Keras tensor with dtype `dtype`.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> input = K.placeholder((2, 3), dtype='float32')
+      >>> input
+      <tf.Tensor 'Placeholder_2:0' shape=(2, 3) dtype=float32>
+      # It doesn't work in-place as below.
+      >>> K.cast(input, dtype='float16')
+      <tf.Tensor 'Cast_1:0' shape=(2, 3) dtype=float16>
+      >>> input
+      <tf.Tensor 'Placeholder_2:0' shape=(2, 3) dtype=float32>
+      # you need to assign it.
+      >>> input = K.cast(input, dtype='float16')
+      >>> input
+      <tf.Tensor 'Cast_2:0' shape=(2, 3) dtype=float16>
+  ```
+  """
+  return math_ops.cast(x, dtype)
+
+
+# UPDATES OPS
+
+
+def update(x, new_x):
+  return state_ops.assign(x, new_x)
+
+
+def update_add(x, increment):
+  return state_ops.assign_add(x, increment)
+
+
+def update_sub(x, decrement):
+  return state_ops.assign_sub(x, decrement)
+
+
+def moving_average_update(x, value, momentum):
+  return moving_averages.assign_moving_average(
+      x, value, momentum, zero_debias=False)
+
+
+# LINEAR ALGEBRA
+
+
+def dot(x, y):
+  """Multiplies 2 tensors (and/or variables) and returns a *tensor*.
+
+  When attempting to multiply a nD tensor
+  with a nD tensor, it reproduces the Theano behavior.
+  (e.g. `(2, 3) * (4, 3, 5) -> (2, 4, 5)`)
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A tensor, dot product of `x` and `y`.
+
+  Examples:
+  ```python
+      # dot product between tensors
+      >>> x = K.placeholder(shape=(2, 3))
+      >>> y = K.placeholder(shape=(3, 4))
+      >>> xy = K.dot(x, y)
+      >>> xy
+      <tf.Tensor 'MatMul_9:0' shape=(2, 4) dtype=float32>
+  ```
+
+  ```python
+      # dot product between tensors
+      >>> x = K.placeholder(shape=(32, 28, 3))
+      >>> y = K.placeholder(shape=(3, 4))
+      >>> xy = K.dot(x, y)
+      >>> xy
+      <tf.Tensor 'MatMul_9:0' shape=(32, 28, 4) dtype=float32>
+  ```
+
+  ```python
+      # Theano-like behavior example
+      >>> x = K.random_uniform_variable(shape=(2, 3), low=0, high=1)
+      >>> y = K.ones((4, 3, 5))
+      >>> xy = K.dot(x, y)
+      >>> K.int_shape(xy)
+      (2, 4, 5)
+  ```
+  """
+  if ndim(x) is not None and (ndim(x) > 2 or ndim(y) > 2):
+    x_shape = []
+    for i, s in zip(int_shape(x), array_ops.unstack(array_ops.shape(x))):
+      if i is not None:
+        x_shape.append(i)
+      else:
+        x_shape.append(s)
+    x_shape = tuple(x_shape)
+    y_shape = []
+    for i, s in zip(int_shape(y), array_ops.unstack(array_ops.shape(y))):
+      if i is not None:
+        y_shape.append(i)
+      else:
+        y_shape.append(s)
+    y_shape = tuple(y_shape)
+    y_permute_dim = list(range(ndim(y)))
+    y_permute_dim = [y_permute_dim.pop(-2)] + y_permute_dim
+    xt = array_ops.reshape(x, [-1, x_shape[-1]])
+    yt = array_ops.reshape(
+        array_ops.transpose(y, perm=y_permute_dim), [y_shape[-2], -1])
+    return array_ops.reshape(
+        math_ops.matmul(xt, yt), x_shape[:-1] + y_shape[:-2] + y_shape[-1:])
+  if is_sparse(x):
+    out = sparse_ops.sparse_tensor_dense_matmul(x, y)
+  else:
+    out = math_ops.matmul(x, y)
+  return out
+
+
+def batch_dot(x, y, axes=None):
+  """Batchwise dot product.
+
+  `batch_dot` is used to compute dot product of `x` and `y` when
+  `x` and `y` are data in batch, i.e. in a shape of
+  `(batch_size, :)`.
+  `batch_dot` results in a tensor or variable with less dimensions
+  than the input. If the number of dimensions is reduced to 1,
+  we use `expand_dims` to make sure that ndim is at least 2.
+
+  Arguments:
+      x: Keras tensor or variable with `ndim >= 2`.
+      y: Keras tensor or variable with `ndim >= 2`.
+      axes: list of (or single) int with target dimensions.
+          The lengths of `axes[0]` and `axes[1]` should be the same.
+
+  Returns:
+      A tensor with shape equal to the concatenation of `x`'s shape
+      (less the dimension that was summed over) and `y`'s shape
+      (less the batch dimension and the dimension that was summed over).
+      If the final rank is 1, we reshape it to `(batch_size, 1)`.
+
+  Examples:
+      Assume `x = [[1, 2], [3, 4]]` and `y = [[5, 6], [7, 8]]`
+      `batch_dot(x, y, axes=1) = [[17, 53]]` which is the main diagonal
+      of `x.dot(y.T)`, although we never have to calculate the off-diagonal
+      elements.
+
+      Shape inference:
+      Let `x`'s shape be `(100, 20)` and `y`'s shape be `(100, 30, 20)`.
+      If `axes` is (1, 2), to find the output shape of resultant tensor,
+          loop through each dimension in `x`'s shape and `y`'s shape:
+
+      * `x.shape[0]` : 100 : append to output shape
+      * `x.shape[1]` : 20 : do not append to output shape,
+          dimension 1 of `x` has been summed over. (`dot_axes[0]` = 1)
+      * `y.shape[0]` : 100 : do not append to output shape,
+          always ignore first dimension of `y`
+      * `y.shape[1]` : 30 : append to output shape
+      * `y.shape[2]` : 20 : do not append to output shape,
+          dimension 2 of `y` has been summed over. (`dot_axes[1]` = 2)
+      `output_shape` = `(100, 30)`
+
+  ```python
+      >>> x_batch = K.ones(shape=(32, 20, 1))
+      >>> y_batch = K.ones(shape=(32, 30, 20))
+      >>> xy_batch_dot = K.batch_dot(x_batch, y_batch, axes=[1, 2])
+      >>> K.int_shape(xy_batch_dot)
+      (32, 1, 30)
+  ```
+  """
+  if isinstance(axes, int):
+    axes = (axes, axes)
+  if ndim(x) == 2 and ndim(y) == 2:
+    if axes[0] == axes[1]:
+      out = math_ops.reduce_sum(math_ops.multiply(x, y), axes[0])
+    else:
+      out = math_ops.reduce_sum(
+          math_ops.multiply(array_ops.transpose(x, [1, 0]), y), axes[1])
+  else:
+    if axes is not None:
+      adj_x = None if axes[0] == ndim(x) - 1 else True
+      adj_y = True if axes[1] == ndim(y) - 1 else None
+    else:
+      adj_x = None
+      adj_y = None
+    out = math_ops.matmul(x, y, adjoint_a=adj_x, adjoint_b=adj_y)
+  if ndim(out) == 1:
+    out = expand_dims(out, 1)
+  return out
+
+
+def transpose(x):
+  """Transposes a tensor and returns it.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+
+  Examples:
+  ```python
+      >>> var = K.variable([[1, 2, 3], [4, 5, 6]])
+      >>> K.eval(var)
+      array([[ 1.,  2.,  3.],
+             [ 4.,  5.,  6.]], dtype=float32)
+      >>> var_transposed = K.transpose(var)
+      >>> K.eval(var_transposed)
+      array([[ 1.,  4.],
+             [ 2.,  5.],
+             [ 3.,  6.]], dtype=float32)
+  ```
+
+  ```python
+      >>> input = K.placeholder((2, 3))
+      >>> input
+      <tf.Tensor 'Placeholder_11:0' shape=(2, 3) dtype=float32>
+      >>> input_transposed = K.transpose(input)
+      >>> input_transposed
+      <tf.Tensor 'transpose_4:0' shape=(3, 2) dtype=float32>
+
+  ```
+  """
+  return array_ops.transpose(x)
+
+
+def gather(reference, indices):
+  """Retrieves the elements of indices `indices` in the tensor `reference`.
+
+  Arguments:
+      reference: A tensor.
+      indices: An integer tensor of indices.
+
+  Returns:
+      A tensor of same type as `reference`.
+  """
+  return array_ops.gather(reference, indices)
+
+
+# ELEMENT-WISE OPERATIONS
+
+
+def _normalize_axis(axis, ndim):
+  """Converts negative axes to positive values.
+
+  Arguments:
+      axis: Integer axis (possibly negative).
+      ndim: Rank of the tensor considered.
+
+  Returns:
+      Positive integer axis.
+  """
+  if isinstance(axis, tuple):
+    axis = list(axis)
+  if isinstance(axis, list):
+    for i, a in enumerate(axis):
+      if a is not None and a < 0:
+        axis[i] = a % ndim
+  else:
+    if axis is not None and axis < 0:
+      axis %= ndim
+  return axis
+
+
+def max(x, axis=None, keepdims=False):
+  """Maximum value in a tensor.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to find maximum values.
+      keepdims: A boolean, whether to keep the dimensions or not.
+          If `keepdims` is `False`, the rank of the tensor is reduced
+          by 1. If `keepdims` is `True`,
+          the reduced dimension is retained with length 1.
+
+  Returns:
+      A tensor with maximum values of `x`.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.reduce_max(x, reduction_indices=axis, keep_dims=keepdims)
+
+
+def min(x, axis=None, keepdims=False):
+  """Minimum value in a tensor.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to find minimum values.
+      keepdims: A boolean, whether to keep the dimensions or not.
+          If `keepdims` is `False`, the rank of the tensor is reduced
+          by 1. If `keepdims` is `True`,
+          the reduced dimension is retained with length 1.
+
+  Returns:
+      A tensor with miminum values of `x`.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.reduce_min(x, reduction_indices=axis, keep_dims=keepdims)
+
+
+def sum(x, axis=None, keepdims=False):
+  """Sum of the values in a tensor, alongside the specified axis.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to sum over.
+      keepdims: A boolean, whether to keep the dimensions or not.
+          If `keepdims` is `False`, the rank of the tensor is reduced
+          by 1. If `keepdims` is `True`,
+          the reduced dimension is retained with length 1.
+
+  Returns:
+      A tensor with sum of `x`.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.reduce_sum(x, reduction_indices=axis, keep_dims=keepdims)
+
+
+def prod(x, axis=None, keepdims=False):
+  """Multiplies the values in a tensor, alongside the specified axis.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to compute the product.
+      keepdims: A boolean, whether to keep the dimensions or not.
+          If `keepdims` is `False`, the rank of the tensor is reduced
+          by 1. If `keepdims` is `True`,
+          the reduced dimension is retained with length 1.
+
+  Returns:
+      A tensor with the product of elements of `x`.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.reduce_prod(x, reduction_indices=axis, keep_dims=keepdims)
+
+
+def var(x, axis=None, keepdims=False):
+  """Variance of a tensor, alongside the specified axis.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to compute the variance.
+      keepdims: A boolean, whether to keep the dimensions or not.
+          If `keepdims` is `False`, the rank of the tensor is reduced
+          by 1. If `keepdims` is `True`,
+          the reduced dimension is retained with length 1.
+
+  Returns:
+      A tensor with the variance of elements of `x`.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  if x.dtype.base_dtype == dtypes_module.bool:
+    x = math_ops.cast(x, floatx())
+  m = math_ops.reduce_mean(x, reduction_indices=axis, keep_dims=True)
+  devs_squared = math_ops.square(x - m)
+  return math_ops.reduce_mean(
+      devs_squared, reduction_indices=axis, keep_dims=keepdims)
+
+
+def std(x, axis=None, keepdims=False):
+  """Standard deviation of a tensor, alongside the specified axis.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to compute the standard deviation.
+      keepdims: A boolean, whether to keep the dimensions or not.
+          If `keepdims` is `False`, the rank of the tensor is reduced
+          by 1. If `keepdims` is `True`,
+          the reduced dimension is retained with length 1.
+
+  Returns:
+      A tensor with the standard deviation of elements of `x`.
+  """
+  return math_ops.sqrt(var(x, axis=axis, keepdims=keepdims))
+
+
+def mean(x, axis=None, keepdims=False):
+  """Mean of a tensor, alongside the specified axis.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: A list of integer. Axes to compute the mean.
+      keepdims: A boolean, whether to keep the dimensions or not.
+          If `keepdims` is `False`, the rank of the tensor is reduced
+          by 1 for each entry in `axis`. If `keep_dims` is `True`,
+          the reduced dimensions are retained with length 1.
+
+  Returns:
+      A tensor with the mean of elements of `x`.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  if x.dtype.base_dtype == dtypes_module.bool:
+    x = math_ops.cast(x, floatx())
+  return math_ops.reduce_mean(x, reduction_indices=axis, keep_dims=keepdims)
+
+
+def any(x, axis=None, keepdims=False):
+  """Bitwise reduction (logical OR).
+
+  Arguments:
+      x: Tensor or variable.
+      axis: axis along which to perform the reduction.
+      keepdims: whether the drop or broadcast the reduction axes.
+
+  Returns:
+      A uint8 tensor (0s and 1s).
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  x = math_ops.cast(x, dtypes_module.bool)
+  x = math_ops.reduce_any(x, reduction_indices=axis, keep_dims=keepdims)
+  return math_ops.cast(x, dtypes_module.uint8)
+
+
+def all(x, axis=None, keepdims=False):
+  """Bitwise reduction (logical AND).
+
+  Arguments:
+      x: Tensor or variable.
+      axis: axis along which to perform the reduction.
+      keepdims: whether the drop or broadcast the reduction axes.
+
+  Returns:
+      A uint8 tensor (0s and 1s).
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  x = math_ops.cast(x, dtypes_module.bool)
+  x = math_ops.reduce_all(x, reduction_indices=axis, keep_dims=keepdims)
+  return math_ops.cast(x, dtypes_module.uint8)
+
+
+def argmax(x, axis=-1):
+  """Returns the index of the maximum value along an axis.
+
+  Arguments:
+      x: Tensor or variable.
+      axis: axis along which to perform the reduction.
+
+  Returns:
+      A tensor.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.argmax(x, axis)
+
+
+def argmin(x, axis=-1):
+  """Returns the index of the minimum value along an axis.
+
+  Arguments:
+      x: Tensor or variable.
+      axis: axis along which to perform the reduction.
+
+  Returns:
+      A tensor.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.argmin(x, axis)
+
+
+def square(x):
+  """Element-wise square.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.square(x)
+
+
+def abs(x):
+  """Element-wise absolute value.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.abs(x)
+
+
+def sqrt(x):
+  """Element-wise square root.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  zero = _to_tensor(0., x.dtype.base_dtype)
+  inf = _to_tensor(np.inf, x.dtype.base_dtype)
+  x = clip_ops.clip_by_value(x, zero, inf)
+  return math_ops.sqrt(x)
+
+
+def exp(x):
+  """Element-wise exponential.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.exp(x)
+
+
+def log(x):
+  """Element-wise log.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.log(x)
+
+
+def round(x):
+  """Element-wise rounding to the closest integer.
+
+  In case of tie, the rounding mode used is "half to even".
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.round(x)
+
+
+def sign(x):
+  """Element-wise sign.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.sign(x)
+
+
+def pow(x, a):
+  """Element-wise exponentiation.
+
+  Arguments:
+      x: Tensor or variable.
+      a: Python integer.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.pow(x, a)
+
+
+def clip(x, min_value, max_value):
+  """Element-wise value clipping.
+
+  Arguments:
+      x: Tensor or variable.
+      min_value: Python float or integer.
+      max_value: Python float or integer.
+
+  Returns:
+      A tensor.
+  """
+  if max_value is not None and max_value < min_value:
+    max_value = min_value
+  if max_value is None:
+    max_value = np.inf
+  min_value = _to_tensor(min_value, x.dtype.base_dtype)
+  max_value = _to_tensor(max_value, x.dtype.base_dtype)
+  return clip_ops.clip_by_value(x, min_value, max_value)
+
+
+def equal(x, y):
+  """Element-wise equality between two tensors.
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A bool tensor.
+  """
+  return math_ops.equal(x, y)
+
+
+def not_equal(x, y):
+  """Element-wise inequality between two tensors.
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A bool tensor.
+  """
+  return math_ops.not_equal(x, y)
+
+
+def greater(x, y):
+  """Element-wise truth value of (x > y).
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A bool tensor.
+  """
+  return math_ops.greater(x, y)
+
+
+def greater_equal(x, y):
+  """Element-wise truth value of (x >= y).
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A bool tensor.
+  """
+  return math_ops.greater_equal(x, y)
+
+
+def less(x, y):
+  """Element-wise truth value of (x < y).
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A bool tensor.
+  """
+  return math_ops.less(x, y)
+
+
+def less_equal(x, y):
+  """Element-wise truth value of (x <= y).
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A bool tensor.
+  """
+  return math_ops.less_equal(x, y)
+
+
+def maximum(x, y):
+  """Element-wise maximum of two tensors.
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.maximum(x, y)
+
+
+def minimum(x, y):
+  """Element-wise minimum of two tensors.
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.minimum(x, y)
+
+
+def sin(x):
+  """Computes sin of x element-wise.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.sin(x)
+
+
+def cos(x):
+  """Computes cos of x element-wise.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.cos(x)
+
+
+def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
+  """Computes mean and std for batch then apply batch_normalization on batch.
+
+  Arguments:
+      x: Input tensor or variable.
+      gamma: Tensor by which to scale the input.
+      beta: Tensor with which to center the input.
+      reduction_axes: iterable of integers,
+          axes over which to normalize.
+      epsilon: Fuzz factor.
+
+  Returns:
+      A tuple length of 3, `(normalized_tensor, mean, variance)`.
+  """
+  mean, var = nn.moments(
+      x, reduction_axes, shift=None, name=None, keep_dims=False)
+  if sorted(reduction_axes) == range(ndim(x))[:-1]:
+    normed = nn.batch_normalization(x, mean, var, beta, gamma, epsilon)
+  else:
+    # need broadcasting
+    target_shape = []
+    for axis in range(ndim(x)):
+      if axis in reduction_axes:
+        target_shape.append(1)
+      else:
+        target_shape.append(array_ops.shape(x)[axis])
+    target_shape = array_ops.stack(target_shape)
+
+    broadcast_mean = array_ops.reshape(mean, target_shape)
+    broadcast_var = array_ops.reshape(var, target_shape)
+    if gamma is None:
+      broadcast_gamma = None
+    else:
+      broadcast_gamma = array_ops.reshape(gamma, target_shape)
+    if beta is None:
+      broadcast_beta = None
+    else:
+      broadcast_beta = array_ops.reshape(beta, target_shape)
+    normed = nn.batch_normalization(x, broadcast_mean, broadcast_var,
+                                    broadcast_beta, broadcast_gamma, epsilon)
+  return normed, mean, var
+
+
+def batch_normalization(x, mean, var, beta, gamma, epsilon=1e-3):
+  """Applies batch normalization on x given mean, var, beta and gamma.
+
+  I.e. returns:
+  `output = (x - mean) / (sqrt(var) + epsilon) * gamma + beta`
+
+  Arguments:
+      x: Input tensor or variable.
+      mean: Mean of batch.
+      var: Variance of batch.
+      beta: Tensor with which to center the input.
+      gamma: Tensor by which to scale the input.
+      epsilon: Fuzz factor.
+
+  Returns:
+      A tensor.
+  """
+  return nn.batch_normalization(x, mean, var, beta, gamma, epsilon)
+
+
+# SHAPE OPERATIONS
+
+
+def concatenate(tensors, axis=-1):
+  """Concatenates a list of tensors alongside the specified axis.
+
+  Arguments:
+      tensors: list of tensors to concatenate.
+      axis: concatenation axis.
+
+  Returns:
+      A tensor.
+  """
+  if axis < 0:
+    rank = ndim(tensors[0])
+    if rank:
+      axis %= rank
+    else:
+      axis = 0
+
+  if py_all([is_sparse(x) for x in tensors]):
+    return sparse_ops.sparse_concat(axis, tensors)
+  else:
+    return array_ops.concat([to_dense(x) for x in tensors], axis)
+
+
+def reshape(x, shape):
+  """Reshapes a tensor to the specified shape.
+
+  Arguments:
+      x: Tensor or variable.
+      shape: Target shape tuple.
+
+  Returns:
+      A tensor.
+  """
+  return array_ops.reshape(x, shape)
+
+
+def permute_dimensions(x, pattern):
+  """Permutes axes in a tensor.
+
+  Arguments:
+      x: Tensor or variable.
+      pattern: A tuple of
+          dimension indices, e.g. `(0, 2, 1)`.
+
+  Returns:
+      A tensor.
+  """
+  return array_ops.transpose(x, perm=pattern)
+
+
+def resize_images(x, height_factor, width_factor, data_format):
+  """Resizes the images contained in a 4D tensor.
+
+  Arguments:
+      x: Tensor or variable to resize.
+      height_factor: Positive integer.
+      width_factor: Positive integer.
+      data_format: One of `"channels_first"`, `"channels_last"`.
+
+  Returns:
+      A tensor.
+
+  Raises:
+      ValueError: if `data_format` is neither
+          `channels_last` or `channels_first`.
+  """
+  if data_format == 'channels_first':
+    original_shape = int_shape(x)
+    new_shape = array_ops.shape(x)[2:]
+    new_shape *= constant_op.constant(
+        np.array([height_factor, width_factor]).astype('int32'))
+    x = permute_dimensions(x, [0, 2, 3, 1])
+    x = image_ops.resize_nearest_neighbor(x, new_shape)
+    x = permute_dimensions(x, [0, 3, 1, 2])
+    x.set_shape((None, None, original_shape[2] * height_factor
+                 if original_shape[2] is not None else None,
+                 original_shape[3] * width_factor
+                 if original_shape[3] is not None else None))
+    return x
+  elif data_format == 'channels_last':
+    original_shape = int_shape(x)
+    new_shape = array_ops.shape(x)[1:3]
+    new_shape *= constant_op.constant(
+        np.array([height_factor, width_factor]).astype('int32'))
+    x = image_ops.resize_nearest_neighbor(x, new_shape)
+    x.set_shape((None, original_shape[1] * height_factor
+                 if original_shape[1] is not None else None,
+                 original_shape[2] * width_factor
+                 if original_shape[2] is not None else None, None))
+    return x
+  else:
+    raise ValueError('Invalid data_format:', data_format)
+
+
+def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
+  """Resizes the volume contained in a 5D tensor.
+
+  Arguments:
+      x: Tensor or variable to resize.
+      depth_factor: Positive integer.
+      height_factor: Positive integer.
+      width_factor: Positive integer.
+      data_format: One of `"channels_first"`, `"channels_last"`.
+
+  Returns:
+      A tensor.
+
+  Raises:
+      ValueError: if `data_format` is neither
+          `channels_last` or `channels_first`.
+  """
+  if data_format == 'channels_first':
+    output = repeat_elements(x, depth_factor, axis=2)
+    output = repeat_elements(output, height_factor, axis=3)
+    output = repeat_elements(output, width_factor, axis=4)
+    return output
+  elif data_format == 'channels_last':
+    output = repeat_elements(x, depth_factor, axis=1)
+    output = repeat_elements(output, height_factor, axis=2)
+    output = repeat_elements(output, width_factor, axis=3)
+    return output
+  else:
+    raise ValueError('Invalid data_format:', data_format)
+
+
+def repeat_elements(x, rep, axis):
+  """Repeats the elements of a tensor along an axis, like `np.repeat`.
+
+  If `x` has shape `(s1, s2, s3)` and `axis` is `1`, the output
+  will have shape `(s1, s2 * rep, s3)`.
+
+  Arguments:
+      x: Tensor or variable.
+      rep: Python integer, number of times to repeat.
+      axis: Axis along which to repeat.
+
+  Raises:
+      ValueError: In case `x.shape[axis]` is undefined.
+
+  Returns:
+      A tensor.
+  """
+  x_shape = x.get_shape().as_list()
+  if x_shape[axis] is None:
+    raise ValueError('Axis ' + str(axis) + ' of input tensor '
+                     'should have a defined dimension, but is None. '
+                     'Full tensor shape: ' + str(tuple(x_shape)) + '. '
+                     'Typically you need to pass a fully-defined '
+                     '`input_shape` argument to your first layer.')
+  # slices along the repeat axis
+  splits = array_ops.split(value=x, num_or_size_splits=x_shape[axis], axis=axis)
+  # repeat each slice the given number of reps
+  x_rep = [s for s in splits for _ in range(rep)]
+  return concatenate(x_rep, axis)
+
+
+def repeat(x, n):
+  """Repeats a 2D tensor.
+
+  if `x` has shape (samples, dim) and `n` is `2`,
+  the output will have shape `(samples, 2, dim)`.
+
+  Arguments:
+      x: Tensor or variable.
+      n: Python integer, number of times to repeat.
+
+  Returns:
+      A tensor.
+  """
+  assert ndim(x) == 2
+  x = array_ops.expand_dims(x, 1)
+  pattern = array_ops.stack([1, n, 1])
+  return array_ops.tile(x, pattern)
+
+
+def arange(start, stop=None, step=1, dtype='int32'):
+  """Creates a 1D tensor containing a sequence of integers.
+
+  The function arguments use the same convention as
+  Theano's arange: if only one argument is provided,
+  it is in fact the "stop" argument.
+
+  The default type of the returned tensor is `'int32'` to
+  match TensorFlow's default.
+
+  Arguments:
+      start: Start value.
+      stop: Stop value.
+      step: Difference between two successive values.
+      dtype: Integer dtype to use.
+
+  Returns:
+      An integer tensor.
+
+  """
+  # Match the behavior of numpy and Theano by returning an empty seqence.
+  if stop is None and start < 0:
+    start = 0
+  result = math_ops.range(start, limit=stop, delta=step, name='arange')
+  if dtype != 'int32':
+    result = cast(result, dtype)
+  return result
+
+
+def tile(x, n):
+  """Creates a tensor by tiling `x` by `n`.
+
+  Arguments:
+      x: A tensor or variable
+      n: A list of integer. The length must be the same as the number of
+          dimensions in `x`.
+
+  Returns:
+      A tiled tensor.
+  """
+  if isinstance(n, int):
+    n = [n]
+  return array_ops.tile(x, n)
+
+
+def flatten(x):
+  """Flatten a tensor.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A tensor, reshaped into 1-D
+  """
+  return array_ops.reshape(x, [-1])
+
+
+def batch_flatten(x):
+  """Turn a nD tensor into a 2D tensor with same 0th dimension.
+
+  In other words, it flattens each data samples of a batch.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  x = array_ops.reshape(x, array_ops.stack([-1, prod(shape(x)[1:])]))
+  return x
+
+
+def expand_dims(x, axis=-1):
+  """Adds a 1-sized dimension at index "dim".
+
+  Arguments:
+      x: A tensor or variable.
+      axis: Position where to add a new axis.
+
+  Returns:
+      A tensor with expended dimensions.
+  """
+  return array_ops.expand_dims(x, axis)
+
+
+def squeeze(x, axis):
+  """Removes a 1-dimension from the tensor at index "axis".
+
+  Arguments:
+      x: A tensor or variable.
+      axis: Axis to drop.
+
+  Returns:
+      A tensor with the same data as `x` but reduced dimensions.
+  """
+  return array_ops.squeeze(x, [axis])
+
+
+def temporal_padding(x, padding=(1, 1)):
+  """Pads the middle dimension of a 3D tensor.
+
+  Arguments:
+      x: Tensor or variable.
+      padding: Tuple of 2 integers, how many zeros to
+          add at the start and end of dim 1.
+
+  Returns:
+      A padded 3D tensor.
+  """
+  assert len(padding) == 2
+  pattern = [[0, 0], [padding[0], padding[1]], [0, 0]]
+  return array_ops.pad(x, pattern)
+
+
+def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
+  """Pads the 2nd and 3rd dimensions of a 4D tensor.
+
+  Arguments:
+      x: Tensor or variable.
+      padding: Tuple of 2 tuples, padding pattern.
+      data_format: One of `channels_last` or `channels_first`.
+
+  Returns:
+      A padded 4D tensor.
+
+  Raises:
+      ValueError: if `data_format` is neither
+          `channels_last` or `channels_first`.
+  """
+  assert len(padding) == 2
+  assert len(padding[0]) == 2
+  assert len(padding[1]) == 2
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  if data_format == 'channels_first':
+    pattern = [[0, 0], [0, 0], list(padding[0]), list(padding[1])]
+  else:
+    pattern = [[0, 0], list(padding[0]), list(padding[1]), [0, 0]]
+  return array_ops.pad(x, pattern)
+
+
+def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
+  """Pads 5D tensor with zeros along the depth, height, width dimensions.
+
+  Pads these dimensions with respectively
+  "padding[0]", "padding[1]" and "padding[2]" zeros left and right.
+
+  For 'channels_last' data_format,
+  the 2nd, 3rd and 4th dimension will be padded.
+  For 'channels_first' data_format,
+  the 3rd, 4th and 5th dimension will be padded.
+
+  Arguments:
+      x: Tensor or variable.
+      padding: Tuple of 3 tuples, padding pattern.
+      data_format: One of `channels_last` or `channels_first`.
+
+  Returns:
+      A padded 5D tensor.
+
+  Raises:
+      ValueError: if `data_format` is neither
+          `channels_last` or `channels_first`.
+
+  """
+  assert len(padding) == 3
+  assert len(padding[0]) == 2
+  assert len(padding[1]) == 2
+  assert len(padding[2]) == 2
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  if data_format == 'channels_first':
+    pattern = [[0, 0], [0, 0], [padding[0][0], padding[0][1]],
+               [padding[1][0], padding[1][1]], [padding[2][0], padding[2][1]]]
+  else:
+    pattern = [[0, 0], [padding[0][0], padding[0][1]],
+               [padding[1][0], padding[1][1]], [padding[2][0],
+                                                padding[2][1]], [0, 0]]
+  return array_ops.pad(x, pattern)
+
+
+def stack(x, axis=0):
+  """Stacks a list of rank `R` tensors into a rank `R+1` tensor.
+
+  Arguments:
+      x: List of tensors.
+      axis: Axis along which to perform stacking.
+
+  Returns:
+      A tensor.
+  """
+  return array_ops.stack(x, axis=axis)
+
+
+def one_hot(indices, num_classes):
+  """Computes the one-hot representation of an integer tensor.
+
+  Arguments:
+      indices: nD integer tensor of shape
+          `(batch_size, dim1, dim2, ... dim(n-1))`
+      num_classes: Integer, number of classes to consider.
+
+  Returns:
+      (n + 1)D one hot representation of the input
+      with shape `(batch_size, dim1, dim2, ... dim(n-1), num_classes)`
+
+  Returns:
+      The one-hot tensor.
+  """
+  return array_ops.one_hot(indices, depth=num_classes, axis=-1)
+
+
+def reverse(x, axes):
+  """Reverse a tensor along the specified axes.
+
+  Arguments:
+      x: Tensor to reverse.
+      axes: Integer or iterable of integers.
+          Axes to reverse.
+
+  Returns:
+      A tensor.
+  """
+  if isinstance(axes, int):
+    axes = [axes]
+  return array_ops.reverse(x, axes)
+
+
+# VALUE MANIPULATION
+
+
+def get_value(x):
+  """Returns the value of a variable.
+
+  Arguments:
+      x: input variable.
+
+  Returns:
+      A Numpy array.
+  """
+  return x.eval(session=get_session())
+
+
+def batch_get_value(tensors):
+  """Returns the value of more than one tensor variable.
+
+  Arguments:
+      tensors: list of ops to run.
+
+  Returns:
+      A list of Numpy arrays.
+  """
+  if tensors:
+    return get_session().run(tensors)
+  else:
+    return []
+
+
+def set_value(x, value):
+  """Sets the value of a variable, from a Numpy array.
+
+  Arguments:
+      x: Tensor to set to a new value.
+      value: Value to set the tensor to, as a Numpy array
+          (of the same shape).
+  """
+  value = np.asarray(value)
+  tf_dtype = _convert_string_dtype(x.dtype.name.split('_')[0])
+  if hasattr(x, '_assign_placeholder'):
+    assign_placeholder = x._assign_placeholder
+    assign_op = x._assign_op
+  else:
+    assign_placeholder = array_ops.placeholder(tf_dtype, shape=value.shape)
+    assign_op = x.assign(assign_placeholder)
+    x._assign_placeholder = assign_placeholder
+    x._assign_op = assign_op
+  get_session().run(assign_op, feed_dict={assign_placeholder: value})
+
+
+def batch_set_value(tuples):
+  """Sets the values of many tensor variables at once.
+
+  Arguments:
+      tuples: a list of tuples `(tensor, value)`.
+          `value` should be a Numpy array.
+  """
+  if tuples:
+    assign_ops = []
+    feed_dict = {}
+    for x, value in tuples:
+      value = np.asarray(value)
+      tf_dtype = _convert_string_dtype(x.dtype.name.split('_')[0])
+      if hasattr(x, '_assign_placeholder'):
+        assign_placeholder = x._assign_placeholder
+        assign_op = x._assign_op
+      else:
+        assign_placeholder = array_ops.placeholder(tf_dtype, shape=value.shape)
+        assign_op = x.assign(assign_placeholder)
+        x._assign_placeholder = assign_placeholder
+        x._assign_op = assign_op
+      assign_ops.append(assign_op)
+      feed_dict[assign_placeholder] = value
+    get_session().run(assign_ops, feed_dict=feed_dict)
+
+
+def print_tensor(x, message=''):
+  """Prints `message` and the tensor value when evaluated.
+
+  Arguments:
+      x: Tensor to print.
+      message: Message to print jointly with the tensor.
+
+  Returns:
+      The same tensor `x`, unchanged.
+  """
+  return logging_ops.Print(x, [x], message)
+
+
+# GRAPH MANIPULATION
+
+
+class Function(object):
+  """Runs a computation graph.
+
+  Arguments:
+      inputs: Feed placeholders to the computation graph.
+      outputs: Output tensors to fetch.
+      updates: Additional update ops to be run at function call.
+  """
+
+  def __init__(self, inputs, outputs, updates=None):
+    updates = updates or []
+    if not isinstance(inputs, (list, tuple)):
+      raise TypeError('`inputs` to a TensorFlow backend function '
+                      'should be a list or tuple.')
+    if not isinstance(outputs, (list, tuple)):
+      raise TypeError('`outputs` of a TensorFlow backend function '
+                      'should be a list or tuple.')
+    if not isinstance(updates, (list, tuple)):
+      raise TypeError('`updates` in a TensorFlow backend function '
+                      'should be a list or tuple.')
+    self.inputs = list(inputs)
+    self.outputs = list(outputs)
+    with ops.control_dependencies(self.outputs):
+      updates_ops = []
+      for update in updates:
+        if isinstance(update, tuple):
+          p, new_p = update
+          updates_ops.append(state_ops.assign(p, new_p))
+        else:
+          # assumed already an op
+          updates_ops.append(update)
+      self.updates_op = control_flow_ops.group(*updates_ops)
+
+  def __call__(self, inputs):
+    if not isinstance(inputs, (list, tuple)):
+      raise TypeError('`inputs` should be a list or tuple.')
+    feed_dict = {}
+    for tensor, value in zip(self.inputs, inputs):
+      if is_sparse(tensor):
+        sparse_coo = value.tocoo()
+        indices = np.concatenate((np.expand_dims(sparse_coo.row, 1),
+                                  np.expand_dims(sparse_coo.col, 1)), 1)
+        value = (indices, sparse_coo.data, sparse_coo.shape)
+      feed_dict[tensor] = value
+    session = get_session()
+    updated = session.run(self.outputs + [self.updates_op], feed_dict=feed_dict)
+    return updated[:len(self.outputs)]
+
+
+def function(inputs, outputs, updates=None, **kwargs):
+  """Instantiates a Keras function.
+
+  Arguments:
+      inputs: List of placeholder tensors.
+      outputs: List of output tensors.
+      updates: List of update ops.
+      **kwargs: Not used with TensorFlow.
+
+  Returns:
+      Output values as Numpy arrays.
+  """
+  if kwargs:
+    msg = [
+        'Expected no kwargs, you passed %s' % len(kwargs),
+        'kwargs passed to function are ignored with Tensorflow backend'
+    ]
+    warnings.warn('\n'.join(msg))
+  return Function(inputs, outputs, updates=updates)
+
+
+def gradients(loss, variables):
+  """Returns the gradients of `variables` w.r.t. `loss`.
+
+  Arguments:
+      loss: Scalar tensor to minimize.
+      variables: List of variables.
+
+  Returns:
+      A gradients tensor.
+  """
+  return gradients_module.gradients(
+      loss, variables, colocate_gradients_with_ops=True)
+
+
+def stop_gradient(variables):
+  """Returns `variables` but with zero gradient w.r.t. every other variable.
+
+  Arguments:
+      variables: List of variables.
+
+  Returns:
+      The same list of variables.
+  """
+  return array_ops.stop_gradient(variables)
+
+
+# CONTROL FLOW
+
+
+def rnn(step_function,
+        inputs,
+        initial_states,
+        go_backwards=False,
+        mask=None,
+        constants=None,
+        unroll=False):
+  """Iterates over the time dimension of a tensor.
+
+  Arguments:
+      step_function: RNN step function.
+          Parameters;
+              input; tensor with shape `(samples, ...)` (no time dimension),
+                  representing input for the batch of samples at a certain
+                  time step.
+              states; list of tensors.
+          Returns;
+              output; tensor with shape `(samples, output_dim)`
+                  (no time dimension).
+              new_states; list of tensors, same length and shapes
+                  as 'states'. The first state in the list must be the
+                  output tensor at the previous timestep.
+      inputs: tensor of temporal data of shape `(samples, time, ...)`
+          (at least 3D).
+      initial_states: tensor with shape (samples, output_dim)
+          (no time dimension),
+          containing the initial values for the states used in
+          the step function.
+      go_backwards: boolean. If True, do the iteration over
+          the time dimension in reverse order.
+      mask: binary tensor with shape `(samples, time, 1)`,
+          with a zero for every element that is masked.
+      constants: a list of constant values passed at each step.
+      unroll: whether to unroll the RNN or to use a symbolic loop
+          (`while_loop` or `scan` depending on backend).
+
+  Returns:
+      A tuple, `(last_output, outputs, new_states)`.
+          last_output: the latest output of the rnn, of shape `(samples, ...)`
+          outputs: tensor with shape `(samples, time, ...)` where each
+              entry `outputs[s, t]` is the output of the step function
+              at time `t` for sample `s`.
+          new_states: list of tensors, latest states returned by
+              the step function, of shape `(samples, ...)`.
+
+  Raises:
+      ValueError: if input dimension is less than 3.
+      ValueError: if `unroll` is `True` but input timestep is not a fixed
+      number.
+      ValueError: if `mask` is provided (not `None`) but states is not provided
+          (`len(states)` == 0).
+  """
+  ndim = len(inputs.get_shape())
+  if ndim < 3:
+    raise ValueError('Input should be at least 3D.')
+  axes = [1, 0] + list(range(2, ndim))
+  inputs = array_ops.transpose(inputs, (axes))
+
+  if mask is not None:
+    if mask.dtype != dtypes_module.bool:
+      mask = math_ops.cast(mask, dtypes_module.bool)
+    if len(mask.get_shape()) == ndim - 1:
+      mask = expand_dims(mask)
+    mask = array_ops.transpose(mask, axes)
+
+  if constants is None:
+    constants = []
+
+  if unroll:
+    if not inputs.get_shape()[0]:
+      raise ValueError('Unrolling requires a ' 'fixed number of timesteps.')
+    states = initial_states
+    successive_states = []
+    successive_outputs = []
+
+    input_list = array_ops.unstack(inputs)
+    if go_backwards:
+      input_list.reverse()
+
+    if mask is not None:
+      mask_list = array_ops.unstack(mask)
+      if go_backwards:
+        mask_list.reverse()
+
+      for inp, mask_t in zip(input_list, mask_list):
+        output, new_states = step_function(inp, states + constants)
+
+        # tf.where needs its condition tensor
+        # to be the same shape as its two
+        # result tensors, but in our case
+        # the condition (mask) tensor is
+        # (nsamples, 1), and A and B are (nsamples, ndimensions).
+        # So we need to
+        # broadcast the mask to match the shape of A and B.
+        # That's what the tile call does,
+        # it just repeats the mask along its second dimension
+        # n times.
+        tiled_mask_t = array_ops.tile(mask_t,
+                                      array_ops.stack(
+                                          [1, array_ops.shape(output)[1]]))
+
+        if not successive_outputs:
+          prev_output = zeros_like(output)
+        else:
+          prev_output = successive_outputs[-1]
+
+        output = array_ops.where(tiled_mask_t, output, prev_output)
+
+        return_states = []
+        for state, new_state in zip(states, new_states):
+          # (see earlier comment for tile explanation)
+          tiled_mask_t = array_ops.tile(mask_t,
+                                        array_ops.stack(
+                                            [1, array_ops.shape(new_state)[1]]))
+          return_states.append(array_ops.where(tiled_mask_t, new_state, state))
+        states = return_states
+        successive_outputs.append(output)
+        successive_states.append(states)
+        last_output = successive_outputs[-1]
+        new_states = successive_states[-1]
+        outputs = array_ops.stack(successive_outputs)
+    else:
+      for inp in input_list:
+        output, states = step_function(inp, states + constants)
+        successive_outputs.append(output)
+        successive_states.append(states)
+      last_output = successive_outputs[-1]
+      new_states = successive_states[-1]
+      outputs = array_ops.stack(successive_outputs)
+
+  else:
+    if go_backwards:
+      inputs = reverse(inputs, 0)
+
+    states = tuple(initial_states)
+
+    time_steps = array_ops.shape(inputs)[0]
+    outputs, _ = step_function(inputs[0], initial_states + constants)
+    output_ta = tensor_array_ops.TensorArray(
+        dtype=outputs.dtype, size=time_steps, tensor_array_name='output_ta')
+    input_ta = tensor_array_ops.TensorArray(
+        dtype=inputs.dtype, size=time_steps, tensor_array_name='input_ta')
+    input_ta = input_ta.unstack(inputs)
+    time = constant_op.constant(0, dtype='int32', name='time')
+
+    if mask is not None:
+      if not states:
+        raise ValueError('No initial states provided! '
+                         'When using masking in an RNN, you should '
+                         'provide initial states '
+                         '(and your step function should return '
+                         'as its first state at time `t` '
+                         'the output at time `t-1`).')
+      if go_backwards:
+        mask = reverse(mask, 0)
+
+      mask_ta = tensor_array_ops.TensorArray(
+          dtype=dtypes_module.bool,
+          size=time_steps,
+          tensor_array_name='mask_ta')
+      mask_ta = mask_ta.unstack(mask)
+
+      def _step(time, output_ta_t, *states):
+        """RNN step function.
+
+        Arguments:
+            time: Current timestep value.
+            output_ta_t: TensorArray.
+            *states: List of states.
+
+        Returns:
+            Tuple: `(time + 1,output_ta_t) + tuple(new_states)`
+        """
+        current_input = input_ta.read(time)
+        mask_t = mask_ta.read(time)
+        output, new_states = step_function(current_input,
+                                           tuple(states) + tuple(constants))
+        for state, new_state in zip(states, new_states):
+          new_state.set_shape(state.get_shape())
+        tiled_mask_t = array_ops.tile(mask_t,
+                                      array_ops.stack(
+                                          [1, array_ops.shape(output)[1]]))
+        output = array_ops.where(tiled_mask_t, output, states[0])
+        new_states = [
+            array_ops.where(tiled_mask_t, new_states[i], states[i])
+            for i in range(len(states))
+        ]
+        output_ta_t = output_ta_t.write(time, output)
+        return (time + 1, output_ta_t) + tuple(new_states)
+    else:
+
+      def _step(time, output_ta_t, *states):
+        """RNN step function.
+
+        Arguments:
+            time: Current timestep value.
+            output_ta_t: TensorArray.
+            *states: List of states.
+
+        Returns:
+            Tuple: `(time + 1,output_ta_t) + tuple(new_states)`
+        """
+        current_input = input_ta.read(time)
+        output, new_states = step_function(current_input,
+                                           tuple(states) + tuple(constants))
+        for state, new_state in zip(states, new_states):
+          new_state.set_shape(state.get_shape())
+        output_ta_t = output_ta_t.write(time, output)
+        return (time + 1, output_ta_t) + tuple(new_states)
+
+    final_outputs = control_flow_ops.while_loop(
+        cond=lambda time, *_: time < time_steps,
+        body=_step,
+        loop_vars=(time, output_ta) + states,
+        parallel_iterations=32,
+        swap_memory=True)
+    last_time = final_outputs[0]
+    output_ta = final_outputs[1]
+    new_states = final_outputs[2:]
+
+    outputs = output_ta.stack()
+    last_output = output_ta.read(last_time - 1)
+
+  axes = [1, 0] + list(range(2, len(outputs.get_shape())))
+  outputs = array_ops.transpose(outputs, axes)
+  return last_output, outputs, new_states
+
+
+def switch(condition, then_expression, else_expression):
+  """Switches between two operations depending on a scalar value.
+
+  Note that both `then_expression` and `else_expression`
+  should be symbolic tensors of the *same shape*.
+
+  Arguments:
+      condition: scalar tensor (`int` or `bool`).
+      then_expression: either a tensor, or a callable that returns a tensor.
+      else_expression: either a tensor, or a callable that returns a tensor.
+
+  Returns:
+      The selected tensor.
+  """
+  if condition.dtype != dtypes_module.bool:
+    condition = math_ops.cast(condition, 'bool')
+  if not callable(then_expression):
+
+    def then_expression_fn():
+      return then_expression
+  else:
+    then_expression_fn = then_expression
+  if not callable(else_expression):
+
+    def else_expression_fn():
+      return else_expression
+  else:
+    else_expression_fn = else_expression
+  x = control_flow_ops.cond(condition, then_expression_fn, else_expression_fn)
+  return x
+
+
+def in_train_phase(x, alt, training=None):
+  """Selects `x` in train phase, and `alt` otherwise.
+
+  Note that `alt` should have the *same shape* as `x`.
+
+  Arguments:
+      x: What to return in train phase
+          (tensor or callable that returns a tensor).
+      alt: What to return otherwise
+          (tensor or callable that returns a tensor).
+      training: Optional scalar tensor
+          (or Python boolean, or Python integer)
+          specifing the learning phase.
+
+  Returns:
+      Either `x` or `alt` based on the `training` flag.
+      the `training` flag defaults to `K.learning_phase()`.
+  """
+  if training is None:
+    training = learning_phase()
+    uses_learning_phase = True
+  else:
+    uses_learning_phase = False
+
+  if training is 1 or training is True:
+    if callable(x):
+      return x()
+    else:
+      return x
+
+  elif training is 0 or training is False:
+    if callable(alt):
+      return alt()
+    else:
+      return alt
+
+  # else: assume learning phase is a placeholder tensor.
+  x = switch(training, x, alt)
+  if uses_learning_phase:
+    x._uses_learning_phase = True
+  return x
+
+
+def in_test_phase(x, alt, training=None):
+  """Selects `x` in test phase, and `alt` otherwise.
+
+  Note that `alt` should have the *same shape* as `x`.
+
+  Arguments:
+      x: What to return in test phase
+          (tensor or callable that returns a tensor).
+      alt: What to return otherwise
+          (tensor or callable that returns a tensor).
+      training: Optional scalar tensor
+          (or Python boolean, or Python integer)
+          specifing the learning phase.
+
+  Returns:
+      Either `x` or `alt` based on `K.learning_phase`.
+  """
+  return in_train_phase(alt, x, training=training)
+
+
+# NN OPERATIONS
+
+
+def relu(x, alpha=0., max_value=None):
+  """Rectified linear unit.
+
+  With default values, it returns element-wise `max(x, 0)`.
+
+  Arguments:
+      x: A tensor or variable.
+      alpha: A scalar, slope of negative section (default=`0.`).
+      max_value: Saturation threshold.
+
+  Returns:
+      A tensor.
+  """
+  if alpha != 0.:
+    negative_part = nn.relu(-x)
+  x = nn.relu(x)
+  if max_value is not None:
+    max_value = _to_tensor(max_value, x.dtype.base_dtype)
+    zero = _to_tensor(0., x.dtype.base_dtype)
+    x = clip_ops.clip_by_value(x, zero, max_value)
+  if alpha != 0.:
+    alpha = _to_tensor(alpha, x.dtype.base_dtype)
+    x -= alpha * negative_part
+  return x
+
+
+def elu(x, alpha=1.):
+  """Exponential linear unit.
+
+  Arguments:
+      x: A tenor or variable to compute the activation function for.
+      alpha: A scalar, slope of positive section.
+
+  Returns:
+      A tensor.
+  """
+  res = nn.elu(x)
+  if alpha == 1:
+    return res
+  else:
+    return array_ops.where(x > 0, res, alpha * res)
+
+
+def softmax(x):
+  """Softmax of a tensor.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return nn.softmax(x)
+
+
+def softplus(x):
+  """Softplus of a tensor.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return nn.softplus(x)
+
+
+def softsign(x):
+  """Softsign of a tensor.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return nn.softsign(x)
+
+
+def categorical_crossentropy(output, target, from_logits=False):
+  """Categorical crossentropy between an output tensor and a target tensor.
+
+  Arguments:
+      output: A tensor resulting from a softmax
+          (unless `from_logits` is True, in which
+          case `output` is expected to be the logits).
+      target: A tensor of the same shape as `output`.
+      from_logits: Boolean, whether `output` is the
+          result of a softmax, or is a tensor of logits.
+
+  Returns:
+      Output tensor.
+  """
+  # Note: nn.softmax_cross_entropy_with_logits
+  # expects logits, Keras expects probabilities.
+  if not from_logits:
+    # scale preds so that the class probas of each sample sum to 1
+    output /= math_ops.reduce_sum(
+        output, reduction_indices=len(output.get_shape()) - 1, keep_dims=True)
+    # manual computation of crossentropy
+    epsilon = _to_tensor(_EPSILON, output.dtype.base_dtype)
+    output = clip_ops.clip_by_value(output, epsilon, 1. - epsilon)
+    return -math_ops.reduce_sum(
+        target * math_ops.log(output),
+        reduction_indices=len(output.get_shape()) - 1)
+  else:
+    return nn.softmax_cross_entropy_with_logits(labels=target, logits=output)
+
+
+def sparse_categorical_crossentropy(output, target, from_logits=False):
+  """Categorical crossentropy with integer targets.
+
+  Arguments:
+      output: A tensor resulting from a softmax
+          (unless `from_logits` is True, in which
+          case `output` is expected to be the logits).
+      target: An integer tensor.
+      from_logits: Boolean, whether `output` is the
+          result of a softmax, or is a tensor of logits.
+
+  Returns:
+      Output tensor.
+  """
+  # Note: nn.softmax_cross_entropy_with_logits
+  # expects logits, Keras expects probabilities.
+  if not from_logits:
+    epsilon = _to_tensor(_EPSILON, output.dtype.base_dtype)
+    output = clip_ops.clip_by_value(output, epsilon, 1 - epsilon)
+    output = math_ops.log(output)
+
+  output_shape = output.get_shape()
+  targets = cast(flatten(target), 'int64')
+  logits = array_ops.reshape(output, [-1, int(output_shape[-1])])
+  res = nn.sparse_softmax_cross_entropy_with_logits(
+      labels=targets, logits=logits)
+  if len(output_shape) == 3:
+    # if our output includes timesteps we need to reshape
+    return array_ops.reshape(res, array_ops.shape(output)[:-1])
+  else:
+    return res
+
+
+def binary_crossentropy(output, target, from_logits=False):
+  """Binary crossentropy between an output tensor and a target tensor.
+
+  Arguments:
+      output: A tensor.
+      target: A tensor with the same shape as `output`.
+      from_logits: Whether `output` is expected to be a logits tensor.
+          By default, we consider that `output`
+          encodes a probability distribution.
+
+  Returns:
+      A tensor.
+  """
+  # Note: nn.softmax_cross_entropy_with_logits
+  # expects logits, Keras expects probabilities.
+  if not from_logits:
+    # transform back to logits
+    epsilon = _to_tensor(_EPSILON, output.dtype.base_dtype)
+    output = clip_ops.clip_by_value(output, epsilon, 1 - epsilon)
+    output = math_ops.log(output / (1 - output))
+  return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
+
+
+def sigmoid(x):
+  """Element-wise sigmoid.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return nn.sigmoid(x)
+
+
+def hard_sigmoid(x):
+  """Segment-wise linear approximation of sigmoid.
+
+  Faster than sigmoid.
+  Returns `0.` if `x < -2.5`, `1.` if `x > 2.5`.
+  In `-2.5 <= x <= 2.5`, returns `0.2 * x + 0.5`.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  x = (0.2 * x) + 0.5
+  zero = _to_tensor(0., x.dtype.base_dtype)
+  one = _to_tensor(1., x.dtype.base_dtype)
+  x = clip_ops.clip_by_value(x, zero, one)
+  return x
+
+
+def tanh(x):
+  """Element-wise tanh.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return nn.tanh(x)
+
+
+def dropout(x, level, noise_shape=None, seed=None):
+  """Sets entries in `x` to zero at random, while scaling the entire tensor.
+
+  Arguments:
+      x: tensor
+      level: fraction of the entries in the tensor
+          that will be set to 0.
+      noise_shape: shape for randomly generated keep/drop flags,
+          must be broadcastable to the shape of `x`
+      seed: random seed to ensure determinism.
+
+  Returns:
+      A tensor.
+  """
+  retain_prob = 1. - level
+  if seed is None:
+    seed = np.random.randint(10e6)
+  # the dummy 1. works around a TF bug
+  # (float32_ref vs. float32 incomptability)
+  return nn.dropout(x * 1., retain_prob, noise_shape, seed=seed)
+
+
+def l2_normalize(x, axis):
+  """Normalizes a tensor wrt the L2 norm alongside the specified axis.
+
+  Arguments:
+      x: Tensor or variable.
+      axis: axis along which to perform normalization.
+
+  Returns:
+      A tensor.
+  """
+  if axis < 0:
+    axis %= len(x.get_shape())
+  return nn.l2_normalize(x, dim=axis)
+
+
+def in_top_k(predictions, targets, k):
+  """Returns whether the `targets` are in the top `k` `predictions`.
+
+  Arguments:
+      predictions: A tensor of shape `batch_size` x classes and type `float32`.
+      targets: A tensor of shape batch_size and type `int32` or `int64`.
+      k: An `int`, number of top elements to consider.
+
+  Returns:
+      A tensor of shape `batch_size` and type `bool`. `output_i` is `True` if
+      `targets_i` is within top-k values of `predictions_i`
+  """
+  return nn.in_top_k(predictions, targets, k)
+
+
+# CONVOLUTIONS
+
+
+def _preprocess_deconv_output_shape(x, shape, data_format):
+  if data_format == 'channels_first':
+    shape = (shape[0], shape[2], shape[3], shape[1])
+
+  if shape[0] is None:
+    shape = (array_ops.shape(x)[0],) + tuple(shape[1:])
+    shape = array_ops.stack(list(shape))
+  return shape
+
+
+def _preprocess_conv2d_input(x, data_format):
+  if dtype(x) == 'float64':
+    x = math_ops.cast(x, 'float32')
+  if data_format == 'channels_first':
+    # TF uses the last dimension as channel dimension,
+    # instead of the 2nd one.
+    # TH input shape: (samples, input_depth, rows, cols)
+    # TF input shape: (samples, rows, cols, input_depth)
+    x = array_ops.transpose(x, (0, 2, 3, 1))
+  return x
+
+
+def _preprocess_conv3d_input(x, data_format):
+  if dtype(x) == 'float64':
+    x = math_ops.cast(x, 'float32')
+  if data_format == 'channels_first':
+    x = array_ops.transpose(x, (0, 2, 3, 4, 1))
+  return x
+
+
+def _preprocess_conv2d_kernel(kernel, data_format):
+  if dtype(kernel) == 'float64':
+    kernel = math_ops.cast(kernel, 'float32')
+  if data_format == 'channels_first':
+    kernel = array_ops.transpose(kernel, (2, 3, 1, 0))
+  return kernel
+
+
+def _preprocess_conv3d_kernel(kernel, data_format):
+  if dtype(kernel) == 'float64':
+    kernel = math_ops.cast(kernel, 'float32')
+  if data_format == 'channels_first':
+    kernel = array_ops.transpose(kernel, (2, 3, 4, 1, 0))
+  return kernel
+
+
+def _preprocess_padding(padding):
+  if padding == 'same':
+    padding = 'SAME'
+  elif padding == 'valid':
+    padding = 'VALID'
+  else:
+    raise ValueError('Invalid border mode:', padding)
+  return padding
+
+
+def _postprocess_conv2d_output(x, data_format):
+  if data_format == 'channels_first':
+    x = array_ops.transpose(x, (0, 3, 1, 2))
+
+  if floatx() == 'float64':
+    x = math_ops.cast(x, 'float64')
+  return x
+
+
+def _postprocess_conv3d_output(x, data_format):
+  if data_format == 'channels_first':
+    x = array_ops.transpose(x, (0, 4, 1, 2, 3))
+
+  if floatx() == 'float64':
+    x = math_ops.cast(x, 'float64')
+  return x
+
+
+def conv1d(x,
+           kernel,
+           strides=1,
+           padding='valid',
+           data_format=None,
+           dilation_rate=1):
+  """1D convolution.
+
+  Arguments:
+      x: Tensor or variable.
+      kernel: kernel tensor.
+      strides: stride integer.
+      padding: string, `"same"`, `"causal"` or `"valid"`.
+      data_format: string, one of "channels_last", "channels_first".
+      dilation_rate: integer dilate rate.
+
+  Returns:
+      A tensor, result of 1D convolution.
+  """
+  kernel_shape = kernel.get_shape().as_list()
+  if padding == 'causal':
+    # causal (dilated) convolution:
+    left_pad = dilation_rate * (kernel_shape[0] - 1)
+    x = temporal_padding(x, (left_pad, 0))
+    padding = 'valid'
+  padding = _preprocess_padding(padding)
+  if data_format == 'channels_last':
+    tf_data_format = 'NWC'
+  else:
+    tf_data_format = 'NCW'
+  x = nn.convolution(
+      input=x,
+      filter=kernel,
+      dilation_rate=(dilation_rate,),
+      strides=(strides,),
+      padding=padding,
+      data_format=tf_data_format)
+  return x
+
+
+def conv2d(x,
+           kernel,
+           strides=(1, 1),
+           padding='valid',
+           data_format=None,
+           dilation_rate=(1, 1)):
+  """2D convolution.
+
+  Arguments:
+      x: Tensor or variable.
+      kernel: kernel tensor.
+      strides: strides tuple.
+      padding: string, `"same"` or `"valid"`.
+      data_format: `"channels_last"` or `"channels_first"`.
+          Whether to use Theano or TensorFlow data format
+          for inputs/kernels/ouputs.
+      dilation_rate: tuple of 2 integers.
+
+  Returns:
+      A tensor, result of 2D convolution.
+
+  Raises:
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  # With 4d inputs, nn.convolution only supports
+  # data_format NHWC, so we transpose the inputs
+  # in case we are in data_format channels_first.
+  x = _preprocess_conv2d_input(x, data_format)
+  padding = _preprocess_padding(padding)
+  x = nn.convolution(
+      input=x,
+      filter=kernel,
+      dilation_rate=dilation_rate,
+      strides=strides,
+      padding=padding,
+      data_format='NHWC')
+  return _postprocess_conv2d_output(x, data_format)
+
+
+def conv2d_transpose(x,
+                     kernel,
+                     output_shape,
+                     strides=(1, 1),
+                     padding='valid',
+                     data_format=None):
+  """2D deconvolution (i.e.
+
+  transposed convolution).
+
+  Arguments:
+      x: Tensor or variable.
+      kernel: kernel tensor.
+      output_shape: 1D int tensor for the output shape.
+      strides: strides tuple.
+      padding: string, `"same"` or `"valid"`.
+      data_format: `"channels_last"` or `"channels_first"`.
+          Whether to use Theano or TensorFlow data format
+          for inputs/kernels/ouputs.
+
+  Returns:
+      A tensor, result of transposed 2D convolution.
+
+  Raises:
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+  if isinstance(output_shape, (tuple, list)):
+    output_shape = array_ops.stack(output_shape)
+
+  x = _preprocess_conv2d_input(x, data_format)
+  output_shape = _preprocess_deconv_output_shape(x, output_shape, data_format)
+  padding = _preprocess_padding(padding)
+  strides = (1,) + strides + (1,)
+
+  x = nn.conv2d_transpose(x, kernel, output_shape, strides, padding=padding)
+  x = _postprocess_conv2d_output(x, data_format)
+  return x
+
+
+def separable_conv2d(x,
+                     depthwise_kernel,
+                     pointwise_kernel,
+                     strides=(1, 1),
+                     padding='valid',
+                     data_format=None,
+                     dilation_rate=(1, 1)):
+  """2D convolution with separable filters.
+
+  Arguments:
+      x: input tensor
+      depthwise_kernel: convolution kernel for the depthwise convolution.
+      pointwise_kernel: kernel for the 1x1 convolution.
+      strides: strides tuple (length 2).
+      padding: padding mode, "valid" or "same".
+      data_format: data format, "channels_first" or "channels_last".
+      dilation_rate: tuple of integers,
+          dilation rates for the separable convolution.
+
+  Returns:
+      Output tensor.
+
+  Raises:
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  x = _preprocess_conv2d_input(x, data_format)
+  padding = _preprocess_padding(padding)
+  strides = (1,) + strides + (1,)
+
+  x = nn.separable_conv2d(
+      x,
+      depthwise_kernel,
+      pointwise_kernel,
+      strides=strides,
+      padding=padding,
+      rate=dilation_rate)
+  return _postprocess_conv2d_output(x, data_format)
+
+
+def conv3d(x,
+           kernel,
+           strides=(1, 1, 1),
+           padding='valid',
+           data_format=None,
+           dilation_rate=(1, 1, 1)):
+  """3D convolution.
+
+  Arguments:
+      x: Tensor or variable.
+      kernel: kernel tensor.
+      strides: strides tuple.
+      padding: string, `"same"` or `"valid"`.
+      data_format: `"channels_last"` or `"channels_first"`.
+          Whether to use Theano or TensorFlow data format
+          for inputs/kernels/ouputs.
+      dilation_rate: tuple of 3 integers.
+
+  Returns:
+      A tensor, result of 3D convolution.
+
+  Raises:
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  # With 5d inputs, nn.convolution only supports
+  # data_format NDHWC, so we transpose the inputs
+  # in case we are in data_format channels_first.
+  x = _preprocess_conv3d_input(x, data_format)
+  padding = _preprocess_padding(padding)
+  x = nn.convolution(
+      input=x,
+      filter=kernel,
+      dilation_rate=dilation_rate,
+      strides=strides,
+      padding=padding,
+      data_format='NDHWC')
+  return _postprocess_conv3d_output(x, data_format)
+
+
+def pool2d(x,
+           pool_size,
+           strides=(1, 1),
+           padding='valid',
+           data_format=None,
+           pool_mode='max'):
+  """2D Pooling.
+
+  Arguments:
+      x: Tensor or variable.
+      pool_size: tuple of 2 integers.
+      strides: tuple of 2 integers.
+      padding: one of `"valid"`, `"same"`.
+      data_format: one of `"channels_first"`, `"channels_last"`.
+      pool_mode: one of `"max"`, `"avg"`.
+
+  Returns:
+      A tensor, result of 2D pooling.
+
+  Raises:
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
+      ValueError: if `pool_mode` is neither `max` or `avg`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  padding = _preprocess_padding(padding)
+  strides = (1,) + strides + (1,)
+  pool_size = (1,) + pool_size + (1,)
+
+  x = _preprocess_conv2d_input(x, data_format)
+
+  if pool_mode == 'max':
+    x = nn.max_pool(x, pool_size, strides, padding=padding)
+  elif pool_mode == 'avg':
+    x = nn.avg_pool(x, pool_size, strides, padding=padding)
+  else:
+    raise ValueError('Invalid pooling mode:', pool_mode)
+
+  return _postprocess_conv2d_output(x, data_format)
+
+
+def pool3d(x,
+           pool_size,
+           strides=(1, 1, 1),
+           padding='valid',
+           data_format=None,
+           pool_mode='max'):
+  """3D Pooling.
+
+  Arguments:
+      x: Tensor or variable.
+      pool_size: tuple of 3 integers.
+      strides: tuple of 3 integers.
+      padding: one of `"valid"`, `"same"`.
+      data_format: one of `"channels_first"`, `"channels_last"`.
+      pool_mode: one of `"max"`, `"avg"`.
+
+  Returns:
+      A tensor, result of 3D pooling.
+
+  Raises:
+      ValueError: if `data_format` is neither
+          `channels_last` or `channels_first`.
+      ValueError: if `pool_mode` is neither `max` or `avg`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  padding = _preprocess_padding(padding)
+  strides = (1,) + strides + (1,)
+  pool_size = (1,) + pool_size + (1,)
+
+  x = _preprocess_conv3d_input(x, data_format)
+
+  if pool_mode == 'max':
+    x = nn.max_pool3d(x, pool_size, strides, padding=padding)
+  elif pool_mode == 'avg':
+    x = nn.avg_pool3d(x, pool_size, strides, padding=padding)
+  else:
+    raise ValueError('Invalid pooling mode:', pool_mode)
+
+  return _postprocess_conv3d_output(x, data_format)
+
+
+def bias_add(x, bias, data_format=None):
+  """Adds a bias vector to a tensor.
+
+  Arguments:
+      x: Tensor or variable.
+      bias: Bias tensor to add.
+      data_format: Data format for 3D, 4D or 5D tensors:
+          one of "channels_first", "channels_last".
+
+  Returns:
+      Output tensor.
+
+  Raises:
+      ValueError: In case of invalid `data_format` argument.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+  if ndim(x) == 5:
+    if data_format == 'channels_first':
+      x += reshape(bias, (1, int_shape(bias)[0], 1, 1, 1))
+    elif data_format == 'channels_last':
+      x += reshape(bias, (1, 1, 1, 1, int_shape(bias)[0]))
+  elif ndim(x) == 4:
+    if data_format == 'channels_first':
+      # No support yet for NCHW in bias_add.
+      x += reshape(bias, (1, int_shape(bias)[0], 1, 1))
+    elif data_format == 'channels_last':
+      x = nn.bias_add(x, bias, data_format='NHWC')
+  elif ndim(x) == 3:
+    if data_format == 'channels_first':
+      x += reshape(bias, (1, int_shape(bias)[0], 1))
+    elif data_format == 'channels_last':
+      x += reshape(bias, (1, 1, int_shape(bias)[0]))
+  else:
+    x = nn.bias_add(x, bias)
+  return x
+
+
+# RANDOMNESS
+
+
+def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
+  """Returns a tensor with normal distribution of values.
+
+  Arguments:
+      shape: A tuple of integers, the shape of tensor to create.
+      mean: A float, mean of the normal distribution to draw samples.
+      stddev: A float, standard deviation of the normal distribution
+          to draw samples.
+      dtype: String, dtype of returned tensor.
+      seed: Integer, random seed.
+
+  Returns:
+      A tensor.
+  """
+  if dtype is None:
+    dtype = floatx()
+  if seed is None:
+    seed = np.random.randint(10e6)
+  return random_ops.random_normal(
+      shape, mean=mean, stddev=stddev, dtype=dtype, seed=seed)
+
+
+def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
+  """Returns a tensor with uniform distribution of values.
+
+  Arguments:
+      shape: A tuple of integers, the shape of tensor to create.
+      minval: A float, lower boundary of the uniform distribution
+          to draw samples.
+      maxval: A float, upper boundary of the uniform distribution
+          to draw samples.
+      dtype: String, dtype of returned tensor.
+      seed: Integer, random seed.
+
+  Returns:
+      A tensor.
+  """
+  if dtype is None:
+    dtype = floatx()
+  if seed is None:
+    seed = np.random.randint(10e6)
+  return random_ops.random_uniform(
+      shape, minval=minval, maxval=maxval, dtype=dtype, seed=seed)
+
+
+def random_binomial(shape, p=0.0, dtype=None, seed=None):
+  """Returns a tensor with random binomial distribution of values.
+
+  Arguments:
+      shape: A tuple of integers, the shape of tensor to create.
+      p: A float, `0. <= p <= 1`, probability of binomial distribution.
+      dtype: String, dtype of returned tensor.
+      seed: Integer, random seed.
+
+  Returns:
+      A tensor.
+  """
+  if dtype is None:
+    dtype = floatx()
+  if seed is None:
+    seed = np.random.randint(10e6)
+  return array_ops.where(
+      random_ops.random_uniform(shape, dtype=dtype, seed=seed) <= p,
+      array_ops.ones(shape, dtype=dtype), array_ops.zeros(shape, dtype=dtype))
+
+
+def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
+  """Returns a tensor with truncated random normal distribution of values.
+
+  The generated values follow a normal distribution
+  with specified mean and standard deviation,
+  except that values whose magnitude is more than
+  two standard deviations from the mean are dropped and re-picked.
+
+  Arguments:
+      shape: A tuple of integers, the shape of tensor to create.
+      mean: Mean of the values.
+      stddev: Standard deviation of the values.
+      dtype: String, dtype of returned tensor.
+      seed: Integer, random seed.
+
+  Returns:
+      A tensor.
+  """
+  if dtype is None:
+    dtype = floatx()
+  if seed is None:
+    seed = np.random.randint(10e6)
+  return random_ops.truncated_normal(
+      shape, mean, stddev, dtype=dtype, seed=seed)
+
+
+# CTC
+# tensorflow has a native implemenation, but it uses sparse tensors
+# and therefore requires a wrapper for Keras. The functions below convert
+# dense to sparse tensors and also wraps up the beam search code that is
+# in tensorflow's CTC implementation
+
+
+def ctc_label_dense_to_sparse(labels, label_lengths):
+  """Converts CTC labels from dense to sparse.
+
+  Arguments:
+      labels: dense CTC labels.
+      label_lengths: length of the labels.
+
+  Returns:
+      A sparse tensor representation of the lablels.
+  """
+  label_shape = array_ops.shape(labels)
+  num_batches_tns = array_ops.stack([label_shape[0]])
+  max_num_labels_tns = array_ops.stack([label_shape[1]])
+
+  def range_less_than(_, current_input):
+    return array_ops.expand_dims(math_ops.range(
+        label_shape[1]), 0) < array_ops.fill(max_num_labels_tns, current_input)
+
+  init = math_ops.cast(
+      array_ops.fill([1, label_shape[1]], 0), dtypes_module.bool)
+  dense_mask = functional_ops.scan(
+      range_less_than, label_lengths, initializer=init, parallel_iterations=1)
+  dense_mask = dense_mask[:, 0, :]
+
+  label_array = array_ops.reshape(
+      array_ops.tile(math_ops.range(0, label_shape[1]), num_batches_tns),
+      label_shape)
+  label_ind = array_ops.boolean_mask(label_array, dense_mask)
+
+  batch_array = array_ops.transpose(
+      array_ops.reshape(
+          array_ops.tile(math_ops.range(0, label_shape[0]), max_num_labels_tns),
+          reverse(label_shape, 0)))
+  batch_ind = array_ops.boolean_mask(batch_array, dense_mask)
+  indices = array_ops.transpose(
+      array_ops.reshape(concatenate([batch_ind, label_ind], axis=0), [2, -1]))
+
+  vals_sparse = array_ops.gather_nd(labels, indices)
+
+  return sparse_tensor.SparseTensor(
+      math_ops.to_int64(indices), vals_sparse, math_ops.to_int64(label_shape))
+
+
+def ctc_batch_cost(y_true, y_pred, input_length, label_length):
+  """Runs CTC loss algorithm on each batch element.
+
+  Arguments:
+      y_true: tensor `(samples, max_string_length)`
+          containing the truth labels.
+      y_pred: tensor `(samples, time_steps, num_categories)`
+          containing the prediction, or output of the softmax.
+      input_length: tensor `(samples, 1)` containing the sequence length for
+          each batch item in `y_pred`.
+      label_length: tensor `(samples, 1)` containing the sequence length for
+          each batch item in `y_true`.
+
+  Returns:
+      Tensor with shape (samples,1) containing the
+          CTC loss of each element.
+  """
+  label_length = math_ops.to_int32(array_ops.squeeze(label_length))
+  input_length = math_ops.to_int32(array_ops.squeeze(input_length))
+  sparse_labels = math_ops.to_int32(
+      ctc_label_dense_to_sparse(y_true, label_length))
+
+  y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
+
+  return array_ops.expand_dims(
+      ctc.ctc_loss(
+          inputs=y_pred, labels=sparse_labels, sequence_length=input_length), 1)
+
+
+def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
+  """Decodes the output of a softmax.
+
+  Can use either greedy search (also known as best path)
+  or a constrained dictionary search.
+
+  Arguments:
+      y_pred: tensor `(samples, time_steps, num_categories)`
+          containing the prediction, or output of the softmax.
+      input_length: tensor `(samples, )` containing the sequence length for
+          each batch item in `y_pred`.
+      greedy: perform much faster best-path search if `true`.
+          This does not use a dictionary.
+      beam_width: if `greedy` is `false`: a beam search decoder will be used
+          with a beam of this width.
+      top_paths: if `greedy` is `false`,
+          how many of the most probable paths will be returned.
+
+  Returns:
+      Tuple:
+          List: if `greedy` is `true`, returns a list of one element that
+              contains the decoded sequence.
+              If `false`, returns the `top_paths` most probable
+              decoded sequences.
+              Important: blank labels are returned as `-1`.
+          Tensor `(top_paths, )` that contains
+              the log probability of each decoded sequence.
+  """
+  y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
+  input_length = math_ops.to_int32(input_length)
+
+  if greedy:
+    (decoded, log_prob) = ctc.ctc_greedy_decoder(
+        inputs=y_pred, sequence_length=input_length)
+  else:
+    (decoded, log_prob) = ctc.ctc_beam_search_decoder(
+        inputs=y_pred,
+        sequence_length=input_length,
+        beam_width=beam_width,
+        top_paths=top_paths)
+  decoded_dense = [
+      sparse_ops.sparse_to_dense(
+          st.indices, st.dense_shape, st.values, default_value=-1)
+      for st in decoded
+  ]
+  return (decoded_dense, log_prob)
+
+
+# HIGH ORDER FUNCTIONS
+
+
+def map_fn(fn, elems, name=None):
+  """Map the function fn over the elements elems and return the outputs.
+
+  Arguments:
+      fn: Callable that will be called upon each element in elems
+      elems: tensor
+      name: A string name for the map node in the graph
+
+  Returns:
+      Tensor with first dimension equal to the elems and second depending on
+      fn
+  """
+  return functional_ops.map_fn(fn, elems, name=name)
+
+
+def foldl(fn, elems, initializer=None, name=None):
+  """Reduce elems using fn to combine them from left to right.
+
+  Arguments:
+      fn: Callable that will be called upon each element in elems and an
+          accumulator, for instance `lambda acc, x: acc + x`
+      elems: tensor
+      initializer: The first value used (`elems[0]` in case of None)
+      name: A string name for the foldl node in the graph
+
+  Returns:
+      Same type and shape as initializer
+  """
+  return functional_ops.foldl(fn, elems, initializer=initializer, name=name)
+
+
+def foldr(fn, elems, initializer=None, name=None):
+  """Reduce elems using fn to combine them from right to left.
+
+  Arguments:
+      fn: Callable that will be called upon each element in elems and an
+          accumulator, for instance `lambda acc, x: acc + x`
+      elems: tensor
+      initializer: The first value used (`elems[-1]` in case of None)
+      name: A string name for the foldr node in the graph
+
+  Returns:
+      Same type and shape as initializer
+  """
+  return functional_ops.foldr(fn, elems, initializer=initializer, name=name)
+
+
+# Load Keras default configuration from config file if present.
+_keras_base_dir = os.path.expanduser('~')
+if not os.access(_keras_base_dir, os.W_OK):
+  _keras_base_dir = '/tmp'
+_keras_dir = os.path.join(_keras_base_dir, '.keras')
+if not os.path.exists(_keras_dir):
+  try:
+    os.makedirs(_keras_dir)
+  except FileExistsError:  # pylint: disable=undefined-variable
+    pass
+_config_path = os.path.expanduser(os.path.join(_keras_dir, 'keras.json'))
+if os.path.exists(_config_path):
+  _config = json.load(open(_config_path))
+  _floatx = _config.get('floatx', floatx())
+  assert _floatx in {'float16', 'float32', 'float64'}
+  _epsilon = _config.get('epsilon', epsilon())
+  assert isinstance(_epsilon, float)
+  _backend = backend()
+  _image_data_format = _config.get('image_data_format', image_data_format())
+  assert _image_data_format in {'channels_last', 'channels_first'}
+  set_floatx(_floatx)
+  set_epsilon(_epsilon)
+  set_image_data_format(_image_data_format)
diff --git a/tensorflow/contrib/keras/python/keras/backend_test.py b/tensorflow/contrib/keras/python/keras/backend_test.py
new file mode 100644
index 0000000000..fd9db1f327
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/backend_test.py
@@ -0,0 +1,457 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras backend."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import inspect
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+def compare_single_input_op_to_numpy(keras_op,
+                                     np_op,
+                                     input_shape,
+                                     dtype='float32',
+                                     negative_values=True,
+                                     keras_args=None,
+                                     keras_kwargs=None,
+                                     np_args=None,
+                                     np_kwargs=None):
+  keras_args = keras_args or []
+  keras_kwargs = keras_kwargs or {}
+  np_args = np_args or []
+  np_kwargs = np_kwargs or {}
+  inputs = 2. * np.random.random(input_shape)
+  if negative_values:
+    inputs -= 1.
+  keras_output = keras_op(keras.backend.variable(inputs, dtype=dtype),
+                          *keras_args, **keras_kwargs)
+  keras_output = keras.backend.eval(keras_output)
+  np_output = np_op(inputs.astype(dtype), *np_args, **np_kwargs)
+  try:
+    np.testing.assert_allclose(keras_output, np_output, atol=1e-4)
+  except AssertionError:
+    raise AssertionError('Test for op `' + str(keras_op.__name__) + '` failed; '
+                         'Expected ' + str(np_output) + ' but got ' +
+                         str(keras_output))
+
+
+def compare_two_inputs_op_to_numpy(keras_op,
+                                   np_op,
+                                   input_shape_a,
+                                   input_shape_b,
+                                   dtype='float32',
+                                   keras_args=None,
+                                   keras_kwargs=None,
+                                   np_args=None,
+                                   np_kwargs=None):
+  keras_args = keras_args or []
+  keras_kwargs = keras_kwargs or {}
+  np_args = np_args or []
+  np_kwargs = np_kwargs or {}
+  input_a = np.random.random(input_shape_a)
+  input_b = np.random.random(input_shape_b)
+  keras_output = keras_op(keras.backend.variable(input_a, dtype=dtype),
+                          keras.backend.variable(input_b, dtype=dtype),
+                          *keras_args, **keras_kwargs)
+  keras_output = keras.backend.eval(keras_output)
+  np_output = np_op(input_a.astype(dtype), input_b.astype(dtype),
+                    *np_args, **np_kwargs)
+  try:
+    np.testing.assert_allclose(keras_output, np_output, atol=1e-4)
+  except AssertionError:
+    raise AssertionError('Test for op `' + str(keras_op.__name__) + '` failed; '
+                         'Expected ' + str(np_output) + ' but got ' +
+                         str(keras_output))
+
+
+class BackendUtilsTest(test.TestCase):
+
+  def test_backend(self):
+    self.assertEqual(keras.backend.backend(), 'tensorflow')
+
+  def test_espilon(self):
+    epsilon = 1e-2
+    keras.backend.set_epsilon(epsilon)
+    self.assertEqual(keras.backend.epsilon(), epsilon)
+    keras.backend.set_epsilon(1e-7)
+
+  def test_floatx(self):
+    floatx = 'float64'
+    keras.backend.set_floatx(floatx)
+    self.assertEqual(keras.backend.floatx(), floatx)
+    keras.backend.set_floatx('float32')
+
+  def test_image_data_format(self):
+    image_data_format = 'channels_first'
+    keras.backend.set_image_data_format(image_data_format)
+    self.assertEqual(keras.backend.image_data_format(), image_data_format)
+    keras.backend.set_image_data_format('channels_last')
+
+  def test_get_uid(self):
+    self.assertEqual(keras.backend.get_uid('foo'), 1)
+    self.assertEqual(keras.backend.get_uid('foo'), 2)
+
+
+class BackendVariableTest(test.TestCase):
+
+  def test_zeros(self):
+    with self.test_session():
+      x = keras.backend.zeros((3, 4))
+      val = keras.backend.eval(x)
+      self.assertAllClose(val, np.zeros((3, 4)))
+
+  def test_ones(self):
+    with self.test_session():
+      x = keras.backend.ones((3, 4))
+      val = keras.backend.eval(x)
+      self.assertAllClose(val, np.ones((3, 4)))
+
+  def test_eye(self):
+    with self.test_session():
+      x = keras.backend.eye(4)
+      val = keras.backend.eval(x)
+      self.assertAllClose(val, np.eye(4))
+
+  def test_zeros_like(self):
+    with self.test_session():
+      x = keras.backend.zeros((3, 4))
+      y = keras.backend.zeros_like(x)
+      val = keras.backend.eval(y)
+      self.assertAllClose(val, np.zeros((3, 4)))
+
+  def test_ones_like(self):
+    with self.test_session():
+      x = keras.backend.zeros((3, 4))
+      y = keras.backend.ones_like(x)
+      val = keras.backend.eval(y)
+      self.assertAllClose(val, np.ones((3, 4)))
+
+  def test_random_uniform_variable(self):
+    with self.test_session():
+      x = keras.backend.random_uniform_variable((30, 20), low=1, high=2, seed=0)
+      val = keras.backend.eval(x)
+      self.assertAllClose(val.mean(), 1.5, atol=1e-1)
+      self.assertAllClose(val.max(), 2., atol=1e-1)
+      self.assertAllClose(val.min(), 1., atol=1e-1)
+
+  def test_random_normal_variable(self):
+    with self.test_session():
+      x = keras.backend.random_normal_variable((30, 20), 1., 0.5,
+                                               seed=0)
+      val = keras.backend.eval(x)
+      self.assertAllClose(val.mean(), 1., atol=1e-1)
+      self.assertAllClose(val.std(), 0.5, atol=1e-1)
+
+  def test_count_params(self):
+    with self.test_session():
+      x = keras.backend.zeros((4, 5))
+      val = keras.backend.count_params(x)
+      self.assertAllClose(val, 20)
+
+
+class BackendLinearAlgebraTest(test.TestCase):
+
+  def test_dot(self):
+    x = keras.backend.placeholder(shape=(2, 3))
+    y = keras.backend.placeholder(shape=(3, 4))
+    xy = keras.backend.dot(x, y)
+    self.assertEqual(xy.get_shape().as_list(), [2, 4])
+
+    x = keras.backend.placeholder(shape=(32, 28, 3))
+    y = keras.backend.placeholder(shape=(3, 4))
+    xy = keras.backend.dot(x, y)
+    self.assertEqual(xy.get_shape().as_list(), [32, 28, 4])
+
+  def test_batch_dot(self):
+    x = keras.backend.ones(shape=(32, 20, 1))
+    y = keras.backend.ones(shape=(32, 30, 20))
+    xy = keras.backend.batch_dot(x, y, axes=[1, 2])
+    self.assertEqual(xy.get_shape().as_list(), [32, 1, 30])
+
+  def test_reduction_ops(self):
+    ops_to_test = [
+        (keras.backend.max, np.max),
+        (keras.backend.min, np.min),
+        (keras.backend.sum, np.sum),
+        (keras.backend.prod, np.prod),
+        (keras.backend.var, np.var),
+        (keras.backend.std, np.std),
+        (keras.backend.mean, np.mean),
+        (keras.backend.argmin, np.argmin),
+        (keras.backend.argmax, np.argmax),
+    ]
+    for keras_op, np_op in ops_to_test:
+      with self.test_session():
+        compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
+                                         keras_kwargs={'axis': 1},
+                                         np_kwargs={'axis': 1})
+        compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
+                                         keras_kwargs={'axis': -1},
+                                         np_kwargs={'axis': -1})
+        if 'keepdims' in inspect.getargspec(keras_op).args:
+          compare_single_input_op_to_numpy(keras_op, np_op,
+                                           input_shape=(4, 7, 5),
+                                           keras_kwargs={'axis': 1,
+                                                         'keepdims': True},
+                                           np_kwargs={'axis': 1,
+                                                      'keepdims': True})
+
+  def test_elementwise_ops(self):
+    ops_to_test = [
+        (keras.backend.square, np.square),
+        (keras.backend.abs, np.abs),
+        (keras.backend.round, np.round),
+        (keras.backend.sign, np.sign),
+        (keras.backend.sin, np.sin),
+        (keras.backend.cos, np.cos),
+        (keras.backend.exp, np.exp),
+    ]
+    for keras_op, np_op in ops_to_test:
+      with self.test_session():
+        compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7))
+
+    ops_to_test = [
+        (keras.backend.sqrt, np.sqrt),
+        (keras.backend.log, np.log),
+    ]
+    for keras_op, np_op in ops_to_test:
+      with self.test_session():
+        compare_single_input_op_to_numpy(keras_op, np_op,
+                                         input_shape=(4, 7),
+                                         negative_values=False)
+
+    with self.test_session():
+      compare_single_input_op_to_numpy(
+          keras.backend.clip, np.clip,
+          input_shape=(6, 4),
+          keras_kwargs={'min_value': 0.1, 'max_value': 2.4},
+          np_kwargs={'a_min': 0.1, 'a_max': 1.4})
+
+    with self.test_session():
+      compare_single_input_op_to_numpy(
+          keras.backend.pow, np.power,
+          input_shape=(6, 4),
+          keras_args=[3],
+          np_args=[3])
+
+  def test_two_tensor_ops(self):
+    ops_to_test = [
+        (keras.backend.equal, np.equal),
+        (keras.backend.not_equal, np.not_equal),
+        (keras.backend.greater, np.greater),
+        (keras.backend.greater_equal, np.greater_equal),
+        (keras.backend.less, np.less),
+        (keras.backend.less_equal, np.less_equal),
+        (keras.backend.maximum, np.maximum),
+        (keras.backend.minimum, np.minimum),
+    ]
+    for keras_op, np_op in ops_to_test:
+      with self.test_session():
+        compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                       input_shape_a=(4, 7),
+                                       input_shape_b=(4, 7))
+
+
+class BackendShapeOpsTest(test.TestCase):
+
+  def test_reshape(self):
+    with self.test_session():
+      compare_single_input_op_to_numpy(keras.backend.reshape, np.reshape,
+                                       input_shape=(4, 7),
+                                       keras_args=[(2, 14)],
+                                       np_args=[(2, 14)])
+
+  def test_concatenate(self):
+    a = keras.backend.variable(np.ones((1, 2, 3)))
+    b = keras.backend.variable(np.ones((1, 2, 2)))
+    y = keras.backend.concatenate([a, b], axis=-1)
+    self.assertEqual(y.get_shape().as_list(), [1, 2, 5])
+
+  def test_permute_dimensions(self):
+    with self.test_session():
+      compare_single_input_op_to_numpy(keras.backend.permute_dimensions,
+                                       np.transpose,
+                                       input_shape=(4, 7),
+                                       keras_args=[(1, 0)],
+                                       np_args=[(1, 0)])
+
+  def test_resize_images(self):
+    height_factor = 2
+    width_factor = 2
+    data_format = 'channels_last'
+    x = keras.backend.variable(np.ones((1, 2, 2, 3)))
+    y = keras.backend.resize_images(x,
+                                    height_factor,
+                                    width_factor,
+                                    data_format)
+    self.assertEqual(y.get_shape().as_list(), [1, 4, 4, 3])
+
+    data_format = 'channels_first'
+    x = keras.backend.variable(np.ones((1, 3, 2, 2)))
+    y = keras.backend.resize_images(x,
+                                    height_factor,
+                                    width_factor,
+                                    data_format)
+    self.assertEqual(y.get_shape().as_list(), [1, 3, 4, 4])
+
+  def test_resize_volumes(self):
+    height_factor = 2
+    width_factor = 2
+    depth_factor = 2
+    data_format = 'channels_last'
+    x = keras.backend.variable(np.ones((1, 2, 2, 2, 3)))
+    y = keras.backend.resize_volumes(x,
+                                     depth_factor,
+                                     height_factor,
+                                     width_factor,
+                                     data_format)
+    self.assertEqual(y.get_shape().as_list(), [1, 4, 4, 4, 3])
+
+    data_format = 'channels_first'
+    x = keras.backend.variable(np.ones((1, 3, 2, 2, 2)))
+    y = keras.backend.resize_volumes(x,
+                                     depth_factor,
+                                     height_factor,
+                                     width_factor,
+                                     data_format)
+    self.assertEqual(y.get_shape().as_list(), [1, 3, 4, 4, 4])
+
+  def test_repeat_elements(self):
+    x = keras.backend.variable(np.ones((1, 3, 2)))
+    y = keras.backend.repeat_elements(x, 3, axis=1)
+    self.assertEqual(y.get_shape().as_list(), [1, 9, 2])
+
+  def test_repeat(self):
+    x = keras.backend.variable(np.ones((1, 3)))
+    y = keras.backend.repeat(x, 2)
+    self.assertEqual(y.get_shape().as_list(), [1, 2, 3])
+
+  def test_flatten(self):
+    with self.test_session():
+      compare_single_input_op_to_numpy(keras.backend.flatten,
+                                       np.reshape,
+                                       input_shape=(4, 7, 6),
+                                       np_args=[(4 * 7 * 6,)])
+
+  def test_batch_flatten(self):
+    with self.test_session():
+      compare_single_input_op_to_numpy(keras.backend.batch_flatten,
+                                       np.reshape,
+                                       input_shape=(4, 7, 6),
+                                       np_args=[(4, 7 * 6)])
+
+  def test_temporal_padding(self):
+
+    def ref_op(x, padding):
+      shape = list(x.shape)
+      shape[1] += padding[0] + padding[1]
+      y = np.zeros(tuple(shape))
+      y[:, padding[0]:-padding[1], :] = x
+      return y
+
+    with self.test_session():
+      compare_single_input_op_to_numpy(keras.backend.temporal_padding,
+                                       ref_op,
+                                       input_shape=(4, 7, 6),
+                                       keras_args=[(2, 3)],
+                                       np_args=[(2, 3)])
+
+  def test_spatial_2d_padding(self):
+
+    def ref_op(x, padding, data_format='channels_last'):
+      shape = list(x.shape)
+      if data_format == 'channels_last':
+        shape[1] += padding[0][0] + padding[0][1]
+        shape[2] += padding[1][0] + padding[1][1]
+        y = np.zeros(tuple(shape))
+        y[:, padding[0][0]:-padding[0][1], padding[1][0]:-padding[1][1], :] = x
+      else:
+        shape[2] += padding[0][0] + padding[0][1]
+        shape[3] += padding[1][0] + padding[1][1]
+        y = np.zeros(tuple(shape))
+        y[:, :, padding[0][0]:-padding[0][1], padding[1][0]:-padding[1][1]] = x
+      return y
+
+    with self.test_session():
+      compare_single_input_op_to_numpy(
+          keras.backend.spatial_2d_padding,
+          ref_op,
+          input_shape=(2, 3, 2, 3),
+          keras_args=[((2, 3), (1, 2))],
+          keras_kwargs={'data_format': 'channels_last'},
+          np_args=[((2, 3), (1, 2))],
+          np_kwargs={'data_format': 'channels_last'})
+      compare_single_input_op_to_numpy(
+          keras.backend.spatial_2d_padding,
+          ref_op,
+          input_shape=(2, 3, 2, 3),
+          keras_args=[((2, 3), (1, 2))],
+          keras_kwargs={'data_format': 'channels_first'},
+          np_args=[((2, 3), (1, 2))],
+          np_kwargs={'data_format': 'channels_first'})
+
+  def test_spatial_3d_padding(self):
+
+    def ref_op(x, padding, data_format='channels_last'):
+      shape = list(x.shape)
+      if data_format == 'channels_last':
+        shape[1] += padding[0][0] + padding[0][1]
+        shape[2] += padding[1][0] + padding[1][1]
+        shape[3] += padding[2][0] + padding[2][1]
+        y = np.zeros(tuple(shape))
+        y[:,
+          padding[0][0]:-padding[0][1],
+          padding[1][0]:-padding[1][1],
+          padding[2][0]:-padding[2][1],
+          :] = x
+      else:
+        shape[2] += padding[0][0] + padding[0][1]
+        shape[3] += padding[1][0] + padding[1][1]
+        shape[4] += padding[2][0] + padding[2][1]
+        y = np.zeros(tuple(shape))
+        y[:, :,
+          padding[0][0]:-padding[0][1],
+          padding[1][0]:-padding[1][1],
+          padding[2][0]:-padding[2][1]] = x
+      return y
+
+    with self.test_session():
+      compare_single_input_op_to_numpy(
+          keras.backend.spatial_3d_padding,
+          ref_op,
+          input_shape=(2, 3, 2, 3, 2),
+          keras_args=[((2, 3), (1, 2), (2, 3))],
+          keras_kwargs={'data_format': 'channels_last'},
+          np_args=[((2, 3), (1, 2), (2, 3))],
+          np_kwargs={'data_format': 'channels_last'})
+      compare_single_input_op_to_numpy(
+          keras.backend.spatial_3d_padding,
+          ref_op,
+          input_shape=(2, 3, 2, 3, 2),
+          keras_args=[((2, 3), (1, 2), (2, 3))],
+          keras_kwargs={'data_format': 'channels_first'},
+          np_args=[((2, 3), (1, 2), (2, 3))],
+          np_kwargs={'data_format': 'channels_first'})
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/callbacks.py b/tensorflow/contrib/keras/python/keras/callbacks.py
new file mode 100644
index 0000000000..345db2791c
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/callbacks.py
@@ -0,0 +1,947 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras callbacks: utilities called at certain points during model training.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import deque
+from collections import Iterable
+from collections import OrderedDict
+import csv
+import json
+import os
+import time
+import warnings
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.ops import array_ops
+from tensorflow.python.summary import summary as tf_summary
+
+
+# pylint: disable=g-import-not-at-top
+try:
+  import requests
+except ImportError:
+  requests = None
+# pylint: enable=g-import-not-at-top
+
+
+class CallbackList(object):
+  """Container abstracting a list of callbacks.
+
+  Arguments:
+      callbacks: List of `Callback` instances.
+      queue_length: Queue length for keeping
+          running statistics over callback execution time.
+  """
+
+  def __init__(self, callbacks=None, queue_length=10):
+    callbacks = callbacks or []
+    self.callbacks = [c for c in callbacks]
+    self.queue_length = queue_length
+
+  def append(self, callback):
+    self.callbacks.append(callback)
+
+  def set_params(self, params):
+    for callback in self.callbacks:
+      callback.set_params(params)
+
+  def set_model(self, model):
+    for callback in self.callbacks:
+      callback.set_model(model)
+
+  def on_epoch_begin(self, epoch, logs=None):
+    """Called at the start of an epoch.
+
+    Arguments:
+        epoch: integer, index of epoch.
+        logs: dictionary of logs.
+    """
+    logs = logs or {}
+    for callback in self.callbacks:
+      callback.on_epoch_begin(epoch, logs)
+    self._delta_t_batch = 0.
+    self._delta_ts_batch_begin = deque([], maxlen=self.queue_length)
+    self._delta_ts_batch_end = deque([], maxlen=self.queue_length)
+
+  def on_epoch_end(self, epoch, logs=None):
+    """Called at the end of an epoch.
+
+    Arguments:
+        epoch: integer, index of epoch.
+        logs: dictionary of logs.
+    """
+    logs = logs or {}
+    for callback in self.callbacks:
+      callback.on_epoch_end(epoch, logs)
+
+  def on_batch_begin(self, batch, logs=None):
+    """Called right before processing a batch.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dictionary of logs.
+    """
+    logs = logs or {}
+    t_before_callbacks = time.time()
+    for callback in self.callbacks:
+      callback.on_batch_begin(batch, logs)
+    self._delta_ts_batch_begin.append(time.time() - t_before_callbacks)
+    delta_t_median = np.median(self._delta_ts_batch_begin)
+    if (self._delta_t_batch > 0. and
+        delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1):
+      warnings.warn('Method on_batch_begin() is slow compared '
+                    'to the batch update (%f). Check your callbacks.' %
+                    delta_t_median)
+    self._t_enter_batch = time.time()
+
+  def on_batch_end(self, batch, logs=None):
+    """Called at the end of a batch.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dictionary of logs.
+    """
+    logs = logs or {}
+    if not hasattr(self, '_t_enter_batch'):
+      self._t_enter_batch = time.time()
+    self._delta_t_batch = time.time() - self._t_enter_batch
+    t_before_callbacks = time.time()
+    for callback in self.callbacks:
+      callback.on_batch_end(batch, logs)
+    self._delta_ts_batch_end.append(time.time() - t_before_callbacks)
+    delta_t_median = np.median(self._delta_ts_batch_end)
+    if (self._delta_t_batch > 0. and
+        (delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1)):
+      warnings.warn('Method on_batch_end() is slow compared '
+                    'to the batch update (%f). Check your callbacks.' %
+                    delta_t_median)
+
+  def on_train_begin(self, logs=None):
+    """Called at the beginning of training.
+
+    Arguments:
+        logs: dictionary of logs.
+    """
+    logs = logs or {}
+    for callback in self.callbacks:
+      callback.on_train_begin(logs)
+
+  def on_train_end(self, logs=None):
+    """Called at the end of training.
+
+    Arguments:
+        logs: dictionary of logs.
+    """
+    logs = logs or {}
+    for callback in self.callbacks:
+      callback.on_train_end(logs)
+
+  def __iter__(self):
+    return iter(self.callbacks)
+
+
+class Callback(object):
+  """Abstract base class used to build new callbacks.
+
+  # Properties
+      params: dict. Training parameters
+          (eg. verbosity, batch size, number of epochs...).
+      model: instance of `keras.models.Model`.
+          Reference of the model being trained.
+
+  The `logs` dictionary that callback methods
+  take as argument will contain keys for quantities relevant to
+  the current batch or epoch.
+
+  Currently, the `.fit()` method of the `Sequential` model class
+  will include the following quantities in the `logs` that
+  it passes to its callbacks:
+
+      on_epoch_end: logs include `acc` and `loss`, and
+          optionally include `val_loss`
+          (if validation is enabled in `fit`), and `val_acc`
+          (if validation and accuracy monitoring are enabled).
+      on_batch_begin: logs include `size`,
+          the number of samples in the current batch.
+      on_batch_end: logs include `loss`, and optionally `acc`
+          (if accuracy monitoring is enabled).
+  """
+
+  def __init__(self):
+    self.validation_data = None
+
+  def set_params(self, params):
+    self.params = params
+
+  def set_model(self, model):
+    self.model = model
+
+  def on_epoch_begin(self, epoch, logs=None):
+    pass
+
+  def on_epoch_end(self, epoch, logs=None):
+    pass
+
+  def on_batch_begin(self, batch, logs=None):
+    pass
+
+  def on_batch_end(self, batch, logs=None):
+    pass
+
+  def on_train_begin(self, logs=None):
+    pass
+
+  def on_train_end(self, logs=None):
+    pass
+
+
+class BaseLogger(Callback):
+  """Callback that accumulates epoch averages of metrics.
+
+  This callback is automatically applied to every Keras model.
+  """
+
+  def on_epoch_begin(self, epoch, logs=None):
+    self.seen = 0
+    self.totals = {}
+
+  def on_batch_end(self, batch, logs=None):
+    logs = logs or {}
+    batch_size = logs.get('size', 0)
+    self.seen += batch_size
+
+    for k, v in logs.items():
+      if k in self.totals:
+        self.totals[k] += v * batch_size
+      else:
+        self.totals[k] = v * batch_size
+
+  def on_epoch_end(self, epoch, logs=None):
+    if logs is not None:
+      for k in self.params['metrics']:
+        if k in self.totals:
+          # Make value available to next callbacks.
+          logs[k] = self.totals[k] / self.seen
+
+
+class ProgbarLogger(Callback):
+  """Callback that prints metrics to stdout.
+
+  Arguments:
+      count_mode: One of "steps" or "samples".
+          Whether the progress bar should
+          count samples seens or steps (batches) seen.
+
+  Raises:
+      ValueError: In case of invalid `count_mode`.
+  """
+
+  def __init__(self, count_mode='samples'):
+    super(ProgbarLogger, self).__init__()
+    if count_mode == 'samples':
+      self.use_steps = False
+    elif count_mode == 'steps':
+      self.use_steps = True
+    else:
+      raise ValueError('Unknown `count_mode`: ' + str(count_mode))
+
+  def on_train_begin(self, logs=None):
+    self.verbose = self.params['verbose']
+    self.epochs = self.params['epochs']
+
+  def on_epoch_begin(self, epoch, logs=None):
+    if self.verbose:
+      print('Epoch %d/%d' % (epoch + 1, self.epochs))
+      if self.use_steps:
+        target = self.params['steps']
+      else:
+        target = self.params['samples']
+      self.target = target
+      self.progbar = Progbar(target=self.target, verbose=self.verbose)
+    self.seen = 0
+
+  def on_batch_begin(self, batch, logs=None):
+    if self.seen < self.target:
+      self.log_values = []
+
+  def on_batch_end(self, batch, logs=None):
+    logs = logs or {}
+    batch_size = logs.get('size', 0)
+    if self.use_steps:
+      self.seen += 1
+    else:
+      self.seen += batch_size
+
+    for k in self.params['metrics']:
+      if k in logs:
+        self.log_values.append((k, logs[k]))
+
+    # Skip progbar update for the last batch;
+    # will be handled by on_epoch_end.
+    if self.verbose and self.seen < self.target:
+      self.progbar.update(self.seen, self.log_values)
+
+  def on_epoch_end(self, epoch, logs=None):
+    logs = logs or {}
+    for k in self.params['metrics']:
+      if k in logs:
+        self.log_values.append((k, logs[k]))
+    if self.verbose:
+      self.progbar.update(self.seen, self.log_values, force=True)
+
+
+class History(Callback):
+  """Callback that records events into a `History` object.
+
+  This callback is automatically applied to
+  every Keras model. The `History` object
+  gets returned by the `fit` method of models.
+  """
+
+  def on_train_begin(self, logs=None):
+    self.epoch = []
+    self.history = {}
+
+  def on_epoch_end(self, epoch, logs=None):
+    logs = logs or {}
+    self.epoch.append(epoch)
+    for k, v in logs.items():
+      self.history.setdefault(k, []).append(v)
+
+
+class ModelCheckpoint(Callback):
+  """Save the model after every epoch.
+
+  `filepath` can contain named formatting options,
+  which will be filled the value of `epoch` and
+  keys in `logs` (passed in `on_epoch_end`).
+
+  For example: if `filepath` is `weights.{epoch:02d}-{val_loss:.2f}.hdf5`,
+  then the model checkpoints will be saved with the epoch number and
+  the validation loss in the filename.
+
+  Arguments:
+      filepath: string, path to save the model file.
+      monitor: quantity to monitor.
+      verbose: verbosity mode, 0 or 1.
+      save_best_only: if `save_best_only=True`,
+          the latest best model according to
+          the quantity monitored will not be overwritten.
+      mode: one of {auto, min, max}.
+          If `save_best_only=True`, the decision
+          to overwrite the current save file is made
+          based on either the maximization or the
+          minimization of the monitored quantity. For `val_acc`,
+          this should be `max`, for `val_loss` this should
+          be `min`, etc. In `auto` mode, the direction is
+          automatically inferred from the name of the monitored quantity.
+      save_weights_only: if True, then only the model's weights will be
+          saved (`model.save_weights(filepath)`), else the full model
+          is saved (`model.save(filepath)`).
+      period: Interval (number of epochs) between checkpoints.
+  """
+
+  def __init__(self,
+               filepath,
+               monitor='val_loss',
+               verbose=0,
+               save_best_only=False,
+               save_weights_only=False,
+               mode='auto',
+               period=1):
+    super(ModelCheckpoint, self).__init__()
+    self.monitor = monitor
+    self.verbose = verbose
+    self.filepath = filepath
+    self.save_best_only = save_best_only
+    self.save_weights_only = save_weights_only
+    self.period = period
+    self.epochs_since_last_save = 0
+
+    if mode not in ['auto', 'min', 'max']:
+      warnings.warn('ModelCheckpoint mode %s is unknown, '
+                    'fallback to auto mode.' % (mode), RuntimeWarning)
+      mode = 'auto'
+
+    if mode == 'min':
+      self.monitor_op = np.less
+      self.best = np.Inf
+    elif mode == 'max':
+      self.monitor_op = np.greater
+      self.best = -np.Inf
+    else:
+      if 'acc' in self.monitor or self.monitor.startswith('fmeasure'):
+        self.monitor_op = np.greater
+        self.best = -np.Inf
+      else:
+        self.monitor_op = np.less
+        self.best = np.Inf
+
+  def on_epoch_end(self, epoch, logs=None):
+    logs = logs or {}
+    self.epochs_since_last_save += 1
+    if self.epochs_since_last_save >= self.period:
+      self.epochs_since_last_save = 0
+      filepath = self.filepath.format(epoch=epoch, **logs)
+      if self.save_best_only:
+        current = logs.get(self.monitor)
+        if current is None:
+          warnings.warn('Can save best model only with %s available, '
+                        'skipping.' % (self.monitor), RuntimeWarning)
+        else:
+          if self.monitor_op(current, self.best):
+            if self.verbose > 0:
+              print('Epoch %05d: %s improved from %0.5f to %0.5f,'
+                    ' saving model to %s' % (epoch, self.monitor, self.best,
+                                             current, filepath))
+            self.best = current
+            if self.save_weights_only:
+              self.model.save_weights(filepath, overwrite=True)
+            else:
+              self.model.save(filepath, overwrite=True)
+          else:
+            if self.verbose > 0:
+              print('Epoch %05d: %s did not improve' % (epoch, self.monitor))
+      else:
+        if self.verbose > 0:
+          print('Epoch %05d: saving model to %s' % (epoch, filepath))
+        if self.save_weights_only:
+          self.model.save_weights(filepath, overwrite=True)
+        else:
+          self.model.save(filepath, overwrite=True)
+
+
+class EarlyStopping(Callback):
+  """Stop training when a monitored quantity has stopped improving.
+
+  Arguments:
+      monitor: quantity to be monitored.
+      min_delta: minimum change in the monitored quantity
+          to qualify as an improvement, i.e. an absolute
+          change of less than min_delta, will count as no
+          improvement.
+      patience: number of epochs with no improvement
+          after which training will be stopped.
+      verbose: verbosity mode.
+      mode: one of {auto, min, max}. In `min` mode,
+          training will stop when the quantity
+          monitored has stopped decreasing; in `max`
+          mode it will stop when the quantity
+          monitored has stopped increasing; in `auto`
+          mode, the direction is automatically inferred
+          from the name of the monitored quantity.
+  """
+
+  def __init__(self,
+               monitor='val_loss',
+               min_delta=0,
+               patience=0,
+               verbose=0,
+               mode='auto'):
+    super(EarlyStopping, self).__init__()
+
+    self.monitor = monitor
+    self.patience = patience
+    self.verbose = verbose
+    self.min_delta = min_delta
+    self.wait = 0
+    self.stopped_epoch = 0
+
+    if mode not in ['auto', 'min', 'max']:
+      warnings.warn('EarlyStopping mode %s is unknown, '
+                    'fallback to auto mode.' % (self.mode), RuntimeWarning)
+      mode = 'auto'
+
+    if mode == 'min':
+      self.monitor_op = np.less
+    elif mode == 'max':
+      self.monitor_op = np.greater
+    else:
+      if 'acc' in self.monitor or self.monitor.startswith('fmeasure'):
+        self.monitor_op = np.greater
+      else:
+        self.monitor_op = np.less
+
+    if self.monitor_op == np.greater:
+      self.min_delta *= 1
+    else:
+      self.min_delta *= -1
+
+  def on_train_begin(self, logs=None):
+    self.wait = 0  # Allow instances to be re-used
+    self.best = np.Inf if self.monitor_op == np.less else -np.Inf
+
+  def on_epoch_end(self, epoch, logs=None):
+    current = logs.get(self.monitor)
+    if current is None:
+      warnings.warn('Early stopping requires %s available!' % (self.monitor),
+                    RuntimeWarning)
+
+    if self.monitor_op(current - self.min_delta, self.best):
+      self.best = current
+      self.wait = 0
+    else:
+      if self.wait >= self.patience:
+        self.stopped_epoch = epoch
+        self.model.stop_training = True
+      self.wait += 1
+
+  def on_train_end(self, logs=None):
+    if self.stopped_epoch > 0 and self.verbose > 0:
+      print('Epoch %05d: early stopping' % (self.stopped_epoch))
+
+
+class RemoteMonitor(Callback):
+  """Callback used to stream events to a server.
+
+  Requires the `requests` library.
+  Events are sent to `root + '/publish/epoch/end/'` by default. Calls are
+  HTTP POST, with a `data` argument which is a
+  JSON-encoded dictionary of event data.
+
+  Arguments:
+      root: String; root url of the target server.
+      path: String; path relative to `root` to which the events will be sent.
+      field: String; JSON field under which the data will be stored.
+      headers: Dictionary; optional custom HTTP headers.
+          Defaults to:
+          `{'Accept': 'application/json',
+            'Content-Type': 'application/json'}`
+  """
+
+  def __init__(self,
+               root='http://localhost:9000',
+               path='/publish/epoch/end/',
+               field='data',
+               headers=None):
+    super(RemoteMonitor, self).__init__()
+    if headers is None:
+      headers = {
+          'Accept': 'application/json',
+          'Content-Type': 'application/json'
+      }
+    self.root = root
+    self.path = path
+    self.field = field
+    self.headers = headers
+
+  def on_epoch_end(self, epoch, logs=None):
+    if requests is None:
+      raise ImportError('RemoteMonitor requires ' 'the `requests` library.')
+    logs = logs or {}
+    send = {}
+    send['epoch'] = epoch
+    for k, v in logs.items():
+      send[k] = v
+    try:
+      requests.post(
+          self.root + self.path, {self.field: json.dumps(send)},
+          headers=self.headers)
+    except requests.exceptions.RequestException:
+      warnings.warn('Warning: could not reach RemoteMonitor '
+                    'root server at ' + str(self.root))
+
+
+class LearningRateScheduler(Callback):
+  """Learning rate scheduler.
+
+  Arguments:
+      schedule: a function that takes an epoch index as input
+          (integer, indexed from 0) and returns a new
+          learning rate as output (float).
+  """
+
+  def __init__(self, schedule):
+    super(LearningRateScheduler, self).__init__()
+    self.schedule = schedule
+
+  def on_epoch_begin(self, epoch, logs=None):
+    if not hasattr(self.model.optimizer, 'lr'):
+      raise ValueError('Optimizer must have a "lr" attribute.')
+    lr = self.schedule(epoch)
+    if not isinstance(lr, (float, np.float32, np.float64)):
+      raise ValueError('The output of the "schedule" function '
+                       'should be float.')
+    K.set_value(self.model.optimizer.lr, lr)
+
+
+class TensorBoard(Callback):
+  """Tensorboard basic visualizations.
+
+  This callback writes a log for TensorBoard, which allows
+  you to visualize dynamic graphs of your training and test
+  metrics, as well as activation histograms for the different
+  layers in your model.
+
+  Arguments:
+      log_dir: the path of the directory where to save the log
+          files to be parsed by Tensorboard.
+      histogram_freq: frequency (in epochs) at which to compute activation
+          histograms for the layers of the model. If set to 0,
+          histograms won't be computed.
+      write_graph: whether to visualize the graph in Tensorboard.
+          The log file can become quite large when
+          write_graph is set to True.
+      write_images: whether to write model weights to visualize as
+          image in Tensorboard.
+  """
+
+  def __init__(self,
+               log_dir='./logs',
+               histogram_freq=0,
+               write_graph=True,
+               write_images=False):
+    super(TensorBoard, self).__init__()
+    self.log_dir = log_dir
+    self.histogram_freq = histogram_freq
+    self.merged = None
+    self.write_graph = write_graph
+    self.write_images = write_images
+
+  def set_model(self, model):
+    self.model = model
+    self.sess = K.get_session()
+    if self.histogram_freq and self.merged is None:
+      for layer in self.model.layers:
+
+        for weight in layer.weights:
+          tf_summary.histogram(weight.name, weight)
+          if self.write_images:
+            w_img = array_ops.squeeze(weight)
+            shape = w_img.get_shape()
+            if len(shape) > 1 and shape[0] > shape[1]:
+              w_img = array_ops.transpose(w_img)
+            if len(shape) == 1:
+              w_img = array_ops.expand_dims(w_img, 0)
+            w_img = array_ops.expand_dims(array_ops.expand_dims(w_img, 0), -1)
+            tf_summary.image(weight.name, w_img)
+
+        if hasattr(layer, 'output'):
+          tf_summary.histogram('{}_out'.format(layer.name), layer.output)
+    self.merged = tf_summary.merge_all()
+
+    if self.write_graph:
+      self.writer = tf_summary.FileWriter(self.log_dir, self.sess.graph)
+    else:
+      self.writer = tf_summary.FileWriter(self.log_dir)
+
+  def on_epoch_end(self, epoch, logs=None):
+    logs = logs or {}
+
+    if self.validation_data and self.histogram_freq:
+      if epoch % self.histogram_freq == 0:
+        # TODO(fchollet): implement batched calls to sess.run
+        # (current call will likely go OOM on GPU)
+        if self.model.uses_learning_phase:
+          cut_v_data = len(self.model.inputs)
+          val_data = self.validation_data[:cut_v_data] + [0]
+          tensors = self.model.inputs + [K.learning_phase()]
+        else:
+          val_data = self.validation_data
+          tensors = self.model.inputs
+        feed_dict = dict(zip(tensors, val_data))
+        result = self.sess.run([self.merged], feed_dict=feed_dict)
+        summary_str = result[0]
+        self.writer.add_summary(summary_str, epoch)
+
+    for name, value in logs.items():
+      if name in ['batch', 'size']:
+        continue
+      summary = tf_summary.Summary()
+      summary_value = summary.value.add()
+      summary_value.simple_value = value.item()
+      summary_value.tag = name
+      self.writer.add_summary(summary, epoch)
+    self.writer.flush()
+
+  def on_train_end(self, _):
+    self.writer.close()
+
+
+class ReduceLROnPlateau(Callback):
+  """Reduce learning rate when a metric has stopped improving.
+
+  Models often benefit from reducing the learning rate by a factor
+  of 2-10 once learning stagnates. This callback monitors a
+  quantity and if no improvement is seen for a 'patience' number
+  of epochs, the learning rate is reduced.
+
+  Example:
+      ```python
+          reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
+                                        patience=5, min_lr=0.001)
+          model.fit(X_train, Y_train, callbacks=[reduce_lr])
+      ```
+
+  Arguments:
+      monitor: quantity to be monitored.
+      factor: factor by which the learning rate will
+          be reduced. new_lr = lr * factor
+      patience: number of epochs with no improvement
+          after which learning rate will be reduced.
+      verbose: int. 0: quiet, 1: update messages.
+      mode: one of {auto, min, max}. In `min` mode,
+          lr will be reduced when the quantity
+          monitored has stopped decreasing; in `max`
+          mode it will be reduced when the quantity
+          monitored has stopped increasing; in `auto`
+          mode, the direction is automatically inferred
+          from the name of the monitored quantity.
+      epsilon: threshold for measuring the new optimum,
+          to only focus on significant changes.
+      cooldown: number of epochs to wait before resuming
+          normal operation after lr has been reduced.
+      min_lr: lower bound on the learning rate.
+  """
+
+  def __init__(self,
+               monitor='val_loss',
+               factor=0.1,
+               patience=10,
+               verbose=0,
+               mode='auto',
+               epsilon=1e-4,
+               cooldown=0,
+               min_lr=0):
+    super(ReduceLROnPlateau, self).__init__()
+
+    self.monitor = monitor
+    if factor >= 1.0:
+      raise ValueError('ReduceLROnPlateau ' 'does not support a factor >= 1.0.')
+    self.factor = factor
+    self.min_lr = min_lr
+    self.epsilon = epsilon
+    self.patience = patience
+    self.verbose = verbose
+    self.cooldown = cooldown
+    self.cooldown_counter = 0  # Cooldown counter.
+    self.wait = 0
+    self.best = 0
+    self.mode = mode
+    self.monitor_op = None
+    self._reset()
+
+  def _reset(self):
+    """Resets wait counter and cooldown counter.
+    """
+    if self.mode not in ['auto', 'min', 'max']:
+      warnings.warn('Learning Rate Plateau Reducing mode %s is unknown, '
+                    'fallback to auto mode.' % (self.mode), RuntimeWarning)
+      self.mode = 'auto'
+    if (self.mode == 'min' or
+        (self.mode == 'auto' and 'acc' not in self.monitor)):
+      self.monitor_op = lambda a, b: np.less(a, b - self.epsilon)
+      self.best = np.Inf
+    else:
+      self.monitor_op = lambda a, b: np.greater(a, b + self.epsilon)
+      self.best = -np.Inf
+    self.cooldown_counter = 0
+    self.wait = 0
+    self.lr_epsilon = self.min_lr * 1e-4
+
+  def on_train_begin(self, logs=None):
+    self._reset()
+
+  def on_epoch_end(self, epoch, logs=None):
+    logs = logs or {}
+    logs['lr'] = K.get_value(self.model.optimizer.lr)
+    current = logs.get(self.monitor)
+    if current is None:
+      warnings.warn('Learning Rate Plateau Reducing requires %s available!' %
+                    self.monitor, RuntimeWarning)
+    else:
+      if self.in_cooldown():
+        self.cooldown_counter -= 1
+        self.wait = 0
+
+      if self.monitor_op(current, self.best):
+        self.best = current
+        self.wait = 0
+      elif not self.in_cooldown():
+        if self.wait >= self.patience:
+          old_lr = float(K.get_value(self.model.optimizer.lr))
+          if old_lr > self.min_lr + self.lr_epsilon:
+            new_lr = old_lr * self.factor
+            new_lr = max(new_lr, self.min_lr)
+            K.set_value(self.model.optimizer.lr, new_lr)
+            if self.verbose > 0:
+              print('\nEpoch %05d: reducing learning rate to %s.' % (epoch,
+                                                                     new_lr))
+            self.cooldown_counter = self.cooldown
+            self.wait = 0
+        self.wait += 1
+
+  def in_cooldown(self):
+    return self.cooldown_counter > 0
+
+
+class CSVLogger(Callback):
+  """Callback that streams epoch results to a csv file.
+
+  Supports all values that can be represented as a string,
+  including 1D iterables such as np.ndarray.
+
+  Example:
+      ```python
+          csv_logger = CSVLogger('training.log')
+          model.fit(X_train, Y_train, callbacks=[csv_logger])
+      ```
+
+  Arguments:
+      filename: filename of the csv file, e.g. 'run/log.csv'.
+      separator: string used to separate elements in the csv file.
+      append: True: append if file exists (useful for continuing
+          training). False: overwrite existing file,
+  """
+
+  def __init__(self, filename, separator=',', append=False):
+    self.sep = separator
+    self.filename = filename
+    self.append = append
+    self.writer = None
+    self.keys = None
+    self.append_header = True
+    super(CSVLogger, self).__init__()
+
+  def on_train_begin(self, logs=None):
+    if self.append:
+      if os.path.exists(self.filename):
+        with open(self.filename) as f:
+          self.append_header = not bool(len(f.readline()))
+      self.csv_file = open(self.filename, 'a')
+    else:
+      self.csv_file = open(self.filename, 'w')
+
+  def on_epoch_end(self, epoch, logs=None):
+    logs = logs or {}
+
+    def handle_value(k):
+      is_zero_dim_ndarray = isinstance(k, np.ndarray) and k.ndim == 0
+      if isinstance(k, Iterable) and not is_zero_dim_ndarray:
+        return '"[%s]"' % (', '.join(map(str, k)))
+      else:
+        return k
+
+    if not self.writer:
+      self.keys = sorted(logs.keys())
+
+      class CustomDialect(csv.excel):
+        delimiter = self.sep
+
+      self.writer = csv.DictWriter(
+          self.csv_file,
+          fieldnames=['epoch'] + self.keys,
+          dialect=CustomDialect)
+      if self.append_header:
+        self.writer.writeheader()
+
+    row_dict = OrderedDict({'epoch': epoch})
+    row_dict.update((key, handle_value(logs[key])) for key in self.keys)
+    self.writer.writerow(row_dict)
+    self.csv_file.flush()
+
+  def on_train_end(self, logs=None):
+    self.csv_file.close()
+    self.writer = None
+
+
+class LambdaCallback(Callback):
+  """Callback for creating simple, custom callbacks on-the-fly.
+
+  This callback is constructed with anonymous functions that will be called
+  at the appropriate time. Note that the callbacks expects positional
+  arguments, as:
+   - `on_epoch_begin` and `on_epoch_end` expect two positional arguments:
+      `epoch`, `logs`
+   - `on_batch_begin` and `on_batch_end` expect two positional arguments:
+      `batch`, `logs`
+   - `on_train_begin` and `on_train_end` expect one positional argument:
+      `logs`
+
+  Arguments:
+      on_epoch_begin: called at the beginning of every epoch.
+      on_epoch_end: called at the end of every epoch.
+      on_batch_begin: called at the beginning of every batch.
+      on_batch_end: called at the end of every batch.
+      on_train_begin: called at the beginning of model training.
+      on_train_end: called at the end of model training.
+
+  Example:
+      ```python
+      # Print the batch number at the beginning of every batch.
+      batch_print_callback = LambdaCallback(
+          on_batch_begin=lambda batch,logs: print(batch))
+
+      # Plot the loss after every epoch.
+      import numpy as np
+      import matplotlib.pyplot as plt
+      plot_loss_callback = LambdaCallback(
+          on_epoch_end=lambda epoch, logs: plt.plot(np.arange(epoch),
+                                                    logs['loss']))
+
+      # Terminate some processes after having finished model training.
+      processes = ...
+      cleanup_callback = LambdaCallback(
+          on_train_end=lambda logs: [
+              p.terminate() for p in processes if p.is_alive()])
+
+      model.fit(...,
+                callbacks=[batch_print_callback,
+                           plot_loss_callback,
+                           cleanup_callback])
+      ```
+  """
+
+  def __init__(self,
+               on_epoch_begin=None,
+               on_epoch_end=None,
+               on_batch_begin=None,
+               on_batch_end=None,
+               on_train_begin=None,
+               on_train_end=None,
+               **kwargs):
+    super(LambdaCallback, self).__init__()
+    self.__dict__.update(kwargs)
+    if on_epoch_begin is not None:
+      self.on_epoch_begin = on_epoch_begin
+    else:
+      self.on_epoch_begin = lambda epoch, logs: None
+    if on_epoch_end is not None:
+      self.on_epoch_end = on_epoch_end
+    else:
+      self.on_epoch_end = lambda epoch, logs: None
+    if on_batch_begin is not None:
+      self.on_batch_begin = on_batch_begin
+    else:
+      self.on_batch_begin = lambda batch, logs: None
+    if on_batch_end is not None:
+      self.on_batch_end = on_batch_end
+    else:
+      self.on_batch_end = lambda batch, logs: None
+    if on_train_begin is not None:
+      self.on_train_begin = on_train_begin
+    else:
+      self.on_train_begin = lambda logs: None
+    if on_train_end is not None:
+      self.on_train_end = on_train_end
+    else:
+      self.on_train_end = lambda logs: None
diff --git a/tensorflow/contrib/keras/python/keras/callbacks_test.py b/tensorflow/contrib/keras/python/keras/callbacks_test.py
new file mode 100644
index 0000000000..412f736e16
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/callbacks_test.py
@@ -0,0 +1,619 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras callbacks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import csv
+import multiprocessing
+import os
+import re
+import shutil
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+try:
+  import h5py  # pylint:disable=g-import-not-at-top
+except ImportError:
+  h5py = None
+
+
+TRAIN_SAMPLES = 10
+TEST_SAMPLES = 10
+NUM_CLASSES = 2
+INPUT_DIM = 3
+NUM_HIDDEN = 5
+BATCH_SIZE = 5
+
+
+class KerasCallbacksTest(test.TestCase):
+
+  def test_ModelCheckpoint(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      np.random.seed(1337)
+
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      filepath = os.path.join(temp_dir, 'checkpoint.h5')
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+      # case 1
+      monitor = 'val_loss'
+      save_best_only = False
+      mode = 'auto'
+
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='rmsprop',
+          metrics=['accuracy'])
+
+      cbks = [
+          keras.callbacks.ModelCheckpoint(
+              filepath,
+              monitor=monitor,
+              save_best_only=save_best_only,
+              mode=mode)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=1,
+          verbose=0)
+      assert os.path.exists(filepath)
+      os.remove(filepath)
+
+      # case 2
+      mode = 'min'
+      cbks = [
+          keras.callbacks.ModelCheckpoint(
+              filepath,
+              monitor=monitor,
+              save_best_only=save_best_only,
+              mode=mode)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=1,
+          verbose=0)
+      assert os.path.exists(filepath)
+      os.remove(filepath)
+
+      # case 3
+      mode = 'max'
+      monitor = 'val_acc'
+      cbks = [
+          keras.callbacks.ModelCheckpoint(
+              filepath,
+              monitor=monitor,
+              save_best_only=save_best_only,
+              mode=mode)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=1,
+          verbose=0)
+      assert os.path.exists(filepath)
+      os.remove(filepath)
+
+      # case 4
+      save_best_only = True
+      cbks = [
+          keras.callbacks.ModelCheckpoint(
+              filepath,
+              monitor=monitor,
+              save_best_only=save_best_only,
+              mode=mode)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=1,
+          verbose=0)
+      assert os.path.exists(filepath)
+      os.remove(filepath)
+
+      # case 5
+      save_best_only = False
+      period = 2
+      mode = 'auto'
+
+      filepath = os.path.join(temp_dir, 'checkpoint.{epoch:02d}.h5')
+      cbks = [
+          keras.callbacks.ModelCheckpoint(
+              filepath,
+              monitor=monitor,
+              save_best_only=save_best_only,
+              mode=mode,
+              period=period)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=4,
+          verbose=0)
+      assert os.path.exists(filepath.format(epoch=1))
+      assert os.path.exists(filepath.format(epoch=3))
+      os.remove(filepath.format(epoch=1))
+      os.remove(filepath.format(epoch=3))
+      assert not os.path.exists(filepath.format(epoch=0))
+      assert not os.path.exists(filepath.format(epoch=2))
+
+  def test_EarlyStopping(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='rmsprop',
+          metrics=['accuracy'])
+      mode = 'max'
+      monitor = 'val_acc'
+      patience = 0
+      cbks = [
+          keras.callbacks.EarlyStopping(
+              patience=patience, monitor=monitor, mode=mode)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=20,
+          verbose=0)
+
+      mode = 'auto'
+      monitor = 'val_acc'
+      patience = 2
+      cbks = [
+          keras.callbacks.EarlyStopping(
+              patience=patience, monitor=monitor, mode=mode)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=20,
+          verbose=0)
+
+  def test_EarlyStopping_reuse(self):
+    with self.test_session():
+      np.random.seed(1337)
+      patience = 3
+      data = np.random.random((100, 1))
+      labels = np.where(data > 0.5, 1, 0)
+      model = keras.models.Sequential((keras.layers.Dense(
+          1, input_dim=1, activation='relu'), keras.layers.Dense(
+              1, activation='sigmoid'),))
+      model.compile(
+          optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
+      stopper = keras.callbacks.EarlyStopping(monitor='acc', patience=patience)
+      weights = model.get_weights()
+
+      hist = model.fit(data, labels, callbacks=[stopper], verbose=0)
+      assert len(hist.epoch) >= patience
+
+      # This should allow training to go for at least `patience` epochs
+      model.set_weights(weights)
+      hist = model.fit(data, labels, callbacks=[stopper], verbose=0)
+    assert len(hist.epoch) >= patience
+
+  def test_LearningRateScheduler(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='sgd',
+          metrics=['accuracy'])
+
+      cbks = [keras.callbacks.LearningRateScheduler(lambda x: 1. / (1. + x))]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=5,
+          verbose=0)
+      assert (float(keras.backend.get_value(model.optimizer.lr)) - 0.2
+             ) < keras.backend.epsilon()
+
+  def test_ReduceLROnPlateau(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+
+      def make_model():
+        np.random.seed(1337)
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Dense(
+                NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+        model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+
+        model.compile(
+            loss='categorical_crossentropy',
+            optimizer=keras.optimizers.SGD(lr=0.1),
+            metrics=['accuracy'])
+        return model
+
+      model = make_model()
+
+      # This should reduce the LR after the first epoch (due to high epsilon).
+      cbks = [
+          keras.callbacks.ReduceLROnPlateau(
+              monitor='val_loss',
+              factor=0.1,
+              epsilon=10,
+              patience=1,
+              cooldown=5)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=5,
+          verbose=0)
+      assert np.allclose(
+          float(keras.backend.get_value(model.optimizer.lr)),
+          0.01,
+          atol=keras.backend.epsilon())
+
+      model = make_model()
+      cbks = [
+          keras.callbacks.ReduceLROnPlateau(
+              monitor='val_loss', factor=0.1, epsilon=0, patience=1, cooldown=5)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=5,
+          verbose=0)
+      assert np.allclose(
+          float(keras.backend.get_value(model.optimizer.lr)),
+          0.1,
+          atol=keras.backend.epsilon())
+
+  def test_CSVLogger(self):
+    with self.test_session():
+      np.random.seed(1337)
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+      filepath = os.path.join(temp_dir, 'log.tsv')
+
+      sep = '\t'
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+
+      def make_model():
+        np.random.seed(1337)
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Dense(
+                NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+        model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+
+        model.compile(
+            loss='categorical_crossentropy',
+            optimizer=keras.optimizers.SGD(lr=0.1),
+            metrics=['accuracy'])
+        return model
+
+      # case 1, create new file with defined separator
+      model = make_model()
+      cbks = [keras.callbacks.CSVLogger(filepath, separator=sep)]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=1,
+          verbose=0)
+
+      assert os.path.exists(filepath)
+      with open(filepath) as csvfile:
+        dialect = csv.Sniffer().sniff(csvfile.read())
+      assert dialect.delimiter == sep
+      del model
+      del cbks
+
+      # case 2, append data to existing file, skip header
+      model = make_model()
+      cbks = [keras.callbacks.CSVLogger(filepath, separator=sep, append=True)]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=1,
+          verbose=0)
+
+      # case 3, reuse of CSVLogger object
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=1,
+          verbose=0)
+
+      with open(filepath) as csvfile:
+        output = ' '.join(csvfile.readlines())
+        assert len(re.findall('epoch', output)) == 1
+
+      os.remove(filepath)
+
+  def test_TensorBoard(self):
+    np.random.seed(1337)
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test)
+    y_train = keras.utils.to_categorical(y_train)
+
+    def data_generator(train):
+      if train:
+        max_batch_index = len(x_train) // BATCH_SIZE
+      else:
+        max_batch_index = len(x_test) // BATCH_SIZE
+      i = 0
+      while 1:
+        if train:
+          yield (x_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
+                 y_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
+        else:
+          yield (x_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
+                 y_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
+        i += 1
+        i %= max_batch_index
+
+    # case: Sequential
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='sgd',
+          metrics=['accuracy'])
+
+      tsb = keras.callbacks.TensorBoard(
+          log_dir=temp_dir, histogram_freq=1, write_images=True)
+      cbks = [tsb]
+
+      # fit with validation data
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=3,
+          verbose=0)
+
+      # fit with validation data and accuracy
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=2,
+          verbose=0)
+
+      # fit generator with validation data
+      model.fit_generator(
+          data_generator(True),
+          len(x_train),
+          epochs=2,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          verbose=0)
+
+      # fit generator without validation data
+      model.fit_generator(
+          data_generator(True),
+          len(x_train),
+          epochs=2,
+          callbacks=cbks,
+          verbose=0)
+
+      # fit generator with validation data and accuracy
+      model.fit_generator(
+          data_generator(True),
+          len(x_train),
+          epochs=2,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          verbose=0)
+
+      # fit generator without validation data and accuracy
+      model.fit_generator(
+          data_generator(True), len(x_train), epochs=2, callbacks=cbks)
+      assert os.path.exists(temp_dir)
+
+  def test_LambdaCallback(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='sgd',
+          metrics=['accuracy'])
+
+      # Start an arbitrary process that should run during model
+      # training and be terminated after training has completed.
+      def target():
+        while True:
+          pass
+
+      p = multiprocessing.Process(target=target)
+      p.start()
+      cleanup_callback = keras.callbacks.LambdaCallback(
+          on_train_end=lambda logs: p.terminate())
+
+      cbks = [cleanup_callback]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=5,
+          verbose=0)
+      p.join()
+      assert not p.is_alive()
+
+  def test_TensorBoard_with_ReduceLROnPlateau(self):
+    with self.test_session():
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
+
+      cbks = [
+          keras.callbacks.ReduceLROnPlateau(
+              monitor='val_loss', factor=0.5, patience=4, verbose=1),
+          keras.callbacks.TensorBoard(log_dir=temp_dir)
+      ]
+
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=2,
+          verbose=0)
+
+      assert os.path.exists(temp_dir)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/constraints.py b/tensorflow/contrib/keras/python/keras/constraints.py
new file mode 100644
index 0000000000..91d6153862
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/constraints.py
@@ -0,0 +1,199 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Constraints: functions that impose constraints on weights values.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import serialize_keras_object
+
+
+class Constraint(object):
+
+  def __call__(self, w):
+    return w
+
+  def get_config(self):
+    return {}
+
+
+class MaxNorm(Constraint):
+  """MaxNorm weight constraint.
+
+  Constrains the weights incident to each hidden unit
+  to have a norm less than or equal to a desired value.
+
+  Arguments:
+      m: the maximum norm for the incoming weights.
+      axis: integer, axis along which to calculate weight norms.
+          For instance, in a `Dense` layer the weight matrix
+          has shape `(input_dim, output_dim)`,
+          set `axis` to `0` to constrain each weight vector
+          of length `(input_dim,)`.
+          In a `Convolution2D` layer with `data_format="channels_last"`,
+          the weight tensor has shape
+          `(rows, cols, input_depth, output_depth)`,
+          set `axis` to `[0, 1, 2]`
+          to constrain the weights of each filter tensor of size
+          `(rows, cols, input_depth)`.
+
+  References:
+      - [Dropout: A Simple Way to Prevent Neural Networks from Overfitting
+        Srivastava, Hinton, et al.
+        2014](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)
+  """
+
+  def __init__(self, max_value=2, axis=0):
+    self.max_value = max_value
+    self.axis = axis
+
+  def __call__(self, w):
+    norms = K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True))
+    desired = K.clip(norms, 0, self.max_value)
+    w *= (desired / (K.epsilon() + norms))
+    return w
+
+  def get_config(self):
+    return {'max_value': self.max_value, 'axis': self.axis}
+
+
+class NonNeg(Constraint):
+  """Constrains the weights to be non-negative.
+  """
+
+  def __call__(self, w):
+    w *= K.cast(w >= 0., K.floatx())
+    return w
+
+
+class UnitNorm(Constraint):
+  """Constrains the weights incident to each hidden unit to have unit norm.
+
+  Arguments:
+      axis: integer, axis along which to calculate weight norms.
+          For instance, in a `Dense` layer the weight matrix
+          has shape `(input_dim, output_dim)`,
+          set `axis` to `0` to constrain each weight vector
+          of length `(input_dim,)`.
+          In a `Convolution2D` layer with `data_format="channels_last"`,
+          the weight tensor has shape
+          `(rows, cols, input_depth, output_depth)`,
+          set `axis` to `[0, 1, 2]`
+          to constrain the weights of each filter tensor of size
+          `(rows, cols, input_depth)`.
+  """
+
+  def __init__(self, axis=0):
+    self.axis = axis
+
+  def __call__(self, w):
+    return w / (
+        K.epsilon() + K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True)))
+
+  def get_config(self):
+    return {'axis': self.axis}
+
+
+class MinMaxNorm(Constraint):
+  """MinMaxNorm weight constraint.
+
+  Constrains the weights incident to each hidden unit
+  to have the norm between a lower bound and an upper bound.
+
+  Arguments:
+      min_value: the minimum norm for the incoming weights.
+      max_value: the maximum norm for the incoming weights.
+      rate: rate for enforcing the constraint: weights will be
+          rescaled to yield
+          `(1 - rate) * norm + rate * norm.clip(min_value, max_value)`.
+          Effectively, this means that rate=1.0 stands for strict
+          enforcement of the constraint, while rate<1.0 means that
+          weights will be rescaled at each step to slowly move
+          towards a value inside the desired interval.
+      axis: integer, axis along which to calculate weight norms.
+          For instance, in a `Dense` layer the weight matrix
+          has shape `(input_dim, output_dim)`,
+          set `axis` to `0` to constrain each weight vector
+          of length `(input_dim,)`.
+          In a `Convolution2D` layer with `dim_ordering="tf"`,
+          the weight tensor has shape
+          `(rows, cols, input_depth, output_depth)`,
+          set `axis` to `[0, 1, 2]`
+          to constrain the weights of each filter tensor of size
+          `(rows, cols, input_depth)`.
+  """
+
+  def __init__(self, min_value=0.0, max_value=1.0, rate=1.0, axis=0):
+    self.min_value = min_value
+    self.max_value = max_value
+    self.rate = rate
+    self.axis = axis
+
+  def __call__(self, w):
+    norms = K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True))
+    desired = (self.rate * K.clip(norms, self.min_value, self.max_value) +
+               (1 - self.rate) * norms)
+    w *= (desired / (K.epsilon() + norms))
+    return w
+
+  def get_config(self):
+    return {
+        'min_value': self.min_value,
+        'max_value': self.max_value,
+        'rate': self.rate,
+        'axis': self.axis
+    }
+
+
+# Aliases.
+
+# pylint: disable=invalid-name
+max_norm = MaxNorm
+non_neg = NonNeg
+unit_norm = UnitNorm
+min_max_norm = MinMaxNorm
+
+# pylint: enable=invalid-name
+
+
+def serialize(constraint):
+  return serialize_keras_object(constraint)
+
+
+def deserialize(config, custom_objects=None):
+  return deserialize_keras_object(
+      config,
+      module_objects=globals(),
+      custom_objects=custom_objects,
+      printable_module_name='constraint')
+
+
+def get(identifier):
+  if identifier is None:
+    return None
+  if isinstance(identifier, dict):
+    return deserialize(identifier)
+  elif isinstance(identifier, six.string_types):
+    config = {'class_name': str(identifier), 'config': {}}
+    return deserialize(config)
+  elif callable(identifier):
+    return identifier
+  else:
+    raise ValueError('Could not interpret constraint identifier:', identifier)
diff --git a/tensorflow/contrib/keras/python/keras/constraints_test.py b/tensorflow/contrib/keras/python/keras/constraints_test.py
new file mode 100644
index 0000000000..36fbee7fd5
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/constraints_test.py
@@ -0,0 +1,103 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras weights constraints."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+def get_test_values():
+  return [0.1, 0.5, 3, 8, 1e-7]
+
+
+def get_example_array():
+  np.random.seed(3537)
+  example_array = np.random.random((100, 100)) * 100. - 50.
+  example_array[0, 0] = 0.  # 0 could possibly cause trouble
+  return example_array
+
+
+class KerasConstraintsTest(test.TestCase):
+
+  def test_serialization(self):
+    all_activations = ['max_norm', 'non_neg',
+                       'unit_norm', 'min_max_norm']
+    for name in all_activations:
+      fn = keras.constraints.get(name)
+      ref_fn = getattr(keras.constraints, name)()
+      assert fn.__class__ == ref_fn.__class__
+      config = keras.constraints.serialize(fn)
+      fn = keras.constraints.deserialize(config)
+      assert fn.__class__ == ref_fn.__class__
+
+  def test_max_norm(self):
+    with self.test_session():
+      array = get_example_array()
+      for m in get_test_values():
+        norm_instance = keras.constraints.max_norm(m)
+        normed = norm_instance(keras.backend.variable(array))
+        assert np.all(keras.backend.eval(normed) < m)
+
+      # a more explicit example
+      norm_instance = keras.constraints.max_norm(2.0)
+      x = np.array([[0, 0, 0], [1.0, 0, 0], [3, 0, 0], [3, 3, 3]]).T
+      x_normed_target = np.array([[0, 0, 0], [1.0, 0, 0],
+                                  [2.0, 0, 0],
+                                  [2. / np.sqrt(3),
+                                   2. / np.sqrt(3),
+                                   2. / np.sqrt(3)]]).T
+      x_normed_actual = keras.backend.eval(
+          norm_instance(keras.backend.variable(x)))
+      self.assertAllClose(x_normed_actual, x_normed_target, rtol=1e-05)
+
+  def test_non_neg(self):
+    with self.test_session():
+      non_neg_instance = keras.constraints.non_neg()
+      normed = non_neg_instance(keras.backend.variable(get_example_array()))
+      assert np.all(np.min(keras.backend.eval(normed), axis=1) == 0.)
+
+  def test_unit_norm(self):
+    with self.test_session():
+      unit_norm_instance = keras.constraints.unit_norm()
+      normalized = unit_norm_instance(
+          keras.backend.variable(get_example_array()))
+      norm_of_normalized = np.sqrt(
+          np.sum(keras.backend.eval(normalized) ** 2, axis=0))
+      # In the unit norm constraint, it should be equal to 1.
+      difference = norm_of_normalized - 1.
+      largest_difference = np.max(np.abs(difference))
+      assert np.abs(largest_difference) < 10e-5
+
+  def test_min_max_norm(self):
+    with self.test_session():
+      array = get_example_array()
+      for m in get_test_values():
+        norm_instance = keras.constraints.min_max_norm(min_value=m,
+                                                       max_value=m * 2)
+        normed = norm_instance(keras.backend.variable(array))
+        value = keras.backend.eval(normed)
+        l2 = np.sqrt(np.sum(np.square(value), axis=0))
+        assert not l2[l2 < m]
+        assert not l2[l2 > m * 2 + 1e-5]
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/datasets/__init__.py b/tensorflow/contrib/keras/python/keras/datasets/__init__.py
new file mode 100644
index 0000000000..fe8dee54db
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/datasets/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras datasets: utilities for downloading and pre-processing common datasets.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.datasets import boston_housing
+from tensorflow.contrib.keras.python.keras.datasets import cifar10
+from tensorflow.contrib.keras.python.keras.datasets import cifar100
+from tensorflow.contrib.keras.python.keras.datasets import imdb
+from tensorflow.contrib.keras.python.keras.datasets import mnist
+from tensorflow.contrib.keras.python.keras.datasets import reuters
+
diff --git a/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py b/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py
new file mode 100644
index 0000000000..ac0f0fd422
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py
@@ -0,0 +1,56 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Boston housing price regression dataset.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+def load_data(path='boston_housing.npz', seed=113, test_split=0.2):
+  """Loads the Boston Housing dataset.
+
+  Arguments:
+      path: path where to cache the dataset locally
+          (relative to ~/.keras/datasets).
+      seed: Random seed for shuffling the data
+          before computing the test split.
+      test_split: fraction of the data to reserve as test set.
+
+  Returns:
+      Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+  """
+  assert 0 <= test_split < 1
+  path = get_file(
+      path, origin='https://s3.amazonaws.com/keras-datasets/boston_housing.npz')
+  f = np.load(path)
+  x = f['x']
+  y = f['y']
+  f.close()
+
+  np.random.seed(seed)
+  np.random.shuffle(x)
+  np.random.seed(seed)
+  np.random.shuffle(y)
+
+  x_train = np.array(x[:int(len(x) * (1 - test_split))])
+  y_train = np.array(y[:int(len(x) * (1 - test_split))])
+  x_test = np.array(x[int(len(x) * (1 - test_split)):])
+  y_test = np.array(y[int(len(x) * (1 - test_split)):])
+  return (x_train, y_train), (x_test, y_test)
diff --git a/tensorflow/contrib/keras/python/keras/datasets/cifar.py b/tensorflow/contrib/keras/python/keras/datasets/cifar.py
new file mode 100644
index 0000000000..564709c0ee
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/datasets/cifar.py
@@ -0,0 +1,53 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities used by the CIFAR10 and CIFAR100 datasets.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+from six.moves import cPickle
+
+
+def load_batch(fpath, label_key='labels'):
+  """Internal utility for parsing CIFAR data.
+
+  Arguments:
+      fpath: path the file to parse.
+      label_key: key for label data in the retrieve
+          dictionary.
+
+  Returns:
+      A tuple `(data, labels)`.
+  """
+  f = open(fpath, 'rb')
+  if sys.version_info < (3,):
+    d = cPickle.load(f)
+  else:
+    d = cPickle.load(f, encoding='bytes')
+    # decode utf8
+    d_decoded = {}
+    for k, v in d.items():
+      d_decoded[k.decode('utf8')] = v
+    d = d_decoded
+  f.close()
+  data = d['data']
+  labels = d[label_key]
+
+  data = data.reshape(data.shape[0], 3, 32, 32)
+  return data, labels
diff --git a/tensorflow/contrib/keras/python/keras/datasets/cifar10.py b/tensorflow/contrib/keras/python/keras/datasets/cifar10.py
new file mode 100644
index 0000000000..11618b8552
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/datasets/cifar10.py
@@ -0,0 +1,61 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""CIFAR10 small image classification dataset.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.datasets.cifar import load_batch
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+def load_data():
+  """Loads CIFAR10 dataset.
+
+  Returns:
+      Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+  """
+  dirname = 'cifar-10-batches-py'
+  origin = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
+  path = get_file(dirname, origin=origin, untar=True)
+
+  num_train_samples = 50000
+
+  x_train = np.zeros((num_train_samples, 3, 32, 32), dtype='uint8')
+  y_train = np.zeros((num_train_samples,), dtype='uint8')
+
+  for i in range(1, 6):
+    fpath = os.path.join(path, 'data_batch_' + str(i))
+    data, labels = load_batch(fpath)
+    x_train[(i - 1) * 10000:i * 10000, :, :, :] = data
+    y_train[(i - 1) * 10000:i * 10000] = labels
+
+  fpath = os.path.join(path, 'test_batch')
+  x_test, y_test = load_batch(fpath)
+
+  y_train = np.reshape(y_train, (len(y_train), 1))
+  y_test = np.reshape(y_test, (len(y_test), 1))
+
+  if K.image_data_format() == 'channels_last':
+    x_train = x_train.transpose(0, 2, 3, 1)
+    x_test = x_test.transpose(0, 2, 3, 1)
+
+  return (x_train, y_train), (x_test, y_test)
diff --git a/tensorflow/contrib/keras/python/keras/datasets/cifar100.py b/tensorflow/contrib/keras/python/keras/datasets/cifar100.py
new file mode 100644
index 0000000000..eba3ee6415
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/datasets/cifar100.py
@@ -0,0 +1,62 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""CIFAR100 small image classification dataset.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.datasets.cifar import load_batch
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+def load_data(label_mode='fine'):
+  """Loads CIFAR100 dataset.
+
+  Arguments:
+      label_mode: one of "fine", "coarse".
+
+  Returns:
+      Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+
+  Raises:
+      ValueError: in case of invalid `label_mode`.
+  """
+  if label_mode not in ['fine', 'coarse']:
+    raise ValueError('label_mode must be one of "fine" "coarse".')
+
+  dirname = 'cifar-100-python'
+  origin = 'http://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
+  path = get_file(dirname, origin=origin, untar=True)
+
+  fpath = os.path.join(path, 'train')
+  x_train, y_train = load_batch(fpath, label_key=label_mode + '_labels')
+
+  fpath = os.path.join(path, 'test')
+  x_test, y_test = load_batch(fpath, label_key=label_mode + '_labels')
+
+  y_train = np.reshape(y_train, (len(y_train), 1))
+  y_test = np.reshape(y_test, (len(y_test), 1))
+
+  if K.image_data_format() == 'channels_last':
+    x_train = x_train.transpose(0, 2, 3, 1)
+    x_test = x_test.transpose(0, 2, 3, 1)
+
+  return (x_train, y_train), (x_test, y_test)
diff --git a/tensorflow/contrib/keras/python/keras/datasets/imdb.py b/tensorflow/contrib/keras/python/keras/datasets/imdb.py
new file mode 100644
index 0000000000..2688e8bede
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/datasets/imdb.py
@@ -0,0 +1,150 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""IMDB movie review sentiment classification dataset.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+import numpy as np
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+def load_data(path='imdb.npz',
+              num_words=None,
+              skip_top=0,
+              maxlen=None,
+              seed=113,
+              start_char=1,
+              oov_char=2,
+              index_from=3):
+  """Loads the IMDB dataset.
+
+  Arguments:
+      path: where to cache the data (relative to `~/.keras/dataset`).
+      num_words: max number of words to include. Words are ranked
+          by how often they occur (in the training set) and only
+          the most frequent words are kept
+      skip_top: skip the top N most frequently occuring words
+          (which may not be informative).
+      maxlen: truncate sequences after this length.
+      seed: random seed for sample shuffling.
+      start_char: The start of a sequence will be marked with this character.
+          Set to 1 because 0 is usually the padding character.
+      oov_char: words that were cut out because of the `num_words`
+          or `skip_top` limit will be replaced with this character.
+      index_from: index actual words with this index and higher.
+
+  Returns:
+      Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+
+  Raises:
+      ValueError: in case `maxlen` is so low
+          that no input sequence could be kept.
+
+  Note that the 'out of vocabulary' character is only used for
+  words that were present in the training set but are not included
+  because they're not making the `num_words` cut here.
+  Words that were not seen in the training set but are in the test set
+  have simply been skipped.
+  """
+  path = get_file(
+      path, origin='https://s3.amazonaws.com/text-datasets/imdb.npz')
+  f = np.load(path)
+  x_train = f['x_train']
+  labels_train = f['y_train']
+  x_test = f['x_test']
+  labels_test = f['y_test']
+  f.close()
+
+  np.random.seed(seed)
+  np.random.shuffle(x_train)
+  np.random.seed(seed)
+  np.random.shuffle(labels_train)
+
+  np.random.seed(seed * 2)
+  np.random.shuffle(x_test)
+  np.random.seed(seed * 2)
+  np.random.shuffle(labels_test)
+
+  xs = np.concatenate([x_train, x_test])
+  labels = np.concatenate([labels_train, labels_test])
+
+  if start_char is not None:
+    xs = [[start_char] + [w + index_from for w in x] for x in xs]
+  elif index_from:
+    xs = [[w + index_from for w in x] for x in xs]
+
+  if maxlen:
+    new_xs = []
+    new_labels = []
+    for x, y in zip(xs, labels):
+      if len(x) < maxlen:
+        new_xs.append(x)
+        new_labels.append(y)
+    xs = new_xs
+    labels = new_labels
+  if not xs:
+    raise ValueError('After filtering for sequences shorter than maxlen=' + str(
+        maxlen) + ', no sequence was kept. '
+                     'Increase maxlen.')
+  if not num_words:
+    num_words = max([max(x) for x in xs])
+
+  # by convention, use 2 as OOV word
+  # reserve 'index_from' (=3 by default) characters:
+  # 0 (padding), 1 (start), 2 (OOV)
+  if oov_char is not None:
+    xs = [[oov_char if (w >= num_words or w < skip_top) else w for w in x]
+          for x in xs]
+  else:
+    new_xs = []
+    for x in xs:
+      nx = []
+      for w in x:
+        if w >= num_words or w < skip_top:
+          nx.append(w)
+      new_xs.append(nx)
+    xs = new_xs
+
+  x_train = np.array(xs[:len(x_train)])
+  y_train = np.array(labels[:len(x_train)])
+
+  x_test = np.array(xs[len(x_train):])
+  y_test = np.array(labels[len(x_train):])
+
+  return (x_train, y_train), (x_test, y_test)
+
+
+def get_word_index(path='imdb_word_index.json'):
+  """Retrieves the dictionary mapping word indices back to words.
+
+  Arguments:
+      path: where to cache the data (relative to `~/.keras/dataset`).
+
+  Returns:
+      The word index dictionary.
+  """
+  path = get_file(
+      path,
+      origin='https://s3.amazonaws.com/text-datasets/imdb_word_index.json')
+  f = open(path)
+  data = json.load(f)
+  f.close()
+  return data
diff --git a/tensorflow/contrib/keras/python/keras/datasets/mnist.py b/tensorflow/contrib/keras/python/keras/datasets/mnist.py
new file mode 100644
index 0000000000..aaced003d0
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/datasets/mnist.py
@@ -0,0 +1,44 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""MNIST handwritten digits classification dataset.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+def load_data(path='mnist.npz'):
+  """Loads the MNIST dataset.
+
+  Arguments:
+      path: path where to cache the dataset locally
+          (relative to ~/.keras/datasets).
+
+  Returns:
+      Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+  """
+  path = get_file(
+      path, origin='https://s3.amazonaws.com/img-datasets/mnist.npz')
+  f = np.load(path)
+  x_train = f['x_train']
+  y_train = f['y_train']
+  x_test = f['x_test']
+  y_test = f['y_test']
+  f.close()
+  return (x_train, y_train), (x_test, y_test)
diff --git a/tensorflow/contrib/keras/python/keras/datasets/reuters.py b/tensorflow/contrib/keras/python/keras/datasets/reuters.py
new file mode 100644
index 0000000000..81e940a846
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/datasets/reuters.py
@@ -0,0 +1,136 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Reuters newswire topic classification dataset.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+import numpy as np
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+def load_data(path='reuters.npz',
+              num_words=None,
+              skip_top=0,
+              maxlen=None,
+              test_split=0.2,
+              seed=113,
+              start_char=1,
+              oov_char=2,
+              index_from=3):
+  """Loads the Reuters newswire classification dataset.
+
+  Arguments:
+      path: where to cache the data (relative to `~/.keras/dataset`).
+      num_words: max number of words to include. Words are ranked
+          by how often they occur (in the training set) and only
+          the most frequent words are kept
+      skip_top: skip the top N most frequently occuring words
+          (which may not be informative).
+      maxlen: truncate sequences after this length.
+      test_split: Fraction of the dataset to be used as test data.
+      seed: random seed for sample shuffling.
+      start_char: The start of a sequence will be marked with this character.
+          Set to 1 because 0 is usually the padding character.
+      oov_char: words that were cut out because of the `num_words`
+          or `skip_top` limit will be replaced with this character.
+      index_from: index actual words with this index and higher.
+
+  Returns:
+      Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+
+  Note that the 'out of vocabulary' character is only used for
+  words that were present in the training set but are not included
+  because they're not making the `num_words` cut here.
+  Words that were not seen in the training set but are in the test set
+  have simply been skipped.
+  """
+  path = get_file(
+      path, origin='https://s3.amazonaws.com/text-datasets/reuters.npz')
+  npzfile = np.load(path)
+  xs = npzfile['x']
+  labels = npzfile['y']
+  npzfile.close()
+
+  np.random.seed(seed)
+  np.random.shuffle(xs)
+  np.random.seed(seed)
+  np.random.shuffle(labels)
+
+  if start_char is not None:
+    xs = [[start_char] + [w + index_from for w in x] for x in xs]
+  elif index_from:
+    xs = [[w + index_from for w in x] for x in xs]
+
+  if maxlen:
+    new_xs = []
+    new_labels = []
+    for x, y in zip(xs, labels):
+      if len(x) < maxlen:
+        new_xs.append(x)
+        new_labels.append(y)
+    xs = new_xs
+    labels = new_labels
+
+  if not num_words:
+    num_words = max([max(x) for x in xs])
+
+  # by convention, use 2 as OOV word
+  # reserve 'index_from' (=3 by default) characters:
+  # 0 (padding), 1 (start), 2 (OOV)
+  if oov_char is not None:
+    xs = [[oov_char if (w >= num_words or w < skip_top) else w for w in x]
+          for x in xs]
+  else:
+    new_xs = []
+    for x in xs:
+      nx = []
+      for w in x:
+        if w >= num_words or w < skip_top:
+          nx.append(w)
+      new_xs.append(nx)
+    xs = new_xs
+
+  x_train = np.array(xs[:int(len(xs) * (1 - test_split))])
+  y_train = np.array(labels[:int(len(xs) * (1 - test_split))])
+
+  x_test = np.array(xs[int(len(xs) * (1 - test_split)):])
+  y_test = np.array(labels[int(len(xs) * (1 - test_split)):])
+
+  return (x_train, y_train), (x_test, y_test)
+
+
+def get_word_index(path='reuters_word_index.json'):
+  """Retrieves the dictionary mapping word indices back to words.
+
+  Arguments:
+      path: where to cache the data (relative to `~/.keras/dataset`).
+
+  Returns:
+      The word index dictionary.
+  """
+  path = get_file(
+      path,
+      origin='https://s3.amazonaws.com/text-datasets/reuters_word_index.json')
+  f = open(path)
+  data = json.load(f)
+  f.close()
+  return data
diff --git a/tensorflow/contrib/keras/python/keras/engine/__init__.py b/tensorflow/contrib/keras/python/keras/engine/__init__.py
new file mode 100644
index 0000000000..0a1dc3dd2d
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/engine/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Keras Engine: graph topology and training loop functionality.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.engine.topology import get_source_inputs
+from tensorflow.contrib.keras.python.keras.engine.topology import Input
+from tensorflow.contrib.keras.python.keras.engine.topology import InputLayer
+from tensorflow.contrib.keras.python.keras.engine.topology import InputSpec
+from tensorflow.contrib.keras.python.keras.engine.topology import Layer
+from tensorflow.contrib.keras.python.keras.engine.training import Model
+
+
+# Note: topology.Node is an internal class,
+# it isn't meant to be used by Keras users.
diff --git a/tensorflow/contrib/keras/python/keras/engine/topology.py b/tensorflow/contrib/keras/python/keras/engine/topology.py
new file mode 100644
index 0000000000..0f506ff0a4
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/engine/topology.py
@@ -0,0 +1,2952 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Base layer code and base model (Container) code.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import inspect
+import json
+import os
+import re
+import warnings
+
+import numpy as np
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras.utils import conv_utils
+from tensorflow.contrib.keras.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.contrib.keras.python.keras.utils.layer_utils import print_summary as print_layer_summary
+from tensorflow.python.framework import tensor_shape
+
+
+# pylint: disable=g-import-not-at-top
+try:
+  import h5py
+except ImportError:
+  h5py = None
+
+try:
+  import yaml
+except ImportError:
+  yaml = None
+# pylint: enable=g-import-not-at-top
+
+
+class InputSpec(object):
+  """Specifies the ndim, dtype and shape of every input to a layer.
+
+  Every layer should expose (if appropriate) an `input_spec` attribute:
+  a list of instances of InputSpec (one per input tensor).
+
+  A None entry in a shape is compatible with any dimension,
+  a None shape is compatible with any shape.
+
+  Arguments:
+      dtype: Expected datatype of the input.
+      shape: Shape tuple, expected shape of the input
+          (may include None for unchecked axes).
+      ndim: Integer, expected rank of the input.
+      max_ndim: Integer, maximum rank of the input.
+      min_ndim: Integer, minimum rank of the input.
+      axes: Dictionary mapping integer axes to
+          a specific dimension value.
+  """
+
+  def __init__(self,
+               dtype=None,
+               shape=None,
+               ndim=None,
+               max_ndim=None,
+               min_ndim=None,
+               axes=None):
+    self.dtype = dtype
+    self.shape = shape
+    if shape is not None:
+      self.ndim = len(shape)
+    else:
+      self.ndim = ndim
+    self.max_ndim = max_ndim
+    self.min_ndim = min_ndim
+    self.axes = axes or {}
+
+
+class Node(object):
+  """A `Node` describes the connectivity between two layers.
+
+  Each time a layer is connected to some new input,
+  a node is added to `layer.inbound_nodes`.
+  Each time the output of a layer is used by another layer,
+  a node is added to `layer.outbound_nodes`.
+
+  Arguments:
+      outbound_layer: the layer that takes
+          `input_tensors` and turns them into `output_tensors`
+          (the node gets created when the `call`
+          method of the layer was called).
+      inbound_layers: a list of layers, the same length as `input_tensors`,
+          the layers from where `input_tensors` originate.
+      node_indices: a list of integers, the same length as `inbound_layers`.
+          `node_indices[i]` is the origin node of `input_tensors[i]`
+          (necessary since each inbound layer might have several nodes,
+          e.g. if the layer is being shared with a different data stream).
+      tensor_indices: a list of integers,
+          the same length as `inbound_layers`.
+          `tensor_indices[i]` is the index of `input_tensors[i]` within the
+          output of the inbound layer
+          (necessary since each inbound layer might
+          have multiple tensor outputs, with each one being
+          independently manipulable).
+      input_tensors: list of input tensors.
+      output_tensors: list of output tensors.
+      input_masks: list of input masks (a mask can be a tensor, or None).
+      output_masks: list of output masks (a mask can be a tensor, or None).
+      arguments: dictionary of keyword arguments that were passed to the
+          `call` method of the layer at the call that created the node.
+
+  `node_indices` and `tensor_indices` are basically fine-grained coordinates
+  describing the origin of the `input_tensors`.
+
+  A node from layer A to layer B is added to:
+      A.outbound_nodes
+      B.inbound_nodes
+  """
+
+  def __init__(self,
+               outbound_layer,
+               inbound_layers,
+               node_indices,
+               tensor_indices,
+               input_tensors,
+               output_tensors,
+               input_masks,
+               output_masks,
+               arguments=None):
+    # Layer instance (NOT a list).
+    # this is the layer that takes a list of input tensors
+    # and turns them into a list of output tensors.
+    # the current node will be added to
+    # the inbound_nodes of outbound_layer.
+    self.outbound_layer = outbound_layer
+
+    # The following 3 properties describe where
+    # the input tensors come from: which layers,
+    # and for each layer, which node and which
+    # tensor output of each node.
+
+    # List of layer instances.
+    self.inbound_layers = inbound_layers
+    # List of integers, 1:1 mapping with inbound_layers.
+    self.node_indices = node_indices
+    # List of integers, 1:1 mapping with inbound_layers.
+    self.tensor_indices = tensor_indices
+
+    # Following 2 properties:
+    # tensor inputs and outputs of outbound_layer.
+
+    # List of tensors. 1:1 mapping with inbound_layers.
+    self.input_tensors = input_tensors
+    # List of tensors, created by outbound_layer.call().
+    self.output_tensors = output_tensors
+
+    # Following 2 properties: input and output masks.
+    # List of tensors, 1:1 mapping with input_tensor.
+    self.input_masks = input_masks
+    # List of tensors, created by outbound_layer.compute_mask().
+    self.output_masks = output_masks
+
+    # Following 2 properties: input and output shapes.
+
+    # List of shape tuples, shapes of input_tensors.
+    self.input_shapes = [K.int_shape(x) for x in input_tensors]
+    # List of shape tuples, shapes of output_tensors.
+    self.output_shapes = [K.int_shape(x) for x in output_tensors]
+
+    # Optional keyword arguments to layer's `call`.
+    self.arguments = arguments
+
+    # Add nodes to all layers involved.
+    for layer in inbound_layers:
+      if layer is not None:
+        layer.outbound_nodes.append(self)
+    outbound_layer.inbound_nodes.append(self)
+
+  def get_config(self):
+    inbound_names = []
+    for layer in self.inbound_layers:
+      if layer:
+        inbound_names.append(layer.name)
+      else:
+        inbound_names.append(None)
+    return {
+        'outbound_layer':
+            self.outbound_layer.name if self.outbound_layer else None,
+        'inbound_layers':
+            inbound_names,
+        'node_indices':
+            self.node_indices,
+        'tensor_indices':
+            self.tensor_indices
+    }
+
+
+class Layer(object):
+  """Abstract base layer class.
+
+  # Properties
+      name: String, must be unique within a model.
+      input_spec: List of InputSpec class instances
+          each entry describes one required input:
+              - ndim
+              - dtype
+          A layer with `n` input tensors must have
+          an `input_spec` of length `n`.
+      trainable: Boolean, whether the layer weights
+          will be updated during training.
+      uses_learning_phase: Whether any operation
+          of the layer uses `K.in_training_phase()`
+          or `K.in_test_phase()`.
+      input_shape: Shape tuple. Provided for convenience,
+          but note that there may be cases in which this
+          attribute is ill-defined (e.g. a shared layer
+          with multiple input shapes), in which case
+          requesting `input_shape` will raise an Exception.
+          Prefer using `layer.get_input_shape_for(input_shape)`,
+          or `layer.get_input_shape_at(node_index)`.
+      output_shape: Shape tuple. See above.
+      inbound_nodes: List of nodes.
+      outbound_nodes: List of nodes.
+      input, output: Input/output tensor(s). Note that if the layer is used
+          more than once (shared layer), this is ill-defined
+          and will raise an exception. In such cases, use
+          `layer.get_input_at(node_index)`.
+      input_mask, output_mask: Same as above, for masks.
+      trainable_weights: List of variables.
+      non_trainable_weights: List of variables.
+      weights: The concatenation of the lists trainable_weights and
+          non_trainable_weights (in this order).
+      constraints: Dict mapping weights to constraints.
+
+  # Methods
+      call(x, mask=None): Where the layer's logic lives.
+      __call__(x, mask=None): Wrapper around the layer logic (`call`).
+          If x is a Keras tensor:
+              - Connect current layer with last layer from tensor:
+                  `self._add_inbound_node(last_layer)`
+              - Add layer to tensor history
+          If layer is not built:
+              - Build from inputs shape
+      get_weights()
+      set_weights(weights)
+      get_config()
+      count_params()
+      _compute_output_shape(input_shape)
+      compute_mask(x, mask)
+      get_input_at(node_index)
+      get_output_at(node_index)
+      get_input_shape_at(node_index)
+      get_output_shape_at(node_index)
+      get_input_mask_at(node_index)
+      get_output_mask_at(node_index)
+
+  # Class Methods
+      from_config(config)
+
+  # Internal methods:
+      build(input_shape)
+      _add_inbound_node(layer, index=0)
+      assert_input_compatibility()
+  """
+
+  def __init__(self, **kwargs):
+    self.input_spec = None
+    self.supports_masking = False
+
+    # These properties will be set upon call of self.build()
+    self._trainable_weights = []
+    self._non_trainable_weights = []
+    self._constraints = {}  # dict {tensor: constraint instance}
+    self.built = False
+
+    # These lists will be filled via successive calls
+    # to self._add_inbound_node().
+    self.inbound_nodes = []
+    self.outbound_nodes = []
+
+    # These properties should be set by the user via keyword arguments.
+    # note that 'dtype', 'input_shape' and 'batch_input_shape'
+    # are only applicable to input layers: do not pass these keywords
+    # to non-input layers.
+    allowed_kwargs = {
+        'input_shape', 'batch_input_shape', 'batch_size', 'dtype', 'name',
+        'trainable', 'weights'
+    }
+    for kwarg in kwargs:
+      if kwarg not in allowed_kwargs:
+        raise TypeError('Keyword argument not understood:', kwarg)
+    name = kwargs.get('name')
+    if not name:
+      prefix = self.__class__.__name__
+      name = _to_snake_case(prefix) + '_' + str(K.get_uid(prefix))
+    self.name = name
+
+    self.trainable = kwargs.get('trainable', True)
+    if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
+      # In this case we will later create an input layer
+      # to insert before the current layer
+      if 'batch_input_shape' in kwargs:
+        batch_input_shape = tuple(kwargs['batch_input_shape'])
+      elif 'input_shape' in kwargs:
+        if 'batch_size' in kwargs:
+          batch_size = kwargs['batch_size']
+        else:
+          batch_size = None
+        batch_input_shape = (batch_size,) + tuple(kwargs['input_shape'])
+      self.batch_input_shape = batch_input_shape
+      dtype = kwargs.get('dtype', K.floatx())
+      self.dtype = dtype
+    if 'weights' in kwargs:
+      self._initial_weights = kwargs['weights']
+    else:
+      self._initial_weights = None
+
+  @property
+  def constraints(self):
+    return self._constraints
+
+  @constraints.setter
+  def constraints(self, constraints):
+    self._constraints = constraints
+
+  @property
+  def trainable_weights(self):
+    trainable = getattr(self, 'trainable', True)
+    if trainable:
+      return self._trainable_weights
+    else:
+      return []
+
+  @trainable_weights.setter
+  def trainable_weights(self, weights):
+    self._trainable_weights = weights
+
+  @property
+  def non_trainable_weights(self):
+    trainable = getattr(self, 'trainable', True)
+    if not trainable:
+      return self._trainable_weights + self._non_trainable_weights
+    else:
+      return self._non_trainable_weights
+
+  @non_trainable_weights.setter
+  def non_trainable_weights(self, weights):
+    self._non_trainable_weights = weights
+
+  def add_weight(self,
+                 shape,
+                 initializer,
+                 name=None,
+                 trainable=True,
+                 regularizer=None,
+                 constraint=None):
+    """Adds a weight variable to the layer.
+
+    Arguments:
+        shape: The shape tuple of the weight.
+        initializer: An Initializer instance (callable).
+        name: String, the name for the weight variable.
+        trainable: A boolean, whether the weight should
+            be trained via backprop or not (assuming
+            that the layer itself is also trainable).
+        regularizer: An optional Regularizer instance.
+        constraint: An optional Constraint instance.
+
+    Returns:
+        The created weight variable.
+    """
+    shape = tuple(tensor_shape.TensorShape(shape).as_list())
+    initializer = initializers.get(initializer)
+    weight = K.variable(initializer(shape), dtype=K.floatx(), name=name)
+    if regularizer is not None:
+      self.add_loss(regularizer(weight))
+    if constraint is not None:
+      self.constraints[weight] = constraint
+    if trainable:
+      self._trainable_weights.append(weight)
+    else:
+      self._non_trainable_weights.append(weight)
+    return weight
+
+  def assert_input_compatibility(self, inputs):
+    """Checks compatibility between the layer and provided inputs.
+
+    This checks that the tensor(s) `input`
+    verify the input assumptions of the layer
+    (if any). If not, exceptions are raised.
+
+    Arguments:
+        inputs: input tensor or list of input tensors.
+
+    Raises:
+        ValueError: in case of mismatch between
+            the provided inputs and the expectations of the layer.
+    """
+    if not self.input_spec:
+      return
+    if not isinstance(self.input_spec, (list, tuple)):
+      input_spec = _to_list(self.input_spec)
+    else:
+      input_spec = self.input_spec
+    inputs = _to_list(inputs)
+    if len(inputs) != len(input_spec):
+      raise ValueError('Layer ' + self.name + ' expects ' + str(
+          len(input_spec)) + ' inputs, '
+                       'but it received ' + str(len(inputs)) +
+                       ' input tensors. Input received: ' + str(input))
+    for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
+      if spec is None:
+        continue
+
+      # Check ndim.
+      if spec.ndim is not None:
+        if K.ndim(x) != spec.ndim:
+          raise ValueError('Input ' + str(
+              input_index) + ' is incompatible with layer ' + self.name +
+                           ': expected ndim=' + str(
+                               spec.ndim) + ', found ndim=' + str(K.ndim(x)))
+      if spec.max_ndim is not None:
+        ndim = K.ndim(x)
+        if ndim is not None and ndim > spec.max_ndim:
+          raise ValueError('Input ' + str(
+              input_index) + ' is incompatible with layer ' + self.name +
+                           ': expected max_ndim=' + str(spec.max_ndim) +
+                           ', found ndim=' + str(K.ndim(x)))
+      if spec.min_ndim is not None:
+        ndim = K.ndim(x)
+        if ndim is not None and ndim < spec.min_ndim:
+          raise ValueError('Input ' + str(
+              input_index) + ' is incompatible with layer ' + self.name +
+                           ': expected min_ndim=' + str(spec.min_ndim) +
+                           ', found ndim=' + str(K.ndim(x)))
+      # Check dtype.
+      if spec.dtype is not None:
+        if K.dtype(x) != spec.dtype:
+          raise ValueError('Input ' + str(
+              input_index) + ' is incompatible with layer ' + self.name +
+                           ': expected dtype=' + str(
+                               spec.dtype) + ', found dtype=' + str(K.dtype(x)))
+      # Check specific shape axes.
+      if spec.axes:
+        try:
+          x_shape = K.int_shape(x)
+        except TypeError:
+          x_shape = None
+        if x_shape is not None:
+          for axis, value in spec.axes.items():
+            if hasattr(value, 'value'):
+              value = value.value
+            if value is not None and x_shape[int(axis)] not in {value, None}:
+              raise ValueError(
+                  'Input ' + str(input_index) + ' is incompatible with layer ' +
+                  self.name + ': expected axis ' + str(
+                      axis) + ' of input shape to have '
+                  'value ' + str(value) + ' but got shape ' + str(x_shape))
+      # Check shape.
+      if spec.shape is not None:
+        try:
+          x_shape = K.int_shape(x)
+        except TypeError:
+          x_shape = None
+        if x_shape is not None:
+          for spec_dim, dim in zip(spec.shape, x_shape):
+            if hasattr(spec_dim, 'value'):
+              spec_dim = spec_dim.value
+            if spec_dim is not None and dim is not None:
+              if spec_dim != dim:
+                raise ValueError('Input ' + str(
+                    input_index) + ' is incompatible with layer ' + self.name +
+                                 ': expected shape=' + str(spec.shape) +
+                                 ', found shape=' + str(x_shape))
+
+  def call(self, inputs):
+    """This is where the layer's logic lives.
+
+    Arguments:
+        inputs: input tensor, or list/tuple of input tensors.
+
+    Returns:
+        A tensor or list/tuple of tensors.
+    """
+    return inputs
+
+  def __call__(self, inputs, **kwargs):
+    """Wrapper around self.call(), for handling internal references.
+
+    If a Keras tensor is passed:
+        - We call self._add_inbound_node().
+        - If necessary, we `build` the layer to match
+            the shape of the input(s).
+        - We update the _keras_history of the output tensor(s)
+            with the current layer.
+            This is done as part of _add_inbound_node().
+
+    Arguments:
+        inputs: Can be a tensor or list/tuple of tensors.
+        **kwargs: Additional keyword arguments to be passed to `call()`.
+
+    Returns:
+        Output of the layer's `call` method.
+
+    Raises:
+        ValueError: in case the layer is missing shape information
+            for its `build` call.
+    """
+    with K.name_scope(self.name):
+      # Handle laying building (weight creating, input spec locking).
+      if not self.built:
+        # Raise exceptions in case the input is not compatible
+        # with the input_spec specified in the layer constructor.
+        self.assert_input_compatibility(inputs)
+
+        # Collect input shapes to build layer.
+        input_shapes = []
+        for x_elem in _to_list(inputs):
+          input_shapes.append(K.int_shape(x_elem))
+        if len(input_shapes) == 1:
+          self.build(input_shapes[0])
+        else:
+          self.build(input_shapes)
+        self.built = True
+
+        # Load weights that were specified at layer instantiation.
+        if self._initial_weights is not None:
+          self.set_weights(self._initial_weights)
+
+      # Raise exceptions in case the input is not compatible
+      # with the input_spec set at build time.
+      self.assert_input_compatibility(inputs)
+
+      # Handle mask propagation.
+      previous_mask = _collect_previous_mask(inputs)
+      if not _is_all_none(previous_mask):
+        # The previous layer generated a mask.
+        if 'mask' in inspect.getargspec(self.call).args:
+          if 'mask' not in kwargs:
+            # If mask is explicitly passed to __call__,
+            # we should override the default mask.
+            kwargs['mask'] = previous_mask
+
+      # Actually call the layer, collecting output(s), mask(s), and shape(s).
+      output = self.call(inputs, **kwargs)
+      output_mask = self.compute_mask(inputs, previous_mask)
+
+      # Add an inbound node to the layer, so that it keeps track
+      # of the call and of all new variables created during the call.
+      # This also updates the layer history of the output tensor(s).
+      # If the input tensor(s) had not previous Keras history,
+      # this does nothing.
+      self._add_inbound_node(
+          input_tensors=inputs,
+          output_tensors=output,
+          input_masks=previous_mask,
+          output_masks=output_mask,
+          arguments=kwargs)
+
+      # Apply activity regularizer if any:
+      if hasattr(
+          self,
+          'activity_regularizer') and self.activity_regularizer is not None:
+        regularization_losses = [
+            self.activity_regularizer(x) for x in _to_list(output)
+        ]
+        self.add_loss(regularization_losses, _to_list(inputs))
+    return output
+
+  def _add_inbound_node(self,
+                        input_tensors,
+                        output_tensors,
+                        input_masks,
+                        output_masks,
+                        arguments=None):
+    """Internal method to create an inbound node for the layer.
+
+    Arguments:
+        input_tensors: list of input tensors.
+        output_tensors: list of output tensors.
+        input_masks: list of input masks (a mask can be a tensor, or None).
+        output_masks: list of output masks (a mask can be a tensor, or None).
+        arguments: dictionary of keyword arguments that were passed to the
+            `call` method of the layer at the call that created the node.
+    """
+    input_tensors = _to_list(input_tensors)
+    output_tensors = _to_list(output_tensors)
+    input_masks = _to_list(input_masks)
+    output_masks = _to_list(output_masks)
+
+    # Collect input tensor(s) coordinates.
+    inbound_layers = []
+    node_indices = []
+    tensor_indices = []
+    for x in input_tensors:
+      if hasattr(x, '_keras_history'):
+        inbound_layer, node_index, tensor_index = x._keras_history
+        inbound_layers.append(inbound_layer)
+        node_indices.append(node_index)
+        tensor_indices.append(tensor_index)
+      else:
+        inbound_layers.append(None)
+        node_indices.append(None)
+        tensor_indices.append(None)
+
+    # Create node, add it to inbound nodes.
+    Node(
+        self,
+        inbound_layers=inbound_layers,
+        node_indices=node_indices,
+        tensor_indices=tensor_indices,
+        input_tensors=input_tensors,
+        output_tensors=output_tensors,
+        input_masks=input_masks,
+        output_masks=output_masks,
+        arguments=arguments)
+
+    # Update tensor history and `_uses_learning_phase`.
+    for i in range(len(output_tensors)):
+      uses_lp = any(
+          [getattr(x, '_uses_learning_phase', False) for x in input_tensors])
+      uses_lp = getattr(self, 'uses_learning_phase', False) or uses_lp
+      output_tensors[i]._uses_learning_phase = getattr(
+          output_tensors[i], '_uses_learning_phase', False) or uses_lp
+      output_tensors[i]._keras_history = (self, len(self.inbound_nodes) - 1, i)
+
+  def _compute_output_shape(self, input_shape):
+    """Computes the output shape of the layer.
+
+    Assumes that the layer will be built
+    to match that input shape provided.
+
+    Arguments:
+        input_shape: Shape tuple (tuple of integers)
+            or list of shape tuples (one per output tensor of the layer).
+            Shape tuples can include None for free dimensions,
+            instead of an integer.
+
+    Returns:
+        An input shape tuple.
+    """
+    if isinstance(input_shape, list):
+      return [tensor_shape.TensorShape(shape) for shape in input_shape]
+    else:
+      return tensor_shape.TensorShape(input_shape)
+
+  def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
+    """Computes an output mask tensor.
+
+    Arguments:
+        inputs: Tensor or list of tensors.
+        mask: Tensor or list of tensors.
+
+    Returns:
+        None or a tensor (or list of tensors,
+            one per output tensor of the layer).
+    """
+    if not self.supports_masking:
+      if mask is not None:
+        if isinstance(mask, list):
+          if any(m is not None for m in mask):
+            raise TypeError('Layer ' + self.name + ' does not support masking, '
+                            'but was passed an input_mask: ' + str(mask))
+        else:
+          raise TypeError('Layer ' + self.name + ' does not support masking, '
+                          'but was passed an input_mask: ' + str(mask))
+      # masking not explicitly supported: return None as mask
+      return None
+    # if masking is explictly supported, by default
+    # carry over the input mask
+    return mask
+
+  def build(self, input_shape):  # pylint: disable=unused-argument
+    """Creates the layer weights.
+
+    Must be implemented on all layers that have weights.
+
+    Arguments:
+        input_shape: Keras tensor (future input to layer)
+            or list/tuple of Keras tensors to reference
+            for weight shape computations.
+    """
+    self.built = True
+
+  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
+    """Retrieves an attribute (e.g. input_tensors) from a node.
+
+    This is used to implement the methods:
+        - get_input_shape_at
+        - get_output_shape_at
+        - get_input_at
+        etc...
+
+    Arguments:
+        node_index: Integer index of the node from which
+            to retrieve the attribute.
+        attr: Exact node attribute name.
+        attr_name: Human-readable attribute name, for error messages.
+
+    Returns:
+        The layer's attribute `attr` at the node of index `node_index`.
+
+    Raises:
+        RuntimeError: If the layer has no inbound nodes.
+        ValueError: If the index is does not match any node.
+    """
+    if not self.inbound_nodes:
+      raise RuntimeError('The layer has never been called '
+                         'and thus has no defined ' + attr_name + '.')
+    if not len(self.inbound_nodes) > node_index:
+      raise ValueError('Asked to get ' + attr_name + ' at node ' + str(
+          node_index) + ', but the layer has only ' + str(
+              len(self.inbound_nodes)) + ' inbound nodes.')
+    values = getattr(self.inbound_nodes[node_index], attr)
+    if len(values) == 1:
+      return values[0]
+    else:
+      return values
+
+  def get_input_shape_at(self, node_index):
+    """Retrieves the input shape(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A shape tuple
+        (or list of shape tuples if the layer has multiple inputs).
+    """
+    return self._get_node_attribute_at_index(node_index, 'input_shapes',
+                                             'input shape')
+
+  def get_output_shape_at(self, node_index):
+    """Retrieves the output shape(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A shape tuple
+        (or list of shape tuples if the layer has multiple outputs).
+    """
+    return self._get_node_attribute_at_index(node_index, 'output_shapes',
+                                             'output shape')
+
+  def get_input_at(self, node_index):
+    """Retrieves the input tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A tensor (or list of tensors if the layer has multiple inputs).
+    """
+    return self._get_node_attribute_at_index(node_index, 'input_tensors',
+                                             'input')
+
+  def get_output_at(self, node_index):
+    """Retrieves the output tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A tensor (or list of tensors if the layer has multiple outputs).
+    """
+    return self._get_node_attribute_at_index(node_index, 'output_tensors',
+                                             'output')
+
+  def get_input_mask_at(self, node_index):
+    """Retrieves the input mask tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A mask tensor
+        (or list of tensors if the layer has multiple inputs).
+    """
+    return self._get_node_attribute_at_index(node_index, 'input_masks',
+                                             'input mask')
+
+  def get_output_mask_at(self, node_index):
+    """Retrieves the output mask tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A mask tensor
+        (or list of tensors if the layer has multiple outputs).
+    """
+    return self._get_node_attribute_at_index(node_index, 'output_masks',
+                                             'output mask')
+
+  @property
+  def input(self):
+    """Retrieves the input tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one inbound node,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+        Input tensor or list of input tensors.
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+    """
+    if len(self.inbound_nodes) > 1:
+      raise AttributeError('Layer ' + self.name +
+                           ' has multiple inbound nodes, '
+                           'hence the notion of "layer input" '
+                           'is ill-defined. '
+                           'Use `get_input_at(node_index)` instead.')
+    elif not self.inbound_nodes:
+      raise AttributeError('Layer ' + self.name +
+                           ' is not connected, no input to return.')
+    return self._get_node_attribute_at_index(0, 'input_tensors', 'input')
+
+  @property
+  def output(self):
+    """Retrieves the output tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one inbound node,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+        Output tensor or list of output tensors.
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+    """
+    if not self.inbound_nodes:
+      raise AttributeError('Layer ' + self.name + ' has no inbound nodes.')
+    if len(self.inbound_nodes) > 1:
+      raise AttributeError('Layer ' + self.name +
+                           ' has multiple inbound nodes, '
+                           'hence the notion of "layer output" '
+                           'is ill-defined. '
+                           'Use `get_output_at(node_index)` instead.')
+    return self._get_node_attribute_at_index(0, 'output_tensors', 'output')
+
+  @property
+  def input_mask(self):
+    """Retrieves the input mask tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one inbound node,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+        Input mask tensor (potentially None) or list of input
+        mask tensors.
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+    """
+    if len(self.inbound_nodes) != 1:
+      raise AttributeError('Layer ' + self.name +
+                           ' has multiple inbound nodes, ' +
+                           'hence the notion of "layer input mask" '
+                           'is ill-defined. '
+                           'Use `get_input_mask_at(node_index)` '
+                           'instead.')
+    return self._get_node_attribute_at_index(0, 'input_masks', 'input mask')
+
+  @property
+  def output_mask(self):
+    """Retrieves the output mask tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one inbound node,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+        Output mask tensor (potentially None) or list of output
+        mask tensors.
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+    """
+    if len(self.inbound_nodes) != 1:
+      raise AttributeError('Layer ' + self.name +
+                           ' has multiple inbound nodes, '
+                           'hence the notion of "layer output mask" '
+                           'is ill-defined. '
+                           'Use `get_output_mask_at(node_index)` '
+                           'instead.')
+    return self._get_node_attribute_at_index(0, 'output_masks', 'output mask')
+
+  @property
+  def input_shape(self):
+    """Retrieves the input shape tuple(s) of a layer.
+
+    Only applicable if the layer has exactly one inbound node,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+        Input shape tuple
+        (or list of input shape tuples, one tuple per input tensor).
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+    """
+    if not self.inbound_nodes:
+      raise AttributeError('The layer has never been called '
+                           'and thus has no defined input shape.')
+    all_input_shapes = set(
+        [str(node.input_shapes) for node in self.inbound_nodes])
+    if len(all_input_shapes) == 1:
+      input_shapes = self.inbound_nodes[0].input_shapes
+      if len(input_shapes) == 1:
+        return tuple(tensor_shape.TensorShape(input_shapes[0]).as_list())
+      else:
+        return [
+            tuple(tensor_shape.TensorShape(shape).as_list())
+            for shape in input_shapes
+        ]
+    else:
+      raise AttributeError('The layer "' + str(self.name) +
+                           ' has multiple inbound nodes, '
+                           'with different input shapes. Hence '
+                           'the notion of "input shape" is '
+                           'ill-defined for the layer. '
+                           'Use `get_input_shape_at(node_index)` '
+                           'instead.')
+
+  @property
+  def output_shape(self):
+    """Retrieves the output shape tuple(s) of a layer.
+
+    Only applicable if the layer has one inbound node,
+    or if all inbound nodes have the same output shape.
+
+    Returns:
+        Output shape tuple
+        (or list of input shape tuples, one tuple per output tensor).
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+    """
+    if not self.inbound_nodes:
+      raise AttributeError('The layer has never been called '
+                           'and thus has no defined output shape.')
+    all_output_shapes = set(
+        [str(node.output_shapes) for node in self.inbound_nodes])
+    if len(all_output_shapes) == 1:
+      output_shapes = self.inbound_nodes[0].output_shapes
+      if len(output_shapes) == 1:
+        return tuple(tensor_shape.TensorShape(output_shapes[0]).as_list())
+      else:
+        return [
+            tuple(tensor_shape.TensorShape(shape).as_list())
+            for shape in output_shapes
+        ]
+    else:
+      raise AttributeError('The layer "' + str(self.name) +
+                           ' has multiple inbound nodes, '
+                           'with different output shapes. Hence '
+                           'the notion of "output shape" is '
+                           'ill-defined for the layer. '
+                           'Use `get_output_shape_at(node_index)` '
+                           'instead.')
+
+  def add_loss(self, losses, inputs=None):
+    """Add losses to the layer.
+
+    The loss may potentially be conditional on some inputs tensors,
+    for instance activity losses are conditional on the layer's inputs.
+
+    Arguments:
+        losses: loss tensor or list of loss tensors
+            to add to the layer.
+        inputs: input tensor or list of inputs tensors to mark
+            the losses as conditional on these inputs.
+            If None is passed, the loss is assumed unconditional
+            (e.g. L2 weight regularization, which only depends
+            on the layer's weights variables, not on any inputs tensors).
+    """
+    if losses is None:
+      return
+    # Update self.losses
+    losses = _to_list(losses)
+    if not hasattr(self, 'losses'):
+      self.losses = []
+    try:
+      self.losses += losses
+    except AttributeError:
+      # In case self.losses isn't settable
+      # (i.e. it's a getter method).
+      # In that case the `losses` property is
+      # auto-computed and shouldn't be set.
+      pass
+    # Update self._per_input_updates
+    if not hasattr(self, '_per_input_losses'):
+      self._per_input_losses = {}
+    if inputs is not None:
+      inputs_hash = _object_list_uid(inputs)
+    else:
+      # Updates indexed by None are unconditional
+      # rather than input-dependent
+      inputs_hash = None
+    if inputs_hash not in self._per_input_losses:
+      self._per_input_losses[inputs_hash] = []
+    self._per_input_losses[inputs_hash] += losses
+
+  def add_update(self, updates, inputs=None):
+    """Add updates to the layer.
+
+    The updates may potentially be conditional on some inputs tensors,
+    for instance batch norm updates are conditional on the layer's inputs.
+
+    Arguments:
+        updates: update op or list of update ops
+            to add to the layer.
+        inputs: input tensor or list of inputs tensors to mark
+            the updates as conditional on these inputs.
+            If None is passed, the updates are assumed unconditional.
+    """
+    if updates is None:
+      return
+    # Update self.updates
+    updates = _to_list(updates)
+    if not hasattr(self, 'updates'):
+      self.updates = []
+    try:
+      self.updates += updates
+    except AttributeError:
+      # In case self.updates isn't settable
+      # (i.e. it's a getter method).
+      # In that case the `updates` property is
+      # auto-computed and shouldn't be set.
+      pass
+    # Update self._per_input_updates
+    if not hasattr(self, '_per_input_updates'):
+      self._per_input_updates = {}
+    if inputs is not None:
+      inputs_hash = _object_list_uid(inputs)
+    else:
+      # Updates indexed by None are unconditional
+      # rather than input-dependent
+      inputs_hash = None
+    if inputs_hash not in self._per_input_updates:
+      self._per_input_updates[inputs_hash] = []
+    self._per_input_updates[inputs_hash] += updates
+
+  def get_updates_for(self, inputs):
+    if not hasattr(self, '_per_input_updates'):
+      return []
+    if inputs is not None:
+      inputs_hash = _object_list_uid(inputs)
+    else:
+      inputs_hash = None
+    if inputs_hash in self._per_input_updates:
+      return self._per_input_updates[inputs_hash]
+    return []
+
+  def get_losses_for(self, inputs):
+    if not hasattr(self, '_per_input_losses'):
+      return []
+    if inputs is not None:
+      inputs_hash = _object_list_uid(inputs)
+    else:
+      inputs_hash = None
+    if inputs_hash in self._per_input_losses:
+      return self._per_input_losses[inputs_hash]
+    return []
+
+  @property
+  def weights(self):
+    return self.trainable_weights + self.non_trainable_weights
+
+  def set_weights(self, weights):
+    """Sets the weights of the layer, from Numpy arrays.
+
+    Arguments:
+        weights: a list of Numpy arrays. The number
+            of arrays and their shape must match
+            number of the dimensions of the weights
+            of the layer (i.e. it should match the
+            output of `get_weights`).
+
+    Raises:
+        ValueError: If the provided weights list does not match the
+            layer's specifications.
+    """
+    params = self.weights
+    if len(params) != len(weights):
+      raise ValueError('You called `set_weights(weights)` on layer "' +
+                       self.name + '" with a  weight list of length ' + str(
+                           len(weights)) + ', but the layer was expecting ' +
+                       str(len(params)) + ' weights. Provided weights: ' + str(
+                           weights)[:50] + '...')
+    if not params:
+      return
+    weight_value_tuples = []
+    param_values = K.batch_get_value(params)
+    for pv, p, w in zip(param_values, params, weights):
+      if pv.shape != w.shape:
+        raise ValueError('Layer weight shape ' + str(pv.shape) +
+                         ' not compatible with '
+                         'provided weight shape ' + str(w.shape))
+      weight_value_tuples.append((p, w))
+    K.batch_set_value(weight_value_tuples)
+
+  def get_weights(self):
+    """Returns the current weights of the layer.
+
+    Returns:
+        Weights values as a list of numpy arrays.
+    """
+    params = self.weights
+    return K.batch_get_value(params)
+
+  def get_config(self):
+    """Returns the config of the layer.
+
+    A layer config is a Python dictionary (serializable)
+    containing the configuration of a layer.
+    The same layer can be reinstantiated later
+    (without its trained weights) from this configuration.
+
+    The config of a layer does not include connectivity
+    information, nor the layer class name. These are handled
+    by `Container` (one layer of abstraction above).
+
+    Returns:
+        Python dictionary.
+    """
+    config = {'name': self.name, 'trainable': self.trainable}
+    if hasattr(self, 'batch_input_shape'):
+      config['batch_input_shape'] = self.batch_input_shape
+    if hasattr(self, 'dtype'):
+      config['dtype'] = self.dtype
+    return config
+
+  @classmethod
+  def from_config(cls, config):
+    """Creates a layer from its config.
+
+    This method is the reverse of `get_config`,
+    capable of instantiating the same layer from the config
+    dictionary. It does not handle layer connectivity
+    (handled by Container), nor weights (handled by `set_weights`).
+
+    Arguments:
+        config: A Python dictionary, typically the
+            output of get_config.
+
+    Returns:
+        A layer instance.
+    """
+    return cls(**config)
+
+  def count_params(self):
+    """Count the total number of scalars composing the weights.
+
+    Returns:
+        An integer count.
+
+    Raises:
+        RuntimeError: if the layer isn't yet built
+            (in which case its weights aren't yet defined).
+    """
+    if not self.built:
+      if self.__class__.__name__ == 'Sequential':
+        self.build()  # pylint: disable=no-value-for-parameter
+      else:
+        raise RuntimeError('You tried to call `count_params` on ' + self.name +
+                           ', but the layer isn\'t built. '
+                           'You can build it manually via: `' + self.name +
+                           '.build(batch_input_shape)`.')
+    return sum([K.count_params(p) for p in self.weights])
+
+
+class InputLayer(Layer):
+  """Layer to be used as an entry point into a graph.
+
+  It can either wrap an existing tensor (pass an `input_tensor` argument)
+  or create its a placeholder tensor (pass arguments `input_shape`
+  or `batch_input_shape` as well as `dtype`).
+
+  Arguments:
+      input_shape: Shape tuple, not including the batch axis.
+      batch_size: Optional input batch size (integer or None).
+      batch_input_shape: Shape tuple, including the batch axis.
+      dtype: Datatype of the input.
+      input_tensor: Optional tensor to use as layer input
+          instead of creating a placeholder.
+      sparse: Boolean, whether the placeholder created
+          is meant to be sparse.
+      name: Name of the layer (string).
+  """
+
+  def __init__(self,
+               input_shape=None,
+               batch_size=None,
+               batch_input_shape=None,
+               dtype=None,
+               input_tensor=None,
+               sparse=False,
+               name=None):
+    if not name:
+      prefix = 'input'
+      name = prefix + '_' + str(K.get_uid(prefix))
+    super(InputLayer, self).__init__(dtype=dtype, name=name)
+
+    self.trainable = False
+    self.built = True
+    self.sparse = sparse
+
+    if input_shape and batch_input_shape:
+      raise ValueError('Only provide the input_shape OR '
+                       'batch_input_shape argument to '
+                       'InputLayer, not both at the same time.')
+    if input_tensor is not None:
+      # Attempt automatic input shape inference.
+      try:
+        batch_input_shape = K.int_shape(input_tensor)
+      except TypeError:
+        if not input_shape and not batch_input_shape:
+          raise ValueError('InputLayer was provided '
+                           'an input_tensor argument, '
+                           'but its input shape cannot be '
+                           'automatically inferred. '
+                           'You should pass an input_shape or '
+                           'batch_input_shape argument.')
+    if not batch_input_shape:
+      if not input_shape:
+        raise ValueError('An Input layer should be passed either '
+                         'a `batch_input_shape` or an `input_shape`.')
+      else:
+        batch_input_shape = (batch_size,) + tuple(input_shape)
+    else:
+      batch_input_shape = tuple(batch_input_shape)
+
+    if not dtype:
+      if input_tensor is None:
+        dtype = K.floatx()
+      else:
+        dtype = K.dtype(input_tensor)
+
+    self.batch_input_shape = batch_input_shape
+    self.dtype = dtype
+
+    if input_tensor is None:
+      self.is_placeholder = True
+      input_tensor = K.placeholder(
+          shape=batch_input_shape,
+          dtype=dtype,
+          sparse=self.sparse,
+          name=self.name)
+    else:
+      self.is_placeholder = False
+    # Create an input node to add to self.outbound_node
+    # and set output_tensors' _keras_history.
+    input_tensor._uses_learning_phase = False
+    input_tensor._keras_history = (self, 0, 0)
+    Node(
+        self,
+        inbound_layers=[],
+        node_indices=[],
+        tensor_indices=[],
+        input_tensors=[input_tensor],
+        output_tensors=[input_tensor],
+        input_masks=[None],
+        output_masks=[None])
+
+  def get_config(self):
+    config = {
+        'batch_input_shape': self.batch_input_shape,
+        'dtype': self.dtype,
+        'sparse': self.sparse,
+        'name': self.name
+    }
+    return config
+
+
+def Input(  # pylint: disable=invalid-name
+    shape=None,
+    batch_shape=None,
+    name=None,
+    dtype=K.floatx(),
+    sparse=False,
+    tensor=None):
+  """`Input()` is used to instantiate a Keras tensor.
+
+  A Keras tensor is a tensor object from the underlying backend
+  (Theano or TensorFlow), which we augment with certain
+  attributes that allow us to build a Keras model
+  just by knowing the inputs and outputs of the model.
+
+  For instance, if a, b and c and Keras tensors,
+  it becomes possible to do:
+  `model = Model(input=[a, b], output=c)`
+
+  The added Keras attribute is:
+      `_keras_history`: Last layer applied to the tensor.
+          the entire layer graph is retrievable from that layer,
+          recursively.
+
+  Arguments:
+      shape: A shape tuple (integer), not including the batch size.
+          For instance, `shape=(32,)` indicates that the expected input
+          will be batches of 32-dimensional vectors.
+      batch_shape: A shape tuple (integer), including the batch size.
+          For instance, `batch_shape=(10, 32)` indicates that
+          the expected input will be batches of 10 32-dimensional vectors.
+          `batch_shape=(None, 32)` indicates batches of an arbitrary number
+          of 32-dimensional vectors.
+      name: An optional name string for the layer.
+          Should be unique in a model (do not reuse the same name twice).
+          It will be autogenerated if it isn't provided.
+      dtype: The data type expected by the input, as a string
+          (`float32`, `float64`, `int32`...)
+      sparse: A boolean specifying whether the placeholder
+          to be created is sparse.
+      tensor: Optional existing tensor to wrap into the `Input` layer.
+          If set, the layer will not create a placeholder tensor.
+
+  Returns:
+      A tensor.
+
+  Example:
+
+      ```python
+      # this is a logistic regression in Keras
+      x = Input(shape=(32,))
+      y = Dense(16, activation='softmax')(x)
+      model = Model(x, y)
+      ```
+  """
+  if not batch_shape and tensor is None:
+    assert shape, ('Please provide to Input either a `shape`'
+                   ' or a `batch_shape` argument. Note that '
+                   '`shape` does not include the batch '
+                   'dimension.')
+  if shape and not batch_shape:
+    batch_shape = (None,) + tuple(shape)
+  input_layer = InputLayer(
+      batch_input_shape=batch_shape,
+      name=name,
+      dtype=dtype,
+      sparse=sparse,
+      input_tensor=tensor)
+  # Return tensor including `_keras_history`.
+  # Note that in this case train_output and test_output are the same pointer.
+  outputs = input_layer.inbound_nodes[0].output_tensors
+  if len(outputs) == 1:
+    return outputs[0]
+  else:
+    return outputs
+
+
+class Container(Layer):
+  """A Container is a directed acyclic graph of layers.
+
+  It is the topological form of a "model". A Model
+  is simply a Container with added training routines.
+
+  # Properties
+      name
+      inputs
+      outputs
+      input_layers
+      output_layers
+      input_spec (list of class instances)
+          each entry describes one required input:
+              - ndim
+              - dtype
+      trainable (boolean)
+      input_shape
+      output_shape
+      inbound_nodes: list of nodes
+      outbound_nodes: list of nodes
+      trainable_weights (list of variables)
+      non_trainable_weights (list of variables)
+      constraints (list of tuples (weight, constraint))
+
+  # Methods
+      summary
+      get_layer
+      get_weights
+      set_weights
+      get_config
+      get_output_shape_for
+
+  # Class Methods
+      from_config
+  """
+
+  def __init__(self, inputs, outputs, name=None):  # pylint: disable=super-init-not-called
+    # Handle `name` argument.
+    if not name:
+      prefix = self.__class__.__name__.lower()
+      name = prefix + '_' + str(K.get_uid(prefix))
+    self.name = name
+
+    self.supports_masking = False
+    self.trainable = True
+
+    # Container-specific properties.
+    if isinstance(inputs, (list, tuple)):
+      self.inputs = list(inputs)  # Tensor or list of tensors.
+    else:
+      self.inputs = [inputs]
+    if isinstance(outputs, (list, tuple)):
+      self.outputs = list(outputs)
+    else:
+      self.outputs = [outputs]
+
+    # Check for redundancy in inputs.
+    inputs_set = set(self.inputs)
+    if len(inputs_set) != len(self.inputs):
+      raise ValueError('The list of inputs passed to the model '
+                       'is redundant. '
+                       'All inputs should only appear once.'
+                       ' Found: ' + str(self.inputs))
+
+    # List of initial layers (1 to 1 mapping with self.inputs,
+    # hence the same layer might appear twice)
+    self.input_layers = []
+    self.input_layers_node_indices = []
+    self.input_layers_tensor_indices = []
+    # list of layers (1 to 1 mapping with self.inputs,
+    # hence the same layer might appear twice)
+    self.output_layers = []
+    self.output_layers_node_indices = []
+    self.output_layers_tensor_indices = []
+    # all layers in order of horizontal graph traversal.
+    # Entries are unique. Includes input and output layers.
+    self.layers = []
+
+    # This is for performance optimization
+    # when calling the Container on new inputs.
+    # every time the Container is called on a set on input tensors,
+    # we compute the output tensors,
+    # output masks and output shapes in one pass,
+    # then cache them here. When of of these output is queried later,
+    # we retrieve it from there instead of recomputing it.
+    self._output_mask_cache = {}
+    self._output_tensor_cache = {}
+    self._output_shape_cache = {}
+
+    # User-provided arguments validation.
+    for x in self.inputs:
+      # Check that x is a Keras tensor.
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise TypeError('Input tensors to a ' + cls_name + ' ' +
+                        'must be Keras tensors. Found: ' + str(
+                            x) + ' (missing Keras metadata).')
+      # Check that x is an input tensor.
+      layer, node_index, tensor_index = x._keras_history
+      if len(layer.inbound_nodes) > 1 or (
+          layer.inbound_nodes and layer.inbound_nodes[0].inbound_layers):
+        cls_name = self.__class__.__name__
+        warnings.warn(cls_name + ' inputs must come from '
+                      'a Keras Input layer, '
+                      'they cannot be the output of '
+                      'a previous non-Input layer. '
+                      'Here, a tensor specified as '
+                      'input to "' + self.name + '" was not an Input tensor, '
+                      'it was generated by layer ' + layer.name + '.\n'
+                      'Note that input tensors are '
+                      'instantiated via `tensor = Input(shape)`.\n'
+                      'The tensor that caused the issue was: ' + str(x.name))
+    for x in self.outputs:
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise TypeError('Output tensors to a ' + cls_name + ' must be '
+                        'Keras tensors. Found: ' + str(x))
+    # Build self.output_layers:
+    for x in self.outputs:
+      layer, node_index, tensor_index = x._keras_history
+      self.output_layers.append(layer)
+      self.output_layers_node_indices.append(node_index)
+      self.output_layers_tensor_indices.append(tensor_index)
+
+    # Fill in the output mask cache.
+    masks = []
+    for x in self.inputs:
+      layer, node_index, tensor_index = x._keras_history
+      node = layer.inbound_nodes[node_index]
+      mask = node.output_masks[tensor_index]
+      masks.append(mask)
+    mask_cache_key = ','.join([str(id(x)) for x in self.inputs])
+    mask_cache_key += '_' + ','.join([str(id(x)) for x in masks])
+    masks = []
+    for x in self.outputs:
+      layer, node_index, tensor_index = x._keras_history
+      node = layer.inbound_nodes[node_index]
+      mask = node.output_masks[tensor_index]
+      masks.append(mask)
+    if len(masks) == 1:
+      mask = masks[0]
+    else:
+      mask = masks
+    self._output_mask_cache[mask_cache_key] = mask
+
+    # Build self.input_layers:
+    for x in self.inputs:
+      layer, node_index, tensor_index = x._keras_history
+      # It's supposed to be an input layer, so only one node
+      # and one tensor output.
+      assert node_index == 0
+      assert tensor_index == 0
+      self.input_layers.append(layer)
+      self.input_layers_node_indices.append(node_index)
+      self.input_layers_tensor_indices.append(tensor_index)
+
+    # Build self.input_names and self.output_names.
+    self.input_names = []
+    self.output_names = []
+    self._feed_input_names = []
+    self._feed_inputs = []
+    self._feed_input_shapes = []
+    for i, layer in enumerate(self.input_layers):
+      self.input_names.append(layer.name)
+      if layer.is_placeholder:
+        self._feed_input_names.append(layer.name)
+        self._feed_inputs.append(layer.input)
+        self._feed_input_shapes.append(K.int_shape(self.inputs[i]))
+    for layer in self.output_layers:
+      self.output_names.append(layer.name)
+
+    self.internal_input_shapes = [K.int_shape(x) for x in self.inputs]
+    self.internal_output_shapes = [K.int_shape(x) for x in self.outputs]
+
+    # Container_nodes: set of nodes included in the graph
+    # (not all nodes included in the layers
+    # are relevant to the current graph).
+    container_nodes = set()  # ids of all nodes relevant to the Container
+    nodes_depths = {}  # dict {node: depth value}
+    layers_depths = {}  # dict {layer: depth value}
+    layer_indices = {}  # dict {layer: index in traversal}
+
+    def make_node_marker(node, depth):
+      return str(id(node)) + '-' + str(depth)
+
+    def build_map_of_graph(tensor,
+                           seen_nodes=None,
+                           depth=0,
+                           layer=None,
+                           node_index=None,
+                           tensor_index=None):
+      """Builds a map of the graph of layers.
+
+      This recursively updates the maps `nodes_depths`,
+      `layers_depths` and the set `container_nodes`.
+
+      Does not try to detect cycles in the graph.
+
+      Arguments:
+          tensor: Some tensor in a graph.
+          seen_nodes: Set of node ids ("{layer.name}_ib-{node_index}")
+              of nodes seen so far. Useful to prevent infinite loops.
+          depth: Current depth in the graph (0 = last output).
+          layer: Layer from which `tensor` comes from. If not provided,
+              will be obtained from `tensor._keras_history`.
+          node_index: Node index from which `tensor` comes from.
+          tensor_index: Tensor_index from which `tensor` comes from.
+      """
+      seen_nodes = seen_nodes or set()
+      if not layer or node_index is None or tensor_index is None:
+        layer, node_index, tensor_index = tensor._keras_history
+      node = layer.inbound_nodes[node_index]
+
+      # Prevent cycles.
+      seen_nodes.add(make_node_marker(node, depth))
+
+      node_key = layer.name + '_ib-' + str(node_index)
+      # Update container_nodes.
+      container_nodes.add(node_key)
+      # Update nodes_depths.
+      node_depth = nodes_depths.get(node)
+      if node_depth is None:
+        nodes_depths[node] = depth
+      else:
+        nodes_depths[node] = max(depth, node_depth)
+      # Update layers_depths.
+      previously_seen_depth = layers_depths.get(layer)
+      if previously_seen_depth is None:
+        current_depth = depth
+      else:
+        current_depth = max(depth, previously_seen_depth)
+      layers_depths[layer] = current_depth
+      if layer not in layer_indices:
+        layer_indices[layer] = len(layer_indices)
+
+      # Propagate to all previous tensors connected to this node.
+      for i in range(len(node.inbound_layers)):
+        x = node.input_tensors[i]
+        layer = node.inbound_layers[i]
+        node_index = node.node_indices[i]
+        tensor_index = node.tensor_indices[i]
+        next_node = layer.inbound_nodes[node_index]
+        # use node_marker to prevent cycles
+        node_marker = make_node_marker(next_node, current_depth + 1)
+        if node_marker not in seen_nodes:
+          build_map_of_graph(x, seen_nodes, current_depth + 1, layer,
+                             node_index, tensor_index)
+
+    for x in self.outputs:
+      seen_nodes = set()
+      build_map_of_graph(x, seen_nodes, depth=0)
+
+    # Build a dict {depth: list of nodes with this depth}
+    nodes_by_depth = {}
+    for node, depth in nodes_depths.items():
+      if depth not in nodes_by_depth:
+        nodes_by_depth[depth] = []
+      nodes_by_depth[depth].append(node)
+
+    # Build a dict {depth: list of layers with this depth}
+    layers_by_depth = {}
+    for layer, depth in layers_depths.items():
+      if depth not in layers_by_depth:
+        layers_by_depth[depth] = []
+      layers_by_depth[depth].append(layer)
+
+    # Get sorted list of layer depths.
+    depth_keys = list(layers_by_depth.keys())
+    depth_keys.sort(reverse=True)
+
+    # Set self.layers and self.layers_by_depth.
+    layers = []
+    for depth in depth_keys:
+      layers_for_depth = layers_by_depth[depth]
+      # Container.layers needs to have a deterministic order:
+      # here we order them by traversal order.
+      layers_for_depth.sort(key=lambda x: layer_indices[x])
+      for layer in layers_for_depth:
+        layers.append(layer)
+    self.layers = layers
+    self.layers_by_depth = layers_by_depth
+
+    # Get sorted list of node depths.
+    depth_keys = list(nodes_by_depth.keys())
+    depth_keys.sort(reverse=True)
+
+    # Check that all tensors required are computable.
+    # computable_tensors: all tensors in the graph
+    # that can be computed from the inputs provided.
+    computable_tensors = []
+    for x in self.inputs:
+      computable_tensors.append(x)
+
+    layers_with_complete_input = []  # To provide a better error msg.
+    for depth in depth_keys:
+      for node in nodes_by_depth[depth]:
+        layer = node.outbound_layer
+        if layer:
+          for x in node.input_tensors:
+            if x not in computable_tensors:
+              raise RuntimeError('Graph disconnected: '
+                                 'cannot obtain value for tensor ' + str(x) +
+                                 ' at layer "' + layer.name + '". '
+                                 'The following previous layers '
+                                 'were accessed without issue: ' + str(
+                                     layers_with_complete_input))
+          for x in node.output_tensors:
+            computable_tensors.append(x)
+          layers_with_complete_input.append(layer.name)
+
+    # Set self.nodes and self.nodes_by_depth.
+    self.container_nodes = container_nodes
+    self.nodes_by_depth = nodes_by_depth
+
+    # Ensure name unicity, which will be crucial for serialization
+    # (since serialized nodes refer to layers by their name).
+    all_names = [layer.name for layer in self.layers]
+    for name in all_names:
+      if all_names.count(name) != 1:
+        raise RuntimeError('The name "' + name + '" is used ' + str(
+            all_names.count(name)) + ' times in the model. '
+                           'All layer names should be unique.')
+
+    # Layer parameters.
+    # The new container starts with a single inbound node
+    # for its inputs, and no outbound nodes.
+    self.outbound_nodes = []  # Will be appended to by future calls to __call__
+    self.inbound_nodes = [
+    ]  # Will be appended to below, and by future calls to __call__
+    # Create the node linking internal inputs to internal outputs.
+    Node(
+        outbound_layer=self,
+        inbound_layers=[],
+        node_indices=[],
+        tensor_indices=[],
+        input_tensors=self.inputs,
+        output_tensors=self.outputs,
+        # No container-level masking for now.
+        input_masks=[None for _ in self.inputs],
+        output_masks=[None for _ in self.outputs])
+    self.built = True
+
+    # The following are implemented as property functions:
+    # self.constraints
+    # self.trainable_weights
+    # self.non_trainable_weights
+    # self.input_spec
+
+  def get_layer(self, name=None, index=None):
+    """Retrieves a layer based on either its name (unique) or index.
+
+    Indices are based on order of horizontal graph traversal (bottom-up).
+
+    Arguments:
+        name: String, name of layer.
+        index: Integer, index of layer.
+
+    Returns:
+        A layer instance.
+
+    Raises:
+        ValueError: In case of invalid layer name or index.
+    """
+    # It would be unreliable to build a dictionary
+    # based on layer names, because names can potentially
+    # be changed at any point by the user
+    # without the container being notified of it.
+    if index is not None:
+      if len(self.layers) <= index:
+        raise ValueError('Was asked to retrieve layer at index ' +
+                         str(index) + ' but model only has ' + str(
+                             len(self.layers)) + ' layers.')
+      else:
+        return self.layers[index]
+    else:
+      if not name:
+        raise ValueError('Provide either a layer name or layer index.')
+    layer = None
+    for layer in self.layers:
+      if layer.name == name:
+        return layer
+    if not layer:
+      raise ValueError('No such layer: ' + name)
+
+  @property
+  def updates(self):
+    """Retrieve the model's updates.
+
+    Will only include updates that are either
+    inconditional, or conditional on inputs to this model
+    (e.g. will not include updates that depend on tensors
+    that aren't inputs to this model).
+
+    Returns:
+        A list of update ops.
+    """
+    updates = []
+    for layer in self.layers:
+      if hasattr(layer, 'updates'):
+        if len(layer.inbound_nodes) == 1:
+          updates += layer.updates
+        else:
+          # Collect updates that are dependent on inputs
+          # that are part of the model.
+          for node_index, node in enumerate(layer.inbound_nodes):
+            node_key = layer.name + '_ib-' + str(node_index)
+            if node_key in self.container_nodes:
+              # The model owns this layer node.
+              inputs = node.input_tensors
+              updates += layer.get_updates_for(inputs)
+          # Collect unconditional updates.
+          updates += layer.get_updates_for(None)
+    return updates
+
+  @property
+  def losses(self):
+    """Retrieve the model's losses.
+
+    Will only include losses that are either
+    inconditional, or conditional on inputs to this model
+    (e.g. will not include losses that depend on tensors
+    that aren't inputs to this model).
+
+    Returns:
+        A list of loss tensors.
+    """
+    losses = []
+    # Retrieve losses for all internal layers.
+    for layer in self.layers:
+      if hasattr(layer, 'losses'):
+        if len(layer.inbound_nodes) == 1:
+          losses += layer.losses
+        else:
+          # Collect losses that are dependent on inputs
+          # that are part of the model.
+          for node_index, node in enumerate(layer.inbound_nodes):
+            node_key = layer.name + '_ib-' + str(node_index)
+            if node_key in self.container_nodes:
+              # The model owns this layer node.
+              inputs = node.input_tensors
+              losses += layer.get_losses_for(inputs)
+          # Collect unconditional losses.
+          losses += layer.get_losses_for(None)
+    # Add any potential unconditional model-level loss.
+    if hasattr(self, '_per_input_losses'):
+      losses += self._per_input_losses.get(None, [])
+    return losses
+
+  @property
+  def uses_learning_phase(self):
+    return any([x._uses_learning_phase for x in self.outputs])
+
+  @property
+  def stateful(self):
+    return any([(hasattr(layer, 'stateful') and layer.stateful)
+                for layer in self.layers])
+
+  def reset_states(self):
+    for layer in self.layers:
+      if hasattr(layer, 'reset_states') and getattr(layer, 'stateful', False):
+        layer.reset_states()
+
+  @property
+  def state_updates(self):
+    """Returns the `updates` from all layers that are stateful.
+
+    This is useful for separating training updates and
+    state updates, e.g. when we need to update a layer's internal state
+    during prediction.
+
+    Returns:
+        A list of update ops.
+    """
+    state_updates = []
+    for layer in self.layers:
+      if getattr(layer, 'stateful', False):
+        if hasattr(layer, 'updates'):
+          state_updates += layer.updates
+    return state_updates
+
+  @property
+  def constraints(self):
+    cons = {}
+    for layer in self.layers:
+      for key, value in layer.constraints.items():
+        if key in cons and cons[key] != value:
+          raise ValueError('Received multiple constraints '
+                           'for one weight tensor: ' + str(key))
+        cons[key] = value
+    return cons
+
+  @property
+  def trainable_weights(self):
+    if not self.trainable:
+      return []
+    weights = []
+    for layer in self.layers:
+      weights += layer.trainable_weights
+    return weights
+
+  @property
+  def non_trainable_weights(self):
+    weights = []
+    for layer in self.layers:
+      weights += layer.non_trainable_weights
+    if not self.trainable:
+      trainable_weights = []
+      for layer in self.layers:
+        trainable_weights += layer.trainable_weights
+      return trainable_weights + weights
+    return weights
+
+  def get_weights(self):
+    """Retrieves the weights of the model.
+
+    Returns:
+        A flat list of Numpy arrays.
+    """
+    weights = []
+    for layer in self.layers:
+      weights += layer.weights
+    return K.batch_get_value(weights)
+
+  def set_weights(self, weights):
+    """Sets the weights of the model.
+
+    Arguments:
+        weights: A list of Numpy arrays with shapes and types matching
+            the output of `model.get_weights()`.
+    """
+    tuples = []
+    for layer in self.layers:
+      num_param = len(layer.weights)
+      layer_weights = weights[:num_param]
+      for sw, w in zip(layer.weights, layer_weights):
+        tuples.append((sw, w))
+      weights = weights[num_param:]
+    K.batch_set_value(tuples)
+
+  @property
+  def input_spec(self):
+    """Gets the model's input specs.
+
+    Returns:
+        A list of `InputSpec` instances (one per input to the model)
+            or a single instance if the model has only one input.
+    """
+    specs = []
+    for layer in getattr(self, 'input_layers', []):
+      if layer.input_spec is None:
+        specs.append(None)
+      else:
+        if not isinstance(layer.input_spec, list):
+          raise TypeError('Layer ' + layer.name +
+                          ' has an input_spec attribute that '
+                          'is not a list. We expect a list. '
+                          'Found input_spec = ' + str(layer.input_spec))
+        specs += layer.input_spec
+    if len(specs) == 1:
+      return specs[0]
+    return specs
+
+  def call(self, inputs, mask=None):
+    """Call the model on new inputs.
+
+    In this case `call` just reapplies
+    all ops in the graph to the new inputs
+    (e.g. build a new computational graph from the provided inputs).
+
+    A model is callable on non-Keras tensors.
+
+    Arguments:
+        inputs: A tensor or list of tensors.
+        mask: A mask or list of masks. A mask can be
+            either a tensor or None (no mask).
+
+    Returns:
+        A tensor if there is a single output, or
+        a list of tensors if there are more than one outputs.
+    """
+    inputs = _to_list(inputs)
+    if mask is None:
+      masks = [None for _ in range(len(inputs))]
+    else:
+      masks = _to_list(mask)
+    cache_key = ','.join([str(id(x)) for x in inputs])
+    cache_key += '_' + ','.join([str(id(x)) for x in masks])
+    if cache_key in self._output_tensor_cache:
+      return self._output_tensor_cache[cache_key]
+    else:
+      output_tensors, _, _ = self.run_internal_graph(inputs, masks)
+      return output_tensors
+
+  def compute_mask(self, inputs, mask):
+    inputs = _to_list(inputs)
+    if mask is None:
+      masks = [None for _ in range(len(inputs))]
+    else:
+      masks = _to_list(mask)
+    cache_key = ','.join([str(id(x)) for x in inputs])
+    cache_key += '_' + ','.join([str(id(x)) for x in masks])
+    if cache_key in self._output_mask_cache:
+      return self._output_mask_cache[cache_key]
+    else:
+      _, output_masks, _ = self.run_internal_graph(inputs, masks)
+      return output_masks
+
+  def _compute_output_shape(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shapes = []
+      for shape in input_shape:
+        if shape is not None:
+          input_shapes.append(tuple(tensor_shape.TensorShape(shape).as_list()))
+        else:
+          input_shapes.append(None)
+    else:
+      if input_shape is not None:
+        input_shapes = [tuple(tensor_shape.TensorShape(input_shape).as_list())]
+      else:
+        input_shapes = [None]
+
+    if len(input_shapes) != len(self.input_layers):
+      raise ValueError('Invalid input_shape argument ' +
+                       str(input_shape) + ': model has ' + str(
+                           len(self.input_layers)) + ' tensor inputs.')
+
+    cache_key = ','.join([str(x) for x in input_shapes])
+    if cache_key in self._output_shape_cache:
+      output_shapes = self._output_shape_cache[cache_key]
+      if isinstance(output_shapes, list):
+        if len(output_shapes) == 1:
+          return tensor_shape.TensorShape(output_shapes[0])
+        else:
+          return [tensor_shape.TensorShape(shape) for shape in output_shapes]
+      else:
+        return tensor_shape.TensorShape(output_shapes)
+    else:
+      # Bad luck, we have to run the graph manually.
+      layers_to_output_shapes = {}
+      for i in range(len(input_shapes)):
+        layer = self.input_layers[i]
+        input_shape = input_shapes[i]
+        # It's an input layer: get_output_shape_for is identity,
+        # and there is only one node and one tensor output.
+        shape_key = layer.name + '_0_0'
+        layers_to_output_shapes[shape_key] = input_shape
+
+      depth_keys = list(self.nodes_by_depth.keys())
+      depth_keys.sort(reverse=True)
+      # Iterate over nodes, by depth level.
+      if len(depth_keys) > 1:
+        for depth in depth_keys:
+          nodes = self.nodes_by_depth[depth]
+          for node in nodes:
+            # This is always a single layer, never a list.
+            layer = node.outbound_layer
+            if layer in self.input_layers:
+              # We've already covered the input layers
+              # a few lines above.
+              continue
+            # Potentially redundant list,
+            # same size of node.input_tensors.
+            input_shapes = []
+            for j in range(len(node.inbound_layers)):
+              inbound_layer = node.inbound_layers[j]
+              node_index = node.node_indices[j]
+              tensor_index = node.tensor_indices[j]
+              shape_key = inbound_layer.name + '_%s_%s' % (node_index,
+                                                           tensor_index)
+              input_shape = layers_to_output_shapes[shape_key]
+              input_shapes.append(input_shape)
+
+            if len(input_shapes) == 1:
+              output_shape = layer._compute_output_shape(input_shapes[0])
+            else:
+              output_shape = layer._compute_output_shape(input_shapes)
+            if isinstance(output_shape, list):
+              output_shapes = [
+                  tuple(tensor_shape.TensorShape(shape).as_list())
+                  for shape in output_shape
+              ]
+            else:
+              output_shapes = [
+                  tuple(tensor_shape.TensorShape(output_shape).as_list())
+              ]
+
+            node_index = layer.inbound_nodes.index(node)
+            for j in range(len(output_shapes)):
+              shape_key = layer.name + '_%s_%s' % (node_index, j)
+              layers_to_output_shapes[shape_key] = output_shapes[j]
+
+      # Read final output shapes from layers_to_output_shapes.
+      output_shapes = []
+      output_shape_keys = []
+      for i in range(len(self.output_layers)):
+        layer = self.output_layers[i]
+        node_index = self.output_layers_node_indices[i]
+        tensor_index = self.output_layers_tensor_indices[i]
+        shape_key = layer.name + '_%s_%s' % (node_index, tensor_index)
+        output_shape_keys.append(shape_key)
+
+      for i, key in enumerate(output_shape_keys):
+        assert key in layers_to_output_shapes
+        output_shapes.append(layers_to_output_shapes[key])
+      # Store in cache.
+      self._output_shape_cache[cache_key] = output_shapes
+      if isinstance(output_shapes, list):
+        if len(output_shapes) == 1:
+          return tensor_shape.TensorShape(output_shapes[0])
+        else:
+          return [tensor_shape.TensorShape(shape) for shape in output_shapes]
+      else:
+        return tensor_shape.TensorShape(output_shapes)
+
+  def run_internal_graph(self, inputs, masks=None):
+    """Computes output tensors for new inputs.
+
+    # Note:
+        - Expects `inputs` to be a list (potentially with 1 element).
+        - Can be run on non-Keras tensors.
+
+    Arguments:
+        inputs: List of tensors
+        masks: List of masks (tensors or None).
+
+    Returns:
+        Three lists: output_tensors, output_masks, output_shapes
+    """
+    if masks is None:
+      masks = [None for _ in range(len(inputs))]
+
+    # Dictionary mapping reference tensors to tuples
+    # (computed tensor, compute mask)
+    # we assume a 1:1 mapping from tensor to mask
+    # TODO(fchollet): raise exception when a `.compute_mask()` call
+    # does not return a list the same size as `call`
+    tensor_map = {}
+    for x, y, mask in zip(self.inputs, inputs, masks):
+      tensor_map[str(id(x))] = (y, mask)
+
+    depth_keys = list(self.nodes_by_depth.keys())
+    depth_keys.sort(reverse=True)
+    for depth in depth_keys:
+      nodes = self.nodes_by_depth[depth]
+      for node in nodes:
+        # This is always a single layer, never a list.
+        layer = node.outbound_layer
+
+        reference_input_tensors = node.input_tensors
+        reference_output_tensors = node.output_tensors
+
+        # If all previous input tensors are available in tensor_map,
+        # then call node.inbound_layer on them.
+        computed_data = []  # List of tuples (input, mask).
+        for x in reference_input_tensors:
+          if str(id(x)) in tensor_map:
+            computed_data.append(tensor_map[str(id(x))])
+        if len(computed_data) == len(reference_input_tensors):
+          # call layer
+          with K.name_scope(layer.name):
+            if node.arguments:
+              kwargs = node.arguments
+            else:
+              kwargs = {}
+            if len(computed_data) == 1:
+              computed_tensor, computed_mask = computed_data[0]
+              if 'mask' in inspect.getargspec(layer.call).args:
+                if 'mask' not in kwargs:
+                  kwargs['mask'] = computed_mask
+              output_tensors = _to_list(layer.call(computed_tensor, **kwargs))
+              output_masks = _to_list(
+                  layer.compute_mask(computed_tensor, computed_mask))
+              computed_tensors = [computed_tensor]
+              computed_masks = [computed_mask]
+            else:
+              computed_tensors = [x[0] for x in computed_data]
+              computed_masks = [x[1] for x in computed_data]
+              if 'mask' in inspect.getargspec(layer.call).args:
+                if 'mask' not in kwargs:
+                  kwargs['mask'] = computed_masks
+              output_tensors = _to_list(layer.call(computed_tensors, **kwargs))
+              output_masks = _to_list(
+                  layer.compute_mask(computed_tensors, computed_masks))
+
+          # Update model updates and losses:
+          layer_inputs = [x[0] for x in computed_data]
+          # Keep track of updates that depend on the inputs
+          # (e.g. BN updates).
+          self.add_update(layer.get_updates_for(layer_inputs), inputs)
+          # Keep track of unconditional updates (e.g. a counter).
+          self.add_update(layer.get_updates_for(None), None)
+          # Keep track of losses that depend on the inputs
+          # (e.g. activity regularizers).
+          self.add_loss(layer.get_losses_for(layer_inputs), inputs)
+          # Keep track of unconditional losses
+          # (e.g. weight regularizers).
+          self.add_loss(layer.get_losses_for(None), None)
+
+          # Update `_uses_learning_phase`.
+          if len(computed_tensors) == 1:
+            uses_learning_phase = getattr(computed_tensors[0],
+                                          '_uses_learning_phase', False)
+          else:
+            uses_learning_phase = any([
+                getattr(x, '_uses_learning_phase', False)
+                for x in computed_tensors
+            ])
+          for x in output_tensors:
+            x._uses_learning_phase = getattr(x, '_uses_learning_phase',
+                                             False) or uses_learning_phase
+
+          # Update tensor_map.
+          for x, y, mask in zip(reference_output_tensors, output_tensors,
+                                output_masks):
+            tensor_map[str(id(x))] = (y, mask)
+
+    output_tensors = []
+    output_masks = []
+    output_shapes = []
+    for x in self.outputs:
+      assert str(id(x)) in tensor_map, 'Could not compute output ' + str(x)
+      tensor, mask = tensor_map[str(id(x))]
+      output_shapes.append(K.int_shape(x))
+      output_tensors.append(tensor)
+      output_masks.append(mask)
+
+    # Update cache;
+    # keys are based on ids on input tensors and inputs masks.
+    cache_key = ','.join([str(id(x)) for x in inputs])
+    cache_key += '_' + ','.join([str(id(x)) for x in masks])
+
+    if len(output_tensors) == 1:
+      output_tensors = output_tensors[0]
+      self._output_tensor_cache[cache_key] = output_tensors
+    else:
+      self._output_tensor_cache[cache_key] = output_tensors
+
+    if len(output_masks) == 1:
+      output_masks = output_masks[0]
+      self._output_mask_cache[cache_key] = output_masks
+    else:
+      self._output_mask_cache[cache_key] = output_masks
+
+    if output_shapes is not None:
+      input_shapes = [K.int_shape(x) for x in inputs]
+      cache_key = ','.join([str(x) for x in input_shapes])
+      if len(output_shapes) == 1:
+        output_shapes = output_shapes[0]
+        self._output_shape_cache[cache_key] = output_shapes
+      else:
+        self._output_shape_cache[cache_key] = output_shapes
+    return output_tensors, output_masks, output_shapes
+
+  def get_config(self):
+    config = {
+        'name': self.name,
+    }
+    node_conversion_map = {}
+    for layer in self.layers:
+      if issubclass(layer.__class__, Container):
+        # Containers start with a pre-existing node
+        # linking their input to output.
+        kept_nodes = 1
+      else:
+        kept_nodes = 0
+      for original_node_index, node in enumerate(layer.inbound_nodes):
+        node_key = layer.name + '_ib-' + str(original_node_index)
+        if node_key in self.container_nodes:
+          node_conversion_map[node_key] = kept_nodes
+          kept_nodes += 1
+    layer_configs = []
+    for layer in self.layers:  # From the earliest layers on.
+      layer_class_name = layer.__class__.__name__
+      layer_config = layer.get_config()
+      filtered_inbound_nodes = []
+      for original_node_index, node in enumerate(layer.inbound_nodes):
+        node_key = layer.name + '_ib-' + str(original_node_index)
+        if node_key in self.container_nodes:
+          # The node is relevant to the model:
+          # add to filtered_inbound_nodes.
+          if node.arguments:
+            try:
+              json.dumps(node.arguments)
+              kwargs = node.arguments
+            except TypeError:
+              warnings.warn('Layer ' + layer.name +
+                            ' was passed non-serializable keyword arguments: ' +
+                            str(node.arguments) + '. They will not be included '
+                            'in the serialized model (and thus will be missing '
+                            'at deserialization time).')
+              kwargs = {}
+          else:
+            kwargs = {}
+          if node.inbound_layers:
+            node_data = []
+            for i in range(len(node.inbound_layers)):
+              inbound_layer = node.inbound_layers[i]
+              node_index = node.node_indices[i]
+              tensor_index = node.tensor_indices[i]
+              node_key = inbound_layer.name + '_ib-' + str(node_index)
+              new_node_index = node_conversion_map.get(node_key, 0)
+              node_data.append(
+                  [inbound_layer.name, new_node_index, tensor_index, kwargs])
+            filtered_inbound_nodes.append(node_data)
+      layer_configs.append({
+          'name': layer.name,
+          'class_name': layer_class_name,
+          'config': layer_config,
+          'inbound_nodes': filtered_inbound_nodes,
+      })
+    config['layers'] = layer_configs
+
+    # Gather info about inputs and outputs.
+    model_inputs = []
+    for i in range(len(self.input_layers)):
+      layer = self.input_layers[i]
+      node_index = self.input_layers_node_indices[i]
+      node_key = layer.name + '_ib-' + str(node_index)
+      new_node_index = node_conversion_map[node_key]
+      tensor_index = self.input_layers_tensor_indices[i]
+      model_inputs.append([layer.name, new_node_index, tensor_index])
+    config['input_layers'] = model_inputs
+    model_outputs = []
+    for i in range(len(self.output_layers)):
+      layer = self.output_layers[i]
+      node_index = self.output_layers_node_indices[i]
+      node_key = layer.name + '_ib-' + str(node_index)
+      new_node_index = node_conversion_map[node_key]
+      tensor_index = self.output_layers_tensor_indices[i]
+      model_outputs.append([layer.name, new_node_index, tensor_index])
+    config['output_layers'] = model_outputs
+    return copy.deepcopy(config)
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    """Instantiates a Model from its config (output of `get_config()`).
+
+    Arguments:
+        config: Model config dictionary.
+        custom_objects: Optional dictionary mapping names
+            (strings) to custom classes or functions to be
+            considered during deserialization.
+
+    Returns:
+        A model instance.
+
+    Raises:
+        ValueError: In case of improperly formatted config dict.
+    """
+    # layer instances created during
+    # the graph reconstruction process
+    created_layers = {}
+
+    def process_layer(layer_data):
+      """Deserialize a layer, then call it on appropriate inputs.
+
+      Arguments:
+          layer_data: layer config dict.
+
+      Raises:
+          ValueError: In case of improperly formatted `layer_data` dict.
+      """
+      layer_name = layer_data['name']
+
+      # Instantiate layer.
+      from tensorflow.contrib.keras.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+      layer = deserialize_layer(layer_data, custom_objects=custom_objects)
+      created_layers[layer_name] = layer
+
+      # Gather layer inputs.
+      inbound_nodes_data = layer_data['inbound_nodes']
+      for node_data in inbound_nodes_data:
+        input_tensors = []
+        for input_data in node_data:
+          inbound_layer_name = input_data[0]
+          inbound_node_index = input_data[1]
+          inbound_tensor_index = input_data[2]
+          if len(input_data) == 3:
+            kwargs = {}
+          elif len(input_data) == 4:
+            kwargs = input_data[3]
+          else:
+            raise ValueError('Improperly formatted model config.')
+          if inbound_layer_name not in created_layers:
+            raise ValueError('Missing layer: ' + inbound_layer_name)
+          inbound_layer = created_layers[inbound_layer_name]
+          inbound_node = inbound_layer.inbound_nodes[inbound_node_index]
+          input_tensors.append(
+              inbound_node.output_tensors[inbound_tensor_index])
+        # Call layer on its inputs, thus creating the node
+        # and building the layer if needed.
+        if input_tensors:
+          if len(input_tensors) == 1:
+            layer(input_tensors[0], **kwargs)
+          else:
+            layer(input_tensors, **kwargs)
+
+    for layer_data in config['layers']:
+      process_layer(layer_data)
+
+    name = config.get('name')
+    input_tensors = []
+    output_tensors = []
+    for layer_data in config['input_layers']:
+      layer_name, node_index, tensor_index = layer_data
+      assert layer_name in created_layers
+      layer = created_layers[layer_name]
+      layer_output_tensors = layer.inbound_nodes[node_index].output_tensors
+      input_tensors.append(layer_output_tensors[tensor_index])
+    for layer_data in config['output_layers']:
+      layer_name, node_index, tensor_index = layer_data
+      assert layer_name in created_layers
+      layer = created_layers[layer_name]
+      layer_output_tensors = layer.inbound_nodes[node_index].output_tensors
+      output_tensors.append(layer_output_tensors[tensor_index])
+    return cls(inputs=input_tensors, outputs=output_tensors, name=name)
+
+  def save(self, filepath, overwrite=True):
+    """Save the model to a single HDF5 file.
+
+    The savefile includes:
+        - The model architecture, allowing to re-instantiate the model.
+        - The model weights.
+        - The state of the optimizer, allowing to resume training
+            exactly where you left off.
+
+    This allows you to save the entirety of the state of a model
+    in a single file.
+
+    Saved models can be reinstantiated via `keras.models.load_model`.
+    The model returned by `load_model`
+    is a compiled model ready to be used (unless the saved model
+    was never compiled in the first place).
+
+    Arguments:
+        filepath: String, path to the file to save the weights to.
+        overwrite: Whether to silently overwrite any existing file at the
+            target location, or provide the user with a manual prompt.
+
+    Example:
+
+    ```python
+    from keras.models import load_model
+
+    model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
+    del model  # deletes the existing model
+
+    # returns a compiled model
+    # identical to the previous one
+    model = load_model('my_model.h5')
+    ```
+    """
+    from tensorflow.contrib.keras.python.keras.models import save_model  # pylint: disable=g-import-not-at-top
+    save_model(self, filepath, overwrite)
+
+  def save_weights(self, filepath, overwrite=True):
+    """Dumps all layer weights to a HDF5 file.
+
+    The weight file has:
+        - `layer_names` (attribute), a list of strings
+            (ordered names of model layers).
+        - For every layer, a `group` named `layer.name`
+            - For every such layer group, a group attribute `weight_names`,
+                a list of strings
+                (ordered names of weights tensor of the layer).
+            - For every weight in the layer, a dataset
+                storing the weight value, named after the weight tensor.
+
+    Arguments:
+        filepath: String, path to the file to save the weights to.
+        overwrite: Whether to silently overwrite any existing file at the
+            target location, or provide the user with a manual prompt.
+
+    Raises:
+        ImportError: If h5py is not available.
+    """
+    if h5py is None:
+      raise ImportError('`save_weights` requires h5py.')
+    # If file exists and should not be overwritten:
+    if not overwrite and os.path.isfile(filepath):
+      proceed = ask_to_proceed_with_overwrite(filepath)
+      if not proceed:
+        return
+    f = h5py.File(filepath, 'w')
+    save_weights_to_hdf5_group(f, self.layers)
+    f.flush()
+    f.close()
+
+  def load_weights(self, filepath, by_name=False):
+    """Loads all layer weights from a HDF5 save file.
+
+    If `by_name` is False (default) weights are loaded
+    based on the network's topology, meaning the architecture
+    should be the same as when the weights were saved.
+    Note that layers that don't have weights are not taken
+    into account in the topological ordering, so adding or
+    removing layers is fine as long as they don't have weights.
+
+    If `by_name` is True, weights are loaded into layers
+    only if they share the same name. This is useful
+    for fine-tuning or transfer-learning models where
+    some of the layers have changed.
+
+    Arguments:
+        filepath: String, path to the weights file to load.
+        by_name: Boolean, whether to load weights by name
+            or by topological order.
+
+    Raises:
+        ImportError: If h5py is not available.
+    """
+    if h5py is None:
+      raise ImportError('`load_weights` requires h5py.')
+    f = h5py.File(filepath, mode='r')
+    if 'layer_names' not in f.attrs and 'model_weights' in f:
+      f = f['model_weights']
+    if by_name:
+      load_weights_from_hdf5_group_by_name(f, self.layers)
+    else:
+      load_weights_from_hdf5_group(f, self.layers)
+
+    if hasattr(f, 'close'):
+      f.close()
+
+  def _updated_config(self):
+    """Util hared between different serialization methods.
+
+    Returns:
+        Model config with Keras version information added.
+    """
+    from tensorflow.contrib.keras.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+
+    config = self.get_config()
+    model_config = {
+        'class_name': self.__class__.__name__,
+        'config': config,
+        'keras_version': keras_version,
+        'backend': K.backend()
+    }
+    return model_config
+
+  def to_json(self, **kwargs):
+    """Returns a JSON string containing the network configuration.
+
+    To load a network from a JSON save file, use
+    `keras.models.model_from_json(json_string, custom_objects={})`.
+
+    Arguments:
+        **kwargs: Additional keyword arguments
+            to be passed to `json.dumps()`.
+
+    Returns:
+        A JSON string.
+    """
+
+    def get_json_type(obj):
+      # If obj is any numpy type
+      if type(obj).__module__ == np.__name__:
+        return obj.item()
+
+      # If obj is a python 'type'
+      if type(obj).__name__ == type.__name__:
+        return obj.__name__
+
+      raise TypeError('Not JSON Serializable:', obj)
+
+    model_config = self._updated_config()
+    return json.dumps(model_config, default=get_json_type, **kwargs)
+
+  def to_yaml(self, **kwargs):
+    """Returns a yaml string containing the network configuration.
+
+    To load a network from a yaml save file, use
+    `keras.models.model_from_yaml(yaml_string, custom_objects={})`.
+
+    `custom_objects` should be a dictionary mapping
+    the names of custom losses / layers / etc to the corresponding
+    functions / classes.
+
+    Arguments:
+        **kwargs: Additional keyword arguments
+            to be passed to `yaml.dump()`.
+
+    Returns:
+        A YAML string.
+
+    Raises:
+        ImportError: if yaml module is not found.
+    """
+    if yaml is None:
+      raise ImportError('Requires yaml module installed.')
+    return yaml.dump(self._updated_config(), **kwargs)
+
+  def summary(self, line_length=None, positions=None):
+    print_layer_summary(self, line_length=line_length, positions=positions)
+
+
+def get_source_inputs(tensor, layer=None, node_index=None):
+  """Returns the list of input tensors necessary to compute `tensor`.
+
+  Output will always be a list of tensors
+  (potentially with 1 element).
+
+  Arguments:
+      tensor: The tensor to start from.
+      layer: Origin layer of the tensor. Will be
+          determined via tensor._keras_history if not provided.
+      node_index: Origin node index of the tensor.
+
+  Returns:
+      List of input tensors.
+  """
+  if not hasattr(tensor, '_keras_history'):
+    return tensor
+
+  if layer is None or node_index:
+    layer, node_index, _ = tensor._keras_history
+  if not layer.inbound_nodes:
+    return [tensor]
+  else:
+    node = layer.inbound_nodes[node_index]
+    if not node.inbound_layers:
+      # Reached an Input layer, stop recursion.
+      return node.input_tensors
+    else:
+      source_tensors = []
+      for i in range(len(node.inbound_layers)):
+        x = node.input_tensors[i]
+        layer = node.inbound_layers[i]
+        node_index = node.node_indices[i]
+        previous_sources = get_source_inputs(x, layer, node_index)
+        # Avoid input redundancy.
+        for x in previous_sources:
+          if x not in source_tensors:
+            source_tensors.append(x)
+      return source_tensors
+
+
+def _to_list(x):
+  """Normalizes a list/tensor into a list.
+
+  If a tensor is passed, we return
+  a list of size 1 containing the tensor.
+
+  Arguments:
+      x: target object to be normalized.
+
+  Returns:
+      A list.
+  """
+  if isinstance(x, list):
+    return x
+  return [x]
+
+
+def _object_list_uid(object_list):
+  object_list = _to_list(object_list)
+  return ', '.join([str(abs(id(x))) for x in object_list])
+
+
+def _is_all_none(iterable_or_element):
+  if not isinstance(iterable_or_element, (list, tuple)):
+    iterable = [iterable_or_element]
+  else:
+    iterable = iterable_or_element
+  for element in iterable:
+    if element is not None:
+      return False
+  return True
+
+
+def _collect_previous_mask(input_tensors):
+  """Retrieves the output mask(s) of the previous node.
+
+  Arguments:
+      input_tensors: A tensor or list of tensors.
+
+  Returns:
+      A mask tensor or list of mask tensors.
+  """
+  input_tensors = _to_list(input_tensors)
+  masks = []
+  for x in input_tensors:
+    if hasattr(x, '_keras_history'):
+      inbound_layer, node_index, tensor_index = x._keras_history
+      node = inbound_layer.inbound_nodes[node_index]
+      mask = node.output_masks[tensor_index]
+      masks.append(mask)
+    else:
+      masks.append(None)
+  if len(masks) == 1:
+    return masks[0]
+  return masks
+
+
+def _to_snake_case(name):
+  intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
+  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
+  # If the class is private the name starts with "_" which is not secure
+  # for creating scopes. We prefix the name with "private" in this case.
+  if insecure[0] != '_':
+    return insecure
+  return 'private' + insecure
+
+
+def _collect_input_shape(input_tensors):
+  """Collects the output shape(s) of a list of Keras tensors.
+
+  Arguments:
+      input_tensors: list of input tensors (or single input tensor).
+
+  Returns:
+      List of shape tuples (or single tuple), one tuple per input.
+  """
+  input_tensors = _to_list(input_tensors)
+  shapes = []
+  for x in input_tensors:
+    shapes.append(K.int_shape(x))
+  if len(shapes) == 1:
+    return shapes[0]
+  return shapes
+
+
+def save_weights_to_hdf5_group(f, layers):
+  from tensorflow.contrib.keras.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+
+  f.attrs['layer_names'] = [layer.name.encode('utf8') for layer in layers]
+  f.attrs['backend'] = K.backend().encode('utf8')
+  f.attrs['keras_version'] = str(keras_version).encode('utf8')
+
+  for layer in layers:
+    g = f.create_group(layer.name)
+    symbolic_weights = layer.weights
+    weight_values = K.batch_get_value(symbolic_weights)
+    weight_names = []
+    for i, (w, val) in enumerate(zip(symbolic_weights, weight_values)):
+      if hasattr(w, 'name') and w.name:
+        name = str(w.name)
+      else:
+        name = 'param_' + str(i)
+      weight_names.append(name.encode('utf8'))
+    g.attrs['weight_names'] = weight_names
+    for name, val in zip(weight_names, weight_values):
+      param_dset = g.create_dataset(name, val.shape, dtype=val.dtype)
+      if not val.shape:
+        # scalar
+        param_dset[()] = val
+      else:
+        param_dset[:] = val
+
+
+def preprocess_weights_for_loading(layer,
+                                   weights,
+                                   original_keras_version=None,
+                                   original_backend=None):
+  """Converts layers weights from Keras 1 format to Keras 2.
+
+  Arguments:
+      layer: Layer instance.
+      weights: List of weights values (Numpy arrays).
+      original_keras_version: Keras version for the weights, as a string.
+      original_backend: Keras backend the weights were trained with,
+          as a string.
+
+  Returns:
+      A list of weights values (Numpy arrays).
+  """
+  if original_keras_version == '1':
+    if layer.__class__.__name__ == 'Conv1D':
+      shape = weights[0].shape
+      # Handle Keras 1.1 format
+      if shape[:2] != (layer.kernel_size[0], 1) or shape[3] != layer.filters:
+        # Legacy shape:
+        # (filters, input_dim, filter_length, 1)
+        assert shape[0] == layer.filters and shape[2:] == (layer.kernel_size[0],
+                                                           1)
+        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
+      weights[0] = weights[0][:, 0, :, :]
+
+    if layer.__class__.__name__ == 'Conv2D':
+      if layer.data_format == 'channels_first':
+        # old: (filters, stack_size, kernel_rows, kernel_cols)
+        # new: (kernel_rows, kernel_cols, stack_size, filters)
+        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
+
+    if layer.__class__.__name__ == 'Conv2DTranspose':
+      if layer.data_format == 'channels_last':
+        # old: (kernel_rows, kernel_cols, stack_size, filters)
+        # new: (kernel_rows, kernel_cols, filters, stack_size)
+        weights[0] = np.transpose(weights[0], (0, 1, 3, 2))
+      if layer.data_format == 'channels_first':
+        # old: (filters, stack_size, kernel_rows, kernel_cols)
+        # new: (kernel_rows, kernel_cols, filters, stack_size)
+        weights[0] = np.transpose(weights[0], (2, 3, 0, 1))
+
+    if layer.__class__.__name__ == 'Conv3D':
+      if layer.data_format == 'channels_first':
+        # old: (filters, stack_size, ...)
+        # new: (..., stack_size, filters)
+        weights[0] = np.transpose(weights[0], (2, 3, 4, 1, 0))
+
+    if layer.__class__.__name__ == 'GRU':
+      if len(weights) == 9:
+        kernel = np.concatenate([weights[0], weights[3], weights[6]], axis=-1)
+        recurrent_kernel = np.concatenate(
+            [weights[1], weights[4], weights[7]], axis=-1)
+        bias = np.concatenate([weights[2], weights[5], weights[8]], axis=-1)
+        weights = [kernel, recurrent_kernel, bias]
+
+    if layer.__class__.__name__ == 'LSTM':
+      if len(weights) == 12:
+        # old: i, c, f, o
+        # new: i, f, c, o
+        kernel = np.concatenate(
+            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
+        recurrent_kernel = np.concatenate(
+            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
+        bias = np.concatenate(
+            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
+        weights = [kernel, recurrent_kernel, bias]
+
+    if layer.__class__.__name__ == 'ConvLSTM2D':
+      if len(weights) == 12:
+        kernel = np.concatenate(
+            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
+        recurrent_kernel = np.concatenate(
+            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
+        bias = np.concatenate(
+            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
+        if layer.data_format == 'channels_first':
+          # old: (filters, stack_size, kernel_rows, kernel_cols)
+          # new: (kernel_rows, kernel_cols, stack_size, filters)
+          kernel = np.transpose(kernel, (2, 3, 1, 0))
+          recurrent_kernel = np.transpose(recurrent_kernel, (2, 3, 1, 0))
+        weights = [kernel, recurrent_kernel, bias]
+
+  if original_backend and K.backend() != original_backend:
+    conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose']
+    if layer.__class__.__name__ in conv_layers:
+      weights[0] = conv_utils.convert_kernel(weights[0])
+    if layer.__class__.__name__ == 'ConvLSTM2D':
+      weights[0] = conv_utils.convert_kernel(weights[0])
+      weights[1] = conv_utils.convert_kernel(weights[1])
+  return weights
+
+
+def load_weights_from_hdf5_group(f, layers):
+  """Implements topological (order-based) weight loading.
+
+  Arguments:
+      f: A pointer to a HDF5 group.
+      layers: a list of target layers.
+
+  Raises:
+      ValueError: in case of mismatch between provided layers
+          and weights file.
+  """
+  if 'keras_version' in f.attrs:
+    original_keras_version = f.attrs['keras_version'].decode('utf8')
+  else:
+    original_keras_version = '1'
+  if 'backend' in f.attrs:
+    original_backend = f.attrs['backend'].decode('utf8')
+  else:
+    original_backend = None
+
+  filtered_layers = []
+  for layer in layers:
+    weights = layer.weights
+    if weights:
+      filtered_layers.append(layer)
+
+  layer_names = [n.decode('utf8') for n in f.attrs['layer_names']]
+  filtered_layer_names = []
+  for name in layer_names:
+    g = f[name]
+    weight_names = [n.decode('utf8') for n in g.attrs['weight_names']]
+    if weight_names:
+      filtered_layer_names.append(name)
+  layer_names = filtered_layer_names
+  if len(layer_names) != len(filtered_layers):
+    raise ValueError('You are trying to load a weight file '
+                     'containing ' + str(len(
+                         layer_names)) + ' layers into a model with ' + str(
+                             len(filtered_layers)) + ' layers.')
+
+  # We batch weight value assignments in a single backend call
+  # which provides a speedup in TensorFlow.
+  weight_value_tuples = []
+  for k, name in enumerate(layer_names):
+    g = f[name]
+    weight_names = [n.decode('utf8') for n in g.attrs['weight_names']]
+    weight_values = [g[weight_name] for weight_name in weight_names]
+    layer = filtered_layers[k]
+    symbolic_weights = layer.weights
+    weight_values = preprocess_weights_for_loading(
+        layer, weight_values, original_keras_version, original_backend)
+    if len(weight_values) != len(symbolic_weights):
+      raise ValueError('Layer #' + str(k) + ' (named "' + layer.name +
+                       '" in the current model) was found to '
+                       'correspond to layer ' + name + ' in the save file. '
+                       'However the new layer ' + layer.name + ' expects ' +
+                       str(len(symbolic_weights)) +
+                       ' weights, but the saved weights have ' + str(
+                           len(weight_values)) + ' elements.')
+    weight_value_tuples += zip(symbolic_weights, weight_values)
+  K.batch_set_value(weight_value_tuples)
+
+
+def load_weights_from_hdf5_group_by_name(f, layers):
+  """Implements name-based weight loading.
+
+  (instead of topological weight loading).
+
+  Layers that have no matching name are skipped.
+
+  Arguments:
+      f: A pointer to a HDF5 group.
+      layers: a list of target layers.
+
+  Raises:
+      ValueError: in case of mismatch between provided layers
+          and weights file.
+  """
+  if 'keras_version' in f.attrs:
+    original_keras_version = f.attrs['keras_version'].decode('utf8')
+  else:
+    original_keras_version = '1'
+  if 'backend' in f.attrs:
+    original_backend = f.attrs['backend'].decode('utf8')
+  else:
+    original_backend = None
+
+  # New file format.
+  layer_names = [n.decode('utf8') for n in f.attrs['layer_names']]
+
+  # Reverse index of layer name to list of layers with name.
+  index = {}
+  for layer in layers:
+    if layer.name:
+      index.setdefault(layer.name, []).append(layer)
+
+  # We batch weight value assignments in a single backend call
+  # which provides a speedup in TensorFlow.
+  weight_value_tuples = []
+  for k, name in enumerate(layer_names):
+    g = f[name]
+    weight_names = [n.decode('utf8') for n in g.attrs['weight_names']]
+    weight_values = [g[weight_name] for weight_name in weight_names]
+
+    for layer in index.get(name, []):
+      symbolic_weights = layer.weights
+      weight_values = preprocess_weights_for_loading(
+          layer, weight_values, original_keras_version, original_backend)
+      if len(weight_values) != len(symbolic_weights):
+        raise ValueError('Layer #' + str(
+            k) + ' (named "' + layer.name + '") expects ' + str(
+                len(symbolic_weights)) + ' weight(s), but the saved weights' +
+                         ' have ' + str(len(weight_values)) + ' element(s).')
+      # Set values.
+      for i in range(len(weight_values)):
+        weight_value_tuples.append((symbolic_weights[i], weight_values[i]))
+  K.batch_set_value(weight_value_tuples)
diff --git a/tensorflow/contrib/keras/python/keras/engine/topology_test.py b/tensorflow/contrib/keras/python/keras/engine/topology_test.py
new file mode 100644
index 0000000000..eb095b14a9
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/engine/topology_test.py
@@ -0,0 +1,512 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#,============================================================================
+"""Tests for layer graphs construction & handling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+try:
+  import yaml  # pylint:disable=g-import-not-at-top
+except ImportError:
+  yaml = None
+
+
+class TopologyConstructionTest(test.TestCase):
+
+  def test_get_updates_for(self):
+    a = keras.layers.Input(shape=(2,))
+    dense_layer = keras.layers.Dense(1)
+    dense_layer.add_update(0, inputs=a)
+    dense_layer.add_update(1, inputs=None)
+
+    self.assertListEqual(dense_layer.get_updates_for(a), [0])
+    self.assertListEqual(dense_layer.get_updates_for(None), [1])
+
+  def test_get_losses_for(self):
+    a = keras.layers.Input(shape=(2,))
+    dense_layer = keras.layers.Dense(1)
+    dense_layer.add_loss(0, inputs=a)
+    dense_layer.add_loss(1, inputs=None)
+
+    self.assertListEqual(dense_layer.get_losses_for(a), [0])
+    self.assertListEqual(dense_layer.get_losses_for(None), [1])
+
+  def test_trainable_weights(self):
+    a = keras.layers.Input(shape=(2,))
+    b = keras.layers.Dense(1)(a)
+    model = keras.models.Model(a, b)
+
+    weights = model.weights
+    self.assertListEqual(model.trainable_weights, weights)
+    self.assertListEqual(model.non_trainable_weights, [])
+
+    model.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+    self.assertListEqual(model.non_trainable_weights, weights)
+
+    model.trainable = True
+    self.assertListEqual(model.trainable_weights, weights)
+    self.assertListEqual(model.non_trainable_weights, [])
+
+    model.layers[1].trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+    self.assertListEqual(model.non_trainable_weights, weights)
+
+    # sequential model
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(1, input_dim=2))
+    weights = model.weights
+
+    self.assertListEqual(model.trainable_weights, weights)
+    self.assertListEqual(model.non_trainable_weights, [])
+
+    model.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+    self.assertListEqual(model.non_trainable_weights, weights)
+
+    model.trainable = True
+    self.assertListEqual(model.trainable_weights, weights)
+    self.assertListEqual(model.non_trainable_weights, [])
+
+    model.layers[0].trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+    self.assertListEqual(model.non_trainable_weights, weights)
+
+  def test_learning_phase(self):
+    with self.test_session():
+      a = keras.layers.Input(shape=(32,), name='input_a')
+      b = keras.layers.Input(shape=(32,), name='input_b')
+
+      a_2 = keras.layers.Dense(16, name='dense_1')(a)
+      dp = keras.layers.Dropout(0.5, name='dropout')
+      b_2 = dp(b)
+
+      self.assertFalse(a_2._uses_learning_phase)
+      self.assertTrue(b_2._uses_learning_phase)
+
+      # test merge
+      m = keras.layers.concatenate([a_2, b_2])
+      self.assertTrue(m._uses_learning_phase)
+
+      # Test recursion
+      model = keras.models.Model([a, b], [a_2, b_2])
+      self.assertTrue(model.uses_learning_phase)
+
+      c = keras.layers.Input(shape=(32,), name='input_c')
+      d = keras.layers.Input(shape=(32,), name='input_d')
+
+      c_2, b_2 = model([c, d])
+      self.assertTrue(c_2._uses_learning_phase)
+      self.assertTrue(b_2._uses_learning_phase)
+
+      # try actually running graph
+      fn = keras.backend.function(
+          model.inputs + [keras.backend.learning_phase()], model.outputs)
+      input_a_np = np.random.random((10, 32))
+      input_b_np = np.random.random((10, 32))
+      fn_outputs_no_dp = fn([input_a_np, input_b_np, 0])
+      fn_outputs_dp = fn([input_a_np, input_b_np, 1])
+      # output a: nothing changes
+      self.assertEqual(fn_outputs_no_dp[0].sum(), fn_outputs_dp[0].sum())
+      # output b: dropout applied
+      self.assertNotEqual(fn_outputs_no_dp[1].sum(), fn_outputs_dp[1].sum())
+
+  def test_layer_call_arguments(self):
+    # Test the ability to pass and serialize arguments to `call`.
+    inp = keras.layers.Input(shape=(2,))
+    x = keras.layers.Dense(3)(inp)
+    x = keras.layers.Dropout(0.5)(x, training=True)
+    model = keras.models.Model(inp, x)
+    self.assertFalse(model.uses_learning_phase)
+
+    # Test that argument is kept when applying the model
+    inp2 = keras.layers.Input(shape=(2,))
+    out2 = model(inp2)
+    self.assertFalse(out2._uses_learning_phase)
+
+    # Test that argument is kept after loading a model
+    config = model.get_config()
+    model = keras.models.Model.from_config(config)
+    self.assertFalse(model.uses_learning_phase)
+
+  def test_node_construction(self):
+    # test basics
+    a = keras.layers.Input(shape=(32,), name='input_a')
+    b = keras.layers.Input(shape=(32,), name='input_b')
+
+    self.assertListEqual(a.get_shape().as_list(), [None, 32])
+    a_layer, a_node_index, a_tensor_index = a._keras_history
+    b_layer, _, _ = b._keras_history
+    self.assertEqual(len(a_layer.inbound_nodes), 1)
+    self.assertEqual(a_tensor_index, 0)
+    node = a_layer.inbound_nodes[a_node_index]
+    self.assertEqual(node.outbound_layer, a_layer)
+
+    self.assertListEqual(node.inbound_layers, [])
+    self.assertListEqual(node.input_tensors, [a])
+    self.assertListEqual(node.input_masks, [None])
+    self.assertListEqual(node.input_shapes, [(None, 32)])
+    self.assertListEqual(node.output_tensors, [a])
+    self.assertListEqual(node.output_shapes, [(None, 32)])
+    self.assertListEqual(node.output_masks, [None])
+
+    dense = keras.layers.Dense(16, name='dense_1')
+    a_2 = dense(a)
+    b_2 = dense(b)
+
+    self.assertEqual(len(dense.inbound_nodes), 2)
+    self.assertEqual(len(dense.outbound_nodes), 0)
+    self.assertListEqual(dense.inbound_nodes[0].inbound_layers, [a_layer])
+    self.assertEqual(dense.inbound_nodes[0].outbound_layer, dense)
+    self.assertListEqual(dense.inbound_nodes[1].inbound_layers, [b_layer])
+    self.assertEqual(dense.inbound_nodes[1].outbound_layer, dense)
+    self.assertListEqual(dense.inbound_nodes[0].input_tensors, [a])
+    self.assertListEqual(dense.inbound_nodes[1].input_tensors, [b])
+
+    # test layer properties
+    test_layer = keras.layers.Dense(16, name='test_layer')
+    a_test = test_layer(a)
+    self.assertListEqual(test_layer.kernel.get_shape().as_list(), [32, 16])
+    self.assertEqual(test_layer.input, a)
+    self.assertEqual(test_layer.output, a_test)
+    self.assertEqual(test_layer.input_mask, None)
+    self.assertEqual(test_layer.output_mask, None)
+    self.assertEqual(test_layer.input_shape, (None, 32))
+    self.assertEqual(test_layer.output_shape, (None, 16))
+
+    # pylint: disable=pointless-statement
+    with self.assertRaises(Exception):
+      dense.input
+    with self.assertRaises(Exception):
+      dense.output
+    with self.assertRaises(Exception):
+      dense.input_mask
+    with self.assertRaises(Exception):
+      dense.output_mask
+    # pylint: enable=pointless-statement
+
+    self.assertEqual(dense.get_input_at(0), a)
+    self.assertEqual(dense.get_input_at(1), b)
+    self.assertEqual(dense.get_output_at(0), a_2)
+    self.assertEqual(dense.get_output_at(1), b_2)
+    self.assertEqual(dense.get_input_shape_at(0), (None, 32))
+    self.assertEqual(dense.get_input_shape_at(1), (None, 32))
+    self.assertEqual(dense.get_output_shape_at(0), (None, 16))
+    self.assertEqual(dense.get_output_shape_at(1), (None, 16))
+    self.assertEqual(dense.get_input_mask_at(0), None)
+    self.assertEqual(dense.get_input_mask_at(1), None)
+    self.assertEqual(dense.get_output_mask_at(0), None)
+    self.assertEqual(dense.get_output_mask_at(1), None)
+
+  def test_multi_input_layer(self):
+    with self.test_session():
+      # test multi-input layer
+      a = keras.layers.Input(shape=(32,), name='input_a')
+      b = keras.layers.Input(shape=(32,), name='input_b')
+
+      dense = keras.layers.Dense(16, name='dense_1')
+      a_2 = dense(a)
+      b_2 = dense(b)
+
+      merged = keras.layers.concatenate([a_2, b_2], name='merge')
+      self.assertListEqual(merged.get_shape().as_list(), [None, 16 * 2])
+      merge_layer, merge_node_index, merge_tensor_index = merged._keras_history
+
+      self.assertEqual(merge_node_index, 0)
+      self.assertEqual(merge_tensor_index, 0)
+
+      self.assertEqual(len(merge_layer.inbound_nodes), 1)
+      self.assertEqual(len(merge_layer.outbound_nodes), 0)
+
+      self.assertEqual(len(merge_layer.inbound_nodes[0].input_tensors), 2)
+      self.assertEqual(len(merge_layer.inbound_nodes[0].inbound_layers), 2)
+
+      c = keras.layers.Dense(64, name='dense_2')(merged)
+      d = keras.layers.Dense(5, name='dense_3')(c)
+
+      model = keras.models.Model(inputs=[a, b], outputs=[c, d], name='model')
+      self.assertEqual(len(model.layers), 6)
+      output_shapes = model._compute_output_shape([(None, 32), (None, 32)])
+      self.assertListEqual(output_shapes[0].as_list(), [None, 64])
+      self.assertListEqual(output_shapes[1].as_list(), [None, 5])
+      self.assertListEqual(
+          model.compute_mask([a, b], [None, None]), [None, None])
+
+      # we don't check names of first 2 layers (inputs) because
+      # ordering of same-level layers is not fixed
+      self.assertListEqual([l.name for l in model.layers][2:],
+                           ['dense_1', 'merge', 'dense_2', 'dense_3'])
+      self.assertListEqual([l.name for l in model.input_layers],
+                           ['input_a', 'input_b'])
+      self.assertListEqual([l.name for l in model.output_layers],
+                           ['dense_2', 'dense_3'])
+
+      # actually run model
+      fn = keras.backend.function(model.inputs, model.outputs)
+      input_a_np = np.random.random((10, 32))
+      input_b_np = np.random.random((10, 32))
+      fn_outputs = fn([input_a_np, input_b_np])
+      self.assertListEqual([x.shape for x in fn_outputs], [(10, 64), (10, 5)])
+
+      # test get_source_inputs
+      self.assertListEqual(keras.engine.topology.get_source_inputs(c), [a, b])
+
+      # serialization / deserialization
+      json_config = model.to_json()
+      recreated_model = keras.models.model_from_json(json_config)
+      recreated_model.compile('rmsprop', 'mse')
+
+      self.assertListEqual([l.name for l in recreated_model.layers][2:],
+                           ['dense_1', 'merge', 'dense_2', 'dense_3'])
+      self.assertListEqual([l.name for l in recreated_model.input_layers],
+                           ['input_a', 'input_b'])
+      self.assertListEqual([l.name for l in recreated_model.output_layers],
+                           ['dense_2', 'dense_3'])
+
+      fn = keras.backend.function(recreated_model.inputs,
+                                  recreated_model.outputs)
+      input_a_np = np.random.random((10, 32))
+      input_b_np = np.random.random((10, 32))
+      fn_outputs = fn([input_a_np, input_b_np])
+      self.assertListEqual([x.shape for x in fn_outputs], [(10, 64), (10, 5)])
+
+  def test_recursion(self):
+    with self.test_session():
+      a = keras.layers.Input(shape=(32,), name='input_a')
+      b = keras.layers.Input(shape=(32,), name='input_b')
+
+      dense = keras.layers.Dense(16, name='dense_1')
+      a_2 = dense(a)
+      b_2 = dense(b)
+      merged = keras.layers.concatenate([a_2, b_2], name='merge')
+      c = keras.layers.Dense(64, name='dense_2')(merged)
+      d = keras.layers.Dense(5, name='dense_3')(c)
+
+      model = keras.models.Model(inputs=[a, b], outputs=[c, d], name='model')
+
+      e = keras.layers.Input(shape=(32,), name='input_e')
+      f = keras.layers.Input(shape=(32,), name='input_f')
+      g, h = model([e, f])
+
+      self.assertListEqual(g.get_shape().as_list(), c.get_shape().as_list())
+      self.assertListEqual(h.get_shape().as_list(), d.get_shape().as_list())
+
+      # test separate manipulation of different layer outputs
+      i = keras.layers.Dense(7, name='dense_4')(h)
+
+      final_model = keras.models.Model(
+          inputs=[e, f], outputs=[i, g], name='final')
+      self.assertEqual(len(final_model.inputs), 2)
+      self.assertEqual(len(final_model.outputs), 2)
+      self.assertEqual(len(final_model.layers), 4)
+
+      # we don't check names of first 2 layers (inputs) because
+      # ordering of same-level layers is not fixed
+      self.assertListEqual([layer.name for layer in final_model.layers][2:],
+                           ['model', 'dense_4'])
+      self.assertListEqual(
+          model.compute_mask([e, f], [None, None]), [None, None])
+      self.assertListEqual(
+          final_model._compute_output_shape([(10, 32), (10, 32)]), [(10, 7),
+                                                                    (10, 64)])
+
+      # run recursive model
+      fn = keras.backend.function(final_model.inputs, final_model.outputs)
+      input_a_np = np.random.random((10, 32))
+      input_b_np = np.random.random((10, 32))
+      fn_outputs = fn([input_a_np, input_b_np])
+      self.assertListEqual([x.shape for x in fn_outputs], [(10, 7), (10, 64)])
+
+      # test serialization
+      model_config = final_model.get_config()
+      recreated_model = keras.models.Model.from_config(model_config)
+
+      fn = keras.backend.function(recreated_model.inputs,
+                                  recreated_model.outputs)
+      input_a_np = np.random.random((10, 32))
+      input_b_np = np.random.random((10, 32))
+      fn_outputs = fn([input_a_np, input_b_np])
+      self.assertListEqual([x.shape for x in fn_outputs], [(10, 7), (10, 64)])
+
+  def test_multi_input_multi_output_recursion(self):
+    with self.test_session():
+      # test multi-input multi-output
+      a = keras.layers.Input(shape=(32,), name='input_a')
+      b = keras.layers.Input(shape=(32,), name='input_b')
+
+      dense = keras.layers.Dense(16, name='dense_1')
+      a_2 = dense(a)
+      b_2 = dense(b)
+      merged = keras.layers.concatenate([a_2, b_2], name='merge')
+      c = keras.layers.Dense(64, name='dense_2')(merged)
+      d = keras.layers.Dense(5, name='dense_3')(c)
+
+      model = keras.models.Model(inputs=[a, b], outputs=[c, d], name='model')
+
+      j = keras.layers.Input(shape=(32,), name='input_j')
+      k = keras.layers.Input(shape=(32,), name='input_k')
+      _, n = model([j, k])
+
+      o = keras.layers.Input(shape=(32,), name='input_o')
+      p = keras.layers.Input(shape=(32,), name='input_p')
+      q, _ = model([o, p])
+
+      self.assertListEqual(n.get_shape().as_list(), [None, 5])
+      self.assertListEqual(q.get_shape().as_list(), [None, 64])
+      s = keras.layers.concatenate([n, q], name='merge_nq')
+      self.assertListEqual(s.get_shape().as_list(), [None, 64 + 5])
+
+      # test with single output as 1-elem list
+      multi_io_model = keras.models.Model([j, k, o, p], [s])
+
+      fn = keras.backend.function(multi_io_model.inputs, multi_io_model.outputs)
+      fn_outputs = fn([
+          np.random.random((10, 32)), np.random.random((10, 32)),
+          np.random.random((10, 32)), np.random.random((10, 32))
+      ])
+      self.assertListEqual([x.shape for x in fn_outputs], [(10, 69)])
+
+      # test with single output as tensor
+      multi_io_model = keras.models.Model([j, k, o, p], s)
+
+      fn = keras.backend.function(multi_io_model.inputs, multi_io_model.outputs)
+      fn_outputs = fn([
+          np.random.random((10, 32)), np.random.random((10, 32)),
+          np.random.random((10, 32)), np.random.random((10, 32))
+      ])
+      # note that the output of the function will still be a 1-elem list
+      self.assertListEqual([x.shape for x in fn_outputs], [(10, 69)])
+
+      # test serialization
+      model_config = multi_io_model.get_config()
+      recreated_model = keras.models.Model.from_config(model_config)
+
+      fn = keras.backend.function(recreated_model.inputs,
+                                  recreated_model.outputs)
+      fn_outputs = fn([
+          np.random.random((10, 32)), np.random.random((10, 32)),
+          np.random.random((10, 32)), np.random.random((10, 32))
+      ])
+      # note that the output of the function will still be a 1-elem list
+      self.assertListEqual([x.shape for x in fn_outputs], [(10, 69)])
+
+      config = model.get_config()
+      keras.models.Model.from_config(config)
+
+      model.summary()
+      json_str = model.to_json()
+      keras.models.model_from_json(json_str)
+
+      if yaml is not None:
+        yaml_str = model.to_yaml()
+        keras.models.model_from_yaml(yaml_str)
+
+  def test_invalid_graphs(self):
+    a = keras.layers.Input(shape=(32,), name='input_a')
+    b = keras.layers.Input(shape=(32,), name='input_b')
+
+    dense = keras.layers.Dense(16, name='dense_1')
+    a_2 = dense(a)
+    b_2 = dense(b)
+    merged = keras.layers.concatenate([a_2, b_2], name='merge')
+    c = keras.layers.Dense(64, name='dense_2')(merged)
+    d = keras.layers.Dense(5, name='dense_3')(c)
+
+    model = keras.models.Model(inputs=[a, b], outputs=[c, d], name='model')
+
+    # input is not an Input tensor
+    j = keras.layers.Input(shape=(32,), name='input_j')
+    j = keras.layers.Dense(32)(j)
+    k = keras.layers.Input(shape=(32,), name='input_k')
+    m, n = model([j, k])
+
+    with self.assertRaises(Exception):
+      keras.models.Model([j, k], [m, n])
+
+    # disconnected graph
+    j = keras.layers.Input(shape=(32,), name='input_j')
+    k = keras.layers.Input(shape=(32,), name='input_k')
+    m, n = model([j, k])
+    with self.assertRaises(Exception):
+      keras.models.Model([j], [m, n])
+
+    # redundant outputs
+    j = keras.layers.Input(shape=(32,), name='input_j')
+    k = keras.layers.Input(shape=(32,), name='input_k')
+    m, n = model([j, k])
+
+    keras.models.Model([j, k], [m, n, n])
+
+    # redundant inputs
+    j = keras.layers.Input(shape=(32,), name='input_j')
+    k = keras.layers.Input(shape=(32,), name='input_k')
+    m, n = model([j, k])
+    with self.assertRaises(Exception):
+      keras.models.Model([j, k, j], [m, n])
+
+    # i have not idea what I'm doing: garbage as inputs/outputs
+    j = keras.layers.Input(shape=(32,), name='input_j')
+    k = keras.layers.Input(shape=(32,), name='input_k')
+    m, n = model([j, k])
+    with self.assertRaises(Exception):
+      keras.models.Model([j, k], [m, n, 0])
+
+  def test_raw_tf_compatibility(self):
+    # test calling layers/models on TF tensors
+    a = keras.layers.Input(shape=(32,), name='input_a')
+    b = keras.layers.Input(shape=(32,), name='input_b')
+
+    dense = keras.layers.Dense(16, name='dense_1')
+    a_2 = dense(a)
+    b_2 = dense(b)
+    merged = keras.layers.concatenate([a_2, b_2], name='merge')
+    c = keras.layers.Dense(64, name='dense_2')(merged)
+    d = keras.layers.Dense(5, name='dense_3')(c)
+
+    model = keras.models.Model(inputs=[a, b], outputs=[c, d], name='model')
+
+    j = keras.layers.Input(shape=(32,), name='input_j')
+    k = keras.layers.Input(shape=(32,), name='input_k')
+    m, n = model([j, k])
+    tf_model = keras.models.Model([j, k], [m, n])
+
+    j_tf = array_ops.placeholder(dtype=dtypes.float32)
+    k_tf = array_ops.placeholder(dtype=dtypes.float32)
+    m_tf, n_tf = tf_model([j_tf, k_tf])
+    self.assertListEqual(m_tf.get_shape().as_list(), [None, 64])
+    self.assertListEqual(n_tf.get_shape().as_list(), [None, 5])
+
+    # test merge
+    keras.layers.concatenate([j_tf, k_tf], axis=1)
+    keras.layers.add([j_tf, k_tf])
+
+    # test tensor input
+    x = array_ops.placeholder(shape=(None, 2), dtype=dtypes.float32)
+    keras.layers.InputLayer(input_tensor=x)
+
+    x = keras.layers.Input(tensor=x)
+    keras.layers.Dense(2)(x)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/engine/training.py b/tensorflow/contrib/keras/python/keras/engine/training.py
new file mode 100644
index 0000000000..efd437f6f6
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/engine/training.py
@@ -0,0 +1,2119 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras training and evaluation routines.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import multiprocessing
+import threading
+import time
+import warnings
+
+import numpy as np
+import six
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import callbacks as cbks
+from tensorflow.contrib.keras.python.keras import losses
+from tensorflow.contrib.keras.python.keras import metrics as metrics_module
+from tensorflow.contrib.keras.python.keras import optimizers
+from tensorflow.contrib.keras.python.keras.engine.topology import Container
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import Progbar
+
+
+# pylint: disable=g-import-not-at-top
+try:
+  import queue
+except ImportError:
+  import Queue as queue
+# pylint: enable=g-import-not-at-top
+
+
+def _standardize_input_data(data,
+                            names,
+                            shapes=None,
+                            check_batch_axis=True,
+                            exception_prefix=''):
+  """Normalizes inputs and targets provided by users.
+
+  Users may pass data as a list of arrays, dictionary of arrays,
+  or as a single array. We normalize this to an ordered list of
+  arrays (same order as `names`), while checking that the provided
+  arrays have shapes that match the network's expectations.
+
+  Arguments:
+      data: User-provided input data (polymorphic).
+      names: List of expected array names.
+      shapes: Optional list of expected array shapes.
+      check_batch_axis: Boolean; whether to check that
+          the batch axis of the arrays matches the expected
+          value found in `shapes`.
+      exception_prefix: String prefix used for exception formatting.
+
+  Returns:
+      List of standardized input arrays (one array per model input).
+
+  Raises:
+      ValueError: in case of improperly formatted user-provided data.
+  """
+  if data is None:
+    return [None for _ in range(len(names))]
+  if isinstance(data, dict):
+    arrays = []
+    for name in names:
+      if name not in data:
+        raise ValueError('No data provided for "' + name +
+                         '". Need data for each key in: ' + str(names))
+      arrays.append(data[name])
+  elif isinstance(data, list):
+    if len(data) != len(names):
+      if data and hasattr(data[0], 'shape'):
+        raise ValueError('Error when checking ' + exception_prefix +
+                         ': the list of Numpy arrays '
+                         'that you are passing to your model '
+                         'is not the size the model expected. '
+                         'Expected to see ' + str(len(
+                             names)) + ' arrays but instead got '
+                         'the following list of ' + str(len(
+                             data)) + ' arrays: ' + str(data)[:200] + '...')
+      else:
+        if len(names) == 1:
+          data = [np.asarray(data)]
+        else:
+          raise ValueError('Error when checking ' + exception_prefix +
+                           ': you are passing a list as '
+                           'input to your model, '
+                           'but the model expects '
+                           'a list of ' + str(len(
+                               names)) + ' Numpy arrays instead. '
+                           'The list you passed was: ' + str(data)[:200])
+    arrays = data
+  else:
+    if not hasattr(data, 'shape'):
+      raise TypeError('Error when checking ' + exception_prefix +
+                      ': data should be a Numpy array, '
+                      'or list/dict of Numpy arrays. '
+                      'Found: ' + str(data)[:200] + '...')
+    if len(names) != 1:
+      # Case: model expects multiple inputs but only received
+      # a single Numpy array.
+      raise ValueError('The model expects ' + str(len(names)) +
+                       ' input arrays, but only received one array. '
+                       'Found: array with shape ' + str(data.shape))
+    arrays = [data]
+
+  # Make arrays at least 2D.
+  for i in range(len(names)):
+    array = arrays[i]
+    if len(array.shape) == 1:
+      array = np.expand_dims(array, 1)
+      arrays[i] = array
+
+  # Check shapes compatibility.
+  if shapes:
+    for i in range(len(names)):
+      if shapes[i] is None:
+        continue
+      array = arrays[i]
+      if len(array.shape) != len(shapes[i]):
+        raise ValueError(
+            'Error when checking ' + exception_prefix + ': expected ' + names[
+                i] + ' to have ' + str(len(shapes[i])) +
+            ' dimensions, but got array with shape ' + str(array.shape))
+      for j, (dim, ref_dim) in enumerate(zip(array.shape, shapes[i])):
+        if not j and not check_batch_axis:
+          # skip the first axis
+          continue
+        if ref_dim:
+          if ref_dim != dim:
+            raise ValueError('Error when checking ' + exception_prefix +
+                             ': expected ' + names[i] + ' to have shape ' + str(
+                                 shapes[i]) + ' but got array with shape ' +
+                             str(array.shape))
+  return arrays
+
+
+def _standardize_sample_or_class_weights(x_weight, output_names, weight_type):
+  """Maps `sample_weight` or `class_weight` to model outputs.
+
+  Arguments:
+      x_weight: User-provided `sample_weight` or `class_weight` argument.
+      output_names: List of output names (strings) in the model.
+      weight_type: A string used purely for exception printing.
+
+  Returns:
+      A list of `sample_weight` or `class_weight` where there are exactly
+          one element per model output.
+
+  Raises:
+      ValueError: In case of invalid user-provided argument.
+  """
+  if x_weight is None or len(x_weight) == 0:  # pylint: disable=g-explicit-length-test
+    return [None for _ in output_names]
+  if len(output_names) == 1:
+    if isinstance(x_weight, list) and len(x_weight) == 1:
+      return x_weight
+    if isinstance(x_weight, dict) and output_names[0] in x_weight:
+      return [x_weight[output_names[0]]]
+    else:
+      return [x_weight]
+  if isinstance(x_weight, list):
+    if len(x_weight) != len(output_names):
+      raise ValueError('Provided `' + weight_type + '` was a list of ' + str(
+          len(x_weight)) + ' elements, but the model has ' + str(
+              len(output_names)) + ' outputs. '
+                       'You should provide one `' + weight_type + '`'
+                       'array per model output.')
+    return x_weight
+  if isinstance(x_weight, dict):
+    x_weights = []
+    for name in output_names:
+      x_weights.append(x_weight.get(name))
+    return x_weights
+  else:
+    raise TypeError('The model has multiple outputs, so `' + weight_type + '` '
+                    'should be either a list of a dict. '
+                    'Provided `' + weight_type + '` type not understood: ' +
+                    str(x_weight))
+
+
+def _standardize_class_weights(class_weight, output_names):
+  return _standardize_sample_or_class_weights(class_weight, output_names,
+                                              'class_weight')
+
+
+def _standardize_sample_weights(sample_weight, output_names):
+  return _standardize_sample_or_class_weights(sample_weight, output_names,
+                                              'sample_weight')
+
+
+def _check_array_lengths(inputs, targets, weights):
+  """Does user input validation for numpy arrays.
+
+  Arguments:
+      inputs: list of Numpy arrays of inputs.
+      targets: list of Numpy arrays of targets.
+      weights: list of Numpy arrays of sample weights.
+
+  Raises:
+      ValueError: in case of incorrectly formatted data.
+  """
+  x_lengths = [x.shape[0] for x in inputs]
+  y_lengths = [y.shape[0] for y in targets]
+  w_lengths = [w.shape[0] for w in weights]
+  set_x = set(x_lengths)
+  if len(set_x) > 1:
+    raise ValueError('All input arrays (x) should have '
+                     'the same number of samples. Got array shapes: ' + str(
+                         [x.shape for x in inputs]))
+  set_y = set(y_lengths)
+  if len(set_y) > 1:
+    raise ValueError('All target arrays (y) should have '
+                     'the same number of samples. Got array shapes: ' + str(
+                         [y.shape for y in targets]))
+  set_w = set(w_lengths)
+  if len(set_w) > 1:
+    raise ValueError('All sample_weight arrays should have '
+                     'the same number of samples. Got array shapes: ' + str(
+                         [w.shape for w in weights]))
+  if set_x and set_y and list(set_x)[0] != list(set_y)[0]:
+    raise ValueError('Input arrays should have '
+                     'the same number of samples as target arrays. '
+                     'Found ' + str(list(set_x)[0]) + ' input samples '
+                     'and ' + str(list(set_y)[0]) + ' target samples.')
+  if set_y and set_w and list(set_y)[0] != list(set_w)[0]:
+    raise ValueError('Sample_weight arrays should have '
+                     'the same number of samples as target arrays. Got ' + str(
+                         list(set_y)[0]) + ' input samples and ' + str(
+                             list(set_w)[0]) + ' target samples.')
+
+
+def _check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
+  """Does validation on the compatiblity of targets and loss functions.
+
+  This helps prevent users from using loss functions incorrectly.
+
+  Arguments:
+      targets: list of Numpy arrays of targets.
+      loss_fns: list of loss functions.
+      output_shapes: list of shapes of model outputs.
+
+  Raises:
+      ValueError: if a loss function or target array
+          is incompatible with an output.
+  """
+  key_losses = {
+      'mean_square_error', 'binary_crossentropy', 'categorical_crossentropy'
+  }
+  for y, loss, shape in zip(targets, loss_fns, output_shapes):
+    if loss is None:
+      continue
+    if loss.__name__ == 'categorical_crossentropy':
+      if y.shape[-1] == 1:
+        raise ValueError('You are passing a target array of shape ' + str(
+            y.shape) + ' while using as loss `categorical_crossentropy`. '
+                         '`categorical_crossentropy` expects '
+                         'targets to be binary matrices (1s and 0s) '
+                         'of shape (samples, classes). '
+                         'If your targets are integer classes, '
+                         'you can convert them to the expected format via:\n'
+                         '```\n'
+                         'from keras.utils.np_utils import to_categorical\n'
+                         'y_binary = to_categorical(y_int)\n'
+                         '```\n'
+                         '\n'
+                         'Alternatively, you can use the loss function '
+                         '`sparse_categorical_crossentropy` instead, '
+                         'which does expect integer targets.')
+    if loss.__name__ in key_losses:
+      for target_dim, out_dim in zip(y.shape[1:], shape[1:]):
+        if out_dim is not None and target_dim != out_dim:
+          raise ValueError('A target array with shape ' + str(
+              y.shape) + ' was passed for an output of shape ' + str(shape) +
+                           ' while using as loss `' + loss.__name__ + '`. '
+                           'This loss expects '
+                           'targets to have the same shape '
+                           'as the output.')
+
+
+def _collect_metrics(metrics, output_names):
+  """Maps metric functions to model outputs.
+
+  Arguments:
+      metrics: a list or dict of metric functions.
+      output_names: a list of the names (strings) of model outputs.
+
+  Returns:
+      A list (one entry per model output) of lists of metric functions.
+      For instance, if the model has 2 outputs, and for the first output
+      we want to compute "binary_accuracy" and "binary_crossentropy",
+      and just "binary_accuracy" for the second output,
+      the list would look like:
+          `[[binary_accuracy, binary_crossentropy], [binary_accuracy]]`
+
+  Raises:
+      TypeError: if an incorrect type is passed for the `metrics` argument.
+  """
+  if not metrics:
+    return [[] for _ in output_names]
+  if isinstance(metrics, list):
+    # we then apply all metrics to all outputs.
+    return [copy.copy(metrics) for _ in output_names]
+  elif isinstance(metrics, dict):
+    nested_metrics = []
+    for name in output_names:
+      output_metrics = metrics.get(name, [])
+      if not isinstance(output_metrics, list):
+        output_metrics = [output_metrics]
+      nested_metrics.append(output_metrics)
+    return nested_metrics
+  else:
+    raise TypeError('Type of `metrics` argument not understood. '
+                    'Expected a list or dictionary, found: ' + str(metrics))
+
+
+def _batch_shuffle(index_array, batch_size):
+  """Shuffles an array in a batch-wise fashion.
+
+  Useful for shuffling HDF5 arrays
+  (where one cannot access arbitrary indices).
+
+  Arguments:
+      index_array: array of indices to be shuffled.
+      batch_size: integer.
+
+  Returns:
+      The `index_array` array, shuffled in a batch-wise fashion.
+  """
+  batch_count = int(len(index_array) / batch_size)
+  # to reshape we need to be cleanly divisible by batch size
+  # we stash extra items and reappend them after shuffling
+  last_batch = index_array[batch_count * batch_size:]
+  index_array = index_array[:batch_count * batch_size]
+  index_array = index_array.reshape((batch_count, batch_size))
+  np.random.shuffle(index_array)
+  index_array = index_array.flatten()
+  return np.append(index_array, last_batch)
+
+
+def _make_batches(size, batch_size):
+  """Returns a list of batch indices (tuples of indices).
+
+  Arguments:
+      size: Integer, total size of the data to slice into batches.
+      batch_size: Integer, batch size.
+
+  Returns:
+      A list of tuples of array indices.
+  """
+  num_batches = int(np.ceil(size / float(batch_size)))
+  return [(i * batch_size, min(size, (i + 1) * batch_size))
+          for i in range(0, num_batches)]
+
+
+def _slice_arrays(arrays, start=None, stop=None):
+  """Slice an array or list of arrays.
+
+  This takes an array-like, or a list of
+  array-likes, and outputs:
+      - arrays[start:stop] if `arrays` is an array-like
+      - [x[start:stop] for x in arrays] if `arrays` is a list
+
+  Can also work on list/array of indices: `_slice_arrays(x, indices)`
+
+  Arguments:
+      arrays: Single array or list of arrays.
+      start: can be an integer index (start index)
+          or a list/array of indices
+      stop: integer (stop index); should be None if
+          `start` was a list.
+
+  Returns:
+      A slice of the array(s).
+  """
+  if isinstance(arrays, list):
+    if hasattr(start, '__len__'):
+      # hdf5 datasets only support list objects as indices
+      if hasattr(start, 'shape'):
+        start = start.tolist()
+      return [x[start] for x in arrays]
+    else:
+      return [x[start:stop] for x in arrays]
+  else:
+    if hasattr(start, '__len__'):
+      if hasattr(start, 'shape'):
+        start = start.tolist()
+      return arrays[start]
+    else:
+      return arrays[start:stop]
+
+
+def _weighted_masked_objective(fn):
+  """Adds support for masking and sample-weighting to an objective function.
+
+  It transforms an objective function `fn(y_true, y_pred)`
+  into a sample-weighted, cost-masked objective function
+  `fn(y_true, y_pred, weights, mask)`.
+
+  Arguments:
+      fn: The objective function to wrap,
+          with signature `fn(y_true, y_pred)`.
+
+  Returns:
+      A function with signature `fn(y_true, y_pred, weights, mask)`.
+  """
+  if fn is None:
+    return None
+
+  def weighted(y_true, y_pred, weights, mask=None):
+    """Wrapper function.
+
+    Arguments:
+        y_true: `y_true` argument of `fn`.
+        y_pred: `y_pred` argument of `fn`.
+        weights: Weights tensor.
+        mask: Mask tensor.
+
+    Returns:
+        Scalar tensor.
+    """
+    # score_array has ndim >= 2
+    score_array = fn(y_true, y_pred)
+    if mask is not None:
+      mask = K.cast(mask, K.floatx())
+      # mask should have the same shape as score_array
+      score_array *= mask
+      #  the loss per batch should be proportional
+      #  to the number of unmasked samples.
+      score_array /= K.mean(mask)
+
+    # reduce score_array to same ndim as weight array
+    ndim = K.ndim(score_array)
+    weight_ndim = K.ndim(weights)
+    score_array = K.mean(score_array, axis=list(range(weight_ndim, ndim)))
+
+    # apply sample weighting
+    if weights is not None:
+      score_array *= weights
+      score_array /= K.mean(K.cast(K.not_equal(weights, 0), K.floatx()))
+    return K.mean(score_array)
+
+  return weighted
+
+
+def _masked_objective(fn):
+  """Adds support for masking to an objective function.
+
+  It transforms an objective function `fn(y_true, y_pred)`
+  into a cost-masked objective function
+  `fn(y_true, y_pred, mask)`.
+
+  Arguments:
+      fn: The objective function to wrap,
+          with signature `fn(y_true, y_pred)`.
+
+  Returns:
+      A function with signature `fn(y_true, y_pred, mask)`.
+  """
+
+  def masked(y_true, y_pred, mask=None):
+    """Wrapper function.
+
+    Arguments:
+        y_true: `y_true` argument of `fn`.
+        y_pred: `y_pred` argument of `fn`.
+        mask: Mask tensor.
+
+    Returns:
+        Scalar tensor.
+    """
+    # score_array has ndim >= 2
+    score_array = fn(y_true, y_pred)
+    if mask is not None:
+      mask = K.cast(mask, K.floatx())
+      # mask should have the same shape as score_array
+      score_array *= mask
+      #  the loss per batch should be proportional
+      #  to the number of unmasked samples.
+      score_array /= K.mean(mask)
+
+    return K.mean(score_array)
+
+  return masked
+
+
+def _standardize_weights(y,
+                         sample_weight=None,
+                         class_weight=None,
+                         sample_weight_mode=None):
+  """Performs sample weight validation and standardization.
+
+  Everything gets normalized to a single sample-wise (or timestep-wise)
+  weight array.
+
+  Arguments:
+      y: Numpy array of model targets to be weighted.
+      sample_weight: User-provided `sample_weight` argument.
+      class_weight: User-provided `class_weight` argument.
+      sample_weight_mode: One of `None` or `"temporal"`.
+          `"temporal"` indicated that we expect 2D weight data
+          that will be applied to the last 2 dimensions of
+          the targets (i.e. we are weighting timesteps, not samples).
+
+  Returns:
+      A numpy array of target weights, one entry per sample to weight.
+
+  Raises:
+      ValueError: In case of invalid user-provided arguments.
+  """
+  if sample_weight_mode is not None:
+    if sample_weight_mode != 'temporal':
+      raise ValueError('"sample_weight_mode '
+                       'should be None or "temporal". '
+                       'Found: ' + str(sample_weight_mode))
+    if len(y.shape) < 3:
+      raise ValueError('Found a sample_weight array for '
+                       'an input with shape ' + str(y.shape) + '. '
+                       'Timestep-wise sample weighting (use of '
+                       'sample_weight_mode="temporal") is restricted to '
+                       'outputs that are at least 3D, i.e. that have '
+                       'a time dimension.')
+    if sample_weight is not None and len(sample_weight.shape) != 2:
+      raise ValueError('Found a sample_weight array with shape ' + str(
+          sample_weight.shape) + '. '
+                       'In order to use timestep-wise sample weighting, '
+                       'you should pass a 2D sample_weight array.')
+  else:
+    if sample_weight is not None and len(sample_weight.shape) != 1:
+      raise ValueError('Found a sample_weight array with shape ' + str(
+          sample_weight.shape) + '. '
+                       'In order to use timestep-wise sample weights, '
+                       'you should specify '
+                       'sample_weight_mode="temporal" '
+                       'in compile(). If you just mean to use '
+                       'sample-wise weights, make sure your '
+                       'sample_weight array is 1D.')
+
+  if sample_weight is not None:
+    if len(sample_weight.shape) > len(y.shape):
+      raise ValueError('Found a sample_weight with shape' + str(
+          sample_weight.shape) + '.'
+                       'Expected sample_weight with rank '
+                       'less than or equal to ' + str(len(y.shape)))
+
+    if y.shape[:sample_weight.ndim] != sample_weight.shape:
+      raise ValueError('Found a sample_weight array with shape ' + str(
+          sample_weight.shape) + ' for an input with shape ' + str(y.shape) +
+                       '. '
+                       'sample_weight cannot be broadcast.')
+    return sample_weight
+  elif isinstance(class_weight, dict):
+    if len(y.shape) > 2:
+      raise ValueError('class_weight not supported for '
+                       '3+ dimensional targets.')
+    if y.shape[1] > 1:
+      y_classes = y.argmax(axis=1)
+    elif y.shape[1] == 1:
+      y_classes = np.reshape(y, y.shape[0])
+    else:
+      y_classes = y
+    weights = np.asarray([class_weight[cls] for cls in y_classes])
+    return weights
+  else:
+    if sample_weight_mode is None:
+      return np.ones((y.shape[0],), dtype=K.floatx())
+    else:
+      return np.ones((y.shape[0], y.shape[1]), dtype=K.floatx())
+
+
+class GeneratorEnqueuer(object):
+  """Builds a queue out of a data generator.
+
+  Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
+
+  Arguments:
+      generator: a generator function which endlessly yields data
+      pickle_safe: use multiprocessing if True, otherwise threading
+  """
+
+  def __init__(self, generator, pickle_safe=False):
+    self._generator = generator
+    self._pickle_safe = pickle_safe
+    self._threads = []
+    self._stop_event = None
+    self.queue = None
+
+  def start(self, workers=1, max_q_size=10, wait_time=0.05):
+    """Kicks off threads which add data from the generator into the queue.
+
+    Arguments:
+        workers: number of worker threads
+        max_q_size: queue size (when full, threads could block on put())
+        wait_time: time to sleep in-between calls to put()
+    """
+
+    def data_generator_task():
+      while not self._stop_event.is_set():
+        try:
+          if self._pickle_safe or self.queue.qsize() < max_q_size:
+            generator_output = next(self._generator)
+            self.queue.put(generator_output)
+          else:
+            time.sleep(wait_time)
+        except Exception:
+          self._stop_event.set()
+          raise
+
+    try:
+      if self._pickle_safe:
+        self.queue = multiprocessing.Queue(maxsize=max_q_size)
+        self._stop_event = multiprocessing.Event()
+      else:
+        self.queue = queue.Queue()
+        self._stop_event = threading.Event()
+
+      for _ in range(workers):
+        if self._pickle_safe:
+          # Reset random seed else all children processes
+          # share the same seed
+          np.random.seed()
+          thread = multiprocessing.Process(target=data_generator_task)
+          thread.daemon = True
+        else:
+          thread = threading.Thread(target=data_generator_task)
+        self._threads.append(thread)
+        thread.start()
+    except:
+      self.stop()
+      raise
+
+  def is_running(self):
+    return self._stop_event is not None and not self._stop_event.is_set()
+
+  def stop(self, timeout=None):
+    """Stop running threads and wait for them to exit, if necessary.
+
+    Should be called by the same thread which called start().
+
+    Arguments:
+        timeout: maximum time to wait on thread.join()
+    """
+    if self.is_running():
+      self._stop_event.set()
+
+    for thread in self._threads:
+      if thread.is_alive():
+        if self._pickle_safe:
+          thread.terminate()
+        else:
+          thread.join(timeout)
+
+    if self._pickle_safe:
+      if self.queue is not None:
+        self.queue.close()
+
+    self._threads = []
+    self._stop_event = None
+    self.queue = None
+
+
+class Model(Container):
+  """The `Model` class adds training & evaluation routines to a `Container`.
+  """
+
+  def compile(self,
+              optimizer,
+              loss,
+              metrics=None,
+              loss_weights=None,
+              sample_weight_mode=None):
+    """Configures the model for training.
+
+    Arguments:
+        optimizer: str (name of optimizer) or optimizer object.
+            See [optimizers](/optimizers).
+        loss: str (name of objective function) or objective function.
+            See [losses](/losses).
+            If the model has multiple outputs, you can use a different loss
+            on each output by passing a dictionary or a list of losses.
+        metrics: list of metrics to be evaluated by the model
+            during training and testing.
+            Typically you will use `metrics=['accuracy']`.
+            To specify different metrics for different outputs of a
+            multi-output model, you could also pass a dictionary,
+            such as `metrics={'output_a': 'accuracy'}`.
+        loss_weights: Optional list or dictionary specifying scalar
+            coefficients (Python floats) to weight the loss contributions
+            of different model outputs.
+            If a list, it is expected to have a 1:1 mapping
+            to the model's outputs. If a tensor, it is expected to map
+            output names (strings) to scalar coefficients.
+        sample_weight_mode: if you need to do timestep-wise
+            sample weighting (2D weights), set this to `"temporal"`.
+            `None` defaults to sample-wise weights (1D).
+            If the model has multiple outputs, you can use a different
+            `sample_weight_mode` on each output by passing a
+            dictionary or a list of modes.
+
+    Raises:
+        ValueError: In case of invalid arguments for
+            `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
+        RuntimeError: If the model has no loss to optimize.
+    """
+    loss = loss or {}
+    self.optimizer = optimizers.get(optimizer)
+    self.sample_weight_mode = sample_weight_mode
+    self.loss = loss
+    self.loss_weights = loss_weights
+
+    # Prepare loss functions.
+    if isinstance(loss, dict):
+      for name in loss:
+        if name not in self.output_names:
+          raise ValueError('Unknown entry in loss '
+                           'dictionary: "' + name + '". '
+                           'Only expected the following keys: ' + str(
+                               self.output_names))
+      loss_functions = []
+      for name in self.output_names:
+        if name not in loss:
+          warnings.warn('Output "' + name + '" missing from loss dictionary. '
+                        'We assume this was done on purpose, '
+                        'and we will not be expecting '
+                        'any data to be passed to "' + name +
+                        '" during training.')
+        loss_functions.append(losses.get(loss.get(name)))
+    elif isinstance(loss, list):
+      if len(loss) != len(self.outputs):
+        raise ValueError('When passing a list as loss, '
+                         'it should have one entry per model outputs. '
+                         'The model has ' + str(len(self.outputs)) +
+                         ' outputs, but you passed loss=' + str(loss))
+      loss_functions = [losses.get(l) for l in loss]
+    else:
+      loss_function = losses.get(loss)
+      loss_functions = [loss_function for _ in range(len(self.outputs))]
+    self.loss_functions = loss_functions
+    weighted_losses = [_weighted_masked_objective(fn) for fn in loss_functions]
+    skip_indices = []
+    self._feed_outputs = []
+    self._feed_output_names = []
+    self._feed_output_shapes = []
+    self._feed_loss_fns = []
+    for i in range(len(weighted_losses)):
+      if weighted_losses[i] is None:
+        skip_indices.append(i)
+      else:
+        self._feed_outputs.append(self.outputs[i])
+        self._feed_output_names.append(self.output_names[i])
+        self._feed_output_shapes.append(self.internal_output_shapes[i])
+        self._feed_loss_fns.append(self.loss_functions[i])
+
+    # Prepare output masks.
+    masks = self.compute_mask(self.inputs, mask=None)
+    if masks is None:
+      masks = [None for _ in self.outputs]
+    if not isinstance(masks, list):
+      masks = [masks]
+
+    # Prepare loss weights.
+    if loss_weights is None:
+      loss_weights_list = [1. for _ in range(len(self.outputs))]
+    elif isinstance(loss_weights, dict):
+      for name in loss_weights:
+        if name not in self.output_names:
+          raise ValueError('Unknown entry in loss_weights '
+                           'dictionary: "' + name + '". '
+                           'Only expected the following keys: ' + str(
+                               self.output_names))
+      loss_weights_list = []
+      for name in self.output_names:
+        loss_weights_list.append(loss_weights.get(name, 1.))
+    elif isinstance(loss_weights, list):
+      if len(loss_weights) != len(self.outputs):
+        raise ValueError('When passing a list as loss_weights, '
+                         'it should have one entry per model outputs. '
+                         'The model has ' + str(len(self.outputs)) +
+                         ' outputs, but you passed loss_weights=' + str(
+                             loss_weights))
+      loss_weights_list = loss_weights
+    else:
+      raise TypeError('Could not interpret loss_weights argument: ' + str(
+          loss_weights) + ' - expected a list of dicts.')
+
+    # Prepare sample weights.
+    sample_weights = []
+    sample_weight_modes = []
+    if isinstance(sample_weight_mode, dict):
+      for name in sample_weight_mode:
+        if name not in self.output_names:
+          raise ValueError('Unknown entry in '
+                           'sample_weight_mode dictionary: "' + name + '". '
+                           'Only expected the following keys: ' + str(
+                               self.output_names))
+      for i, name in enumerate(self.output_names):
+        if i in skip_indices:
+          weight = None
+          sample_weight_modes.append(None)
+        else:
+          if name not in sample_weight_mode:
+            raise ValueError('Output "' + name +
+                             '" missing from sample_weight_modes '
+                             'dictionary')
+          if sample_weight_mode.get(name) == 'temporal':
+            weight = K.placeholder(ndim=2, name=name + '_sample_weights')
+            sample_weight_modes.append('temporal')
+          else:
+            weight = K.placeholder(ndim=1, name=name + '_sample_weights')
+            sample_weight_modes.append(None)
+        sample_weights.append(weight)
+    elif isinstance(sample_weight_mode, list):
+      if len(sample_weight_mode) != len(self.outputs):
+        raise ValueError('When passing a list as sample_weight_mode, '
+                         'it should have one entry per model outputs. '
+                         'The model has ' + str(len(self.outputs)) +
+                         ' outputs, but you passed '
+                         'sample_weight_mode=' + str(sample_weight_mode))
+      for i in range(len(self.output_names)):
+        if i in skip_indices:
+          weight = None
+          sample_weight_modes.append(None)
+        else:
+          mode = sample_weight_mode[i]
+          name = self.output_names[i]
+          if mode == 'temporal':
+            weight = K.placeholder(ndim=2, name=name + '_sample_weights')
+            sample_weight_modes.append('temporal')
+          else:
+            weight = K.placeholder(ndim=1, name=name + '_sample_weights')
+            sample_weight_modes.append(None)
+        sample_weights.append(weight)
+    else:
+      for i, name in enumerate(self.output_names):
+        if i in skip_indices:
+          sample_weight_modes.append(None)
+          sample_weights.append(None)
+        else:
+          if sample_weight_mode == 'temporal':
+            sample_weights.append(
+                K.placeholder(ndim=2, name=name + '_sample_weights'))
+            sample_weight_modes.append('temporal')
+          else:
+            sample_weights.append(
+                K.placeholder(ndim=1, name=name + '_sample_weights'))
+            sample_weight_modes.append(None)
+    self.sample_weight_modes = sample_weight_modes
+    self._feed_sample_weight_modes = []
+    for i in range(len(self.outputs)):
+      if i not in skip_indices:
+        self._feed_sample_weight_modes.append(self.sample_weight_modes[i])
+
+    # Prepare targets of model.
+    self.targets = []
+    self._feed_targets = []
+    for i in range(len(self.outputs)):
+      if i in skip_indices:
+        self.targets.append(None)
+      else:
+        shape = self.internal_output_shapes[i]
+        name = self.output_names[i]
+        target = K.placeholder(
+            ndim=len(shape),
+            name=name + '_target',
+            sparse=K.is_sparse(self.outputs[i]),
+            dtype=K.dtype(self.outputs[i]))
+        self.targets.append(target)
+        self._feed_targets.append(target)
+
+    # Prepare metrics.
+    self.metrics = metrics
+    self.metrics_names = ['loss']
+    self.metrics_tensors = []
+
+    # Compute total loss.
+    total_loss = None
+    for i in range(len(self.outputs)):
+      if i in skip_indices:
+        continue
+      y_true = self.targets[i]
+      y_pred = self.outputs[i]
+      weighted_loss = weighted_losses[i]
+      sample_weight = sample_weights[i]
+      mask = masks[i]
+      loss_weight = loss_weights_list[i]
+      output_loss = weighted_loss(y_true, y_pred, sample_weight, mask)
+      if len(self.outputs) > 1:
+        self.metrics_tensors.append(output_loss)
+        self.metrics_names.append(self.output_names[i] + '_loss')
+      if total_loss is None:
+        total_loss = loss_weight * output_loss
+      else:
+        total_loss += loss_weight * output_loss
+    if total_loss is None:
+      if not self.losses:
+        raise RuntimeError('The model cannot be compiled '
+                           'because it has no loss to optimize.')
+      else:
+        total_loss = 0.
+
+    # Add regularization penalties
+    # and other layer-specific losses.
+    for loss_tensor in self.losses:
+      total_loss += loss_tensor
+
+    # List of same size as output_names.
+    # contains tuples (metrics for output, names of metrics).
+    nested_metrics = _collect_metrics(metrics, self.output_names)
+
+    def append_metric(layer_num, metric_name, metric_tensor):
+      """Helper function used in loop below."""
+      if len(self.output_names) > 1:
+        metric_name = self.output_layers[layer_num].name + '_' + metric_name
+      self.metrics_names.append(metric_name)
+      self.metrics_tensors.append(metric_tensor)
+
+    for i in range(len(self.outputs)):
+      if i in skip_indices:
+        continue
+      y_true = self.targets[i]
+      y_pred = self.outputs[i]
+      output_metrics = nested_metrics[i]
+      for metric in output_metrics:
+        if metric == 'accuracy' or metric == 'acc':
+          # custom handling of accuracy
+          # (because of class mode duality)
+          output_shape = self.internal_output_shapes[i]
+          acc_fn = None
+          if output_shape[-1] == 1 or self.loss_functions[
+              i] == losses.binary_crossentropy:
+            # case: binary accuracy
+            acc_fn = metrics_module.binary_accuracy
+          elif self.loss_functions[i] == losses.sparse_categorical_crossentropy:
+            # case: categorical accuracy with sparse targets
+            acc_fn = metrics_module.sparse_categorical_accuracy
+          else:
+            acc_fn = metrics_module.categorical_accuracy
+
+          masked_fn = _masked_objective(acc_fn)
+          append_metric(i, 'acc', masked_fn(y_true, y_pred, mask=masks[i]))
+        else:
+          metric_fn = metrics_module.get(metric)
+          masked_metric_fn = _masked_objective(metric_fn)
+          metric_result = masked_metric_fn(y_true, y_pred, mask=masks[i])
+          metric_result = {metric_fn.__name__: metric_result}
+          for name, tensor in six.iteritems(metric_result):
+            append_metric(i, name, tensor)
+
+    # Prepare gradient updates and state updates.
+    self.total_loss = total_loss
+    self.sample_weights = sample_weights
+    self._feed_sample_weights = []
+    for i in range(len(self.sample_weights)):
+      if i not in skip_indices:
+        self._feed_sample_weights.append(sample_weights[i])
+
+    # Functions for train, test and predict will
+    # be compiled lazily when required.
+    # This saves time when the user is not using all functions.
+    self.train_function = None
+    self.test_function = None
+    self.predict_function = None
+
+    # Collected trainable weights and sort them deterministically.
+    trainable_weights = self.trainable_weights
+    # Sort weights by name.
+    if trainable_weights:
+      trainable_weights.sort(key=lambda x: x.name)
+    self._collected_trainable_weights = trainable_weights
+
+  def _make_train_function(self):
+    if not hasattr(self, 'train_function'):
+      raise RuntimeError('You must compile your model before using it.')
+    if self.train_function is None:
+      inputs = (
+          self._feed_inputs + self._feed_targets + self._feed_sample_weights)
+      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+        inputs += [K.learning_phase()]
+
+      training_updates = self.optimizer.get_updates(
+          self._collected_trainable_weights, self.constraints, self.total_loss)
+      updates = self.updates + training_updates
+      # Gets loss and metrics. Updates weights at each call.
+      self.train_function = K.function(
+          inputs, [self.total_loss] + self.metrics_tensors, updates=updates)
+
+  def _make_test_function(self):
+    if not hasattr(self, 'test_function'):
+      raise RuntimeError('You must compile your model before using it.')
+    if self.test_function is None:
+      inputs = (
+          self._feed_inputs + self._feed_targets + self._feed_sample_weights)
+      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+        inputs += [K.learning_phase()]
+      # Return loss and metrics, no gradient updates.
+      # Does update the network states.
+      self.test_function = K.function(
+          inputs, [self.total_loss] + self.metrics_tensors,
+          updates=self.state_updates)
+
+  def _make_predict_function(self):
+    if not hasattr(self, 'predict_function'):
+      self.predict_function = None
+    if self.predict_function is None:
+      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+        inputs = self._feed_inputs + [K.learning_phase()]
+      else:
+        inputs = self._feed_inputs
+      # Gets network outputs. Does not update weights.
+      # Does update the network states.
+      self.predict_function = K.function(
+          inputs, self.outputs, updates=self.state_updates)
+
+  def _fit_loop(self,
+                f,
+                ins,
+                out_labels=None,
+                batch_size=32,
+                epochs=100,
+                verbose=1,
+                callbacks=None,
+                val_f=None,
+                val_ins=None,
+                shuffle=True,
+                callback_metrics=None,
+                initial_epoch=0):
+    """Abstract fit function for `f(ins)`.
+
+    Assume that f returns a list, labeled by out_labels.
+
+    Arguments:
+        f: Keras function returning a list of tensors
+        ins: list of tensors to be fed to `f`
+        out_labels: list of strings, display names of
+            the outputs of `f`
+        batch_size: integer batch size
+        epochs: number of times to iterate over the data
+        verbose: verbosity mode, 0, 1 or 2
+        callbacks: list of callbacks to be called during training
+        val_f: Keras function to call for validation
+        val_ins: list of tensors to be fed to `val_f`
+        shuffle: whether to shuffle the data at the beginning of each epoch
+        callback_metrics: list of strings, the display names of the metrics
+            passed to the callbacks. They should be the
+            concatenation of list the display names of the outputs of
+             `f` and the list of display names of the outputs of `f_val`.
+        initial_epoch: epoch at which to start training
+            (useful for resuming a previous training run)
+
+    Returns:
+        `History` object.
+    """
+    do_validation = False
+    if val_f and val_ins:
+      do_validation = True
+      if verbose:
+        print('Train on %d samples, validate on %d samples' %
+              (ins[0].shape[0], val_ins[0].shape[0]))
+
+    if ins and hasattr(ins[0], 'shape'):
+      num_train_samples = ins[0].shape[0]
+    else:
+      # May happen if we are running `fit` without Numpy input data,
+      # i.e. if all inputs to the models are data tensors
+      # instead of placeholders.
+      # In that case we will run `fit` over a single batch.
+      num_train_samples = batch_size
+      verbose = 2
+    index_array = np.arange(num_train_samples)
+
+    self.history = cbks.History()
+    callbacks = [cbks.BaseLogger()] + (callbacks or []) + [self.history]
+    if verbose:
+      callbacks += [cbks.ProgbarLogger()]
+    callbacks = cbks.CallbackList(callbacks)
+    out_labels = out_labels or []
+
+    # it's possible to callback a different model than self
+    # (used by Sequential models)
+    if hasattr(self, 'callback_model') and self.callback_model:
+      callback_model = self.callback_model
+    else:
+      callback_model = self
+
+    callbacks.set_model(callback_model)
+    callbacks.set_params({
+        'batch_size': batch_size,
+        'epochs': epochs,
+        'samples': num_train_samples,
+        'verbose': verbose,
+        'do_validation': do_validation,
+        'metrics': callback_metrics or [],
+    })
+    callbacks.on_train_begin()
+    callback_model.stop_training = False
+    for cbk in callbacks:
+      cbk.validation_data = val_ins
+
+    for epoch in range(initial_epoch, epochs):
+      callbacks.on_epoch_begin(epoch)
+      if shuffle == 'batch':
+        index_array = _batch_shuffle(index_array, batch_size)
+      elif shuffle:
+        np.random.shuffle(index_array)
+
+      batches = _make_batches(num_train_samples, batch_size)
+      epoch_logs = {}
+      for batch_index, (batch_start, batch_end) in enumerate(batches):
+        batch_ids = index_array[batch_start:batch_end]
+        try:
+          if isinstance(ins[-1], float):
+            # do not slice the training phase flag
+            ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+          else:
+            ins_batch = _slice_arrays(ins, batch_ids)
+        except TypeError:
+          raise TypeError('TypeError while preparing batch. '
+                          'If using HDF5 input data, '
+                          'pass shuffle="batch".')
+        batch_logs = {}
+        batch_logs['batch'] = batch_index
+        batch_logs['size'] = len(batch_ids)
+        callbacks.on_batch_begin(batch_index, batch_logs)
+        outs = f(ins_batch)
+        if not isinstance(outs, list):
+          outs = [outs]
+        for l, o in zip(out_labels, outs):
+          batch_logs[l] = o
+
+        callbacks.on_batch_end(batch_index, batch_logs)
+
+        if batch_index == len(batches) - 1:  # last batch
+          # validation
+          if do_validation:
+            # replace with self._evaluate
+            val_outs = self._test_loop(
+                val_f, val_ins, batch_size=batch_size, verbose=0)
+            if not isinstance(val_outs, list):
+              val_outs = [val_outs]
+            # same labels assumed
+            for l, o in zip(out_labels, val_outs):
+              epoch_logs['val_' + l] = o
+      callbacks.on_epoch_end(epoch, epoch_logs)
+      if callback_model.stop_training:
+        break
+    callbacks.on_train_end()
+    return self.history
+
+  def _predict_loop(self, f, ins, batch_size=32, verbose=0):
+    """Abstract method to loop over some data in batches.
+
+    Arguments:
+        f: Keras function returning a list of tensors.
+        ins: list of tensors to be fed to `f`.
+        batch_size: integer batch size.
+        verbose: verbosity mode.
+
+    Returns:
+        Array of predictions (if the model has a single output)
+        or list of arrays of predictions
+        (if the model has multiple outputs).
+    """
+    if ins and hasattr(ins[0], 'shape'):
+      samples = ins[0].shape[0]
+    else:
+      # May happen if we are running `predict` without Numpy input data,
+      # i.e. if all inputs to the models are data tensors
+      # instead of placeholders.
+      # In that case we will run `predict` over a single batch.
+      samples = batch_size
+      verbose = 2
+    outs = []
+    if verbose == 1:
+      progbar = Progbar(target=samples)
+    batches = _make_batches(samples, batch_size)
+    index_array = np.arange(samples)
+    for batch_index, (batch_start, batch_end) in enumerate(batches):
+      batch_ids = index_array[batch_start:batch_end]
+      if ins and isinstance(ins[-1], float):
+        # do not slice the training phase flag
+        ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+      else:
+        ins_batch = _slice_arrays(ins, batch_ids)
+
+      batch_outs = f(ins_batch)
+      if not isinstance(batch_outs, list):
+        batch_outs = [batch_outs]
+      if batch_index == 0:
+        for batch_out in batch_outs:
+          shape = (samples,) + batch_out.shape[1:]
+          outs.append(np.zeros(shape, dtype=K.floatx()))
+
+      for i, batch_out in enumerate(batch_outs):
+        outs[i][batch_start:batch_end] = batch_out
+      if verbose == 1:
+        progbar.update(batch_end)
+    if len(outs) == 1:
+      return outs[0]
+    return outs
+
+  def _test_loop(self, f, ins, batch_size=32, verbose=0):
+    """Abstract method to loop over some data in batches.
+
+    Arguments:
+        f: Keras function returning a list of tensors.
+        ins: list of tensors to be fed to `f`.
+        batch_size: integer batch size.
+        verbose: verbosity mode.
+
+    Returns:
+        Scalar loss (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+    """
+    if ins and hasattr(ins[0], 'shape'):
+      samples = ins[0].shape[0]
+    else:
+      # May happen if we are running `evaluate` without Numpy input data,
+      # i.e. if all inputs to the models are data tensors
+      # instead of placeholders.
+      # In that case we will run `evaluate` over a single batch.
+      samples = batch_size
+      verbose = 2
+
+    outs = []
+    if verbose == 1:
+      progbar = Progbar(target=samples)
+    batches = _make_batches(samples, batch_size)
+    index_array = np.arange(samples)
+    for batch_index, (batch_start, batch_end) in enumerate(batches):
+      batch_ids = index_array[batch_start:batch_end]
+      if isinstance(ins[-1], float):
+        # do not slice the training phase flag
+        ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+      else:
+        ins_batch = _slice_arrays(ins, batch_ids)
+
+      batch_outs = f(ins_batch)
+      if isinstance(batch_outs, list):
+        if batch_index == 0:
+          for batch_out in enumerate(batch_outs):
+            outs.append(0.)
+        for i, batch_out in enumerate(batch_outs):
+          outs[i] += batch_out * len(batch_ids)
+      else:
+        if batch_index == 0:
+          outs.append(0.)
+        outs[0] += batch_outs * len(batch_ids)
+
+      if verbose == 1:
+        progbar.update(batch_end)
+    for i in range(len(outs)):
+      outs[i] /= samples
+    if len(outs) == 1:
+      return outs[0]
+    return outs
+
+  def _standardize_user_data(self,
+                             x,
+                             y,
+                             sample_weight=None,
+                             class_weight=None,
+                             check_batch_axis=True,
+                             batch_size=None):
+    if not hasattr(self, 'optimizer'):
+      raise RuntimeError('You must compile a model before '
+                         'training/testing. '
+                         'Use `model.compile(optimizer, loss)`.')
+
+    output_shapes = []
+    for output_shape, loss_fn in zip(self._feed_output_shapes,
+                                     self._feed_loss_fns):
+      if loss_fn.__name__ == 'sparse_categorical_crossentropy':
+        output_shapes.append(output_shape[:-1] + (1,))
+      elif getattr(losses, loss_fn.__name__, None) is None:
+        output_shapes.append(None)
+      else:
+        output_shapes.append(output_shape)
+    x = _standardize_input_data(
+        x,
+        self._feed_input_names,
+        self._feed_input_shapes,
+        check_batch_axis=False,
+        exception_prefix='model input')
+    y = _standardize_input_data(
+        y,
+        self._feed_output_names,
+        output_shapes,
+        check_batch_axis=False,
+        exception_prefix='model target')
+    sample_weights = _standardize_sample_weights(sample_weight,
+                                                 self._feed_output_names)
+    class_weights = _standardize_class_weights(class_weight,
+                                               self._feed_output_names)
+    sample_weights = [
+        _standardize_weights(ref, sw, cw, mode)
+        for (ref, sw, cw, mode) in zip(y, sample_weights, class_weights,
+                                       self._feed_sample_weight_modes)
+    ]
+    _check_array_lengths(x, y, sample_weights)
+    _check_loss_and_target_compatibility(y, self._feed_loss_fns,
+                                         self._feed_output_shapes)
+    if self.stateful and batch_size:
+      if x[0].shape[0] % batch_size != 0:
+        raise ValueError('In a stateful network, '
+                         'you should only pass inputs with '
+                         'a number of samples that can be '
+                         'divided by the batch size. Found: ' + str(
+                             x[0].shape[0]) + ' samples')
+    return x, y, sample_weights
+
+  def fit(self,
+          x=None,
+          y=None,
+          batch_size=32,
+          epochs=1,
+          verbose=1,
+          callbacks=None,
+          validation_split=0.,
+          validation_data=None,
+          shuffle=True,
+          class_weight=None,
+          sample_weight=None,
+          initial_epoch=0):
+    """Trains the model for a fixed number of epochs (iterations on a dataset).
+
+    Arguments:
+        x: Numpy array of training data,
+            or list of Numpy arrays if the model has multiple inputs.
+            If all inputs in the model are named,
+            you can also pass a dictionary
+            mapping input names to Numpy arrays.
+        y: Numpy array of target data,
+            or list of Numpy arrays if the model has multiple outputs.
+            If all outputs in the model are named,
+            you can also pass a dictionary
+            mapping output names to Numpy arrays.
+        batch_size: integer. Number of samples per gradient update.
+        epochs: integer, the number of times to iterate
+            over the training data arrays.
+            verbose: 0, 1, or 2. Verbosity mode.
+            0 = silent, 1 = verbose, 2 = one log line per epoch.
+        callbacks: list of callbacks to be called during training.
+            See [callbacks](/callbacks).
+        validation_split: float between 0 and 1:
+            fraction of the training data to be used as validation data.
+            The model will set apart this fraction of the training data,
+            will not train on it, and will evaluate
+            the loss and any model metrics
+            on this data at the end of each epoch.
+        validation_data: data on which to evaluate
+            the loss and any model metrics
+            at the end of each epoch. The model will not
+            be trained on this data.
+            This could be a tuple (x_val, y_val)
+            or a tuple (x_val, y_val, val_sample_weights).
+        shuffle: boolean, whether to shuffle the training data
+            before each epoch.
+        class_weight: optional dictionary mapping
+            class indices (integers) to
+            a weight (float) to apply to the model's loss for the samples
+            from this class during training.
+            This can be useful to tell the model to "pay more attention" to
+            samples from an under-represented class.
+        sample_weight: optional array of the same length as x, containing
+            weights to apply to the model's loss for each sample.
+            In the case of temporal data, you can pass a 2D array
+            with shape (samples, sequence_length),
+            to apply a different weight to every timestep of every sample.
+            In this case you should make sure to specify
+            sample_weight_mode="temporal" in compile().
+        initial_epoch: epoch at which to start training
+            (useful for resuming a previous training run)
+
+    Returns:
+        A `History` instance. Its `history` attribute contains
+        all information collected during training.
+
+    Raises:
+        ValueError: In case of mismatch between the provided input data
+            and what the model expects.
+    """
+    # validate user data
+    x, y, sample_weights = self._standardize_user_data(
+        x,
+        y,
+        sample_weight=sample_weight,
+        class_weight=class_weight,
+        check_batch_axis=False,
+        batch_size=batch_size)
+    # prepare validation data
+    if validation_data:
+      do_validation = True
+      if len(validation_data) == 2:
+        val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
+        val_sample_weight = None
+      elif len(validation_data) == 3:
+        val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
+      else:
+        raise ValueError('When passing validation_data, '
+                         'it must contain 2 (x_val, y_val) '
+                         'or 3 (x_val, y_val, val_sample_weights) '
+                         'items, however it contains %d items' %
+                         len(validation_data))
+
+      val_x, val_y, val_sample_weights = self._standardize_user_data(
+          val_x,
+          val_y,
+          sample_weight=val_sample_weight,
+          check_batch_axis=False,
+          batch_size=batch_size)
+      self._make_test_function()
+      val_f = self.test_function
+      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+        val_ins = val_x + val_y + val_sample_weights + [0.]
+      else:
+        val_ins = val_x + val_y + val_sample_weights
+
+    elif validation_split and 0. < validation_split < 1.:
+      do_validation = True
+      split_at = int(len(x[0]) * (1. - validation_split))
+      x, val_x = (_slice_arrays(x, 0, split_at), _slice_arrays(x, split_at))
+      y, val_y = (_slice_arrays(y, 0, split_at), _slice_arrays(y, split_at))
+      sample_weights, val_sample_weights = (_slice_arrays(
+          sample_weights, 0, split_at), _slice_arrays(sample_weights, split_at))
+      self._make_test_function()
+      val_f = self.test_function
+      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+        val_ins = val_x + val_y + val_sample_weights + [0.]
+      else:
+        val_ins = val_x + val_y + val_sample_weights
+    else:
+      do_validation = False
+      val_f = None
+      val_ins = None
+
+    # prepare input arrays and training function
+    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      ins = x + y + sample_weights + [1.]
+    else:
+      ins = x + y + sample_weights
+    self._make_train_function()
+    f = self.train_function
+
+    # prepare display labels
+    out_labels = self.metrics_names
+
+    # rename duplicated metrics name
+    # (can happen with an output layer shared among multiple dataflows)
+    deduped_out_labels = []
+    for i, label in enumerate(out_labels):
+      new_label = label
+      if out_labels.count(label) > 1:
+        dup_idx = out_labels[:i].count(label)
+        new_label += '_' + str(dup_idx + 1)
+      deduped_out_labels.append(new_label)
+    out_labels = deduped_out_labels
+
+    if do_validation:
+      callback_metrics = copy.copy(out_labels) + [
+          'val_' + n for n in out_labels
+      ]
+    else:
+      callback_metrics = copy.copy(out_labels)
+
+    # delegate logic to _fit_loop
+    return self._fit_loop(
+        f,
+        ins,
+        out_labels=out_labels,
+        batch_size=batch_size,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        val_f=val_f,
+        val_ins=val_ins,
+        shuffle=shuffle,
+        callback_metrics=callback_metrics,
+        initial_epoch=initial_epoch)
+
+  def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):
+    """Returns the loss value & metrics values for the model in test mode.
+
+    Computation is done in batches.
+
+    Arguments:
+        x: Numpy array of test data,
+            or list of Numpy arrays if the model has multiple inputs.
+            If all inputs in the model are named,
+            you can also pass a dictionary
+            mapping input names to Numpy arrays.
+        y: Numpy array of target data,
+            or list of Numpy arrays if the model has multiple outputs.
+            If all outputs in the model are named,
+            you can also pass a dictionary
+            mapping output names to Numpy arrays.
+        batch_size: integer. Number of samples per gradient update.
+        verbose: verbosity mode, 0 or 1.
+        sample_weight: Array of weights to weight the contribution
+            of different samples to the loss and metrics.
+
+    Returns:
+        Scalar test loss (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+    """
+    # validate user data
+    x, y, sample_weights = self._standardize_user_data(
+        x,
+        y,
+        sample_weight=sample_weight,
+        check_batch_axis=False,
+        batch_size=batch_size)
+    # prepare inputs, delegate logic to _test_loop
+    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      ins = x + y + sample_weights + [0.]
+    else:
+      ins = x + y + sample_weights
+    self._make_test_function()
+    f = self.test_function
+    return self._test_loop(f, ins, batch_size=batch_size, verbose=verbose)
+
+  def predict(self, x, batch_size=32, verbose=0):
+    """Generates output predictions for the input samples.
+
+    Computation is done in batches.
+
+    Arguments:
+        x: the input data, as a Numpy array
+            (or list of Numpy arrays if the model has multiple outputs).
+        batch_size: integer.
+        verbose: verbosity mode, 0 or 1.
+
+    Returns:
+        Numpy array(s) of predictions.
+
+    Raises:
+        ValueError: In case of mismatch between the provided
+            input data and the model's expectations,
+            or in case a stateful model receives a number of samples
+            that is not a multiple of the batch size.
+    """
+    # validate user data
+    x = _standardize_input_data(
+        x,
+        self._feed_input_names,
+        self._feed_input_shapes,
+        check_batch_axis=False)
+    if self.stateful:
+      if x[0].shape[0] > batch_size and x[0].shape[0] % batch_size != 0:
+        raise ValueError('In a stateful network, '
+                         'you should only pass inputs with '
+                         'a number of samples that can be '
+                         'divided by the batch size. Found: ' + str(
+                             x[0].shape[0]) + ' samples. '
+                         'Batch size: ' + str(batch_size) + '.')
+
+    # prepare inputs, delegate logic to _predict_loop
+    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      ins = x + [0.]
+    else:
+      ins = x
+    self._make_predict_function()
+    f = self.predict_function
+    return self._predict_loop(f, ins, batch_size=batch_size, verbose=verbose)
+
+  def train_on_batch(self, x, y, sample_weight=None, class_weight=None):
+    """Runs a single gradient update on a single batch of data.
+
+    Arguments:
+        x: Numpy array of training data,
+            or list of Numpy arrays if the model has multiple inputs.
+            If all inputs in the model are named,
+            you can also pass a dictionary
+            mapping input names to Numpy arrays.
+        y: Numpy array of target data,
+            or list of Numpy arrays if the model has multiple outputs.
+            If all outputs in the model are named,
+            you can also pass a dictionary
+            mapping output names to Numpy arrays.
+        sample_weight: optional array of the same length as x, containing
+            weights to apply to the model's loss for each sample.
+            In the case of temporal data, you can pass a 2D array
+            with shape (samples, sequence_length),
+            to apply a different weight to every timestep of every sample.
+            In this case you should make sure to specify
+            sample_weight_mode="temporal" in compile().
+        class_weight: optional dictionary mapping
+            class indices (integers) to
+            a weight (float) to apply to the model's loss for the samples
+            from this class during training.
+            This can be useful to tell the model to "pay more attention" to
+            samples from an under-represented class.
+
+    Returns:
+        Scalar training loss
+        (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+    """
+    x, y, sample_weights = self._standardize_user_data(
+        x,
+        y,
+        sample_weight=sample_weight,
+        class_weight=class_weight,
+        check_batch_axis=True)
+    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      ins = x + y + sample_weights + [1.]
+    else:
+      ins = x + y + sample_weights
+    self._make_train_function()
+    outputs = self.train_function(ins)
+    if len(outputs) == 1:
+      return outputs[0]
+    return outputs
+
+  def test_on_batch(self, x, y, sample_weight=None):
+    """Test the model on a single batch of samples.
+
+    Arguments:
+        x: Numpy array of test data,
+            or list of Numpy arrays if the model has multiple inputs.
+            If all inputs in the model are named,
+            you can also pass a dictionary
+            mapping input names to Numpy arrays.
+        y: Numpy array of target data,
+            or list of Numpy arrays if the model has multiple outputs.
+            If all outputs in the model are named,
+            you can also pass a dictionary
+            mapping output names to Numpy arrays.
+        sample_weight: optional array of the same length as x, containing
+            weights to apply to the model's loss for each sample.
+            In the case of temporal data, you can pass a 2D array
+            with shape (samples, sequence_length),
+            to apply a different weight to every timestep of every sample.
+            In this case you should make sure to specify
+            sample_weight_mode="temporal" in compile().
+
+    Returns:
+        Scalar test loss (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+    """
+    x, y, sample_weights = self._standardize_user_data(
+        x, y, sample_weight=sample_weight, check_batch_axis=True)
+    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      ins = x + y + sample_weights + [0.]
+    else:
+      ins = x + y + sample_weights
+    self._make_test_function()
+    outputs = self.test_function(ins)
+    if len(outputs) == 1:
+      return outputs[0]
+    return outputs
+
+  def predict_on_batch(self, x):
+    """Returns predictions for a single batch of samples.
+
+    Arguments:
+        x: Input samples, as a Numpy array.
+
+    Returns:
+        Numpy array(s) of predictions.
+    """
+    x = _standardize_input_data(x, self._feed_input_names,
+                                self._feed_input_shapes)
+    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      ins = x + [0.]
+    else:
+      ins = x
+    self._make_predict_function()
+    outputs = self.predict_function(ins)
+    if len(outputs) == 1:
+      return outputs[0]
+    return outputs
+
+  def fit_generator(self,
+                    generator,
+                    steps_per_epoch,
+                    epochs=1,
+                    verbose=1,
+                    callbacks=None,
+                    validation_data=None,
+                    validation_steps=None,
+                    class_weight=None,
+                    max_q_size=10,
+                    workers=1,
+                    pickle_safe=False,
+                    initial_epoch=0):
+    """Fits the model on data yielded batch-by-batch by a Python generator.
+
+    The generator is run in parallel to the model, for efficiency.
+    For instance, this allows you to do real-time data augmentation
+    on images on CPU in parallel to training your model on GPU.
+
+    Arguments:
+        generator: a generator.
+            The output of the generator must be either
+            - a tuple (inputs, targets)
+            - a tuple (inputs, targets, sample_weights).
+            All arrays should contain the same number of samples.
+            The generator is expected to loop over its data
+            indefinitely. An epoch finishes when `samples_per_epoch`
+            samples have been seen by the model.
+        steps_per_epoch: Total number of steps (batches of samples)
+            to yield from `generator` before declaring one epoch
+            finished and starting the next epoch. It should typically
+            be equal to the number of unique samples if your dataset
+            divided by the batch size.
+        epochs: integer, total number of iterations on the data.
+        verbose: verbosity mode, 0, 1, or 2.
+        callbacks: list of callbacks to be called during training.
+        validation_data: this can be either
+            - a generator for the validation data
+            - a tuple (inputs, targets)
+            - a tuple (inputs, targets, sample_weights).
+        validation_steps: Only relevant if `validation_data`
+            is a generator. Total number of steps (batches of samples)
+            to yield from `generator` before stopping.
+        class_weight: dictionary mapping class indices to a weight
+            for the class.
+        max_q_size: maximum size for the generator queue
+        workers: maximum number of processes to spin up
+            when using process based threading
+        pickle_safe: if True, use process based threading.
+            Note that because
+            this implementation relies on multiprocessing,
+            you should not pass
+            non picklable arguments to the generator
+            as they can't be passed
+            easily to children processes.
+        initial_epoch: epoch at which to start training
+            (useful for resuming a previous training run)
+
+    Returns:
+        A `History` object.
+
+    Example:
+
+    ```python
+        def generate_arrays_from_file(path):
+            while 1:
+                f = open(path)
+                for line in f:
+                    # create numpy arrays of input data
+                    # and labels, from each line in the file
+                    x1, x2, y = process_line(line)
+                    yield ({'input_1': x1, 'input_2': x2}, {'output': y})
+                f.close()
+
+        model.fit_generator(generate_arrays_from_file('/my_file.txt'),
+                            samples_per_epoch=10000, epochs=10)
+    ```
+
+    Raises:
+        ValueError: In case the generator yields
+            data in an invalid format.
+    """
+    wait_time = 0.01  # in seconds
+    epoch = initial_epoch
+
+    do_validation = bool(validation_data)
+    self._make_train_function()
+    if do_validation:
+      self._make_test_function()
+
+    # python 2 has 'next', 3 has '__next__'
+    # avoid any explicit version checks
+    val_gen = (hasattr(validation_data, 'next') or
+               hasattr(validation_data, '__next__'))
+    if val_gen and not validation_steps:
+      raise ValueError('When using a generator for validation data, '
+                       'you must specify a value for '
+                       '`validation_steps`.')
+
+    out_labels = self.metrics_names
+    callback_metrics = out_labels + ['val_' + n for n in out_labels]
+
+    # prepare callbacks
+    self.history = cbks.History()
+    callbacks = [cbks.BaseLogger()] + (callbacks or []) + [self.history]
+    if verbose:
+      callbacks += [cbks.ProgbarLogger(count_mode='steps')]
+    callbacks = cbks.CallbackList(callbacks)
+
+    # it's possible to callback a different model than self:
+    if hasattr(self, 'callback_model') and self.callback_model:
+      callback_model = self.callback_model
+    else:
+      callback_model = self
+    callbacks.set_model(callback_model)
+    callbacks.set_params({
+        'epochs': epochs,
+        'steps': steps_per_epoch,
+        'verbose': verbose,
+        'do_validation': do_validation,
+        'metrics': callback_metrics,
+    })
+    callbacks.on_train_begin()
+
+    if do_validation and not val_gen:
+      if len(validation_data) == 2:
+        val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
+        val_sample_weight = None
+      elif len(validation_data) == 3:
+        val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
+      else:
+        raise ValueError('validation_data should be a tuple '
+                         '`(val_x, val_y, val_sample_weight)` '
+                         'or `(val_x, val_y)`. Found: ' + str(validation_data))
+      val_x, val_y, val_sample_weights = self._standardize_user_data(
+          val_x, val_y, val_sample_weight)
+      for cbk in callbacks:
+        cbk.validation_data = val_x + [val_y, val_sample_weights]
+    enqueuer = None
+
+    try:
+      enqueuer = GeneratorEnqueuer(generator, pickle_safe=pickle_safe)
+      enqueuer.start(max_q_size=max_q_size, workers=workers)
+
+      callback_model.stop_training = False
+      while epoch < epochs:
+        callbacks.on_epoch_begin(epoch)
+        steps_done = 0
+        batch_index = 0
+        while steps_done < steps_per_epoch:
+          generator_output = None
+          while enqueuer.is_running():
+            if not enqueuer.queue.empty():
+              generator_output = enqueuer.queue.get()
+              break
+            else:
+              time.sleep(wait_time)
+
+          if not hasattr(generator_output, '__len__'):
+            raise ValueError('output of generator should be '
+                             'a tuple `(x, y, sample_weight)` '
+                             'or `(x, y)`. Found: ' + str(generator_output))
+          if len(generator_output) == 2:
+            x, y = generator_output  # pylint: disable=unpacking-non-sequence
+            sample_weight = None
+          elif len(generator_output) == 3:
+            x, y, sample_weight = generator_output  # pylint: disable=unpacking-non-sequence
+          else:
+            raise ValueError('output of generator should be '
+                             'a tuple `(x, y, sample_weight)` '
+                             'or `(x, y)`. Found: ' + str(generator_output))
+          # build batch logs
+          batch_logs = {}
+          if isinstance(x, list):
+            batch_size = x[0].shape[0]
+          elif isinstance(x, dict):
+            batch_size = list(x.values())[0].shape[0]
+          else:
+            batch_size = x.shape[0]
+          batch_logs['batch'] = batch_index
+          batch_logs['size'] = batch_size
+          callbacks.on_batch_begin(batch_index, batch_logs)
+
+          outs = self.train_on_batch(
+              x, y, sample_weight=sample_weight, class_weight=class_weight)
+
+          if not isinstance(outs, list):
+            outs = [outs]
+          for l, o in zip(out_labels, outs):
+            batch_logs[l] = o
+
+          callbacks.on_batch_end(batch_index, batch_logs)
+
+          # Construct epoch logs.
+          epoch_logs = {}
+          batch_index += 1
+          steps_done += 1
+
+          # Epoch finished.
+          if steps_done >= steps_per_epoch and do_validation:
+            if val_gen:
+              val_outs = self.evaluate_generator(
+                  validation_data,
+                  validation_steps,
+                  max_q_size=max_q_size,
+                  workers=workers,
+                  pickle_safe=pickle_safe)
+            else:
+              # No need for try/except because
+              # data has already been validated.
+              val_outs = self.evaluate(
+                  val_x,
+                  val_y,
+                  batch_size=batch_size,
+                  sample_weight=val_sample_weights,
+                  verbose=0)
+            if not isinstance(val_outs, list):
+              val_outs = [val_outs]
+            # Same labels assumed.
+            for l, o in zip(out_labels, val_outs):
+              epoch_logs['val_' + l] = o
+
+        callbacks.on_epoch_end(epoch, epoch_logs)
+        epoch += 1
+        if callback_model.stop_training:
+          break
+
+    finally:
+      if enqueuer is not None:
+        enqueuer.stop()
+
+    callbacks.on_train_end()
+    return self.history
+
+  def evaluate_generator(self,
+                         generator,
+                         steps,
+                         max_q_size=10,
+                         workers=1,
+                         pickle_safe=False):
+    """Evaluates the model on a data generator.
+
+    The generator should return the same kind of data
+    as accepted by `test_on_batch`.
+
+    Arguments:
+        generator: Generator yielding tuples (inputs, targets)
+            or (inputs, targets, sample_weights)
+        steps: Total number of steps (batches of samples)
+            to yield from `generator` before stopping.
+        max_q_size: maximum size for the generator queue
+        workers: maximum number of processes to spin up
+            when using process based threading
+        pickle_safe: if True, use process based threading.
+            Note that because
+            this implementation relies on multiprocessing,
+            you should not pass
+            non picklable arguments to the generator
+            as they can't be passed
+            easily to children processes.
+
+    Returns:
+        Scalar test loss (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+
+    Raises:
+        ValueError: In case the generator yields
+            data in an invalid format.
+    """
+    self._make_test_function()
+
+    steps_done = 0
+    wait_time = 0.01
+    all_outs = []
+    batch_sizes = []
+    enqueuer = None
+
+    try:
+      enqueuer = GeneratorEnqueuer(generator, pickle_safe=pickle_safe)
+      enqueuer.start(workers=workers, max_q_size=max_q_size)
+
+      while steps_done < steps:
+        generator_output = None
+        while enqueuer.is_running():
+          if not enqueuer.queue.empty():
+            generator_output = enqueuer.queue.get()
+            break
+          else:
+            time.sleep(wait_time)
+
+        if not hasattr(generator_output, '__len__'):
+          raise ValueError('output of generator should be a tuple '
+                           '(x, y, sample_weight) '
+                           'or (x, y). Found: ' + str(generator_output))
+        if len(generator_output) == 2:
+          x, y = generator_output  # pylint: disable=unpacking-non-sequence
+          sample_weight = None
+        elif len(generator_output) == 3:
+          x, y, sample_weight = generator_output  # pylint: disable=unpacking-non-sequence
+        else:
+          raise ValueError('output of generator should be a tuple '
+                           '(x, y, sample_weight) '
+                           'or (x, y). Found: ' + str(generator_output))
+        outs = self.test_on_batch(x, y, sample_weight=sample_weight)
+
+        if isinstance(x, list):
+          batch_size = len(x[0])
+        elif isinstance(x, dict):
+          batch_size = len(list(x.values())[0])
+        else:
+          batch_size = len(x)
+        all_outs.append(outs)
+
+        steps_done += 1
+        batch_sizes.append(batch_size)
+
+    finally:
+      if enqueuer is not None:
+        enqueuer.stop()
+
+    if not isinstance(outs, list):
+      return np.average(np.asarray(all_outs), weights=batch_sizes)
+    else:
+      averages = []
+      for i in range(len(outs)):
+        averages.append(
+            np.average([out[i] for out in all_outs], weights=batch_sizes))
+      return averages
+
+  def predict_generator(self,
+                        generator,
+                        steps,
+                        max_q_size=10,
+                        workers=1,
+                        pickle_safe=False):
+    """Generates predictions for the input samples from a data generator.
+
+    The generator should return the same kind of data as accepted by
+    `predict_on_batch`.
+
+    Arguments:
+        generator: Generator yielding batches of input samples.
+        steps: Total number of steps (batches of samples)
+            to yield from `generator` before stopping.
+        max_q_size: Maximum size for the generator queue.
+        workers: Maximum number of processes to spin up
+            when using process based threading
+        pickle_safe: If `True`, use process based threading.
+            Note that because
+            this implementation relies on multiprocessing,
+            you should not pass
+            non picklable arguments to the generator
+            as they can't be passed
+            easily to children processes.
+
+    Returns:
+        Numpy array(s) of predictions.
+
+    Raises:
+        ValueError: In case the generator yields
+            data in an invalid format.
+    """
+    self._make_predict_function()
+
+    steps_done = 0
+    wait_time = 0.01
+    all_outs = []
+    enqueuer = None
+
+    try:
+      enqueuer = GeneratorEnqueuer(generator, pickle_safe=pickle_safe)
+      enqueuer.start(workers=workers, max_q_size=max_q_size)
+
+      while steps_done < steps:
+        generator_output = None
+        while enqueuer.is_running():
+          if not enqueuer.queue.empty():
+            generator_output = enqueuer.queue.get()
+            break
+          else:
+            time.sleep(wait_time)
+
+        if isinstance(generator_output, tuple):
+          # Compatibility with the generators
+          # used for training.
+          if len(generator_output) == 2:
+            x, _ = generator_output  # pylint: disable=unpacking-non-sequence
+          elif len(generator_output) == 3:
+            x, _, _ = generator_output  # pylint: disable=unpacking-non-sequence
+          else:
+            raise ValueError('output of generator should be '
+                             'a tuple `(x, y, sample_weight)` '
+                             'or `(x, y)`. Found: ' + str(generator_output))
+        else:
+          # Assumes a generator that only
+          # yields inputs (not targets and sample weights).
+          x = generator_output
+
+        outs = self.predict_on_batch(x)
+        if not isinstance(outs, list):
+          outs = [outs]
+
+        if not all_outs:
+          for out in outs:
+            all_outs.append([])
+
+        for i, out in enumerate(outs):
+          all_outs[i].append(out)
+        steps_done += 1
+
+    finally:
+      if enqueuer is not None:
+        enqueuer.stop()
+
+    if len(all_outs) == 1:
+      if steps_done == 1:
+        return all_outs[0][0]
+      else:
+        return np.concatenate(all_outs[0])
+    if steps_done == 1:
+      return [out for out in all_outs]
+    else:
+      return [np.concatenate(out) for out in all_outs]
diff --git a/tensorflow/contrib/keras/python/keras/engine/training_test.py b/tensorflow/contrib/keras/python/keras/engine/training_test.py
new file mode 100644
index 0000000000..a23838f7b4
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/engine/training_test.py
@@ -0,0 +1,695 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training routines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.contrib.keras.python.keras.engine.training import _weighted_masked_objective
+from tensorflow.python.platform import test
+
+
+class TrainingTest(test.TestCase):
+
+  def test_fit_on_arrays(self):
+    with self.test_session():
+      a = keras.layers.Input(shape=(3,), name='input_a')
+      b = keras.layers.Input(shape=(3,), name='input_b')
+
+      dense = keras.layers.Dense(4, name='dense')
+      c = dense(a)
+      d = dense(b)
+      e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+      model = keras.models.Model([a, b], [d, e])
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      loss_weights = [1., 0.5]
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+
+      input_a_np = np.random.random((10, 3))
+      input_b_np = np.random.random((10, 3))
+
+      output_d_np = np.random.random((10, 4))
+      output_e_np = np.random.random((10, 4))
+
+      # Test fit at different verbosity
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          epochs=1,
+          batch_size=5,
+          verbose=1)
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          epochs=2,
+          batch_size=5,
+          verbose=2)
+      model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
+
+      # Test with validation data
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                      output_e_np]),
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                      output_e_np]),
+          epochs=2,
+          batch_size=5,
+          verbose=1)
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                      output_e_np]),
+          epochs=2,
+          batch_size=5,
+          verbose=2)
+      # Test with validation split
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          epochs=2,
+          batch_size=5,
+          verbose=0,
+          validation_split=0.2)
+
+      # Test with dictionary inputs
+      model.fit(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {'dense': output_d_np,
+              'dropout': output_e_np},
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+      model.fit(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {'dense': output_d_np,
+              'dropout': output_e_np},
+          epochs=1,
+          batch_size=5,
+          verbose=1)
+      model.fit(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {'dense': output_d_np,
+              'dropout': output_e_np},
+          validation_data=({
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          }),
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+      model.train_on_batch({
+          'input_a': input_a_np,
+          'input_b': input_b_np
+      }, {'dense': output_d_np,
+          'dropout': output_e_np})
+
+      # Test with lists for loss, metrics
+      loss = ['mae', 'mse']
+      metrics = ['acc', 'mae']
+      model.compile(optimizer, loss, metrics=metrics)
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+
+      # Test with dictionaries for loss, metrics, loss weights
+      loss = {'dense': 'mse', 'dropout': 'mae'}
+      loss_weights = {'dense': 1., 'dropout': 0.5}
+      metrics = {'dense': 'mse', 'dropout': 'mae'}
+      model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+
+  def test_evaluate_predict_on_arrays(self):
+    with self.test_session():
+      a = keras.layers.Input(shape=(3,), name='input_a')
+      b = keras.layers.Input(shape=(3,), name='input_b')
+
+      dense = keras.layers.Dense(4, name='dense')
+      c = dense(a)
+      d = dense(b)
+      e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+      model = keras.models.Model([a, b], [d, e])
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      loss_weights = [1., 0.5]
+      metrics = ['mae']
+      model.compile(
+          optimizer,
+          loss,
+          metrics=metrics,
+          loss_weights=loss_weights,
+          sample_weight_mode=None)
+
+      input_a_np = np.random.random((10, 3))
+      input_b_np = np.random.random((10, 3))
+
+      output_d_np = np.random.random((10, 4))
+      output_e_np = np.random.random((10, 4))
+
+      # Test evaluate at different verbosity
+      out = model.evaluate(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          batch_size=5,
+          verbose=0)
+      self.assertEqual(len(out), 5)
+      out = model.evaluate(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          batch_size=5,
+          verbose=1)
+      self.assertEqual(len(out), 5)
+      out = model.evaluate(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          batch_size=5,
+          verbose=2)
+      self.assertEqual(len(out), 5)
+      out = model.test_on_batch([input_a_np, input_b_np],
+                                [output_d_np, output_e_np])
+      self.assertEqual(len(out), 5)
+
+      # Test evaluate with dictionary inputs
+      model.evaluate(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {'dense': output_d_np,
+              'dropout': output_e_np},
+          batch_size=5,
+          verbose=0)
+      model.evaluate(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {'dense': output_d_np,
+              'dropout': output_e_np},
+          batch_size=5,
+          verbose=1)
+
+      # Test predict
+      out = model.predict([input_a_np, input_b_np], batch_size=5)
+      self.assertEqual(len(out), 2)
+      out = model.predict({'input_a': input_a_np, 'input_b': input_b_np})
+      self.assertEqual(len(out), 2)
+      out = model.predict_on_batch({
+          'input_a': input_a_np,
+          'input_b': input_b_np
+      })
+      self.assertEqual(len(out), 2)
+
+
+class LossWeightingTest(test.TestCase):
+
+  def test_class_weights(self):
+    num_classes = 5
+    batch_size = 5
+    epochs = 5
+    weighted_class = 3
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
+      model.add(keras.layers.Activation('relu'))
+      model.add(keras.layers.Dense(num_classes))
+      model.add(keras.layers.Activation('softmax'))
+      model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=test_samples,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      int_y_test = y_test.copy()
+      int_y_train = y_train.copy()
+      # convert class vectors to binary class matrices
+      y_train = keras.utils.to_categorical(y_train, num_classes)
+      y_test = keras.utils.to_categorical(y_test, num_classes)
+      test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+      class_weight = dict([(i, 1.) for i in range(num_classes)])
+      class_weight[weighted_class] = 2.
+
+      sample_weight = np.ones((y_train.shape[0]))
+      sample_weight[int_y_train == weighted_class] = 2.
+
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=batch_size,
+          epochs=epochs // 3,
+          verbose=0,
+          class_weight=class_weight,
+          validation_data=(x_train, y_train, sample_weight))
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=batch_size,
+          epochs=epochs // 2,
+          verbose=0,
+          class_weight=class_weight)
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=batch_size,
+          epochs=epochs // 2,
+          verbose=0,
+          class_weight=class_weight,
+          validation_split=0.1)
+
+      model.train_on_batch(
+          x_train[:batch_size], y_train[:batch_size], class_weight=class_weight)
+      ref_score = model.evaluate(x_test, y_test, verbose=0)
+      score = model.evaluate(
+          x_test[test_ids, :], y_test[test_ids, :], verbose=0)
+      self.assertLess(score, ref_score)
+
+  def test_sample_weights(self):
+    num_classes = 5
+    batch_size = 5
+    epochs = 5
+    weighted_class = 3
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
+      model.add(keras.layers.Activation('relu'))
+      model.add(keras.layers.Dense(num_classes))
+      model.add(keras.layers.Activation('softmax'))
+      model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=test_samples,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      int_y_test = y_test.copy()
+      int_y_train = y_train.copy()
+      # convert class vectors to binary class matrices
+      y_train = keras.utils.to_categorical(y_train, num_classes)
+      y_test = keras.utils.to_categorical(y_test, num_classes)
+      test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+      class_weight = dict([(i, 1.) for i in range(num_classes)])
+      class_weight[weighted_class] = 2.
+
+      sample_weight = np.ones((y_train.shape[0]))
+      sample_weight[int_y_train == weighted_class] = 2.
+
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=batch_size,
+          epochs=epochs // 3,
+          verbose=0,
+          sample_weight=sample_weight)
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=batch_size,
+          epochs=epochs // 3,
+          verbose=0,
+          sample_weight=sample_weight,
+          validation_split=0.1)
+
+      model.train_on_batch(
+          x_train[:batch_size],
+          y_train[:batch_size],
+          sample_weight=sample_weight[:batch_size])
+      model.test_on_batch(
+          x_train[:batch_size],
+          y_train[:batch_size],
+          sample_weight=sample_weight[:batch_size])
+      ref_score = model.evaluate(x_test, y_test, verbose=0)
+      score = model.evaluate(
+          x_test[test_ids, :], y_test[test_ids, :], verbose=0)
+      self.assertLess(score, ref_score)
+
+  def test_temporal_sample_weights(self):
+    num_classes = 5
+    batch_size = 5
+    epochs = 5
+    weighted_class = 3
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+    timesteps = 3
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Dense(num_classes),
+              input_shape=(timesteps, input_dim)))
+      model.add(keras.layers.Activation('softmax'))
+
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=test_samples,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      int_y_test = y_test.copy()
+      int_y_train = y_train.copy()
+      # convert class vectors to binary class matrices
+      y_train = keras.utils.to_categorical(y_train, num_classes)
+      y_test = keras.utils.to_categorical(y_test, num_classes)
+      test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+      class_weight = dict([(i, 1.) for i in range(num_classes)])
+      class_weight[weighted_class] = 2.
+
+      sample_weight = np.ones((y_train.shape[0]))
+      sample_weight[int_y_train == weighted_class] = 2.
+
+      temporal_x_train = np.reshape(x_train, (len(x_train), 1,
+                                              x_train.shape[1]))
+      temporal_x_train = np.repeat(temporal_x_train, timesteps, axis=1)
+      temporal_x_test = np.reshape(x_test, (len(x_test), 1, x_test.shape[1]))
+      temporal_x_test = np.repeat(temporal_x_test, timesteps, axis=1)
+
+      temporal_y_train = np.reshape(y_train, (len(y_train), 1,
+                                              y_train.shape[1]))
+      temporal_y_train = np.repeat(temporal_y_train, timesteps, axis=1)
+      temporal_y_test = np.reshape(y_test, (len(y_test), 1, y_test.shape[1]))
+      temporal_y_test = np.repeat(temporal_y_test, timesteps, axis=1)
+
+      temporal_sample_weight = np.reshape(sample_weight, (len(sample_weight),
+                                                          1))
+      temporal_sample_weight = np.repeat(
+          temporal_sample_weight, timesteps, axis=1)
+
+      model.compile(
+          loss='binary_crossentropy',
+          optimizer='rmsprop',
+          sample_weight_mode='temporal')
+
+      model.fit(
+          temporal_x_train,
+          temporal_y_train,
+          batch_size=batch_size,
+          epochs=epochs // 3,
+          verbose=0,
+          sample_weight=temporal_sample_weight)
+      model.fit(
+          temporal_x_train,
+          temporal_y_train,
+          batch_size=batch_size,
+          epochs=epochs // 3,
+          verbose=0,
+          sample_weight=temporal_sample_weight,
+          validation_split=0.1)
+
+      model.train_on_batch(
+          temporal_x_train[:batch_size],
+          temporal_y_train[:batch_size],
+          sample_weight=temporal_sample_weight[:batch_size])
+      model.test_on_batch(
+          temporal_x_train[:batch_size],
+          temporal_y_train[:batch_size],
+          sample_weight=temporal_sample_weight[:batch_size])
+      ref_score = model.evaluate(temporal_x_test, temporal_y_test, verbose=0)
+      score = model.evaluate(
+          temporal_x_test[test_ids], temporal_y_test[test_ids], verbose=0)
+      self.assertLess(score, ref_score)
+
+
+class LossMaskingTest(test.TestCase):
+
+  def test_masking(self):
+    with self.test_session():
+      np.random.seed(1337)
+      x = np.array([[[1], [1]], [[0], [0]]])
+      model = keras.models.Sequential()
+      model.add(keras.layers.Masking(mask_value=0, input_shape=(2, 1)))
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Dense(1, kernel_initializer='one')))
+      model.compile(loss='mse', optimizer='sgd')
+      y = np.array([[[1], [1]], [[1], [1]]])
+      loss = model.train_on_batch(x, y)
+      self.assertEqual(loss, 0)
+
+  def test_loss_masking(self):
+    with self.test_session():
+      weighted_loss = _weighted_masked_objective(keras.losses.get('mae'))
+      shape = (3, 4, 2)
+      x = np.arange(24).reshape(shape)
+      y = 2 * x
+
+      # Normally the trailing 1 is added by standardize_weights
+      weights = np.ones((3,))
+      mask = np.ones((3, 4))
+      mask[1, 0] = 0
+
+      keras.backend.eval(
+          weighted_loss(
+              keras.backend.variable(x),
+              keras.backend.variable(y),
+              keras.backend.variable(weights), keras.backend.variable(mask)))
+
+
+class TestDynamicTrainability(test.TestCase):
+
+  def test_trainable_argument(self):
+    with self.test_session():
+      x = np.random.random((5, 3))
+      y = np.random.random((5, 2))
+
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_dim=3, trainable=False))
+      model.compile('rmsprop', 'mse')
+      out = model.predict(x)
+      model.train_on_batch(x, y)
+      out_2 = model.predict(x)
+      self.assertAllClose(out, out_2)
+
+      # test with nesting
+      inputs = keras.layers.Input(shape=(3,))
+      output = model(inputs)
+      model = keras.models.Model(inputs, output)
+      model.compile('rmsprop', 'mse')
+      out = model.predict(x)
+      model.train_on_batch(x, y)
+      out_2 = model.predict(x)
+      self.assertAllClose(out, out_2)
+
+  def test_layer_trainability_switch(self):
+    with self.test_session():
+      # with constructor argument, in Sequential
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, trainable=False, input_dim=1))
+      self.assertListEqual(model.trainable_weights, [])
+
+      # by setting the `trainable` argument, in Sequential
+      model = keras.models.Sequential()
+      layer = keras.layers.Dense(2, input_dim=1)
+      model.add(layer)
+      self.assertListEqual(model.trainable_weights, layer.trainable_weights)
+      layer.trainable = False
+      self.assertListEqual(model.trainable_weights, [])
+
+      # with constructor argument, in Model
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(2, trainable=False)(x)
+      model = keras.models.Model(x, y)
+      self.assertListEqual(model.trainable_weights, [])
+
+      # by setting the `trainable` argument, in Model
+      x = keras.layers.Input(shape=(1,))
+      layer = keras.layers.Dense(2)
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      self.assertListEqual(model.trainable_weights, layer.trainable_weights)
+      layer.trainable = False
+      self.assertListEqual(model.trainable_weights, [])
+
+  def test_model_trainability_switch(self):
+    with self.test_session():
+      # a non-trainable model has no trainable weights
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(2)(x)
+      model = keras.models.Model(x, y)
+      model.trainable = False
+      self.assertListEqual(model.trainable_weights, [])
+
+      # same for Sequential
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_dim=1))
+      model.trainable = False
+      self.assertListEqual(model.trainable_weights, [])
+
+  def test_nested_model_trainability(self):
+    with self.test_session():
+      # a Sequential inside a Model
+      inner_model = keras.models.Sequential()
+      inner_model.add(keras.layers.Dense(2, input_dim=1))
+
+      x = keras.layers.Input(shape=(1,))
+      y = inner_model(x)
+      outer_model = keras.models.Model(x, y)
+      self.assertListEqual(outer_model.trainable_weights,
+                           inner_model.trainable_weights)
+      inner_model.trainable = False
+      self.assertListEqual(outer_model.trainable_weights, [])
+      inner_model.trainable = True
+      inner_model.layers[-1].trainable = False
+      self.assertListEqual(outer_model.trainable_weights, [])
+
+      # a Sequential inside a Sequential
+      inner_model = keras.models.Sequential()
+      inner_model.add(keras.layers.Dense(2, input_dim=1))
+      outer_model = keras.models.Sequential()
+      outer_model.add(inner_model)
+      self.assertListEqual(outer_model.trainable_weights,
+                           inner_model.trainable_weights)
+      inner_model.trainable = False
+      self.assertListEqual(outer_model.trainable_weights, [])
+      inner_model.trainable = True
+      inner_model.layers[-1].trainable = False
+      self.assertListEqual(outer_model.trainable_weights, [])
+
+      # a Model inside a Model
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(2)(x)
+      inner_model = keras.models.Model(x, y)
+      x = keras.layers.Input(shape=(1,))
+      y = inner_model(x)
+      outer_model = keras.models.Model(x, y)
+      self.assertListEqual(outer_model.trainable_weights,
+                           inner_model.trainable_weights)
+      inner_model.trainable = False
+      self.assertListEqual(outer_model.trainable_weights, [])
+      inner_model.trainable = True
+      inner_model.layers[-1].trainable = False
+      self.assertListEqual(outer_model.trainable_weights, [])
+
+      # a Model inside a Sequential
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(2)(x)
+      inner_model = keras.models.Model(x, y)
+      outer_model = keras.models.Sequential()
+      outer_model.add(inner_model)
+      self.assertListEqual(outer_model.trainable_weights,
+                           inner_model.trainable_weights)
+      inner_model.trainable = False
+      self.assertListEqual(outer_model.trainable_weights, [])
+      inner_model.trainable = True
+      inner_model.layers[-1].trainable = False
+      self.assertListEqual(outer_model.trainable_weights, [])
+
+
+class TestGeneratorMethods(test.TestCase):
+
+  def test_generator_methods(self):
+    arr_data = np.random.randint(0, 256, (50, 2))
+    arr_labels = np.random.randint(0, 2, 50)
+
+    def custom_generator():
+      batch_size = 10
+      n_samples = 50
+      while True:
+        batch_index = np.random.randint(0, n_samples - batch_size)
+        start = batch_index
+        end = start + batch_size
+        x = arr_data[start: end]
+        y = arr_labels[start: end]
+        yield x, y
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(1, input_shape=(2,)))
+    model.compile(loss='mse', optimizer='sgd')
+
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_q_size=10,
+                        workers=4,
+                        pickle_safe=True)
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_q_size=10,
+                        pickle_safe=False)
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_q_size=10,
+                        pickle_safe=False,
+                        validation_data=custom_generator(),
+                        validation_steps=10)
+    model.predict_generator(custom_generator(),
+                            steps=5,
+                            max_q_size=10,
+                            workers=2,
+                            pickle_safe=True)
+    model.predict_generator(custom_generator(),
+                            steps=5,
+                            max_q_size=10,
+                            pickle_safe=False)
+    model.evaluate_generator(custom_generator(),
+                             steps=5,
+                             max_q_size=10,
+                             workers=2,
+                             pickle_safe=True)
+    model.evaluate_generator(custom_generator(),
+                             steps=5,
+                             max_q_size=10,
+                             pickle_safe=False)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/initializers.py b/tensorflow/contrib/keras/python/keras/initializers.py
new file mode 100644
index 0000000000..621069f424
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/initializers.py
@@ -0,0 +1,468 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras initializer classes (soon to be replaced with core TF initializers).
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import numpy as np
+import six
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.framework import tensor_shape
+
+
+class Initializer(object):
+  """Initializer base class: all initializers inherit from this class.
+  """
+
+  def __call__(self, shape, dtype=None):
+    raise NotImplementedError
+
+  def get_config(self):
+    return {}
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
+
+
+class Zeros(Initializer):
+  """Initializer that generates tensors initialized to 0."""
+
+  def __call__(self, shape, dtype=None):
+    return K.constant(0, shape=shape, dtype=dtype)
+
+
+class Ones(Initializer):
+  """Initializer that generates tensors initialized to 1."""
+
+  def __call__(self, shape, dtype=None):
+    return K.constant(1, shape=shape, dtype=dtype)
+
+
+class Constant(Initializer):
+  """Initializer that generates tensors initialized to a constant value.
+
+  Arguments:
+      value: float; the value of the generator tensors.
+  """
+
+  def __init__(self, value=0):
+    self.value = value
+
+  def __call__(self, shape, dtype=None):
+    return K.constant(self.value, shape=shape, dtype=dtype)
+
+  def get_config(self):
+    return {'value': self.value}
+
+
+class RandomNormal(Initializer):
+  """Initializer that generates tensors with a normal distribution.
+
+  Arguments:
+      mean: a python scalar or a scalar tensor. Mean of the random values
+        to generate.
+      stddev: a python scalar or a scalar tensor. Standard deviation of the
+        random values to generate.
+      seed: A Python integer. Used to seed the random generator.
+  """
+
+  def __init__(self, mean=0., stddev=0.05, seed=None):
+    self.mean = mean
+    self.stddev = stddev
+    self.seed = seed
+
+  def __call__(self, shape, dtype=None):
+    return K.random_normal(
+        shape, self.mean, self.stddev, dtype=dtype, seed=self.seed)
+
+  def get_config(self):
+    return {'mean': self.mean, 'stddev': self.stddev, 'seed': self.seed}
+
+
+class RandomUniform(Initializer):
+  """Initializer that generates tensors with a uniform distribution.
+
+  Arguments:
+      minval: A python scalar or a scalar tensor. Lower bound of the range
+        of random values to generate.
+      maxval: A python scalar or a scalar tensor. Upper bound of the range
+        of random values to generate.  Defaults to 1 for float types.
+      seed: A Python integer. Used to seed the random generator.
+  """
+
+  def __init__(self, minval=-0.05, maxval=0.05, seed=None):
+    self.minval = minval
+    self.maxval = maxval
+    self.seed = seed
+
+  def __call__(self, shape, dtype=None):
+    return K.random_uniform(
+        shape, self.minval, self.maxval, dtype=dtype, seed=self.seed)
+
+  def get_config(self):
+    return {
+        'minval': self.minval,
+        'maxval': self.maxval,
+        'seed': self.seed,
+    }
+
+
+class TruncatedNormal(Initializer):
+  """Initializer that generates a truncated normal distribution.
+
+  These values are similar to values from a `random_normal_initializer`
+  except that values more than two standard deviations from the mean
+  are discarded and re-drawn. This is the recommended initializer for
+  neural network weights and filters.
+
+  Arguments:
+      mean: a python scalar or a scalar tensor. Mean of the random values
+        to generate.
+      stddev: a python scalar or a scalar tensor. Standard deviation of the
+        random values to generate.
+      seed: A Python integer. Used to seed the random generator.
+  """
+
+  def __init__(self, mean=0., stddev=0.05, seed=None):
+    self.mean = mean
+    self.stddev = stddev
+    self.seed = seed
+
+  def __call__(self, shape, dtype=None):
+    return K.truncated_normal(
+        shape, self.mean, self.stddev, dtype=dtype, seed=self.seed)
+
+  def get_config(self):
+    return {'mean': self.mean, 'stddev': self.stddev, 'seed': self.seed}
+
+
+class VarianceScaling(Initializer):
+  """Initializer capable of adapting its scale to the shape of weights.
+
+  With `distribution="normal"`, samples are drawn from a truncated normal
+  distribution centered on zero, with `stddev = sqrt(scale / n)` where n is:
+      - number of input units in the weight tensor, if mode = "fan_in"
+      - number of output units, if mode = "fan_out"
+      - average of the numbers of input and output units, if mode = "fan_avg"
+
+  With `distribution="uniform"`,
+  samples are drawn from a uniform distribution
+  within [-limit, limit], with `limit = sqrt(3 * scale / n)`.
+
+  Arguments:
+      scale: Scaling factor (positive float).
+      mode: One of "fan_in", "fan_out", "fan_avg".
+      distribution: Random distribution to use. One of "normal", "uniform".
+      seed: A Python integer. Used to seed the random generator.
+
+  Raises:
+      ValueError: In case of an invalid value for the "scale", mode" or
+        "distribution" arguments.
+  """
+
+  def __init__(self, scale=1.0, mode='fan_in', distribution='normal',
+               seed=None):
+    if scale <= 0.:
+      raise ValueError('`scale` must be a positive float. Got:', scale)
+    mode = mode.lower()
+    if mode not in {'fan_in', 'fan_out', 'fan_avg'}:
+      raise ValueError('Invalid `mode` argument: '
+                       'expected on of {"fan_in", "fan_out", "fan_avg"} '
+                       'but got', mode)
+    distribution = distribution.lower()
+    if distribution not in {'normal', 'uniform'}:
+      raise ValueError('Invalid `distribution` argument: '
+                       'expected one of {"normal", "uniform"} '
+                       'but got', distribution)
+    self.scale = scale
+    self.mode = mode
+    self.distribution = distribution
+    self.seed = seed
+
+  def __call__(self, shape, dtype=None):
+    fan_in, fan_out = _compute_fans(shape)
+    scale = self.scale
+    if self.mode == 'fan_in':
+      scale /= max(1., fan_in)
+    elif self.mode == 'fan_out':
+      scale /= max(1., fan_out)
+    else:
+      scale /= max(1., float(fan_in + fan_out) / 2)
+    if self.distribution == 'normal':
+      stddev = math.sqrt(scale)
+      return K.truncated_normal(shape, 0., stddev, dtype=dtype, seed=self.seed)
+    else:
+      limit = math.sqrt(3. * scale)
+      return K.random_uniform(shape, -limit, limit, dtype=dtype, seed=self.seed)
+
+  def get_config(self):
+    return {
+        'scale': self.scale,
+        'mode': self.mode,
+        'distribution': self.distribution,
+        'seed': self.seed
+    }
+
+
+class Orthogonal(Initializer):
+  """Initializer that generates a random orthogonal matrix.
+
+  Arguments:
+      gain: Multiplicative factor to apply to the orthogonal matrix.
+      seed: A Python integer. Used to seed the random generator.
+
+  References:
+      Saxe et al., http://arxiv.org/abs/1312.6120
+  """
+
+  def __init__(self, gain=1., seed=None):
+    self.gain = gain
+    self.seed = seed
+
+  def __call__(self, shape, dtype=None):
+    num_rows = 1
+    for dim in shape[:-1]:
+      num_rows *= dim
+    num_cols = shape[-1]
+    flat_shape = (num_rows, num_cols)
+    if self.seed is not None:
+      np.random.seed(self.seed)
+    a = np.random.normal(0.0, 1.0, flat_shape)
+    u, _, v = np.linalg.svd(a, full_matrices=False)
+    # Pick the one with the correct shape.
+    q = u if u.shape == flat_shape else v
+    q = q.reshape(shape)
+    return self.gain * q[:shape[0], :shape[1]]
+
+  def get_config(self):
+    return {'gain': self.gain, 'seed': self.seed}
+
+
+class Identity(Initializer):
+  """Initializer that generates the identity matrix.
+
+  Only use for square 2D matrices.
+
+  Arguments:
+      gain: Multiplicative factor to apply to the identity matrix.
+  """
+
+  def __init__(self, gain=1.):
+    self.gain = gain
+
+  def __call__(self, shape, dtype=None):
+    if len(shape) != 2 or shape[0] != shape[1]:
+      raise ValueError('Identity matrix initializer can only be used '
+                       'for 2D square matrices.')
+    else:
+      return self.gain * np.identity(shape[0])
+
+  def get_config(self):
+    return {'gain': self.gain}
+
+
+def lecun_uniform(seed=None):
+  """LeCun uniform initializer.
+
+  It draws samples from a uniform distribution within [-limit, limit]
+  where `limit` is `sqrt(3 / fan_in)`
+  where `fan_in` is the number of input units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      LeCun 98, Efficient Backprop,
+      http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
+  """
+  return VarianceScaling(
+      scale=1., mode='fan_in', distribution='uniform', seed=seed)
+
+
+def glorot_normal(seed=None):
+  """Glorot normal initializer, also called Xavier normal initializer.
+
+  It draws samples from a truncated normal distribution centered on 0
+  with `stddev = sqrt(2 / (fan_in + fan_out))`
+  where `fan_in` is the number of input units in the weight tensor
+  and `fan_out` is the number of output units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      Glorot & Bengio, AISTATS 2010
+      http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
+  """
+  return VarianceScaling(
+      scale=1., mode='fan_avg', distribution='normal', seed=seed)
+
+
+def glorot_uniform(seed=None):
+  """Glorot uniform initializer, also called Xavier uniform initializer.
+
+  It draws samples from a uniform distribution within [-limit, limit]
+  where `limit` is `sqrt(6 / (fan_in + fan_out))`
+  where `fan_in` is the number of input units in the weight tensor
+  and `fan_out` is the number of output units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      Glorot & Bengio, AISTATS 2010
+      http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
+  """
+  return VarianceScaling(
+      scale=1., mode='fan_avg', distribution='uniform', seed=seed)
+
+
+def he_normal(seed=None):
+  """He normal initializer.
+
+  It draws samples from a truncated normal distribution centered on 0
+  with `stddev = sqrt(2 / fan_in)`
+  where `fan_in` is the number of input units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      He et al., http://arxiv.org/abs/1502.01852
+  """
+  return VarianceScaling(
+      scale=2., mode='fan_in', distribution='normal', seed=seed)
+
+
+def he_uniform(seed=None):
+  """He uniform variance scaling initializer.
+
+  It draws samples from a uniform distribution within [-limit, limit]
+  where `limit` is `sqrt(6 / fan_in)`
+  where `fan_in` is the number of input units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      He et al., http://arxiv.org/abs/1502.01852
+  """
+  return VarianceScaling(
+      scale=2., mode='fan_in', distribution='uniform', seed=seed)
+
+
+# Compatibility aliases
+
+# pylint: disable=invalid-name
+zero = zeros = Zeros
+one = ones = Ones
+constant = Constant
+uniform = random_uniform = RandomUniform
+normal = random_normal = RandomNormal
+truncated_normal = TruncatedNormal
+identity = Identity
+orthogonal = Orthogonal
+
+# pylint: enable=invalid-name
+
+# Utility functions
+
+
+def _compute_fans(shape, data_format='channels_last'):
+  """Computes the number of input and output units for a weight shape.
+
+  Arguments:
+      shape: Integer shape tuple.
+      data_format: Image data format to use for convolution kernels.
+          Note that all kernels in Keras are standardized on the
+          `channels_last` ordering (even when inputs are set
+          to `channels_first`).
+
+  Returns:
+      A tuple of scalars, `(fan_in, fan_out)`.
+
+  Raises:
+      ValueError: in case of invalid `data_format` argument.
+  """
+  shape = tensor_shape.TensorShape(shape).as_list()
+  if len(shape) == 2:
+    fan_in = shape[0]
+    fan_out = shape[1]
+  elif len(shape) in {3, 4, 5}:
+    # Assuming convolution kernels (1D, 2D or 3D).
+    # TH kernel shape: (depth, input_depth, ...)
+    # TF kernel shape: (..., input_depth, depth)
+    if data_format == 'channels_first':
+      receptive_field_size = np.prod(shape[2:])
+      fan_in = shape[1] * receptive_field_size
+      fan_out = shape[0] * receptive_field_size
+    elif data_format == 'channels_last':
+      receptive_field_size = np.prod(shape[:2])
+      fan_in = shape[-2] * receptive_field_size
+      fan_out = shape[-1] * receptive_field_size
+    else:
+      raise ValueError('Invalid data_format: ' + data_format)
+  else:
+    # No specific assumptions.
+    fan_in = math.sqrt(np.prod(shape))
+    fan_out = math.sqrt(np.prod(shape))
+  return fan_in, fan_out
+
+
+def serialize(initializer):
+  return serialize_keras_object(initializer)
+
+
+def deserialize(config, custom_objects=None):
+  return deserialize_keras_object(
+      config,
+      module_objects=globals(),
+      custom_objects=custom_objects,
+      printable_module_name='initializer')
+
+
+def get(identifier):
+  if isinstance(identifier, dict):
+    return deserialize(identifier)
+  elif isinstance(identifier, six.string_types):
+    config = {'class_name': str(identifier), 'config': {}}
+    return deserialize(config)
+  elif callable(identifier):
+    return identifier
+  else:
+    raise ValueError('Could not interpret initializer identifier:', identifier)
diff --git a/tensorflow/contrib/keras/python/keras/initializers_test.py b/tensorflow/contrib/keras/python/keras/initializers_test.py
new file mode 100644
index 0000000000..7436fbb390
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/initializers_test.py
@@ -0,0 +1,142 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras initializers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+def _runner(init, shape, target_mean=None, target_std=None,
+            target_max=None, target_min=None):
+  variable = keras.backend.variable(init(shape))
+  output = keras.backend.get_value(variable)
+  lim = 3e-2
+  if target_std is not None:
+    assert abs(output.std() - target_std) < lim, output.std()
+  if target_mean is not None:
+    assert abs(output.mean() - target_mean) < lim, output.mean()
+  if target_max is not None:
+    assert abs(output.max() - target_max) < lim, output.max()
+  if target_min is not None:
+    assert abs(output.min() - target_min) < lim, output.min()
+
+
+class KerasInitializersTest(test.TestCase):
+
+  def test_uniform(self):
+    tensor_shape = (9, 6, 7)
+    with self.test_session():
+      _runner(keras.initializers.RandomUniform(minval=-1, maxval=1, seed=124),
+              tensor_shape,
+              target_mean=0., target_max=1, target_min=-1)
+
+  def test_normal(self):
+    tensor_shape = (8, 12, 99)
+    with self.test_session():
+      _runner(keras.initializers.RandomNormal(mean=0, stddev=1, seed=153),
+              tensor_shape,
+              target_mean=0., target_std=1)
+
+  def test_truncated_normal(self):
+    tensor_shape = (12, 99, 7)
+    with self.test_session():
+      _runner(keras.initializers.TruncatedNormal(mean=0, stddev=1, seed=126),
+              tensor_shape,
+              target_mean=0., target_std=None, target_max=2)
+
+  def test_constant(self):
+    tensor_shape = (5, 6, 4)
+    with self.test_session():
+      _runner(keras.initializers.Constant(2), tensor_shape,
+              target_mean=2, target_max=2, target_min=2)
+
+  def test_lecun_uniform(self):
+    tensor_shape = (5, 6, 4, 2)
+    with self.test_session():
+      fan_in, _ = keras.initializers._compute_fans(tensor_shape)
+      scale = np.sqrt(3. / fan_in)
+      _runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
+              target_mean=0., target_max=scale, target_min=-scale)
+
+  def test_glorot_uniform(self):
+    tensor_shape = (5, 6, 4, 2)
+    with self.test_session():
+      fan_in, fan_out = keras.initializers._compute_fans(tensor_shape)
+      scale = np.sqrt(6. / (fan_in + fan_out))
+      _runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
+              target_mean=0., target_max=scale, target_min=-scale)
+
+  def test_he_uniform(self):
+    tensor_shape = (5, 6, 4, 2)
+    with self.test_session():
+      fan_in, _ = keras.initializers._compute_fans(tensor_shape)
+      scale = np.sqrt(6. / fan_in)
+      _runner(keras.initializers.he_uniform(seed=123), tensor_shape,
+              target_mean=0., target_max=scale, target_min=-scale)
+
+  def test_glorot_normal(self):
+    tensor_shape = (5, 6, 4, 2)
+    with self.test_session():
+      fan_in, fan_out = keras.initializers._compute_fans(tensor_shape)
+      scale = np.sqrt(2. / (fan_in + fan_out))
+      _runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
+              target_mean=0., target_std=None, target_max=2 * scale)
+
+  def test_he_normal(self):
+    tensor_shape = (5, 6, 4, 2)
+    with self.test_session():
+      fan_in, _ = keras.initializers._compute_fans(tensor_shape)
+      scale = np.sqrt(2. / fan_in)
+      _runner(keras.initializers.he_normal(seed=123), tensor_shape,
+              target_mean=0., target_std=None, target_max=2 * scale)
+
+  def test_orthogonal(self):
+    tensor_shape = (7, 8)
+    with self.test_session():
+      _runner(keras.initializers.orthogonal(seed=123), tensor_shape,
+              target_mean=0.)
+
+  def test_identity(self):
+    with self.test_session():
+      tensor_shape = (3, 4, 5)
+      with self.assertRaises(ValueError):
+        _runner(keras.initializers.identity(), tensor_shape,
+                target_mean=1. / tensor_shape[0], target_max=1.)
+
+      tensor_shape = (3, 3)
+      _runner(keras.initializers.identity(), tensor_shape,
+              target_mean=1. / tensor_shape[0], target_max=1.)
+
+  def test_zero(self):
+    tensor_shape = (4, 5)
+    with self.test_session():
+      _runner(keras.initializers.zeros(), tensor_shape,
+              target_mean=0., target_max=0.)
+
+  def test_one(self):
+    tensor_shape = (4, 5)
+    with self.test_session():
+      _runner(keras.initializers.ones(), tensor_shape,
+              target_mean=1., target_max=1.)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/integration_test.py b/tensorflow/contrib/keras/python/keras/integration_test.py
new file mode 100644
index 0000000000..f42f81b286
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/integration_test.py
@@ -0,0 +1,166 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration tests for Keras."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class KerasIntegrationTest(test.TestCase):
+
+  def test_vector_classification_declarative(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=200,
+          test_samples=100,
+          input_shape=(8,),
+          num_classes=2)
+      y_train = keras.utils.to_categorical(y_train)
+      y_test = keras.utils.to_categorical(y_test)
+
+      model = keras.models.Sequential([
+          keras.layers.Dense(8,
+                             activation='relu',
+                             input_shape=x_train.shape[1:]),
+          keras.layers.Dropout(0.1),
+          keras.layers.Dense(y_train.shape[-1], activation='softmax')
+      ])
+      model.compile(loss='categorical_crossentropy',
+                    optimizer='rmsprop',
+                    metrics=['accuracy'])
+      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
+                          validation_data=(x_test, y_test),
+                          verbose=2)
+      self.assertTrue(history.history['val_acc'][-1] > 0.85)
+
+  def test_vector_classification_functional(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=200,
+          test_samples=100,
+          input_shape=(8,),
+          num_classes=2)
+      y_train = keras.utils.to_categorical(y_train)
+      y_test = keras.utils.to_categorical(y_test)
+
+      inputs = keras.layers.Input(shape=x_train.shape[1:])
+      x = keras.layers.Dense(8, activation='relu')(inputs)
+      x = keras.layers.Dropout(0.1)(x)
+      outputs = keras.layers.Dense(y_train.shape[-1], activation='softmax')(x)
+
+      model = keras.models.Model(inputs, outputs)
+      model.compile(loss='categorical_crossentropy',
+                    optimizer='rmsprop',
+                    metrics=['accuracy'])
+      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
+                          validation_data=(x_test, y_test),
+                          verbose=2)
+      self.assertTrue(history.history['val_acc'][-1] > 0.85)
+
+  def test_temporal_classification_declarative(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=200,
+          test_samples=100,
+          input_shape=(4, 8),
+          num_classes=2)
+      y_train = keras.utils.to_categorical(y_train)
+      y_test = keras.utils.to_categorical(y_test)
+
+      model = keras.models.Sequential()
+      model.add(keras.layers.LSTM(3, return_sequences=True,
+                                  input_shape=x_train.shape[1:]))
+      model.add(keras.layers.GRU(y_train.shape[-1], activation='softmax'))
+      model.compile(loss='categorical_crossentropy',
+                    optimizer='adam',
+                    metrics=['accuracy'])
+      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
+                          validation_data=(x_test, y_test),
+                          verbose=2)
+      self.assertTrue(history.history['val_acc'][-1] > 0.85)
+
+  def test_image_classification_declarative(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=200,
+          test_samples=100,
+          input_shape=(8, 8, 3),
+          num_classes=2)
+      y_train = keras.utils.to_categorical(y_train)
+      y_test = keras.utils.to_categorical(y_test)
+
+      model = keras.models.Sequential()
+      model.add(keras.layers.Conv2D(
+          8, 3,
+          activation='relu',
+          input_shape=x_train.shape[1:]))
+      model.add(keras.layers.BatchNormalization())
+      model.add(keras.layers.Conv2D(
+          8, 3,
+          padding='same',
+          activation='relu'))
+      model.add(keras.layers.GlobalMaxPooling2D())
+      model.add(keras.layers.Dense(y_train.shape[-1], activation='softmax'))
+      model.compile(loss='categorical_crossentropy',
+                    optimizer='adam',
+                    metrics=['accuracy'])
+      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
+                          validation_data=(x_test, y_test),
+                          verbose=2)
+      self.assertTrue(history.history['val_acc'][-1] > 0.85)
+
+  def test_video_classification_functional(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=200,
+          test_samples=100,
+          input_shape=(4, 8, 8, 3),
+          num_classes=3)
+      y_train = keras.utils.to_categorical(y_train)
+      y_test = keras.utils.to_categorical(y_test)
+
+      inputs = keras.layers.Input(shape=x_train.shape[1:])
+      x = keras.layers.TimeDistributed(
+          keras.layers.Conv2D(4, 3, activation='relu'))(inputs)
+      x = keras.layers.BatchNormalization()(x)
+      x = keras.layers.TimeDistributed(keras.layers.GlobalMaxPooling2D())(x)
+      x = keras.layers.Conv1D(8, 3, activation='relu')(x)
+      x = keras.layers.Flatten()(x)
+      outputs = keras.layers.Dense(y_train.shape[-1], activation='softmax')(x)
+
+      model = keras.models.Model(inputs, outputs)
+      model.compile(loss='categorical_crossentropy',
+                    optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.8),
+                    metrics=['accuracy'])
+      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
+                          validation_data=(x_test, y_test),
+                          verbose=2)
+      self.assertTrue(history.history['val_acc'][-1] > 0.85)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/__init__.py b/tensorflow/contrib/keras/python/keras/layers/__init__.py
new file mode 100644
index 0000000000..9a428f3114
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/__init__.py
@@ -0,0 +1,40 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras layers module.
+"""
+# pylint: disable=wildcard-import
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.engine import Input
+from tensorflow.contrib.keras.python.keras.engine import InputLayer
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.contrib.keras.python.keras.layers.advanced_activations import *
+from tensorflow.contrib.keras.python.keras.layers.convolutional import *
+from tensorflow.contrib.keras.python.keras.layers.convolutional_recurrent import *
+from tensorflow.contrib.keras.python.keras.layers.core import *
+from tensorflow.contrib.keras.python.keras.layers.embeddings import *
+from tensorflow.contrib.keras.python.keras.layers.local import *
+from tensorflow.contrib.keras.python.keras.layers.merge import *
+from tensorflow.contrib.keras.python.keras.layers.noise import *
+from tensorflow.contrib.keras.python.keras.layers.normalization import *
+from tensorflow.contrib.keras.python.keras.layers.pooling import *
+from tensorflow.contrib.keras.python.keras.layers.recurrent import *
+from tensorflow.contrib.keras.python.keras.layers.serialization import deserialize
+from tensorflow.contrib.keras.python.keras.layers.serialization import serialize
+from tensorflow.contrib.keras.python.keras.layers.wrappers import *
+
diff --git a/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py b/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py
new file mode 100644
index 0000000000..b3abfc29d2
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py
@@ -0,0 +1,222 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Layers that act as activation functions.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.python.framework import tensor_shape
+
+
+class LeakyReLU(Layer):
+  """Leaky version of a Rectified Linear Unit.
+
+  It allows a small gradient when the unit is not active:
+  `f(x) = alpha * x for x < 0`,
+  `f(x) = x for x >= 0`.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as the input.
+
+  Arguments:
+      alpha: float >= 0. Negative slope coefficient.
+
+  """
+
+  def __init__(self, alpha=0.3, **kwargs):
+    super(LeakyReLU, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.alpha = K.cast_to_floatx(alpha)
+
+  def call(self, inputs):
+    return K.relu(inputs, alpha=self.alpha)
+
+  def get_config(self):
+    config = {'alpha': self.alpha}
+    base_config = super(LeakyReLU, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class PReLU(Layer):
+  """Parametric Rectified Linear Unit.
+
+  It follows:
+  `f(x) = alpha * x for x < 0`,
+  `f(x) = x for x >= 0`,
+  where `alpha` is a learned array with the same shape as x.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as the input.
+
+  Arguments:
+      alpha_initializer: initializer function for the weights.
+      alpha_regularizer: regularizer for the weights.
+      alpha_constraint: constraint for the weights.
+      shared_axes: the axes along which to share learnable
+          parameters for the activation function.
+          For example, if the incoming feature maps
+          are from a 2D convolution
+          with output shape `(batch, height, width, channels)`,
+          and you wish to share parameters across space
+          so that each filter only has one set of parameters,
+          set `shared_axes=[1, 2]`.
+
+  """
+
+  def __init__(self,
+               alpha_initializer='zeros',
+               alpha_regularizer=None,
+               alpha_constraint=None,
+               shared_axes=None,
+               **kwargs):
+    super(PReLU, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.alpha_initializer = initializers.get(alpha_initializer)
+    self.alpha_regularizer = regularizers.get(alpha_regularizer)
+    self.alpha_constraint = constraints.get(alpha_constraint)
+    if shared_axes is None:
+      self.shared_axes = None
+    elif not isinstance(shared_axes, (list, tuple)):
+      self.shared_axes = [shared_axes]
+    else:
+      self.shared_axes = list(shared_axes)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    param_shape = input_shape[1:]
+    self.param_broadcast = [False] * len(param_shape)
+    if self.shared_axes is not None:
+      for i in self.shared_axes:
+        param_shape[i - 1] = 1
+        self.param_broadcast[i - 1] = True
+    self.alpha = self.add_weight(
+        param_shape,
+        name='alpha',
+        initializer=self.alpha_initializer,
+        regularizer=self.alpha_regularizer,
+        constraint=self.alpha_constraint)
+    # Set input spec
+    axes = {}
+    if self.shared_axes:
+      for i in range(1, len(input_shape)):
+        if i not in self.shared_axes:
+          axes[i] = input_shape[i]
+    self.input_spec = InputSpec(ndim=len(input_shape), axes=axes)
+    self.built = True
+
+  def call(self, inputs, mask=None):
+    pos = K.relu(inputs)
+    if K.backend() == 'theano':
+      neg = (K.pattern_broadcast(self.alpha, self.param_broadcast) *
+             (inputs - K.abs(inputs)) * 0.5)
+    else:
+      neg = -self.alpha * K.relu(-inputs)
+    return pos + neg
+
+  def get_config(self):
+    config = {
+        'alpha_initializer': initializers.serialize(self.alpha_initializer),
+        'alpha_regularizer': regularizers.serialize(self.alpha_regularizer),
+        'alpha_constraint': constraints.serialize(self.alpha_constraint),
+        'shared_axes': self.shared_axes
+    }
+    base_config = super(PReLU, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class ELU(Layer):
+  """Exponential Linear Unit.
+
+  It follows:
+  `f(x) =  alpha * (exp(x) - 1.) for x < 0`,
+  `f(x) = x for x >= 0`.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as the input.
+
+  Arguments:
+      alpha: scale for the negative factor.
+
+  """
+
+  def __init__(self, alpha=1.0, **kwargs):
+    super(ELU, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.alpha = K.cast_to_floatx(alpha)
+
+  def call(self, inputs):
+    return K.elu(inputs, self.alpha)
+
+  def get_config(self):
+    config = {'alpha': float(self.alpha)}
+    base_config = super(ELU, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class ThresholdedReLU(Layer):
+  """Thresholded Rectified Linear Unit.
+
+  It follows:
+  `f(x) = x for x > theta`,
+  `f(x) = 0 otherwise`.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as the input.
+
+  Arguments:
+      theta: float >= 0. Threshold location of activation.
+
+  """
+
+  def __init__(self, theta=1.0, **kwargs):
+    super(ThresholdedReLU, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.theta = K.cast_to_floatx(theta)
+
+  def call(self, inputs, mask=None):
+    return inputs * K.cast(inputs > self.theta, K.floatx())
+
+  def get_config(self):
+    config = {'theta': float(self.theta)}
+    base_config = super(ThresholdedReLU, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/advanced_activations_test.py b/tensorflow/contrib/keras/python/keras/layers/advanced_activations_test.py
new file mode 100644
index 0000000000..1be56123d8
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/advanced_activations_test.py
@@ -0,0 +1,61 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for advanced activation layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class AdvancedActivationsTest(test.TestCase):
+
+  def test_leaky_relu(self):
+    with self.test_session():
+      for alpha in [0., .5, -1.]:
+        testing_utils.layer_test(keras.layers.LeakyReLU,
+                                 kwargs={'alpha': alpha},
+                                 input_shape=(2, 3, 4))
+
+  def test_prelu(self):
+    with self.test_session():
+      testing_utils.layer_test(keras.layers.PReLU, kwargs={},
+                               input_shape=(2, 3, 4))
+
+  def test_prelu_share(self):
+    with self.test_session():
+      testing_utils.layer_test(keras.layers.PReLU,
+                               kwargs={'shared_axes': 1},
+                               input_shape=(2, 3, 4))
+
+  def test_elu(self):
+    with self.test_session():
+      for alpha in [0., .5, -1.]:
+        testing_utils.layer_test(keras.layers.ELU,
+                                 kwargs={'alpha': alpha},
+                                 input_shape=(2, 3, 4))
+
+  def test_thresholded_relu(self):
+    with self.test_session():
+      testing_utils.layer_test(keras.layers.ThresholdedReLU,
+                               kwargs={'theta': 0.5},
+                               input_shape=(2, 3, 4))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional.py b/tensorflow/contrib/keras/python/keras/layers/convolutional.py
new file mode 100644
index 0000000000..1a28399a28
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional.py
@@ -0,0 +1,1792 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras convolution layers and image transformation layers.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import activations
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+# imports for backwards namespace compatibility
+# pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.layers.pooling import AveragePooling1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import AveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import AveragePooling3D
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPooling1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPooling3D
+# pylint: enable=unused-import
+from tensorflow.contrib.keras.python.keras.utils import conv_utils
+from tensorflow.python.framework import tensor_shape
+
+
+class _Conv(Layer):
+  """Abstract nD convolution layer (private, used as implementation base).
+
+  This layer creates a convolution kernel that is convolved
+  with the layer input to produce a tensor of outputs.
+  If `use_bias` is True, a bias vector is created and added to the outputs.
+  Finally, if `activation` is not `None`,
+  it is applied to the outputs as well.
+
+  Arguments:
+      rank: An integer, the rank of the convolution,
+          e.g. "2" for 2D convolution.
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of n integers, specifying the
+          dimensions of the convolution window.
+      strides: An integer or tuple/list of n integers,
+          specifying the strides of the convolution.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, ..., channels)` while `channels_first` corresponds to
+          inputs with shape `(batch, channels, ...)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      dilation_rate: An integer or tuple/list of n integers, specifying
+          the dilation rate to use for dilated convolution.
+          Currently, specifying any `dilation_rate` value != 1 is
+          incompatible with specifying any `strides` value != 1.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to the kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+  """
+
+  def __init__(self,
+               rank,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format=None,
+               dilation_rate=1,
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(_Conv, self).__init__(**kwargs)
+    self.rank = rank
+    self.filters = filters
+    self.kernel_size = conv_utils.normalize_tuple(kernel_size, rank,
+                                                  'kernel_size')
+    self.strides = conv_utils.normalize_tuple(strides, rank, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, rank,
+                                                    'dilation_rate')
+    self.activation = activations.get(activation)
+    self.use_bias = use_bias
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+    self.input_spec = InputSpec(ndim=self.rank + 2)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = input_shape[channel_axis]
+    kernel_shape = self.kernel_size + (input_dim, self.filters)
+
+    self.kernel = self.add_weight(
+        kernel_shape,
+        initializer=self.kernel_initializer,
+        name='kernel',
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+    if self.use_bias:
+      self.bias = self.add_weight(
+          (self.filters,),
+          initializer=self.bias_initializer,
+          name='bias',
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+    else:
+      self.bias = None
+    # Set input spec.
+    self.input_spec = InputSpec(
+        ndim=self.rank + 2, axes={channel_axis: input_dim})
+    self.built = True
+
+  def call(self, inputs):
+    if self.rank == 1:
+      outputs = K.conv1d(
+          inputs,
+          self.kernel,
+          strides=self.strides[0],
+          padding=self.padding,
+          data_format=self.data_format,
+          dilation_rate=self.dilation_rate[0])
+    if self.rank == 2:
+      outputs = K.conv2d(
+          inputs,
+          self.kernel,
+          strides=self.strides,
+          padding=self.padding,
+          data_format=self.data_format,
+          dilation_rate=self.dilation_rate)
+    if self.rank == 3:
+      outputs = K.conv3d(
+          inputs,
+          self.kernel,
+          strides=self.strides,
+          padding=self.padding,
+          data_format=self.data_format,
+          dilation_rate=self.dilation_rate)
+
+    if self.use_bias:
+      outputs = K.bias_add(outputs, self.bias, data_format=self.data_format)
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_last':
+      space = input_shape[1:-1]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = conv_utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0]] + new_space +
+                                      [self.filters])
+    else:
+      space = input_shape[2:]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = conv_utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0], self.filters] +
+                                      new_space)
+
+  def get_config(self):
+    config = {
+        'rank':
+            self.rank,
+        'filters':
+            self.filters,
+        'kernel_size':
+            self.kernel_size,
+        'strides':
+            self.strides,
+        'padding':
+            self.padding,
+        'data_format':
+            self.data_format,
+        'dilation_rate':
+            self.dilation_rate,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'bias_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(_Conv, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Conv1D(_Conv):
+  """1D convolution layer (e.g.
+
+  temporal convolution).
+
+  This layer creates a convolution kernel that is convolved
+  with the layer input over a single spatial (or temporal) dimension
+  to produce a tensor of outputs.
+  If `use_bias` is True, a bias vector is created and added to the outputs.
+  Finally, if `activation` is not `None`,
+  it is applied to the outputs as well.
+
+  When using this layer as the first layer in a model,
+  provide an `input_shape` argument
+  (tuple of integers or `None`, e.g.
+  `(10, 128)` for sequences of 10 vectors of 128-dimensional vectors,
+  or `(None, 128)` for variable-length sequences of 128-dimensional vectors.
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of a single integer,
+          specifying the length of the 1D convolution window.
+      strides: An integer or tuple/list of a single integer,
+          specifying the stride length of the convolution.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: One of `"valid"`, `"causal"` or `"same"` (case-insensitive).
+          `"causal"` results in causal (dilated) convolutions, e.g. output[t]
+          depends solely on input[:t-1]. Useful when modeling temporal data
+          where the model should not violate the temporal order.
+          See [WaveNet: A Generative Model for Raw Audio, section
+            2.1](https://arxiv.org/abs/1609.03499).
+      dilation_rate: an integer or tuple/list of a single integer, specifying
+          the dilation rate to use for dilated convolution.
+          Currently, specifying any `dilation_rate` value != 1 is
+          incompatible with specifying any `strides` value != 1.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to the kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+      3D tensor with shape: `(batch_size, steps, input_dim)`
+
+  Output shape:
+      3D tensor with shape: `(batch_size, new_steps, filters)`
+      `steps` value might have changed due to padding or strides.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               dilation_rate=1,
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(Conv1D, self).__init__(
+        rank=1,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format='channels_last',
+        dilation_rate=dilation_rate,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        bias_constraint=bias_constraint,
+        **kwargs)
+    self.input_spec = InputSpec(ndim=3)
+
+  def get_config(self):
+    config = super(Conv1D, self).get_config()
+    config.pop('rank')
+    config.pop('data_format')
+    return config
+
+
+class Conv2D(_Conv):
+  """2D convolution layer (e.g. spatial convolution over images).
+
+  This layer creates a convolution kernel that is convolved
+  with the layer input to produce a tensor of
+  outputs. If `use_bias` is True,
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  When using this layer as the first layer in a model,
+  provide the keyword argument `input_shape`
+  (tuple of integers, does not include the sample axis),
+  e.g. `input_shape=(128, 128, 3)` for 128x128 RGB pictures
+  in `data_format="channels_last"`.
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of 2 integers, specifying the
+          width and height of the 2D convolution window.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+          specifying the strides of the convolution along the width and height.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, width, height, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, width, height)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      dilation_rate: an integer or tuple/list of 2 integers, specifying
+          the dilation rate to use for dilated convolution.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Currently, specifying any `dilation_rate` value != 1 is
+          incompatible with specifying any stride value != 1.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to the kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+      4D tensor with shape:
+      `(samples, channels, rows, cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, rows, cols, channels)` if data_format='channels_last'.
+
+  Output shape:
+      4D tensor with shape:
+      `(samples, filters, new_rows, new_cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, new_rows, new_cols, filters)` if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to padding.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format=None,
+               dilation_rate=(1, 1),
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(Conv2D, self).__init__(
+        rank=2,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        bias_constraint=bias_constraint,
+        **kwargs)
+    self.input_spec = InputSpec(ndim=4)
+
+  def get_config(self):
+    config = super(Conv2D, self).get_config()
+    config.pop('rank')
+    return config
+
+
+class Conv3D(_Conv):
+  """3D convolution layer (e.g.
+
+  spatial convolution over volumes).
+
+  This layer creates a convolution kernel that is convolved
+  with the layer input to produce a tensor of
+  outputs. If `use_bias` is True,
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  When using this layer as the first layer in a model,
+  provide the keyword argument `input_shape`
+  (tuple of integers, does not include the sample axis),
+  e.g. `input_shape=(128, 128, 128, 3)` for 128x128x128 volumes
+  with a single channel,
+  in `data_format="channels_last"`.
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of 3 integers, specifying the
+          width and height of the 3D convolution window.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+      strides: An integer or tuple/list of 3 integers,
+          specifying the strides of the convolution along each spatial
+            dimension.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+          while `channels_first` corresponds to inputs with shape
+          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      dilation_rate: an integer or tuple/list of 3 integers, specifying
+          the dilation rate to use for dilated convolution.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Currently, specifying any `dilation_rate` value != 1 is
+          incompatible with specifying any stride value != 1.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to the kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+      5D tensor with shape:
+      `(samples, channels, conv_dim1, conv_dim2, conv_dim3)` if
+        data_format='channels_first'
+      or 5D tensor with shape:
+      `(samples, conv_dim1, conv_dim2, conv_dim3, channels)` if
+        data_format='channels_last'.
+
+  Output shape:
+      5D tensor with shape:
+      `(samples, filters, new_conv_dim1, new_conv_dim2, new_conv_dim3)` if
+        data_format='channels_first'
+      or 5D tensor with shape:
+      `(samples, new_conv_dim1, new_conv_dim2, new_conv_dim3, filters)` if
+        data_format='channels_last'.
+      `new_conv_dim1`, `new_conv_dim2` and `new_conv_dim3` values might have
+        changed due to padding.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1, 1),
+               padding='valid',
+               data_format=None,
+               dilation_rate=(1, 1, 1),
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(Conv3D, self).__init__(
+        rank=3,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        bias_constraint=bias_constraint,
+        **kwargs)
+    self.input_spec = InputSpec(ndim=5)
+
+  def get_config(self):
+    config = super(Conv3D, self).get_config()
+    config.pop('rank')
+    return config
+
+
+class Conv2DTranspose(Conv2D):
+  """Transposed convolution layer (sometimes called Deconvolution).
+
+  The need for transposed convolutions generally arises
+  from the desire to use a transformation going in the opposite direction
+  of a normal convolution, i.e., from something that has the shape of the
+  output of some convolution to something that has the shape of its input
+  while maintaining a connectivity pattern that is compatible with
+  said convolution.
+
+  When using this layer as the first layer in a model,
+  provide the keyword argument `input_shape`
+  (tuple of integers, does not include the sample axis),
+  e.g. `input_shape=(128, 128, 3)` for 128x128 RGB pictures
+  in `data_format="channels_last"`.
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of 2 integers, specifying the
+          width and height of the 2D convolution window.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+          specifying the strides of the convolution along the width and height.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, width, height, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, width, height)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      dilation_rate: an integer or tuple/list of 2 integers, specifying
+          the dilation rate to use for dilated convolution.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Currently, specifying any `dilation_rate` value != 1 is
+          incompatible with specifying any stride value != 1.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to the kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+      4D tensor with shape:
+      `(batch, channels, rows, cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(batch, rows, cols, channels)` if data_format='channels_last'.
+
+  Output shape:
+      4D tensor with shape:
+      `(batch, filters, new_rows, new_cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(batch, new_rows, new_cols, filters)` if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to padding.
+
+  References:
+      - [A guide to convolution arithmetic for deep
+        learning](https://arxiv.org/abs/1603.07285v1)
+      - [Deconvolutional
+        Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf)
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format='channels_last',
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(Conv2DTranspose, self).__init__(
+        filters,
+        kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        bias_constraint=bias_constraint,
+        **kwargs)
+    self.input_spec = InputSpec(ndim=4)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if len(input_shape) != 4:
+      raise ValueError(
+          'Inputs should have rank ' + str(4) + '; Received input shape:',
+          str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = input_shape[channel_axis]
+    kernel_shape = self.kernel_size + (self.filters, input_dim)
+
+    self.kernel = self.add_weight(
+        kernel_shape,
+        initializer=self.kernel_initializer,
+        name='kernel',
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+    if self.use_bias:
+      self.bias = self.add_weight(
+          (self.filters,),
+          initializer=self.bias_initializer,
+          name='bias',
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+    else:
+      self.bias = None
+    # Set input spec.
+    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
+    self.built = True
+
+  def call(self, inputs):
+    input_shape = K.shape(inputs)
+    batch_size = input_shape[0]
+    if self.data_format == 'channels_first':
+      h_axis, w_axis = 2, 3
+    else:
+      h_axis, w_axis = 1, 2
+
+    height, width = input_shape[h_axis], input_shape[w_axis]
+    kernel_h, kernel_w = self.kernel_size
+    stride_h, stride_w = self.strides
+
+    # Infer the dynamic output shape:
+    out_height = conv_utils.deconv_length(height, stride_h, kernel_h,
+                                          self.padding)
+    out_width = conv_utils.deconv_length(width, stride_w, kernel_w,
+                                         self.padding)
+    if self.data_format == 'channels_first':
+      output_shape = (batch_size, self.filters, out_height, out_width)
+    else:
+      output_shape = (batch_size, out_height, out_width, self.filters)
+
+    outputs = K.conv2d_transpose(
+        inputs,
+        self.kernel,
+        output_shape,
+        self.strides,
+        padding=self.padding,
+        data_format=self.data_format)
+
+    if self.bias:
+      outputs = K.bias_add(outputs, self.bias, data_format=self.data_format)
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = list(input_shape)
+    if self.data_format == 'channels_first':
+      c_axis, h_axis, w_axis = 1, 2, 3
+    else:
+      c_axis, h_axis, w_axis = 3, 1, 2
+
+    kernel_h, kernel_w = self.kernel_size
+    stride_h, stride_w = self.strides
+
+    output_shape[c_axis] = self.filters
+    output_shape[h_axis] = conv_utils.deconv_length(
+        output_shape[h_axis], stride_h, kernel_h, self.padding)
+    output_shape[w_axis] = conv_utils.deconv_length(
+        output_shape[w_axis], stride_w, kernel_w, self.padding)
+    return tensor_shape.TensorShape(output_shape)
+
+  def get_config(self):
+    config = super(Conv2DTranspose, self).get_config()
+    config.pop('dilation_rate')
+    return config
+
+
+class SeparableConv2D(Conv2D):
+  """Depthwise separable 2D convolution.
+
+  Separable convolutions consist in first performing
+  a depthwise spatial convolution
+  (which acts on each input channel separately)
+  followed by a pointwise convolution which mixes together the resulting
+  output channels. The `depth_multiplier` argument controls how many
+  output channels are generated per input channel in the depthwise step.
+
+  Intuitively, separable convolutions can be understood as
+  a way to factorize a convolution kernel into two smaller kernels,
+  or as an extreme version of an Inception block.
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of 2 integers, specifying the
+          width and height of the 2D convolution window.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+          specifying the strides of the convolution along the width and height.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, width, height, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, width, height)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      depth_multiplier: The number of depthwise convolution output channels
+          for each input channel.
+          The total number of depthwise convolution output
+          channels will be equal to `filterss_in * depth_multiplier`.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      depthwise_initializer: Initializer for the depthwise kernel matrix.
+      pointwise_initializer: Initializer for the pointwise kernel matrix.
+      bias_initializer: Initializer for the bias vector.
+      depthwise_regularizer: Regularizer function applied to
+          the depthwise kernel matrix.
+      pointwise_regularizer: Regularizer function applied to
+          the depthwise kernel matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      depthwise_constraint: Constraint function applied to
+          the depthwise kernel matrix.
+      pointwise_constraint: Constraint function applied to
+          the pointwise kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+      4D tensor with shape:
+      `(batch, channels, rows, cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(batch, rows, cols, channels)` if data_format='channels_last'.
+
+  Output shape:
+      4D tensor with shape:
+      `(batch, filters, new_rows, new_cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(batch, new_rows, new_cols, filters)` if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to padding.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format=None,
+               depth_multiplier=1,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer='glorot_uniform',
+               pointwise_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               depthwise_regularizer=None,
+               pointwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               pointwise_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(SeparableConv2D, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activation,
+        use_bias=use_bias,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        bias_constraint=bias_constraint,
+        **kwargs)
+    self.depth_multiplier = depth_multiplier
+    self.depthwise_initializer = initializers.get(depthwise_initializer)
+    self.pointwise_initializer = initializers.get(pointwise_initializer)
+    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
+    self.pointwise_regularizer = regularizers.get(pointwise_regularizer)
+    self.depthwise_constraint = constraints.get(depthwise_constraint)
+    self.pointwise_constraint = constraints.get(pointwise_constraint)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if len(input_shape) < 4:
+      raise ValueError('Inputs to `SeparableConv2D` should have rank 4. '
+                       'Received input shape:', str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = 3
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs to '
+                       '`SeparableConv2D` '
+                       'should be defined. Found `None`.')
+    input_dim = int(input_shape[channel_axis])
+    depthwise_kernel_shape = (self.kernel_size[0], self.kernel_size[1],
+                              input_dim, self.depth_multiplier)
+    pointwise_kernel_shape = (1, 1, self.depth_multiplier * input_dim,
+                              self.filters)
+
+    self.depthwise_kernel = self.add_weight(
+        depthwise_kernel_shape,
+        initializer=self.depthwise_initializer,
+        name='depthwise_kernel',
+        regularizer=self.depthwise_regularizer,
+        constraint=self.depthwise_constraint)
+    self.pointwise_kernel = self.add_weight(
+        pointwise_kernel_shape,
+        initializer=self.pointwise_initializer,
+        name='pointwise_kernel',
+        regularizer=self.pointwise_regularizer,
+        constraint=self.pointwise_constraint)
+
+    if self.use_bias:
+      self.bias = self.add_weight(
+          (self.filters,),
+          initializer=self.bias_initializer,
+          name='bias',
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+    else:
+      self.bias = None
+    # Set input spec.
+    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
+    self.built = True
+
+  def call(self, inputs):
+    outputs = K.separable_conv2d(
+        inputs,
+        self.depthwise_kernel,
+        self.pointwise_kernel,
+        data_format=self.data_format,
+        strides=self.strides,
+        padding=self.padding)
+
+    if self.bias:
+      outputs = K.bias_add(outputs, self.bias, data_format=self.data_format)
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+    else:
+      rows = input_shape[1]
+      cols = input_shape[2]
+
+    rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
+                                         self.padding, self.strides[0])
+    cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
+                                         self.padding, self.strides[1])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], self.filters, rows, cols])
+    else:
+      return tensor_shape.TensorShape(
+          [input_shape[0], rows, cols, self.filters])
+
+  def get_config(self):
+    config = super(SeparableConv2D, self).get_config()
+    config.pop('kernel_initializer')
+    config.pop('kernel_regularizer')
+    config.pop('kernel_constraint')
+    config['depth_multiplier'] = self.depth_multiplier
+    config['depthwise_initializer'] = initializers.serialize(
+        self.depthwise_initializer)
+    config['pointwise_initializer'] = initializers.serialize(
+        self.pointwise_initializer)
+    config['depthwise_regularizer'] = regularizers.serialize(
+        self.depthwise_regularizer)
+    config['pointwise_regularizer'] = regularizers.serialize(
+        self.pointwise_regularizer)
+    config['depthwise_constraint'] = constraints.serialize(
+        self.depthwise_constraint)
+    config['pointwise_constraint'] = constraints.serialize(
+        self.pointwise_constraint)
+    return config
+
+
+class UpSampling1D(Layer):
+  """Upsampling layer for 1D inputs.
+
+  Repeats each temporal step `size` times along the time axis.
+
+  Arguments:
+      size: integer. Upsampling factor.
+
+  Input shape:
+      3D tensor with shape: `(batch, steps, features)`.
+
+  Output shape:
+      3D tensor with shape: `(batch, upsampled_steps, features)`.
+  """
+
+  def __init__(self, size=2, **kwargs):
+    super(UpSampling1D, self).__init__(**kwargs)
+    self.size = int(size)
+    self.input_spec = InputSpec(ndim=3)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    size = self.size * input_shape[1] if input_shape[1] is not None else None
+    return tensor_shape.TensorShape([input_shape[0], size, input_shape[2]])
+
+  def call(self, inputs):
+    output = K.repeat_elements(inputs, self.size, axis=1)
+    return output
+
+  def get_config(self):
+    config = {'size': self.size}
+    base_config = super(UpSampling1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class UpSampling2D(Layer):
+  """Upsampling layer for 2D inputs.
+
+  Repeats the rows and columns of the data
+  by size[0] and size[1] respectively.
+
+  Arguments:
+      size: int, or tuple of 2 integers.
+          The upsampling factors for rows and columns.
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, width, height, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, width, height)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, rows, cols, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, channels, rows, cols)`
+
+  Output shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, upsampled_rows, upsampled_cols, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, channels, upsampled_rows, upsampled_cols)`
+  """
+
+  def __init__(self, size=(2, 2), data_format=None, **kwargs):
+    super(UpSampling2D, self).__init__(**kwargs)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.size = conv_utils.normalize_tuple(size, 2, 'size')
+    self.input_spec = InputSpec(ndim=4)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      height = self.size[0] * input_shape[2] if input_shape[
+          2] is not None else None
+      width = self.size[1] * input_shape[3] if input_shape[
+          3] is not None else None
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], height, width])
+    else:
+      height = self.size[0] * input_shape[1] if input_shape[
+          1] is not None else None
+      width = self.size[1] * input_shape[2] if input_shape[
+          2] is not None else None
+      return tensor_shape.TensorShape(
+          [input_shape[0], height, width, input_shape[3]])
+
+  def call(self, inputs):
+    return K.resize_images(inputs, self.size[0], self.size[1], self.data_format)
+
+  def get_config(self):
+    config = {'size': self.size, 'data_format': self.data_format}
+    base_config = super(UpSampling2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class UpSampling3D(Layer):
+  """Upsampling layer for 3D inputs.
+
+  Repeats the 1st, 2nd and 3rd dimensions
+  of the data by size[0], size[1] and size[2] respectively.
+
+  Arguments:
+      size: int, or tuple of 3 integers.
+          The upsampling factors for dim1, dim2 and dim3.
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+          while `channels_first` corresponds to inputs with shape
+          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, dim1, dim2, dim3, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, channels, dim1, dim2, dim3)`
+
+  Output shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, upsampled_dim1, upsampled_dim2, upsampled_dim3, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, channels, upsampled_dim1, upsampled_dim2, upsampled_dim3)`
+  """
+
+  def __init__(self, size=(2, 2, 2), data_format=None, **kwargs):
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.size = conv_utils.normalize_tuple(size, 3, 'size')
+    self.input_spec = InputSpec(ndim=5)
+    super(UpSampling3D, self).__init__(**kwargs)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      dim1 = self.size[0] * input_shape[2] if input_shape[
+          2] is not None else None
+      dim2 = self.size[1] * input_shape[3] if input_shape[
+          3] is not None else None
+      dim3 = self.size[2] * input_shape[4] if input_shape[
+          4] is not None else None
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], dim1, dim2, dim3])
+    else:
+      dim1 = self.size[0] * input_shape[1] if input_shape[
+          1] is not None else None
+      dim2 = self.size[1] * input_shape[2] if input_shape[
+          2] is not None else None
+      dim3 = self.size[2] * input_shape[3] if input_shape[
+          3] is not None else None
+      return tensor_shape.TensorShape(
+          [input_shape[0], dim1, dim2, dim3, input_shape[4]])
+
+  def call(self, inputs):
+    return K.resize_volumes(inputs, self.size[0], self.size[1], self.size[2],
+                            self.data_format)
+
+  def get_config(self):
+    config = {'size': self.size, 'data_format': self.data_format}
+    base_config = super(UpSampling3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class ZeroPadding1D(Layer):
+  """Zero-padding layer for 1D input (e.g. temporal sequence).
+
+  Arguments:
+      padding: int, or tuple of int (length 2), or dictionary.
+          - If int:
+          How many zeros to add at the beginning and end of
+          the padding dimension (axis 1).
+          - If tuple of int (length 2):
+          How many zeros to add at the beginning and at the end of
+          the padding dimension (`(left_pad, right_pad)`).
+
+  Input shape:
+      3D tensor with shape `(batch, axis_to_pad, features)`
+
+  Output shape:
+      3D tensor with shape `(batch, padded_axis, features)`
+  """
+
+  def __init__(self, padding=1, **kwargs):
+    super(ZeroPadding1D, self).__init__(**kwargs)
+    self.padding = conv_utils.normalize_tuple(padding, 2, 'padding')
+    self.input_spec = InputSpec(ndim=3)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    length = input_shape[1] + self.padding[0] + self.padding[1] if input_shape[
+        1] is not None else None
+    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
+
+  def call(self, inputs):
+    return K.temporal_padding(inputs, padding=self.padding)
+
+  def get_config(self):
+    config = {'padding': self.padding}
+    base_config = super(ZeroPadding1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class ZeroPadding2D(Layer):
+  """Zero-padding layer for 2D input (e.g. picture).
+
+  This layer can add rows and columns or zeros
+  at the top, bottom, left and right side of an image tensor.
+
+  Arguments:
+      padding: int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
+          - If int: the same symmetric padding
+              is applied to width and height.
+          - If tuple of 2 ints:
+              interpreted as two different
+              symmetric padding values for height and width:
+              `(symmetric_height_pad, symmetrc_width_pad)`.
+          - If tuple of 2 tuples of 2 ints:
+              interpreted as
+              `((top_pad, bottom_pad), (left_pad, right_pad))`
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, width, height, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, width, height)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, rows, cols, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, channels, rows, cols)`
+
+  Output shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, padded_rows, padded_cols, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, channels, padded_rows, padded_cols)`
+  """
+
+  def __init__(self, padding=(1, 1), data_format=None, **kwargs):
+    super(ZeroPadding2D, self).__init__(**kwargs)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    if isinstance(padding, int):
+      self.padding = ((padding, padding), (padding, padding))
+    elif hasattr(padding, '__len__'):
+      if len(padding) != 2:
+        raise ValueError('`padding` should have two elements. '
+                         'Found: ' + str(padding))
+      height_padding = conv_utils.normalize_tuple(padding[0], 2,
+                                                  '1st entry of padding')
+      width_padding = conv_utils.normalize_tuple(padding[1], 2,
+                                                 '2nd entry of padding')
+      self.padding = (height_padding, width_padding)
+    else:
+      raise ValueError('`padding` should be either an int, '
+                       'a tuple of 2 ints '
+                       '(symmetric_height_pad, symmetric_width_pad), '
+                       'or a tuple of 2 tuples of 2 ints '
+                       '((top_pad, bottom_pad), (left_pad, right_pad)). '
+                       'Found: ' + str(padding))
+    self.input_spec = InputSpec(ndim=4)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      rows = input_shape[2] + self.padding[0][0] + self.padding[0][
+          1] if input_shape[2] is not None else None
+      cols = input_shape[3] + self.padding[1][0] + self.padding[1][
+          1] if input_shape[3] is not None else None
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], rows, cols])
+    else:
+      rows = input_shape[1] + self.padding[0][0] + self.padding[0][
+          1] if input_shape[1] is not None else None
+      cols = input_shape[2] + self.padding[1][0] + self.padding[1][
+          1] if input_shape[2] is not None else None
+      return tensor_shape.TensorShape(
+          [input_shape[0], rows, cols, input_shape[3]])
+
+  def call(self, inputs):
+    return K.spatial_2d_padding(
+        inputs, padding=self.padding, data_format=self.data_format)
+
+  def get_config(self):
+    config = {'padding': self.padding, 'data_format': self.data_format}
+    base_config = super(ZeroPadding2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class ZeroPadding3D(Layer):
+  """Zero-padding layer for 3D data (spatial or spatio-temporal).
+
+  Arguments:
+      padding: int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
+          - If int: the same symmetric padding
+              is applied to width and height.
+          - If tuple of 2 ints:
+              interpreted as two different
+              symmetric padding values for height and width:
+              `(symmetric_dim1_pad, symmetric_dim2_pad, symmetric_dim3_pad)`.
+          - If tuple of 2 tuples of 2 ints:
+              interpreted as
+              `((left_dim1_pad, right_dim1_pad), (left_dim2_pad,
+                right_dim2_pad), (left_dim3_pad, right_dim3_pad))`
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+          while `channels_first` corresponds to inputs with shape
+          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, first_axis_to_pad, second_axis_to_pad, third_axis_to_pad,
+            depth)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, depth, first_axis_to_pad, second_axis_to_pad,
+            third_axis_to_pad)`
+
+  Output shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, first_padded_axis, second_padded_axis, third_axis_to_pad,
+            depth)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, depth, first_padded_axis, second_padded_axis,
+            third_axis_to_pad)`
+  """
+
+  def __init__(self, padding=(1, 1, 1), data_format=None, **kwargs):
+    super(ZeroPadding3D, self).__init__(**kwargs)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    if isinstance(padding, int):
+      self.padding = ((padding, padding), (padding, padding), (padding,
+                                                               padding))
+    elif hasattr(padding, '__len__'):
+      if len(padding) != 3:
+        raise ValueError('`padding` should have 3 elements. '
+                         'Found: ' + str(padding))
+      dim1_padding = conv_utils.normalize_tuple(padding[0], 2,
+                                                '1st entry of padding')
+      dim2_padding = conv_utils.normalize_tuple(padding[1], 2,
+                                                '2nd entry of padding')
+      dim3_padding = conv_utils.normalize_tuple(padding[2], 2,
+                                                '3rd entry of padding')
+      self.padding = (dim1_padding, dim2_padding, dim3_padding)
+    else:
+      raise ValueError(
+          '`padding` should be either an int, '
+          'a tuple of 3 ints '
+          '(symmetric_dim1_pad, symmetric_dim2_pad, symmetric_dim3_pad), '
+          'or a tuple of 3 tuples of 2 ints '
+          '((left_dim1_pad, right_dim1_pad),'
+          ' (left_dim2_pad, right_dim2_pad),'
+          ' (left_dim3_pad, right_dim2_pad)). '
+          'Found: ' + str(padding))
+    self.input_spec = InputSpec(ndim=5)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      dim1 = input_shape[2] + 2 * self.padding[0][0] if input_shape[
+          2] is not None else None
+      dim2 = input_shape[3] + 2 * self.padding[1][0] if input_shape[
+          3] is not None else None
+      dim3 = input_shape[4] + 2 * self.padding[2][0] if input_shape[
+          4] is not None else None
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], dim1, dim2, dim3])
+    else:
+      dim1 = input_shape[1] + 2 * self.padding[0][1] if input_shape[
+          1] is not None else None
+      dim2 = input_shape[2] + 2 * self.padding[1][1] if input_shape[
+          2] is not None else None
+      dim3 = input_shape[3] + 2 * self.padding[2][1] if input_shape[
+          3] is not None else None
+      return tensor_shape.TensorShape(
+          [input_shape[0], dim1, dim2, dim3, input_shape[4]])
+
+  def call(self, inputs):
+    return K.spatial_3d_padding(
+        inputs, padding=self.padding, data_format=self.data_format)
+
+  def get_config(self):
+    config = {'padding': self.padding, 'data_format': self.data_format}
+    base_config = super(ZeroPadding3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Cropping1D(Layer):
+  """Cropping layer for 1D input (e.g. temporal sequence).
+
+  It crops along the time dimension (axis 1).
+
+  Arguments:
+      cropping: int or tuple of int (length 2)
+          How many units should be trimmed off at the beginning and end of
+          the cropping dimension (axis 1).
+          If a single int is provided,
+          the same value will be used for both.
+
+  Input shape:
+      3D tensor with shape `(batch, axis_to_crop, features)`
+
+  Output shape:
+      3D tensor with shape `(batch, cropped_axis, features)`
+  """
+
+  def __init__(self, cropping=(1, 1), **kwargs):
+    super(Cropping1D, self).__init__(**kwargs)
+    self.cropping = conv_utils.normalize_tuple(cropping, 2, 'cropping')
+    self.input_spec = InputSpec(ndim=3)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if input_shape[1] is not None:
+      length = input_shape[1] - self.cropping[0] - self.cropping[1]
+    else:
+      length = None
+    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
+
+  def call(self, inputs):
+    if self.cropping[1] == 0:
+      return inputs[:, self.cropping[0]:, :]
+    else:
+      return inputs[:, self.cropping[0]:-self.cropping[1], :]
+
+  def get_config(self):
+    config = {'cropping': self.cropping}
+    base_config = super(Cropping1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Cropping2D(Layer):
+  """Cropping layer for 2D input (e.g. picture).
+
+  It crops along spatial dimensions, i.e. width and height.
+
+  Arguments:
+      cropping: int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
+          - If int: the same symmetric cropping
+              is applied to width and height.
+          - If tuple of 2 ints:
+              interpreted as two different
+              symmetric cropping values for height and width:
+              `(symmetric_height_crop, symmetrc_width_crop)`.
+          - If tuple of 2 tuples of 2 ints:
+              interpreted as
+              `((top_crop, bottom_crop), (left_crop, right_crop))`
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, width, height, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, width, height)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, rows, cols, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, channels, rows, cols)`
+
+  Output shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, cropped_rows, cropped_cols, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, channels, cropped_rows, cropped_cols)`
+
+  Examples:
+
+  ```python
+      # Crop the input 2D images or feature maps
+      model = Sequential()
+      model.add(Cropping2D(cropping=((2, 2), (4, 4)),
+                           input_shape=(28, 28, 3)))
+      # now model.output_shape == (None, 24, 20, 3)
+      model.add(Conv2D(64, (3, 3), padding='same))
+      model.add(Cropping2D(cropping=((2, 2), (2, 2))))
+      # now model.output_shape == (None, 20, 16. 64)
+  ```
+  """
+
+  def __init__(self, cropping=((0, 0), (0, 0)), data_format=None, **kwargs):
+    super(Cropping2D, self).__init__(**kwargs)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    if isinstance(cropping, int):
+      self.cropping = ((cropping, cropping), (cropping, cropping))
+    elif hasattr(cropping, '__len__'):
+      if len(cropping) != 2:
+        raise ValueError('`cropping` should have two elements. '
+                         'Found: ' + str(cropping))
+      height_cropping = conv_utils.normalize_tuple(cropping[0], 2,
+                                                   '1st entry of cropping')
+      width_cropping = conv_utils.normalize_tuple(cropping[1], 2,
+                                                  '2nd entry of cropping')
+      self.cropping = (height_cropping, width_cropping)
+    else:
+      raise ValueError('`cropping` should be either an int, '
+                       'a tuple of 2 ints '
+                       '(symmetric_height_crop, symmetric_width_crop), '
+                       'or a tuple of 2 tuples of 2 ints '
+                       '((top_crop, bottom_crop), (left_crop, right_crop)). '
+                       'Found: ' + str(cropping))
+    self.input_spec = InputSpec(ndim=4)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    # pylint: disable=invalid-unary-operand-type
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape([
+          input_shape[0], input_shape[1],
+          input_shape[2] - self.cropping[0][0] - self.cropping[0][1]
+          if input_shape[2] else None,
+          input_shape[3] - self.cropping[1][0] - self.cropping[1][1]
+          if input_shape[3] else None
+      ])
+    else:
+      return tensor_shape.TensorShape([
+          input_shape[0],
+          input_shape[1] - self.cropping[0][0] - self.cropping[0][1]
+          if input_shape[1] else None,
+          input_shape[2] - self.cropping[1][0] - self.cropping[1][1]
+          if input_shape[2] else None, input_shape[3]
+      ])
+    # pylint: enable=invalid-unary-operand-type
+
+  def call(self, inputs):
+    # pylint: disable=invalid-unary-operand-type
+    if self.data_format == 'channels_first':
+      if self.cropping[0][1] == self.cropping[1][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][0]:]
+      elif self.cropping[0][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][0]:
+                      -self.cropping[1][1]]
+      elif self.cropping[1][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1],
+                      self.cropping[1][0]:]
+      return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1],
+                    self.cropping[1][0]:-self.cropping[1][1]]
+    else:
+      if self.cropping[0][1] == self.cropping[1][1] == 0:
+        return inputs[:, self.cropping[0][0]:, self.cropping[1][0]:, :]
+      elif self.cropping[0][1] == 0:
+        return inputs[:, self.cropping[0][0]:, self.cropping[1][0]:
+                      -self.cropping[1][1], :]
+      elif self.cropping[1][1] == 0:
+        return inputs[:, self.cropping[0][0]:-self.cropping[0][1],
+                      self.cropping[1][0]:, :]
+      return inputs[:, self.cropping[0][0]:-self.cropping[0][1], self.cropping[
+          1][0]:-self.cropping[1][1], :]  # pylint: disable=invalid-unary-operand-type
+    # pylint: enable=invalid-unary-operand-type
+
+  def get_config(self):
+    config = {'cropping': self.cropping, 'data_format': self.data_format}
+    base_config = super(Cropping2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Cropping3D(Layer):
+  """Cropping layer for 3D data (e.g.
+
+  spatial or spatio-temporal).
+
+  Arguments:
+      cropping: int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
+          - If int: the same symmetric cropping
+              is applied to width and height.
+          - If tuple of 2 ints:
+              interpreted as two different
+              symmetric cropping values for height and width:
+              `(symmetric_dim1_crop, symmetric_dim2_crop, symmetric_dim3_crop)`.
+          - If tuple of 2 tuples of 2 ints:
+              interpreted as
+              `((left_dim1_crop, right_dim1_crop), (left_dim2_crop,
+                right_dim2_crop), (left_dim3_crop, right_dim3_crop))`
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+          while `channels_first` corresponds to inputs with shape
+          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, first_axis_to_crop, second_axis_to_crop, third_axis_to_crop,
+            depth)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, depth, first_axis_to_crop, second_axis_to_crop,
+            third_axis_to_crop)`
+
+  Output shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, first_cropped_axis, second_cropped_axis, third_cropped_axis,
+            depth)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, depth, first_cropped_axis, second_cropped_axis,
+            third_cropped_axis)`
+  """
+
+  def __init__(self,
+               cropping=((1, 1), (1, 1), (1, 1)),
+               data_format=None,
+               **kwargs):
+    super(Cropping3D, self).__init__(**kwargs)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    if isinstance(cropping, int):
+      self.cropping = ((cropping, cropping), (cropping, cropping), (cropping,
+                                                                    cropping))
+    elif hasattr(cropping, '__len__'):
+      if len(cropping) != 3:
+        raise ValueError('`cropping` should have 3 elements. '
+                         'Found: ' + str(cropping))
+      dim1_cropping = conv_utils.normalize_tuple(cropping[0], 2,
+                                                 '1st entry of cropping')
+      dim2_cropping = conv_utils.normalize_tuple(cropping[1], 2,
+                                                 '2nd entry of cropping')
+      dim3_cropping = conv_utils.normalize_tuple(cropping[2], 2,
+                                                 '3rd entry of cropping')
+      self.cropping = (dim1_cropping, dim2_cropping, dim3_cropping)
+    else:
+      raise ValueError(
+          '`cropping` should be either an int, '
+          'a tuple of 3 ints '
+          '(symmetric_dim1_crop, symmetric_dim2_crop, symmetric_dim3_crop), '
+          'or a tuple of 3 tuples of 2 ints '
+          '((left_dim1_crop, right_dim1_crop),'
+          ' (left_dim2_crop, right_dim2_crop),'
+          ' (left_dim3_crop, right_dim2_crop)). '
+          'Found: ' + str(cropping))
+    self.input_spec = InputSpec(ndim=5)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    # pylint: disable=invalid-unary-operand-type
+    if self.data_format == 'channels_first':
+      dim1 = input_shape[2] - self.cropping[0][0] - self.cropping[0][
+          1] if input_shape[2] is not None else None
+      dim2 = input_shape[3] - self.cropping[1][0] - self.cropping[1][
+          1] if input_shape[3] is not None else None
+      dim3 = input_shape[4] - self.cropping[2][0] - self.cropping[2][
+          1] if input_shape[4] is not None else None
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], dim1, dim2, dim3])
+    else:
+      dim1 = input_shape[1] - self.cropping[0][0] - self.cropping[0][
+          1] if input_shape[1] is not None else None
+      dim2 = input_shape[2] - self.cropping[1][0] - self.cropping[1][
+          1] if input_shape[2] is not None else None
+      dim3 = input_shape[3] - self.cropping[2][0] - self.cropping[2][
+          1] if input_shape[3] is not None else None
+      return tensor_shape.TensorShape(
+          [input_shape[0], dim1, dim2, dim3, input_shape[4]])
+    # pylint: enable=invalid-unary-operand-type
+
+  def call(self, inputs):
+    # pylint: disable=invalid-unary-operand-type
+    if self.data_format == 'channels_first':
+      if self.cropping[0][1] == self.cropping[1][1] == self.cropping[2][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][0]:,
+                      self.cropping[2][0]:]
+      elif self.cropping[0][1] == self.cropping[1][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][0]:,
+                      self.cropping[2][0]:-self.cropping[2][1]]
+      elif self.cropping[1][1] == self.cropping[2][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1],
+                      self.cropping[1][0]:, self.cropping[2][0]:]
+      elif self.cropping[0][1] == self.cropping[2][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][0]:
+                      -self.cropping[1][1], self.cropping[2][0]:]
+      elif self.cropping[0][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][
+            0]:-self.cropping[1][1], self.cropping[2][0]:-self.cropping[2][1]]
+      elif self.cropping[1][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:-self.cropping[0][
+            1], self.cropping[1][0]:, self.cropping[2][0]:-self.cropping[2][1]]
+      elif self.cropping[2][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:-self.cropping[0][
+            1], self.cropping[1][0]:-self.cropping[1][1], self.cropping[2][0]:]
+      return inputs[:, :, self.cropping[0][0]:-self.cropping[0][
+          1], self.cropping[1][0]:-self.cropping[1][1], self.cropping[2][0]:
+                    -self.cropping[2][1]]
+    else:
+      if self.cropping[0][1] == self.cropping[1][1] == self.cropping[2][1] == 0:
+        return inputs[:, self.cropping[0][0]:, self.cropping[1][0]:,
+                      self.cropping[2][0]:, :]
+      elif self.cropping[0][1] == self.cropping[1][1] == 0:
+        return inputs[:, self.cropping[0][0]:, self.cropping[1][0]:,
+                      self.cropping[2][0]:-self.cropping[2][1], :]
+      elif self.cropping[1][1] == self.cropping[2][1] == 0:
+        return inputs[:, self.cropping[0][0]:-self.cropping[0][1],
+                      self.cropping[1][0]:, self.cropping[2][0]:, :]
+      elif self.cropping[0][1] == self.cropping[2][1] == 0:
+        return inputs[:, self.cropping[0][0]:, self.cropping[1][0]:
+                      -self.cropping[1][1], self.cropping[2][0]:, :]
+      elif self.cropping[0][1] == 0:
+        return inputs[:, self.cropping[0][0]:, self.cropping[1][
+            0]:-self.cropping[1][1], self.cropping[2][0]:-self.cropping[2][
+                1], :]
+      elif self.cropping[1][1] == 0:
+        return inputs[:, self.cropping[0][0]:-self.cropping[0][
+            1], self.cropping[1][0]:, self.cropping[2][0]:-self.cropping[2][
+                1], :]
+      elif self.cropping[2][1] == 0:
+        return inputs[:, self.cropping[0][0]:-self.cropping[0][
+            1], self.cropping[1][0]:-self.cropping[1][1], self.cropping[2][
+                0]:, :]
+      return inputs[:, self.cropping[0][0]:-self.cropping[0][1], self.cropping[
+          1][0]:-self.cropping[1][1], self.cropping[2][0]:-self.cropping[2][  # pylint: disable=invalid-unary-operand-type
+              1], :]
+    # pylint: enable=invalid-unary-operand-type
+
+  def get_config(self):
+    config = {'cropping': self.cropping, 'data_format': self.data_format}
+    base_config = super(Cropping3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+# Aliases
+
+Convolution1D = Conv1D
+Convolution2D = Conv2D
+Convolution3D = Conv3D
+SeparableConvolution2D = SeparableConv2D
+Convolution2DTranspose = Conv2DTranspose
+Deconvolution2D = Deconv2D = Conv2DTranspose
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py
new file mode 100644
index 0000000000..4ed5046dc3
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py
@@ -0,0 +1,578 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Convolutional-recurrent layers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras import activations
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.layers.recurrent import Recurrent
+from tensorflow.contrib.keras.python.keras.utils import conv_utils
+from tensorflow.python.framework import tensor_shape
+
+
+class ConvRecurrent2D(Recurrent):
+  """Abstract base class for convolutional recurrent layers.
+
+  Do not use in a model -- it's not a functional layer!
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of n integers, specifying the
+          dimensions of the convolution window.
+      strides: An integer or tuple/list of n integers,
+          specifying the strides of the convolution.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, time, ..., channels)`
+          while `channels_first` corresponds to
+          inputs with shape `(batch, time, channels, ...)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      dilation_rate: An integer or tuple/list of n integers, specifying
+          the dilation rate to use for dilated convolution.
+          Currently, specifying any `dilation_rate` value != 1 is
+          incompatible with specifying any `strides` value != 1.
+      return_sequences: Boolean. Whether to return the last output
+          in the output sequence, or the full sequence.
+      go_backwards: Boolean (default False).
+          If True, rocess the input sequence backwards.
+      stateful: Boolean (default False). If True, the last state
+          for each sample at index i in a batch will be used as initial
+          state for the sample of index i in the following batch.
+
+  Input shape:
+      5D tensor with shape `(num_samples, timesteps, channels, rows, cols)`.
+
+  Output shape:
+      - if `return_sequences`: 5D tensor with shape
+          `(num_samples, timesteps, channels, rows, cols)`.
+      - else, 4D tensor with shape `(num_samples, channels, rows, cols)`.
+
+  # Masking
+      This layer supports masking for input data with a variable number
+      of timesteps. To introduce masks to your data,
+      use an `Embedding` layer with the `mask_zero` parameter
+      set to `True`.
+      **Note:** for the time being, masking is only supported with Theano.
+
+  # Note on using statefulness in RNNs
+      You can set RNN layers to be 'stateful', which means that the states
+      computed for the samples in one batch will be reused as initial states
+      for the samples in the next batch.
+      This assumes a one-to-one mapping between
+      samples in different successive batches.
+
+      To enable statefulness:
+          - specify `stateful=True` in the layer constructor.
+          - specify a fixed batch size for your model, by passing
+              a `batch_input_size=(...)` to the first layer in your model.
+              This is the expected shape of your inputs *including the batch
+              size*.
+              It should be a tuple of integers, e.g. `(32, 10, 100)`.
+
+      To reset the states of your model, call `.reset_states()` on either
+      a specific layer, or on your entire model.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format=None,
+               dilation_rate=(1, 1),
+               return_sequences=False,
+               go_backwards=False,
+               stateful=False,
+               **kwargs):
+    super(ConvRecurrent2D, self).__init__(**kwargs)
+    self.filters = filters
+    self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
+    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, 2,
+                                                    'dilation_rate')
+    self.return_sequences = return_sequences
+    self.go_backwards = go_backwards
+    self.stateful = stateful
+    self.input_spec = InputSpec(ndim=5)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      rows = input_shape[3]
+      cols = input_shape[4]
+    elif self.data_format == 'channels_last':
+      rows = input_shape[2]
+      cols = input_shape[3]
+    rows = conv_utils.conv_output_length(
+        rows,
+        self.kernel_size[0],
+        padding=self.padding,
+        stride=self.strides[0],
+        dilation=self.dilation_rate[0])
+    cols = conv_utils.conv_output_length(
+        cols,
+        self.kernel_size[1],
+        padding=self.padding,
+        stride=self.strides[1],
+        dilation=self.dilation_rate[1])
+    if self.return_sequences:
+      if self.data_format == 'channels_first':
+        return tensor_shape.TensorShape(
+            [input_shape[0], input_shape[1], self.filters, rows, cols])
+      elif self.data_format == 'channels_last':
+        return tensor_shape.TensorShape(
+            [input_shape[0], input_shape[1], rows, cols, self.filters])
+    else:
+      if self.data_format == 'channels_first':
+        return tensor_shape.TensorShape(
+            [input_shape[0], self.filters, rows, cols])
+      elif self.data_format == 'channels_last':
+        return tensor_shape.TensorShape(
+            [input_shape[0], rows, cols, self.filters])
+
+  def get_config(self):
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
+        'return_sequences': self.return_sequences,
+        'go_backwards': self.go_backwards,
+        'stateful': self.stateful
+    }
+    base_config = super(ConvRecurrent2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class ConvLSTM2D(ConvRecurrent2D):
+  """Convolutional LSTM.
+
+  It is similar to an LSTM layer, but the input transformations
+  and recurrent transformations are both convolutional.
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of n integers, specifying the
+          dimensions of the convolution window.
+      strides: An integer or tuple/list of n integers,
+          specifying the strides of the convolution.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, time, ..., channels)`
+          while `channels_first` corresponds to
+          inputs with shape `(batch, time, channels, ...)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      dilation_rate: An integer or tuple/list of n integers, specifying
+          the dilation rate to use for dilated convolution.
+          Currently, specifying any `dilation_rate` value != 1 is
+          incompatible with specifying any `strides` value != 1.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use
+          for the recurrent step.
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+          used for the linear transformation of the inputs..
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+          weights matrix,
+          used for the linear transformation of the recurrent state..
+      bias_initializer: Initializer for the bias vector.
+      unit_forget_bias: Boolean.
+          If True, add 1 to the bias of the forget gate at initialization.
+          Use in combination with `bias_initializer="zeros"`.
+          This is recommended in [Jozefowicz et
+            al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      recurrent_regularizer: Regularizer function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to
+          the `kernel` weights matrix.
+      recurrent_constraint: Constraint function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      return_sequences: Boolean. Whether to return the last output
+          in the output sequence, or the full sequence.
+      go_backwards: Boolean (default False).
+          If True, rocess the input sequence backwards.
+      stateful: Boolean (default False). If True, the last state
+          for each sample at index i in a batch will be used as initial
+          state for the sample of index i in the following batch.
+      dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the recurrent state.
+
+  Input shape:
+      - if data_format='channels_first'
+          5D tensor with shape:
+          `(samples,time, channels, rows, cols)`
+      - if data_format='channels_last'
+          5D tensor with shape:
+          `(samples,time, rows, cols, channels)`
+
+   Output shape:
+      - if `return_sequences`
+           - if data_format='channels_first'
+              5D tensor with shape:
+              `(samples, time, filters, output_row, output_col)`
+           - if data_format='channels_last'
+              5D tensor with shape:
+              `(samples, time, output_row, output_col, filters)`
+      - else
+          - if data_format ='channels_first'
+              4D tensor with shape:
+              `(samples, filters, output_row, output_col)`
+          - if data_format='channels_last'
+              4D tensor with shape:
+              `(samples, output_row, output_col, filters)`
+          where o_row and o_col depend on the shape of the filter and
+          the padding
+
+  Raises:
+      ValueError: in case of invalid constructor arguments.
+
+  References:
+      - [Convolutional LSTM Network: A Machine Learning Approach for
+      Precipitation Nowcasting](http://arxiv.org/abs/1506.04214v1)
+      The current implementation does not include the feedback loop on the
+      cells output
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format=None,
+               dilation_rate=(1, 1),
+               activation='tanh',
+               recurrent_activation='hard_sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               return_sequences=False,
+               go_backwards=False,
+               stateful=False,
+               dropout=0.,
+               recurrent_dropout=0.,
+               **kwargs):
+    super(ConvLSTM2D, self).__init__(
+        filters,
+        kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        return_sequences=return_sequences,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        **kwargs)
+    self.activation = activations.get(activation)
+    self.recurrent_activation = activations.get(recurrent_activation)
+    self.use_bias = use_bias
+
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.recurrent_initializer = initializers.get(recurrent_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.unit_forget_bias = unit_forget_bias
+
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.recurrent_constraint = constraints.get(recurrent_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+    self.dropout = min(1., max(0., dropout))
+    self.recurrent_dropout = min(1., max(0., recurrent_dropout))
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    # TODO(fchollet): better handling of input spec
+    self.input_spec = InputSpec(shape=input_shape)
+
+    if self.stateful:
+      self.reset_states()
+    else:
+      # initial states: 2 all-zero tensor of shape (filters)
+      self.states = [None, None]
+
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = input_shape[channel_axis]
+    kernel_shape = self.kernel_size + (input_dim, self.filters * 4)
+    self.kernel_shape = kernel_shape
+    recurrent_kernel_shape = self.kernel_size + (self.filters, self.filters * 4)
+
+    self.kernel = self.add_weight(
+        kernel_shape,
+        initializer=self.kernel_initializer,
+        name='kernel',
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+    self.recurrent_kernel = self.add_weight(
+        recurrent_kernel_shape,
+        initializer=self.recurrent_initializer,
+        name='recurrent_kernel',
+        regularizer=self.recurrent_regularizer,
+        constraint=self.recurrent_constraint)
+    if self.use_bias:
+      self.bias = self.add_weight(
+          (self.filters * 4,),
+          initializer=self.bias_initializer,
+          name='bias',
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+      if self.unit_forget_bias:
+        bias_value = np.zeros((self.filters * 4,))
+        bias_value[self.filters:self.filters * 2] = 1.
+        K.set_value(self.bias, bias_value)
+    else:
+      self.bias = None
+
+    self.kernel_i = self.kernel[:, :, :, :self.filters]
+    self.recurrent_kernel_i = self.recurrent_kernel[:, :, :, :self.filters]
+    self.kernel_f = self.kernel[:, :, :, self.filters:self.filters * 2]
+    self.recurrent_kernel_f = self.recurrent_kernel[:, :, :, self.filters:
+                                                    self.filters * 2]
+    self.kernel_c = self.kernel[:, :, :, self.filters * 2:self.filters * 3]
+    self.recurrent_kernel_c = self.recurrent_kernel[:, :, :, self.filters * 2:
+                                                    self.filters * 3]
+    self.kernel_o = self.kernel[:, :, :, self.filters * 3:]
+    self.recurrent_kernel_o = self.recurrent_kernel[:, :, :, self.filters * 3:]
+
+    if self.use_bias:
+      self.bias_i = self.bias[:self.filters]
+      self.bias_f = self.bias[self.filters:self.filters * 2]
+      self.bias_c = self.bias[self.filters * 2:self.filters * 3]
+      self.bias_o = self.bias[self.filters * 3:]
+    else:
+      self.bias_i = None
+      self.bias_f = None
+      self.bias_c = None
+      self.bias_o = None
+    self.built = True
+
+  def get_initial_states(self, inputs):
+    # (samples, timesteps, rows, cols, filters)
+    initial_state = K.zeros_like(inputs)
+    # (samples, rows, cols, filters)
+    initial_state = K.sum(initial_state, axis=1)
+    shape = list(self.kernel_shape)
+    shape[-1] = self.filters
+    initial_state = self.input_conv(
+        initial_state, K.zeros(tuple(shape)), padding=self.padding)
+
+    initial_states = [initial_state for _ in range(2)]
+    return initial_states
+
+  def reset_states(self):
+    if not self.stateful:
+      raise RuntimeError('Layer must be stateful.')
+    input_shape = self.input_spec.shape
+    output_shape = self._compute_output_shape(input_shape)
+    if not input_shape[0]:
+      raise ValueError('If a RNN is stateful, a complete '
+                       'input_shape must be provided '
+                       '(including batch size). '
+                       'Got input shape: ' + str(input_shape))
+
+    if self.return_sequences:
+      out_row, out_col, out_filter = output_shape[2:]
+    else:
+      out_row, out_col, out_filter = output_shape[1:]
+
+    if hasattr(self, 'states'):
+      K.set_value(self.states[0],
+                  np.zeros((input_shape[0], out_row, out_col, out_filter)))
+      K.set_value(self.states[1],
+                  np.zeros((input_shape[0], out_row, out_col, out_filter)))
+    else:
+      self.states = [
+          K.zeros((input_shape[0], out_row, out_col, out_filter)), K.zeros(
+              (input_shape[0], out_row, out_col, out_filter))
+      ]
+
+  def get_constants(self, inputs, training=None):
+    constants = []
+    if self.implementation == 0 and 0 < self.dropout < 1:
+      ones = K.zeros_like(inputs)
+      ones = K.sum(ones, axis=1)
+      ones += 1
+
+      def dropped_inputs():
+        return K.dropout(ones, self.dropout)
+
+      dp_mask = [
+          K.in_train_phase(dropped_inputs, ones, training=training)
+          for _ in range(4)
+      ]
+      constants.append(dp_mask)
+    else:
+      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
+
+    if 0 < self.recurrent_dropout < 1:
+      shape = list(self.kernel_shape)
+      shape[-1] = self.filters
+      ones = K.zeros_like(inputs)
+      ones = K.sum(ones, axis=1)
+      ones = self.input_conv(ones, K.zeros(shape), padding=self.padding)
+      ones += 1.
+
+      def dropped_inputs():  # pylint: disable=function-redefined
+        return K.dropout(ones, self.recurrent_dropout)
+
+      rec_dp_mask = [
+          K.in_train_phase(dropped_inputs, ones, training=training)
+          for _ in range(4)
+      ]
+      constants.append(rec_dp_mask)
+    else:
+      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
+    return constants
+
+  def input_conv(self, x, w, b=None, padding='valid'):
+    conv_out = K.conv2d(
+        x,
+        w,
+        strides=self.strides,
+        padding=padding,
+        data_format=self.data_format,
+        dilation_rate=self.dilation_rate)
+    if b is not None:
+      conv_out = K.bias_add(conv_out, b, data_format=self.data_format)
+    return conv_out
+
+  def reccurent_conv(self, x, w):
+    conv_out = K.conv2d(
+        x, w, strides=(1, 1), padding='same', data_format=self.data_format)
+    return conv_out
+
+  def step(self, inputs, states):
+    assert len(states) == 4
+    h_tm1 = states[0]
+    c_tm1 = states[1]
+    dp_mask = states[2]
+    rec_dp_mask = states[3]
+
+    x_i = self.input_conv(
+        inputs * dp_mask[0], self.kernel_i, self.bias_i, padding=self.padding)
+    x_f = self.input_conv(
+        inputs * dp_mask[1], self.kernel_f, self.bias_f, padding=self.padding)
+    x_c = self.input_conv(
+        inputs * dp_mask[2], self.kernel_c, self.bias_c, padding=self.padding)
+    x_o = self.input_conv(
+        inputs * dp_mask[3], self.kernel_o, self.bias_o, padding=self.padding)
+    h_i = self.reccurent_conv(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_i)
+    h_f = self.reccurent_conv(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_f)
+    h_c = self.reccurent_conv(h_tm1 * rec_dp_mask[2], self.recurrent_kernel_c)
+    h_o = self.reccurent_conv(h_tm1 * rec_dp_mask[3], self.recurrent_kernel_o)
+
+    i = self.recurrent_activation(x_i + h_i)
+    f = self.recurrent_activation(x_f + h_f)
+    c = f * c_tm1 + i * self.activation(x_c + h_c)
+    o = self.recurrent_activation(x_o + h_o)
+    h = o * self.activation(c)
+    return h, [h, c]
+
+  def get_config(self):
+    config = {
+        'activation':
+            activations.serialize(self.activation),
+        'recurrent_activation':
+            activations.serialize(self.recurrent_activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'unit_forget_bias':
+            self.unit_forget_bias,
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout
+    }
+    base_config = super(ConvLSTM2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent_test.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent_test.py
new file mode 100644
index 0000000000..06b2be6b68
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent_test.py
@@ -0,0 +1,165 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for convolutional recurrent layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class ConvLSTMTest(test.TestCase):
+
+  def test_conv_lstm(self):
+    num_row = 3
+    num_col = 3
+    filters = 2
+    num_samples = 1
+    input_channel = 2
+    input_num_row = 5
+    input_num_col = 5
+    sequence_len = 2
+    for data_format in ['channels_first', 'channels_last']:
+      if data_format == 'channels_first':
+        inputs = np.random.rand(num_samples, sequence_len,
+                                input_channel,
+                                input_num_row, input_num_col)
+      else:
+        inputs = np.random.rand(num_samples, sequence_len,
+                                input_num_row, input_num_col,
+                                input_channel)
+
+      for return_sequences in [True, False]:
+        # test for output shape:
+        with self.test_session():
+          testing_utils.layer_test(
+              keras.layers.ConvLSTM2D,
+              kwargs={'data_format': data_format,
+                      'return_sequences': return_sequences,
+                      'filters': filters,
+                      'kernel_size': (num_row, num_col),
+                      'padding': 'valid'},
+              input_shape=inputs.shape)
+
+  def test_conv_lstm_statefulness(self):
+    # Tests for statefulness
+    num_row = 3
+    num_col = 3
+    filters = 2
+    num_samples = 1
+    input_channel = 2
+    input_num_row = 5
+    input_num_col = 5
+    sequence_len = 2
+    inputs = np.random.rand(num_samples, sequence_len,
+                            input_num_row, input_num_col,
+                            input_channel)
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      kwargs = {'data_format': 'channels_last',
+                'return_sequences': False,
+                'filters': filters,
+                'kernel_size': (num_row, num_col),
+                'stateful': True,
+                'batch_input_shape': inputs.shape,
+                'padding': 'same'}
+      layer = keras.layers.ConvLSTM2D(**kwargs)
+
+      model.add(layer)
+      model.compile(optimizer='sgd', loss='mse')
+      out1 = model.predict(np.ones_like(inputs))
+
+      # train once so that the states change
+      model.train_on_batch(np.ones_like(inputs),
+                           np.random.random(out1.shape))
+      out2 = model.predict(np.ones_like(inputs))
+
+      # if the state is not reset, output should be different
+      self.assertNotEqual(out1.max(), out2.max())
+
+      # check that output changes after states are reset
+      # (even though the model itself didn't change)
+      layer.reset_states()
+      out3 = model.predict(np.ones_like(inputs))
+      self.assertNotEqual(out3.max(), out2.max())
+
+      # check that container-level reset_states() works
+      model.reset_states()
+      out4 = model.predict(np.ones_like(inputs))
+      self.assertAllClose(out3, out4, atol=1e-5)
+
+      # check that the call to `predict` updated the states
+      out5 = model.predict(np.ones_like(inputs))
+      self.assertNotEqual(out4.max(), out5.max())
+
+  def test_conv_lstm_regularizers(self):
+    # check regularizers
+    num_row = 3
+    num_col = 3
+    filters = 2
+    num_samples = 1
+    input_channel = 2
+    input_num_row = 5
+    input_num_col = 5
+    sequence_len = 2
+    inputs = np.random.rand(num_samples, sequence_len,
+                            input_num_row, input_num_col,
+                            input_channel)
+
+    with self.test_session():
+      kwargs = {'data_format': 'channels_last',
+                'return_sequences': False,
+                'kernel_size': (num_row, num_col),
+                'stateful': True,
+                'filters': filters,
+                'batch_input_shape': inputs.shape,
+                'kernel_regularizer': keras.regularizers.L1L2(l1=0.01),
+                'recurrent_regularizer': keras.regularizers.L1L2(l1=0.01),
+                'activity_regularizer': 'l2',
+                'bias_regularizer': 'l2',
+                'kernel_constraint': 'max_norm',
+                'recurrent_constraint': 'max_norm',
+                'bias_constraint': 'max_norm',
+                'padding': 'same'}
+
+      layer = keras.layers.ConvLSTM2D(**kwargs)
+      layer.build(inputs.shape)
+      self.assertEqual(len(layer.losses), 3)
+      layer(keras.backend.variable(np.ones(inputs.shape)))
+      self.assertEqual(len(layer.losses), 4)
+
+  def test_conv_lstm_dropout(self):
+    # check dropout
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.ConvLSTM2D,
+          kwargs={'data_format': 'channels_last',
+                  'return_sequences': False,
+                  'filters': 2,
+                  'kernel_size': (3, 3),
+                  'padding': 'same',
+                  'dropout': 0.1,
+                  'recurrent_dropout': 0.1},
+          input_shape=(1, 2, 5, 5, 2))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
new file mode 100644
index 0000000000..845e9eee12
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
@@ -0,0 +1,737 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for convolutional layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class Convolution1DTest(test.TestCase):
+
+  def test_causal_dilated_conv1d(self):
+    # Causal:
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Conv1D,
+          input_data=np.reshape(np.arange(4, dtype='float32'), (1, 4, 1)),
+          kwargs={
+              'filters': 1,
+              'kernel_size': 2,
+              'dilation_rate': 1,
+              'padding': 'causal',
+              'kernel_initializer': 'ones',
+              'use_bias': False,
+          },
+          expected_output=[[[0], [1], [3], [5]]])
+
+  def test_dilated_conv1d(self):
+    # Non-causal:
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Conv1D,
+          input_data=np.reshape(np.arange(4, dtype='float32'), (1, 4, 1)),
+          kwargs={
+              'filters': 1,
+              'kernel_size': 2,
+              'dilation_rate': 1,
+              'padding': 'valid',
+              'kernel_initializer': 'ones',
+              'use_bias': False,
+          },
+          expected_output=[[[1], [3], [5]]])
+
+  def test_conv_1d(self):
+    batch_size = 2
+    steps = 8
+    input_dim = 2
+    kernel_size = 3
+    filters = 3
+
+    for padding in ['valid', 'same']:
+      for strides in [1, 2]:
+        if padding == 'same' and strides != 1:
+          continue
+
+        with self.test_session():
+          testing_utils.layer_test(
+              keras.layers.Conv1D,
+              kwargs={
+                  'filters': filters,
+                  'kernel_size': kernel_size,
+                  'padding': padding,
+                  'strides': strides
+              },
+              input_shape=(batch_size, steps, input_dim))
+
+  def test_conv_1d_regularization(self):
+    # regularizers
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.test_session():
+      layer = keras.layers.Conv1D(**kwargs)
+      layer.build((None, 5, 2))
+      self.assertEqual(len(layer.losses), 2)
+      layer(keras.backend.variable(np.ones((1, 5, 2))))
+      self.assertEqual(len(layer.losses), 3)
+
+    # constraints
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_constraint': 'max_norm',
+        'bias_constraint': 'max_norm',
+        'strides': 1
+    }
+    with self.test_session():
+      layer = keras.layers.Conv1D(**kwargs)
+      layer.build((None, 5, 2))
+      self.assertEqual(len(layer.constraints), 2)
+
+
+class Conv2DTest(test.TestCase):
+
+  def test_convolution_2d(self):
+    num_samples = 2
+    filters = 2
+    stack_size = 3
+    kernel_size = (3, 2)
+    num_row = 7
+    num_col = 6
+
+    for padding in ['valid', 'same']:
+      for strides in [(1, 1), (2, 2)]:
+        if padding == 'same' and strides != (1, 1):
+          continue
+
+        with self.test_session():
+          testing_utils.layer_test(
+              keras.layers.Conv2D,
+              kwargs={
+                  'filters': filters,
+                  'kernel_size': kernel_size,
+                  'padding': padding,
+                  'strides': strides,
+                  'data_format': 'channels_first'
+              },
+              input_shape=(num_samples, stack_size, num_row, num_col))
+
+  def test_convolution_2d_regularization(self):
+    # regularizers
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.test_session():
+      layer = keras.layers.Conv2D(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(len(layer.losses), 2)
+      layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
+      self.assertEqual(len(layer.losses), 3)
+
+    # constraints
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_constraint': 'max_norm',
+        'bias_constraint': 'max_norm',
+        'strides': 1
+    }
+    with self.test_session():
+      layer = keras.layers.Conv2D(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(len(layer.constraints), 2)
+
+  def test_dilated_conv_2d(self):
+    num_samples = 2
+    filters = 2
+    stack_size = 3
+    kernel_size = (3, 2)
+    num_row = 7
+    num_col = 6
+
+    # Test dilation
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Conv2D,
+          kwargs={
+              'filters': filters,
+              'kernel_size': kernel_size,
+              'dilation_rate': (2, 2)
+          },
+          input_shape=(num_samples, num_row, num_col, stack_size))
+
+
+class Conv2DTransposeTest(test.TestCase):
+
+  def test_conv2d_transpose(self):
+    num_samples = 2
+    filters = 2
+    stack_size = 3
+    num_row = 5
+    num_col = 6
+
+    for padding in ['valid', 'same']:
+      for strides in [(1, 1), (2, 2)]:
+        if padding == 'same' and strides != (1, 1):
+          continue
+
+        with self.test_session():
+          testing_utils.layer_test(
+              keras.layers.Conv2DTranspose,
+              kwargs={
+                  'filters': filters,
+                  'kernel_size': 3,
+                  'padding': padding,
+                  'strides': strides,
+                  'data_format': 'channels_last'
+              },
+              input_shape=(num_samples, num_row, num_col, stack_size))
+
+  def test_conv2dtranspose_regularization(self):
+    # regularizers
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.test_session():
+      layer = keras.layers.Conv2DTranspose(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(len(layer.losses), 2)
+      layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
+      self.assertEqual(len(layer.losses), 3)
+
+    # constraints
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_constraint': 'max_norm',
+        'bias_constraint': 'max_norm',
+        'strides': 1
+    }
+    with self.test_session():
+      layer = keras.layers.Conv2DTranspose(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(len(layer.constraints), 2)
+
+
+class SeparableConv2DTest(test.TestCase):
+
+  def test_separable_conv_2d(self):
+    num_samples = 2
+    filters = 6
+    stack_size = 3
+    num_row = 7
+    num_col = 6
+
+    for padding in ['valid', 'same']:
+      for strides in [(1, 1), (2, 2)]:
+        for multiplier in [1, 2]:
+          if padding == 'same' and strides != (1, 1):
+            continue
+
+          with self.test_session():
+            testing_utils.layer_test(
+                keras.layers.SeparableConv2D,
+                kwargs={
+                    'filters': filters,
+                    'kernel_size': (3, 3),
+                    'padding': padding,
+                    'strides': strides,
+                    'depth_multiplier': multiplier
+                },
+                input_shape=(num_samples, num_row, num_col, stack_size))
+
+  def test_separable_conv2d_regularization(self):
+    # regularizers
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'depthwise_regularizer': 'l2',
+        'pointwise_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.test_session():
+      layer = keras.layers.SeparableConv2D(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(len(layer.losses), 3)
+      layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
+      self.assertEqual(len(layer.losses), 4)
+
+    # constraints
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'pointwise_constraint': 'unit_norm',
+        'depthwise_constraint': 'unit_norm',
+        'strides': 1
+    }
+    with self.test_session():
+      layer = keras.layers.SeparableConv2D(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(len(layer.constraints), 2)
+
+
+class Conv3DTest(test.TestCase):
+
+  def test_convolution_3d(self):
+    num_samples = 2
+    filters = 2
+    stack_size = 3
+
+    input_len_dim1 = 9
+    input_len_dim2 = 8
+    input_len_dim3 = 8
+
+    for padding in ['valid', 'same']:
+      for strides in [(1, 1, 1), (2, 2, 2)]:
+        if padding == 'same' and strides != (1, 1, 1):
+          continue
+
+        with self.test_session():
+          testing_utils.layer_test(
+              keras.layers.Convolution3D,
+              kwargs={
+                  'filters': filters,
+                  'kernel_size': 3,
+                  'padding': padding,
+                  'strides': strides
+              },
+              input_shape=(num_samples, input_len_dim1, input_len_dim2,
+                           input_len_dim3, stack_size))
+
+  def test_convolution_3d_regularization(self):
+    # regularizers
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.test_session():
+      layer = keras.layers.Conv3D(**kwargs)
+      layer.build((None, 5, 5, 5, 2))
+      self.assertEqual(len(layer.losses), 2)
+      self.assertEqual(len(layer.losses), 2)
+      layer(keras.backend.variable(np.ones((1, 5, 5, 5, 2))))
+      self.assertEqual(len(layer.losses), 3)
+
+    # constraints
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_constraint': 'max_norm',
+        'bias_constraint': 'max_norm',
+        'strides': 1
+    }
+    with self.test_session():
+      layer = keras.layers.Conv3D(**kwargs)
+      layer.build((None, 5, 5, 5, 2))
+      self.assertEqual(len(layer.constraints), 2)
+
+
+class ZeroPaddingTest(test.TestCase):
+
+  def test_zero_padding_1d(self):
+    num_samples = 2
+    input_dim = 2
+    num_steps = 5
+    shape = (num_samples, num_steps, input_dim)
+    inputs = np.ones(shape)
+
+    # basic test
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.ZeroPadding1D,
+          kwargs={'padding': 2},
+          input_shape=inputs.shape)
+      testing_utils.layer_test(
+          keras.layers.ZeroPadding1D,
+          kwargs={'padding': (1, 2)},
+          input_shape=inputs.shape)
+
+    # correctness test
+    with self.test_session():
+      layer = keras.layers.ZeroPadding1D(padding=2)
+      layer.build(shape)
+      output = layer(keras.backend.variable(inputs))
+      np_output = keras.backend.eval(output)
+      for offset in [0, 1, -1, -2]:
+        np.testing.assert_allclose(np_output[:, offset, :], 0.)
+      np.testing.assert_allclose(np_output[:, 2:-2, :], 1.)
+
+      layer = keras.layers.ZeroPadding1D(padding=(1, 2))
+      layer.build(shape)
+      output = layer(keras.backend.variable(inputs))
+      np_output = keras.backend.eval(output)
+      for left_offset in [0]:
+        np.testing.assert_allclose(np_output[:, left_offset, :], 0.)
+      for right_offset in [-1, -2]:
+        np.testing.assert_allclose(np_output[:, right_offset, :], 0.)
+      np.testing.assert_allclose(np_output[:, 1:-2, :], 1.)
+      layer.get_config()
+
+  def test_zero_padding_2d(self):
+    num_samples = 2
+    stack_size = 2
+    input_num_row = 4
+    input_num_col = 5
+    for data_format in ['channels_first', 'channels_last']:
+      inputs = np.ones((num_samples, input_num_row, input_num_col, stack_size))
+      inputs = np.ones((num_samples, stack_size, input_num_row, input_num_col))
+
+      # basic test
+      with self.test_session():
+        testing_utils.layer_test(
+            keras.layers.ZeroPadding2D,
+            kwargs={'padding': (2, 2),
+                    'data_format': data_format},
+            input_shape=inputs.shape)
+        testing_utils.layer_test(
+            keras.layers.ZeroPadding2D,
+            kwargs={'padding': ((1, 2), (3, 4)),
+                    'data_format': data_format},
+            input_shape=inputs.shape)
+
+      # correctness test
+      with self.test_session():
+        layer = keras.layers.ZeroPadding2D(
+            padding=(2, 2), data_format=data_format)
+        layer.build(inputs.shape)
+        output = layer(keras.backend.variable(inputs))
+        np_output = keras.backend.eval(output)
+        if data_format == 'channels_last':
+          for offset in [0, 1, -1, -2]:
+            np.testing.assert_allclose(np_output[:, offset, :, :], 0.)
+            np.testing.assert_allclose(np_output[:, :, offset, :], 0.)
+          np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, :], 1.)
+        elif data_format == 'channels_first':
+          for offset in [0, 1, -1, -2]:
+            np.testing.assert_allclose(np_output[:, :, offset, :], 0.)
+            np.testing.assert_allclose(np_output[:, :, :, offset], 0.)
+          np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, :], 1.)
+
+        layer = keras.layers.ZeroPadding2D(
+            padding=((1, 2), (3, 4)), data_format=data_format)
+        layer.build(inputs.shape)
+        output = layer(keras.backend.variable(inputs))
+        np_output = keras.backend.eval(output)
+        if data_format == 'channels_last':
+          for top_offset in [0]:
+            np.testing.assert_allclose(np_output[:, top_offset, :, :], 0.)
+          for bottom_offset in [-1, -2]:
+            np.testing.assert_allclose(np_output[:, bottom_offset, :, :], 0.)
+          for left_offset in [0, 1, 2]:
+            np.testing.assert_allclose(np_output[:, :, left_offset, :], 0.)
+          for right_offset in [-1, -2, -3, -4]:
+            np.testing.assert_allclose(np_output[:, :, right_offset, :], 0.)
+          np.testing.assert_allclose(np_output[:, 1:-2, 3:-4, :], 1.)
+        elif data_format == 'channels_first':
+          for top_offset in [0]:
+            np.testing.assert_allclose(np_output[:, :, top_offset, :], 0.)
+          for bottom_offset in [-1, -2]:
+            np.testing.assert_allclose(np_output[:, :, bottom_offset, :], 0.)
+          for left_offset in [0, 1, 2]:
+            np.testing.assert_allclose(np_output[:, :, :, left_offset], 0.)
+          for right_offset in [-1, -2, -3, -4]:
+            np.testing.assert_allclose(np_output[:, :, :, right_offset], 0.)
+          np.testing.assert_allclose(np_output[:, :, 1:-2, 3:-4], 1.)
+
+  def test_zero_padding_3d(self):
+    num_samples = 2
+    stack_size = 2
+    input_len_dim1 = 4
+    input_len_dim2 = 5
+    input_len_dim3 = 3
+
+    inputs = np.ones((num_samples, input_len_dim1, input_len_dim2,
+                      input_len_dim3, stack_size))
+
+    # basic test
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.ZeroPadding3D,
+          kwargs={'padding': (2, 2, 2)},
+          input_shape=inputs.shape)
+
+    # correctness test
+    with self.test_session():
+      layer = keras.layers.ZeroPadding3D(padding=(2, 2, 2))
+      layer.build(inputs.shape)
+      output = layer(keras.backend.variable(inputs))
+      np_output = keras.backend.eval(output)
+      for offset in [0, 1, -1, -2]:
+        np.testing.assert_allclose(np_output[:, offset, :, :, :], 0.)
+        np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
+        np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
+      np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, 2:-2, :], 1.)
+
+
+class UpSamplingTest(test.TestCase):
+
+  def test_upsampling_1d(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.UpSampling1D, kwargs={'size': 2}, input_shape=(3, 5, 4))
+
+  def test_upsampling_2d(self):
+    num_samples = 2
+    stack_size = 2
+    input_num_row = 11
+    input_num_col = 12
+
+    for data_format in ['channels_first', 'channels_last']:
+      if data_format == 'channels_first':
+        inputs = np.random.rand(num_samples, stack_size, input_num_row,
+                                input_num_col)
+      else:
+        inputs = np.random.rand(num_samples, input_num_row, input_num_col,
+                                stack_size)
+
+      # basic test
+      with self.test_session():
+        testing_utils.layer_test(
+            keras.layers.UpSampling2D,
+            kwargs={'size': (2, 2),
+                    'data_format': data_format},
+            input_shape=inputs.shape)
+
+        for length_row in [2]:
+          for length_col in [2, 3]:
+            layer = keras.layers.UpSampling2D(
+                size=(length_row, length_col), data_format=data_format)
+            layer.build(inputs.shape)
+            output = layer(keras.backend.variable(inputs))
+            np_output = keras.backend.eval(output)
+            if data_format == 'channels_first':
+              assert np_output.shape[2] == length_row * input_num_row
+              assert np_output.shape[3] == length_col * input_num_col
+            else:  # tf
+              assert np_output.shape[1] == length_row * input_num_row
+              assert np_output.shape[2] == length_col * input_num_col
+
+            # compare with numpy
+            if data_format == 'channels_first':
+              expected_out = np.repeat(inputs, length_row, axis=2)
+              expected_out = np.repeat(expected_out, length_col, axis=3)
+            else:  # tf
+              expected_out = np.repeat(inputs, length_row, axis=1)
+              expected_out = np.repeat(expected_out, length_col, axis=2)
+
+            np.testing.assert_allclose(np_output, expected_out)
+
+  def test_upsampling_3d(self):
+    num_samples = 2
+    stack_size = 2
+    input_len_dim1 = 10
+    input_len_dim2 = 11
+    input_len_dim3 = 12
+
+    for data_format in ['channels_first', 'channels_last']:
+      if data_format == 'channels_first':
+        inputs = np.random.rand(num_samples, stack_size, input_len_dim1,
+                                input_len_dim2, input_len_dim3)
+      else:
+        inputs = np.random.rand(num_samples, input_len_dim1, input_len_dim2,
+                                input_len_dim3, stack_size)
+
+      # basic test
+      with self.test_session():
+        testing_utils.layer_test(
+            keras.layers.UpSampling3D,
+            kwargs={'size': (2, 2, 2),
+                    'data_format': data_format},
+            input_shape=inputs.shape)
+
+        for length_dim1 in [2, 3]:
+          for length_dim2 in [2]:
+            for length_dim3 in [3]:
+              layer = keras.layers.UpSampling3D(
+                  size=(length_dim1, length_dim2, length_dim3),
+                  data_format=data_format)
+              layer.build(inputs.shape)
+              output = layer(keras.backend.variable(inputs))
+              np_output = keras.backend.eval(output)
+              if data_format == 'channels_first':
+                assert np_output.shape[2] == length_dim1 * input_len_dim1
+                assert np_output.shape[3] == length_dim2 * input_len_dim2
+                assert np_output.shape[4] == length_dim3 * input_len_dim3
+              else:  # tf
+                assert np_output.shape[1] == length_dim1 * input_len_dim1
+                assert np_output.shape[2] == length_dim2 * input_len_dim2
+                assert np_output.shape[3] == length_dim3 * input_len_dim3
+
+              # compare with numpy
+              if data_format == 'channels_first':
+                expected_out = np.repeat(inputs, length_dim1, axis=2)
+                expected_out = np.repeat(expected_out, length_dim2, axis=3)
+                expected_out = np.repeat(expected_out, length_dim3, axis=4)
+              else:  # tf
+                expected_out = np.repeat(inputs, length_dim1, axis=1)
+                expected_out = np.repeat(expected_out, length_dim2, axis=2)
+                expected_out = np.repeat(expected_out, length_dim3, axis=3)
+
+              np.testing.assert_allclose(np_output, expected_out)
+
+
+class CroppingTest(test.TestCase):
+
+  def test_cropping_1d(self):
+    num_samples = 2
+    time_length = 4
+    input_len_dim1 = 2
+    inputs = np.random.rand(num_samples, time_length, input_len_dim1)
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Cropping1D,
+          kwargs={'cropping': (2, 2)},
+          input_shape=inputs.shape)
+
+  def test_cropping_2d(self):
+    num_samples = 2
+    stack_size = 2
+    input_len_dim1 = 9
+    input_len_dim2 = 9
+    cropping = ((2, 2), (3, 3))
+
+    for data_format in ['channels_first', 'channels_last']:
+      if data_format == 'channels_first':
+        inputs = np.random.rand(num_samples, stack_size, input_len_dim1,
+                                input_len_dim2)
+      else:
+        inputs = np.random.rand(num_samples, input_len_dim1, input_len_dim2,
+                                stack_size)
+      # basic test
+      with self.test_session():
+        testing_utils.layer_test(
+            keras.layers.Cropping2D,
+            kwargs={'cropping': cropping,
+                    'data_format': data_format},
+            input_shape=inputs.shape)
+      # correctness test
+      with self.test_session():
+        layer = keras.layers.Cropping2D(
+            cropping=cropping, data_format=data_format)
+        layer.build(inputs.shape)
+        output = layer(keras.backend.variable(inputs))
+        np_output = keras.backend.eval(output)
+        # compare with numpy
+        if data_format == 'channels_first':
+          expected_out = inputs[:, :, cropping[0][0]:-cropping[0][1], cropping[
+              1][0]:-cropping[1][1]]
+        else:
+          expected_out = inputs[:, cropping[0][0]:-cropping[0][1], cropping[1][
+              0]:-cropping[1][1], :]
+        np.testing.assert_allclose(np_output, expected_out)
+
+    for data_format in ['channels_first', 'channels_last']:
+      if data_format == 'channels_first':
+        inputs = np.random.rand(num_samples, stack_size, input_len_dim1,
+                                input_len_dim2)
+      else:
+        inputs = np.random.rand(num_samples, input_len_dim1, input_len_dim2,
+                                stack_size)
+      # another correctness test (no cropping)
+      with self.test_session():
+        cropping = ((0, 0), (0, 0))
+        layer = keras.layers.Cropping2D(
+            cropping=cropping, data_format=data_format)
+        layer.build(inputs.shape)
+        output = layer(keras.backend.variable(inputs))
+        np_output = keras.backend.eval(output)
+        # compare with input
+        np.testing.assert_allclose(np_output, inputs)
+
+  def test_cropping_3d(self):
+    num_samples = 2
+    stack_size = 2
+    input_len_dim1 = 8
+    input_len_dim2 = 8
+    input_len_dim3 = 8
+    cropping = ((2, 2), (1, 1), (2, 3))
+
+    for data_format in ['channels_last', 'channels_first']:
+      if data_format == 'channels_first':
+        inputs = np.random.rand(num_samples, stack_size, input_len_dim1,
+                                input_len_dim2, input_len_dim3)
+      else:
+        inputs = np.random.rand(num_samples, input_len_dim1, input_len_dim2,
+                                input_len_dim3, stack_size)
+      # basic test
+      with self.test_session():
+        testing_utils.layer_test(
+            keras.layers.Cropping3D,
+            kwargs={'cropping': cropping,
+                    'data_format': data_format},
+            input_shape=inputs.shape)
+      # correctness test
+      with self.test_session():
+        layer = keras.layers.Cropping3D(
+            cropping=cropping, data_format=data_format)
+        layer.build(inputs.shape)
+        output = layer(keras.backend.variable(inputs))
+        np_output = keras.backend.eval(output)
+        # compare with numpy
+        if data_format == 'channels_first':
+          expected_out = inputs[:, :,
+                                cropping[0][0]:-cropping[0][1],
+                                cropping[1][0]:-cropping[1][1],
+                                cropping[2][0]:-cropping[2][1]]
+        else:
+          expected_out = inputs[:,
+                                cropping[0][0]:-cropping[0][1],
+                                cropping[1][0]:-cropping[1][1],
+                                cropping[2][0]:-cropping[2][1], :]
+        print(expected_out.shape)
+        np.testing.assert_allclose(np_output, expected_out)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/core.py b/tensorflow/contrib/keras/python/keras/layers/core.py
new file mode 100644
index 0000000000..1207cc119f
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/core.py
@@ -0,0 +1,820 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Core Keras layers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import inspect
+import types as python_types
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras import activations
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import func_dump
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import func_load
+from tensorflow.python.framework import tensor_shape
+
+
+class Masking(Layer):
+  """Masks a sequence by using a mask value to skip timesteps.
+
+  For each timestep in the input tensor (dimension #1 in the tensor),
+  if all values in the input tensor at that timestep
+  are equal to `mask_value`, then the timestep will be masked (skipped)
+  in all downstream layers (as long as they support masking).
+
+  If any downstream layer does not support masking yet receives such
+  an input mask, an exception will be raised.
+
+  Example:
+
+  Consider a Numpy data array `x` of shape `(samples, timesteps, features)`,
+  to be fed to a LSTM layer.
+  You want to mask timestep #3 and #5 because you lack data for
+  these timesteps. You can:
+
+      - set `x[:, 3, :] = 0.` and `x[:, 5, :] = 0.`
+      - insert a `Masking` layer with `mask_value=0.` before the LSTM layer:
+
+  ```python
+      model = Sequential()
+      model.add(Masking(mask_value=0., input_shape=(timesteps, features)))
+      model.add(LSTM(32))
+  ```
+  """
+
+  def __init__(self, mask_value=0., **kwargs):
+    super(Masking, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.mask_value = mask_value
+
+  def compute_mask(self, inputs, mask=None):
+    return K.any(K.not_equal(inputs, self.mask_value), axis=-1)
+
+  def call(self, inputs):
+    boolean_mask = K.any(
+        K.not_equal(inputs, self.mask_value), axis=-1, keepdims=True)
+    return inputs * K.cast(boolean_mask, K.floatx())
+
+  def get_config(self):
+    config = {'mask_value': self.mask_value}
+    base_config = super(Masking, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Dropout(Layer):
+  """Applies Dropout to the input.
+
+  Dropout consists in randomly setting
+  a fraction `p` of input units to 0 at each update during training time,
+  which helps prevent overfitting.
+
+  Arguments:
+      rate: float between 0 and 1. Fraction of the input units to drop.
+      noise_shape: 1D integer tensor representing the shape of the
+          binary dropout mask that will be multiplied with the input.
+          For instance, if your inputs have shape
+          `(batch_size, timesteps, features)` and
+          you want the dropout mask to be the same for all timesteps,
+          you can use `noise_shape=(batch_size, 1, features)`.
+      seed: A Python integer to use as random seed.
+  """
+
+  def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
+    super(Dropout, self).__init__(**kwargs)
+    self.rate = min(1., max(0., rate))
+    self.noise_shape = noise_shape
+    self.seed = seed
+    self.supports_masking = True
+
+  def _get_noise_shape(self, _):
+    return self.noise_shape
+
+  def call(self, inputs, training=None):
+    if 0. < self.rate < 1.:
+      noise_shape = self._get_noise_shape(inputs)
+
+      def dropped_inputs():
+        return K.dropout(inputs, self.rate, noise_shape, seed=self.seed)
+
+      return K.in_train_phase(dropped_inputs, inputs, training=training)
+    return inputs
+
+  def get_config(self):
+    config = {'rate': self.rate}
+    base_config = super(Dropout, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class SpatialDropout1D(Dropout):
+  """Spatial 1D version of Dropout.
+
+  This version performs the same function as Dropout, however it drops
+  entire 1D feature maps instead of individual elements. If adjacent frames
+  within feature maps are strongly correlated (as is normally the case in
+  early convolution layers) then regular dropout will not regularize the
+  activations and will otherwise just result in an effective learning rate
+  decrease. In this case, SpatialDropout1D will help promote independence
+  between feature maps and should be used instead.
+
+  Arguments:
+      p: float between 0 and 1. Fraction of the input units to drop.
+
+  Input shape:
+      3D tensor with shape:
+      `(samples, timesteps, channels)`
+
+  Output shape:
+      Same as input
+
+  References:
+      - [Efficient Object Localization Using Convolutional
+        Networks](https://arxiv.org/abs/1411.4280)
+  """
+
+  def __init__(self, rate, **kwargs):
+    super(SpatialDropout1D, self).__init__(rate, **kwargs)
+    self.input_spec = InputSpec(ndim=3)
+
+  def _get_noise_shape(self, inputs):
+    input_shape = K.shape(inputs)
+    noise_shape = (input_shape[0], 1, input_shape[2])
+    return noise_shape
+
+
+class SpatialDropout2D(Dropout):
+  """Spatial 2D version of Dropout.
+
+  This version performs the same function as Dropout, however it drops
+  entire 2D feature maps instead of individual elements. If adjacent pixels
+  within feature maps are strongly correlated (as is normally the case in
+  early convolution layers) then regular dropout will not regularize the
+  activations and will otherwise just result in an effective learning rate
+  decrease. In this case, SpatialDropout2D will help promote independence
+  between feature maps and should be used instead.
+
+  Arguments:
+      rate: float between 0 and 1. Fraction of the input units to drop.
+      data_format: 'channels_first' or 'channels_last'.
+          In 'channels_first' mode, the channels dimension
+          (the depth) is at index 1,
+          in 'channels_last' mode is it at index 3.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      4D tensor with shape:
+      `(samples, channels, rows, cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, rows, cols, channels)` if data_format='channels_last'.
+
+  Output shape:
+      Same as input
+
+  References:
+      - [Efficient Object Localization Using Convolutional
+        Networks](https://arxiv.org/abs/1411.4280)
+  """
+
+  def __init__(self, rate, data_format=None, **kwargs):
+    super(SpatialDropout2D, self).__init__(rate, **kwargs)
+    if data_format is None:
+      data_format = K.image_data_format()
+    if data_format not in {'channels_last', 'channels_first'}:
+      raise ValueError('data_format must be in '
+                       '{"channels_last", "channels_first"}')
+    self.data_format = data_format
+    self.input_spec = InputSpec(ndim=4)
+
+  def _get_noise_shape(self, inputs):
+    input_shape = K.shape(inputs)
+    if self.data_format == 'channels_first':
+      noise_shape = (input_shape[0], input_shape[1], 1, 1)
+    elif self.data_format == 'channels_last':
+      noise_shape = (input_shape[0], 1, 1, input_shape[3])
+    else:
+      raise ValueError('Invalid data_format:', self.data_format)
+    return noise_shape
+
+
+class SpatialDropout3D(Dropout):
+  """Spatial 3D version of Dropout.
+
+  This version performs the same function as Dropout, however it drops
+  entire 3D feature maps instead of individual elements. If adjacent voxels
+  within feature maps are strongly correlated (as is normally the case in
+  early convolution layers) then regular dropout will not regularize the
+  activations and will otherwise just result in an effective learning rate
+  decrease. In this case, SpatialDropout3D will help promote independence
+  between feature maps and should be used instead.
+
+  Arguments:
+      rate: float between 0 and 1. Fraction of the input units to drop.
+      data_format: 'channels_first' or 'channels_last'.
+          In 'channels_first' mode, the channels dimension (the depth)
+          is at index 1, in 'channels_last' mode is it at index 4.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      5D tensor with shape:
+      `(samples, channels, dim1, dim2, dim3)` if data_format='channels_first'
+      or 5D tensor with shape:
+      `(samples, dim1, dim2, dim3, channels)` if data_format='channels_last'.
+
+  Output shape:
+      Same as input
+
+  References:
+      - [Efficient Object Localization Using Convolutional
+        Networks](https://arxiv.org/abs/1411.4280)
+  """
+
+  def __init__(self, rate, data_format=None, **kwargs):
+    super(SpatialDropout3D, self).__init__(rate, **kwargs)
+    if data_format is None:
+      data_format = K.image_data_format()
+    if data_format not in {'channels_last', 'channels_first'}:
+      raise ValueError('data_format must be in '
+                       '{"channels_last", "channels_first"}')
+    self.data_format = data_format
+    self.input_spec = InputSpec(ndim=5)
+
+  def _get_noise_shape(self, inputs):
+    input_shape = K.shape(inputs)
+    if self.data_format == 'channels_first':
+      noise_shape = (input_shape[0], input_shape[1], 1, 1, 1)
+    elif self.data_format == 'channels_last':
+      noise_shape = (input_shape[0], 1, 1, 1, input_shape[4])
+    else:
+      raise ValueError('Invalid data_format:', self.data_format)
+    return noise_shape
+
+
+class Activation(Layer):
+  """Applies an activation function to an output.
+
+  Arguments:
+      activation: name of activation function to use
+          or alternatively, a Theano or TensorFlow operation.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as input.
+  """
+
+  def __init__(self, activation, **kwargs):
+    super(Activation, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.activation = activations.get(activation)
+
+  def call(self, inputs):
+    return self.activation(inputs)
+
+  def get_config(self):
+    config = {'activation': activations.serialize(self.activation)}
+    base_config = super(Activation, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Reshape(Layer):
+  """Reshapes an output to a certain shape.
+
+  Arguments:
+      target_shape: target shape. Tuple of integers,
+          does not include the samples dimension (batch size).
+
+  Input shape:
+      Arbitrary, although all dimensions in the input shaped must be fixed.
+      Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      `(batch_size,) + target_shape`
+
+  Example:
+
+  ```python
+      # as first layer in a Sequential model
+      model = Sequential()
+      model.add(Reshape((3, 4), input_shape=(12,)))
+      # now: model.output_shape == (None, 3, 4)
+      # note: `None` is the batch dimension
+
+      # as intermediate layer in a Sequential model
+      model.add(Reshape((6, 2)))
+      # now: model.output_shape == (None, 6, 2)
+
+      # also supports shape inference using `-1` as dimension
+      model.add(Reshape((-1, 2, 2)))
+      # now: model.output_shape == (None, 3, 2, 2)
+  ```
+  """
+
+  def __init__(self, target_shape, **kwargs):
+    super(Reshape, self).__init__(**kwargs)
+    self.target_shape = tuple(target_shape)
+
+  def _fix_unknown_dimension(self, input_shape, output_shape):
+    """Find and replace a missing dimension in an output shape.
+
+    This is a near direct port of the internal Numpy function
+    `_fix_unknown_dimension` in `numpy/core/src/multiarray/shape.c`
+
+    Arguments:
+        input_shape: shape of array being reshaped
+        output_shape: desired shape of the array with at most
+            a single -1 which indicates a dimension that should be
+            derived from the input shape.
+
+    Returns:
+        The new output shape with a -1 replaced with its computed value.
+
+        Raises a ValueError if the total array size of the output_shape is
+        different then the input_shape, or more then one unknown dimension
+        is specified.
+
+    Raises:
+        ValueError: in case of invalid values
+            for `input_shape` or `input_shape`.
+    """
+    output_shape = list(output_shape)
+    msg = 'total size of new array must be unchanged'
+
+    known, unknown = 1, None
+    for index, dim in enumerate(output_shape):
+      if dim < 0:
+        if unknown is None:
+          unknown = index
+        else:
+          raise ValueError('Can only specify one unknown dimension.')
+      else:
+        known *= dim
+
+    original = np.prod(input_shape, dtype=int)
+    if unknown is not None:
+      if known == 0 or original % known != 0:
+        raise ValueError(msg)
+      output_shape[unknown] = original // known
+    elif original != known:
+      raise ValueError(msg)
+    return output_shape
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = [input_shape[0]]
+    output_shape += self._fix_unknown_dimension(input_shape[1:],
+                                                self.target_shape)
+    return tensor_shape.TensorShape(output_shape)
+
+  def call(self, inputs):
+    # In case the target shape is not fully defined,
+    # we need access to the shape of x.
+    target_shape = self.target_shape
+    if -1 in target_shape:
+      # target shape not fully defined
+      target_shape = self._compute_output_shape(inputs.get_shape())
+      target_shape = target_shape.as_list()[1:]
+    return K.reshape(inputs, (-1,) + tuple(target_shape))
+
+  def get_config(self):
+    config = {'target_shape': self.target_shape}
+    base_config = super(Reshape, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Permute(Layer):
+  """Permutes the dimensions of the input according to a given pattern.
+
+  Useful for e.g. connecting RNNs and convnets together.
+
+  Example:
+
+  ```python
+      model = Sequential()
+      model.add(Permute((2, 1), input_shape=(10, 64)))
+      # now: model.output_shape == (None, 64, 10)
+      # note: `None` is the batch dimension
+  ```
+
+  Arguments:
+      dims: Tuple of integers. Permutation pattern, does not include the
+          samples dimension. Indexing starts at 1.
+          For instance, `(2, 1)` permutes the first and second dimension
+          of the input.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same as the input shape, but with the dimensions re-ordered according
+      to the specified pattern.
+  """
+
+  def __init__(self, dims, **kwargs):
+    super(Permute, self).__init__(**kwargs)
+    self.dims = tuple(dims)
+    self.input_spec = InputSpec(ndim=len(self.dims) + 1)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = copy.copy(input_shape)
+    for i, dim in enumerate(self.dims):
+      target_dim = input_shape[dim]
+      output_shape[i + 1] = target_dim
+    return tensor_shape.TensorShape(output_shape)
+
+  def call(self, inputs):
+    return K.permute_dimensions(inputs, (0,) + self.dims)
+
+  def get_config(self):
+    config = {'dims': self.dims}
+    base_config = super(Permute, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Flatten(Layer):
+  """Flattens the input. Does not affect the batch size.
+
+  Example:
+
+  ```python
+      model = Sequential()
+      model.add(Convolution2D(64, 3, 3,
+                              border_mode='same',
+                              input_shape=(3, 32, 32)))
+      # now: model.output_shape == (None, 64, 32, 32)
+
+      model.add(Flatten())
+      # now: model.output_shape == (None, 65536)
+  ```
+  """
+
+  def __init__(self, **kwargs):
+    super(Flatten, self).__init__(**kwargs)
+    self.input_spec = InputSpec(min_ndim=3)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if not all(input_shape[1:]):
+      raise ValueError('The shape of the input to "Flatten" '
+                       'is not fully defined '
+                       '(got ' + str(input_shape[1:]) + '. '
+                       'Make sure to pass a complete "input_shape" '
+                       'or "batch_input_shape" argument to the first '
+                       'layer in your model.')
+    return tensor_shape.TensorShape([input_shape[0], np.prod(input_shape[1:])])
+
+  def call(self, inputs):
+    outputs = K.batch_flatten(inputs)
+    outputs.set_shape(self._compute_output_shape(inputs.get_shape()))
+    return outputs
+
+
+class RepeatVector(Layer):
+  """Repeats the input n times.
+
+  Example:
+
+  ```python
+      model = Sequential()
+      model.add(Dense(32, input_dim=32))
+      # now: model.output_shape == (None, 32)
+      # note: `None` is the batch dimension
+
+      model.add(RepeatVector(3))
+      # now: model.output_shape == (None, 3, 32)
+  ```
+
+  Arguments:
+      n: integer, repetition factor.
+
+  Input shape:
+      2D tensor of shape `(num_samples, features)`.
+
+  Output shape:
+      3D tensor of shape `(num_samples, n, features)`.
+  """
+
+  def __init__(self, n, **kwargs):
+    super(RepeatVector, self).__init__(**kwargs)
+    self.n = n
+    self.input_spec = InputSpec(ndim=2)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    return tensor_shape.TensorShape([input_shape[0], self.n, input_shape[1]])
+
+  def call(self, inputs):
+    return K.repeat(inputs, self.n)
+
+  def get_config(self):
+    config = {'n': self.n}
+    base_config = super(RepeatVector, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Lambda(Layer):
+  """Wraps arbitrary expression as a `Layer` object.
+
+  Examples:
+
+  ```python
+      # add a x -> x^2 layer
+      model.add(Lambda(lambda x: x ** 2))
+  ```
+  ```python
+      # add a layer that returns the concatenation
+      # of the positive part of the input and
+      # the opposite of the negative part
+
+      def antirectifier(x):
+          x -= K.mean(x, axis=1, keepdims=True)
+          x = K.l2_normalize(x, axis=1)
+          pos = K.relu(x)
+          neg = K.relu(-x)
+          return K.concatenate([pos, neg], axis=1)
+
+      model.add(Lambda(antirectifier))
+  ```
+
+  Arguments:
+      function: The function to be evaluated.
+          Takes input tensor as first argument.
+      arguments: optional dictionary of keyword arguments to be passed
+          to the function.
+
+  Input shape:
+      Arbitrary. Use the keyword argument input_shape
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Specified by `output_shape` argument
+      (or auto-inferred when using TensorFlow).
+  """
+
+  def __init__(self, function, mask=None, arguments=None, **kwargs):
+    super(Lambda, self).__init__(**kwargs)
+    self.function = function
+    self.arguments = arguments if arguments else {}
+    if mask is not None:
+      self.supports_masking = True
+    self.mask = mask
+
+  def call(self, inputs, mask=None):
+    arguments = self.arguments
+    arg_spec = inspect.getargspec(self.function)
+    if 'mask' in arg_spec.args:
+      arguments['mask'] = mask
+    return self.function(inputs, **arguments)
+
+  def compute_mask(self, inputs, mask=None):
+    if callable(self.mask):
+      return self.mask(inputs, mask)
+    return self.mask
+
+  def get_config(self):
+    if isinstance(self.function, python_types.LambdaType):
+      function = func_dump(self.function)
+      function_type = 'lambda'
+    else:
+      function = self.function.__name__
+      function_type = 'function'
+
+    config = {
+        'function': function,
+        'function_type': function_type,
+        'arguments': self.arguments
+    }
+    base_config = super(Lambda, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    globs = globals()
+    if custom_objects:
+      globs = dict(list(globs.items()) + list(custom_objects.items()))
+    function_type = config.pop('function_type')
+    if function_type == 'function':
+      # Simple lookup in custom objects
+      function = deserialize_keras_object(
+          config['function'],
+          custom_objects=custom_objects,
+          printable_module_name='function in Lambda layer')
+    elif function_type == 'lambda':
+      # Unsafe deserialization from bytecode
+      function = func_load(config['function'], globs=globs)
+    else:
+      raise TypeError('Unknown function type:', function_type)
+
+    config['function'] = function
+    return cls(**config)
+
+
+class Dense(Layer):
+  """Just your regular densely-connected NN layer.
+
+  `Dense` implements the operation:
+  `output = activation(dot(input, kernel) + bias)`
+  where `activation` is the element-wise activation function
+  passed as the `activation` argument, `kernel` is a weights matrix
+  created by the layer, and `bias` is a bias vector created by the layer
+  (only applicable if `use_bias` is `True`).
+
+  Note: if the input to the layer has a rank greater than 2, then
+  it is flattened prior to the initial dot product with `kernel`.
+
+  Example:
+
+  ```python
+      # as first layer in a sequential model:
+      model = Sequential()
+      model.add(Dense(32, input_shape=(16,)))
+      # now the model will take as input arrays of shape (*, 16)
+      # and output arrays of shape (*, 32)
+
+      # after the first layer, you don't need to specify
+      # the size of the input anymore:
+      model.add(Dense(32))
+  ```
+
+  Arguments:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to
+          the `kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+      nD tensor with shape: `(batch_size, ..., input_dim)`.
+      The most common situation would be
+      a 2D input with shape `(batch_size, input_dim)`.
+
+  Output shape:
+      nD tensor with shape: `(batch_size, ..., units)`.
+      For instance, for a 2D input with shape `(batch_size, input_dim)`,
+      the output would have shape `(batch_size, units)`.
+  """
+
+  def __init__(self,
+               units,
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    if 'input_shape' not in kwargs and 'input_dim' in kwargs:
+      kwargs['input_shape'] = (kwargs.pop('input_dim'),)
+    super(Dense, self).__init__(**kwargs)
+    self.units = units
+    self.activation = activations.get(activation)
+    self.use_bias = use_bias
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+    self.input_spec = InputSpec(min_ndim=2)
+    self.supports_masking = True
+
+  def build(self, input_shape):
+    assert len(input_shape) >= 2
+    input_dim = input_shape[-1]
+
+    self.kernel = self.add_weight(
+        (input_dim, self.units),
+        initializer=self.kernel_initializer,
+        name='kernel',
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+    if self.use_bias:
+      self.bias = self.add_weight(
+          (self.units,),
+          initializer=self.bias_initializer,
+          name='bias',
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+    else:
+      self.bias = None
+    self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})
+    self.built = True
+
+  def call(self, inputs):
+    output = K.dot(inputs, self.kernel)
+    if self.use_bias:
+      output = K.bias_add(output, self.bias)
+    if self.activation is not None:
+      output = self.activation(output)
+    return output
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    assert input_shape and len(input_shape) >= 2
+    assert input_shape[-1]
+    output_shape = list(input_shape)
+    output_shape[-1] = self.units
+    return tensor_shape.TensorShape(output_shape)
+
+  def get_config(self):
+    config = {
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'bias_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(Dense, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class ActivityRegularization(Layer):
+  """Layer that applies an update to the cost function based input activity.
+
+  Arguments:
+      l1: L1 regularization factor (positive float).
+      l2: L2 regularization factor (positive float).
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as input.
+  """
+
+  def __init__(self, l1=0., l2=0., **kwargs):
+    super(ActivityRegularization, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.l1 = l1
+    self.l2 = l2
+    self.activity_regularizer = regularizers.L1L2(l1=l1, l2=l2)
+
+  def get_config(self):
+    config = {'l1': self.l1, 'l2': self.l2}
+    base_config = super(ActivityRegularization, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/core_test.py b/tensorflow/contrib/keras/python/keras/layers/core_test.py
new file mode 100644
index 0000000000..d7aa8413bb
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/core_test.py
@@ -0,0 +1,189 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras core layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class CoreLayersTest(test.TestCase):
+
+  def test_masking(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Masking, kwargs={}, input_shape=(3, 2, 3))
+
+  def test_dropout(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Dropout, kwargs={'rate': 0.5}, input_shape=(3, 2))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Dropout,
+          kwargs={'rate': 0.5,
+                  'noise_shape': [3, 1]},
+          input_shape=(3, 2))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.SpatialDropout1D,
+          kwargs={'rate': 0.5},
+          input_shape=(2, 3, 4))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.SpatialDropout2D,
+          kwargs={'rate': 0.5},
+          input_shape=(2, 3, 4, 5))
+
+  def test_activation(self):
+    # with string argument
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Activation,
+          kwargs={'activation': 'relu'},
+          input_shape=(3, 2))
+
+    # with function argument
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Activation,
+          kwargs={'activation': keras.backend.relu},
+          input_shape=(3, 2))
+
+  def test_reshape(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Reshape,
+          kwargs={'target_shape': (8, 1)},
+          input_shape=(3, 2, 4))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Reshape,
+          kwargs={'target_shape': (-1, 1)},
+          input_shape=(3, 2, 4))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Reshape,
+          kwargs={'target_shape': (1, -1)},
+          input_shape=(3, 2, 4))
+
+  def test_permute(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Permute, kwargs={'dims': (2, 1)}, input_shape=(3, 2, 4))
+
+  def test_flatten(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Flatten, kwargs={}, input_shape=(3, 2, 4))
+
+  def test_repeat_vector(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.RepeatVector, kwargs={'n': 3}, input_shape=(3, 2))
+
+  def test_lambda(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Lambda,
+          kwargs={'function': lambda x: x + 1},
+          input_shape=(3, 2))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Lambda,
+          kwargs={
+              'function': lambda x, a, b: x * a + b,
+              'arguments': {
+                  'a': 0.6,
+                  'b': 0.4
+              }
+          },
+          input_shape=(3, 2))
+
+    with self.test_session():
+      # test serialization with function
+      def f(x):
+        return x + 1
+
+      ld = keras.layers.Lambda(f)
+      config = ld.get_config()
+      ld = keras.layers.deserialize({
+          'class_name': 'Lambda',
+          'config': config
+      })
+
+      # test with lambda
+      ld = keras.layers.Lambda(
+          lambda x: keras.backend.concatenate([keras.backend.square(x), x]))
+      config = ld.get_config()
+      ld = keras.layers.Lambda.from_config(config)
+
+  def test_dense(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 2))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 2))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Dense, kwargs={'units': 3}, input_shape=(None, None, 2))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 5, 2))
+
+    # Test regularization
+    with self.test_session():
+      layer = keras.layers.Dense(
+          3,
+          kernel_regularizer=keras.regularizers.l1(0.01),
+          bias_regularizer='l1',
+          activity_regularizer='l2')
+      layer.build((None, 4))
+      assert len(layer.losses) == 2
+      layer(keras.backend.variable(np.ones((2, 4))))
+      assert len(layer.losses) == 3
+
+    # Test constraints
+    with self.test_session():
+      layer = keras.layers.Dense(
+          3, kernel_constraint='max_norm', bias_constraint='max_norm')
+      layer.build((None, 4))
+      assert len(layer.constraints) == 2
+
+  def test_activity_regularization(self):
+    with self.test_session():
+      layer = keras.layers.ActivityRegularization(l1=0.1)
+      layer(keras.backend.variable(np.ones((2, 4))))
+      assert len(layer.losses) == 1
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/embeddings.py b/tensorflow/contrib/keras/python/keras/layers/embeddings.py
new file mode 100644
index 0000000000..5ba7d7db8a
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/embeddings.py
@@ -0,0 +1,166 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Embedding layer.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.python.framework import tensor_shape
+
+
+class Embedding(Layer):
+  """Turns positive integers (indexes) into dense vectors of fixed size.
+
+  eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
+
+  This layer can only be used as the first layer in a model.
+
+  Example:
+
+  ```python
+    model = Sequential()
+    model.add(Embedding(1000, 64, input_length=10))
+    # the model will take as input an integer matrix of size (batch,
+    input_length).
+    # the largest integer (i.e. word index) in the input should be no larger
+    than 999 (vocabulary size).
+    # now model.output_shape == (None, 10, 64), where None is the batch
+    dimension.
+
+    input_array = np.random.randint(1000, size=(32, 10))
+
+    model.compile('rmsprop', 'mse')
+    output_array = model.predict(input_array)
+    assert output_array.shape == (32, 10, 64)
+  ```
+
+  Arguments:
+    input_dim: int > 0. Size of the vocabulary, ie.
+        1 + maximum integer index occurring in the input data.
+    output_dim: int >= 0. Dimension of the dense embedding.
+    embeddings_initializer: Initializer for the `embeddings` matrix.
+    embeddings_regularizer: Regularizer function applied to
+          the `embeddings` matrix.
+    embeddings_constraint: Constraint function applied to
+          the `embeddings` matrix.
+    mask_zero: Whether or not the input value 0 is a special "padding"
+        value that should be masked out.
+        This is useful when using recurrent layers,
+        which may take variable length inputs.
+        If this is `True` then all subsequent layers
+        in the model need to support masking or an exception will be raised.
+        If mask_zero is set to True, as a consequence, index 0 cannot be
+        used in the vocabulary (input_dim should equal `|vocabulary| + 2`).
+    input_length: Length of input sequences, when it is constant.
+        This argument is required if you are going to connect
+        `Flatten` then `Dense` layers upstream
+        (without it, the shape of the dense outputs cannot be computed).
+
+  Input shape:
+      2D tensor with shape: `(batch_size, sequence_length)`.
+
+  Output shape:
+      3D tensor with shape: `(batch_size, sequence_length, output_dim)`.
+
+  References:
+      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
+        Networks](http://arxiv.org/abs/1512.05287)
+  """
+
+  def __init__(self,
+               input_dim,
+               output_dim,
+               embeddings_initializer='uniform',
+               embeddings_regularizer=None,
+               activity_regularizer=None,
+               embeddings_constraint=None,
+               mask_zero=False,
+               input_length=None,
+               **kwargs):
+    kwargs['dtype'] = 'int32'
+    if 'input_shape' not in kwargs:
+      if input_length:
+        kwargs['input_shape'] = (input_length,)
+      else:
+        kwargs['input_shape'] = (None,)
+    super(Embedding, self).__init__(**kwargs)
+
+    self.input_dim = input_dim
+    self.output_dim = output_dim
+    self.embeddings_initializer = initializers.get(embeddings_initializer)
+    self.embeddings_regularizer = regularizers.get(embeddings_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+    self.embeddings_constraint = constraints.get(embeddings_constraint)
+    self.mask_zero = mask_zero
+    self.input_length = input_length
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    self.embeddings = self.add_weight(
+        (self.input_dim, self.output_dim),
+        initializer=self.embeddings_initializer,
+        name='embeddings',
+        regularizer=self.embeddings_regularizer,
+        constraint=self.embeddings_constraint)
+    self.built = True
+
+  def compute_mask(self, inputs, mask=None):
+    if not self.mask_zero:
+      return None
+    else:
+      return K.not_equal(inputs, 0)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if not self.input_length:
+      input_length = input_shape[1]
+    else:
+      input_length = self.input_length
+    return tensor_shape.TensorShape(
+        [input_shape[0], input_length, self.output_dim])
+
+  def call(self, inputs):
+    if K.dtype(inputs) != 'int32':
+      inputs = K.cast(inputs, 'int32')
+    out = K.gather(self.embeddings, inputs)
+    return out
+
+  def get_config(self):
+    config = {
+        'input_dim':
+            self.input_dim,
+        'output_dim':
+            self.output_dim,
+        'embeddings_initializer':
+            initializers.serialize(self.embeddings_initializer),
+        'embeddings_regularizer':
+            regularizers.serialize(self.embeddings_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'embeddings_constraint':
+            constraints.serialize(self.embeddings_constraint),
+        'mask_zero':
+            self.mask_zero,
+        'input_length':
+            self.input_length
+    }
+    base_config = super(Embedding, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/embeddings_test.py b/tensorflow/contrib/keras/python/keras/layers/embeddings_test.py
new file mode 100644
index 0000000000..ca7ca3efd8
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/embeddings_test.py
@@ -0,0 +1,51 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for embedding layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class EmbeddingTest(test.TestCase):
+
+  def test_embedding(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Embedding,
+          kwargs={'output_dim': 4,
+                  'input_dim': 10,
+                  'input_length': 2},
+          input_shape=(3, 2),
+          input_dtype='int32',
+          expected_output_dtype='float32')
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Embedding,
+          kwargs={'output_dim': 4,
+                  'input_dim': 10,
+                  'mask_zero': True},
+          input_shape=(3, 2),
+          input_dtype='int32',
+          expected_output_dtype='float32')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/gru_test.py b/tensorflow/contrib/keras/python/keras/layers/gru_test.py
new file mode 100644
index 0000000000..327d2b05a1
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/gru_test.py
@@ -0,0 +1,194 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for GRU layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class GRULayerTest(test.TestCase):
+
+  def test_return_sequences_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.GRU,
+          kwargs={'units': units,
+                  'return_sequences': True},
+          input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_dynamic_behavior_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      layer = keras.layers.GRU(units, input_shape=(None, embedding_dim))
+      model = keras.models.Sequential()
+      model.add(layer)
+      model.compile('sgd', 'mse')
+      x = np.random.random((num_samples, timesteps, embedding_dim))
+      y = np.random.random((num_samples, units))
+      model.train_on_batch(x, y)
+
+  def test_dropout_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.GRU,
+          kwargs={'units': units,
+                  'dropout': 0.1,
+                  'recurrent_dropout': 0.1},
+          input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_implementation_mode_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      for mode in [0, 1, 2]:
+        testing_utils.layer_test(
+            keras.layers.GRU,
+            kwargs={'units': units,
+                    'implementation': mode},
+            input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_statefulness_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer_class = keras.layers.GRU
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Embedding(
+              4,
+              embedding_dim,
+              mask_zero=True,
+              input_length=timesteps,
+              batch_input_shape=(num_samples, timesteps)))
+      layer = layer_class(
+          units, return_sequences=False, stateful=True, weights=None)
+      model.add(layer)
+      model.compile(optimizer='sgd', loss='mse')
+      out1 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertEqual(out1.shape, (num_samples, units))
+
+      # train once so that the states change
+      model.train_on_batch(
+          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+      out2 = model.predict(np.ones((num_samples, timesteps)))
+
+      # if the state is not reset, output should be different
+      self.assertNotEqual(out1.max(), out2.max())
+
+      # check that output changes after states are reset
+      # (even though the model itself didn't change)
+      layer.reset_states()
+      out3 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out2.max(), out3.max())
+
+      # check that container-level reset_states() works
+      model.reset_states()
+      out4 = model.predict(np.ones((num_samples, timesteps)))
+      np.testing.assert_allclose(out3, out4, atol=1e-5)
+
+      # check that the call to `predict` updated the states
+      out5 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out4.max(), out5.max())
+
+      # Check masking
+      layer.reset_states()
+
+      left_padded_input = np.ones((num_samples, timesteps))
+      left_padded_input[0, :1] = 0
+      left_padded_input[1, :2] = 0
+      out6 = model.predict(left_padded_input)
+
+      layer.reset_states()
+
+      right_padded_input = np.ones((num_samples, timesteps))
+      right_padded_input[0, -1:] = 0
+      right_padded_input[1, -2:] = 0
+      out7 = model.predict(right_padded_input)
+
+      np.testing.assert_allclose(out7, out6, atol=1e-5)
+
+  def test_regularization_GRU(self):
+    embedding_dim = 4
+    layer_class = keras.layers.GRU
+    with self.test_session():
+      layer = layer_class(
+          5,
+          return_sequences=False,
+          weights=None,
+          input_shape=(None, embedding_dim),
+          kernel_regularizer=keras.regularizers.l1(0.01),
+          recurrent_regularizer=keras.regularizers.l1(0.01),
+          bias_regularizer='l2',
+          activity_regularizer='l1')
+      layer.build((None, None, 2))
+      self.assertEqual(len(layer.losses), 3)
+      layer(keras.backend.variable(np.ones((2, 3, 2))))
+      self.assertEqual(len(layer.losses), 4)
+
+      layer = layer_class(
+          5,
+          return_sequences=False,
+          weights=None,
+          input_shape=(None, embedding_dim),
+          kernel_constraint=keras.constraints.max_norm(0.01),
+          recurrent_constraint=keras.constraints.max_norm(0.01),
+          bias_constraint='max_norm')
+      layer.build((None, None, embedding_dim))
+      self.assertEqual(len(layer.constraints), 3)
+
+  def test_with_masking_layer_GRU(self):
+    layer_class = keras.layers.GRU
+    with self.test_session():
+      inputs = np.random.random((2, 3, 4))
+      targets = np.abs(np.random.random((2, 3, 5)))
+      targets /= targets.sum(axis=-1, keepdims=True)
+      model = keras.models.Sequential()
+      model.add(keras.layers.Masking(input_shape=(3, 4)))
+      model.add(layer_class(units=5, return_sequences=True, unroll=False))
+      model.compile(loss='categorical_crossentropy', optimizer='adam')
+      model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  def test_from_config_GRU(self):
+    layer_class = keras.layers.GRU
+    for stateful in (False, True):
+      l1 = layer_class(units=1, stateful=stateful)
+      l2 = layer_class.from_config(l1.get_config())
+      assert l1.get_config() == l2.get_config()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/local.py b/tensorflow/contrib/keras/python/keras/layers/local.py
new file mode 100644
index 0000000000..3bf5ee4f0f
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/local.py
@@ -0,0 +1,476 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Locally-connected layers.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import activations
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.contrib.keras.python.keras.utils import conv_utils
+from tensorflow.python.framework import tensor_shape
+
+
+class LocallyConnected1D(Layer):
+  """Locally-connected layer for 1D inputs.
+
+  The `LocallyConnected1D` layer works similarly to
+  the `Conv1D` layer, except that weights are unshared,
+  that is, a different set of filters is applied at each different patch
+  of the input.
+
+  Example:
+  ```python
+      # apply a unshared weight convolution 1d of length 3 to a sequence with
+      # 10 timesteps, with 64 output filters
+      model = Sequential()
+      model.add(LocallyConnected1D(64, 3, input_shape=(10, 32)))
+      # now model.output_shape == (None, 8, 64)
+      # add a new conv1d on top
+      model.add(LocallyConnected1D(32, 3))
+      # now model.output_shape == (None, 6, 32)
+  ```
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of a single integer,
+          specifying the length of the 1D convolution window.
+      strides: An integer or tuple/list of a single integer,
+          specifying the stride length of the convolution.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to the kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+      3D tensor with shape: `(batch_size, steps, input_dim)`
+
+  Output shape:
+      3D tensor with shape: `(batch_size, new_steps, filters)`
+      `steps` value might have changed due to padding or strides.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format=None,
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(LocallyConnected1D, self).__init__(**kwargs)
+    self.filters = filters
+    self.kernel_size = conv_utils.normalize_tuple(kernel_size, 1, 'kernel_size')
+    self.strides = conv_utils.normalize_tuple(strides, 1, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    if self.padding != 'valid':
+      raise ValueError('Invalid border mode for LocallyConnected1D '
+                       '(only "valid" is supported): ' + padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.activation = activations.get(activation)
+    self.use_bias = use_bias
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+    self.input_spec = InputSpec(ndim=3)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    input_dim = input_shape[2]
+    if input_dim is None:
+      raise ValueError('Axis 2 of input should be fully-defined. '
+                       'Found shape:', input_shape)
+    output_length = conv_utils.conv_output_length(
+        input_shape[1], self.kernel_size[0], self.padding, self.strides[0])
+    self.kernel_shape = (output_length, self.kernel_size[0] * input_dim,
+                         self.filters)
+    self.kernel = self.add_weight(
+        self.kernel_shape,
+        initializer=self.kernel_initializer,
+        name='kernel',
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+    if self.use_bias:
+      self.bias = self.add_weight(
+          (output_length, self.filters),
+          initializer=self.bias_initializer,
+          name='bias',
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+    else:
+      self.bias = None
+    self.input_spec = InputSpec(ndim=3, axes={2: input_dim})
+    self.built = True
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    length = conv_utils.conv_output_length(input_shape[1], self.kernel_size[0],
+                                           self.padding, self.strides[0])
+    return tensor_shape.TensorShape([input_shape[0], length, self.filters])
+
+  def call(self, inputs):
+    stride = self.strides[0]
+    output_length, feature_dim, filters = self.kernel_shape
+
+    xs = []
+    for i in range(output_length):
+      slice_length = slice(i * stride, i * stride + self.kernel_size[0])
+      xs.append(K.reshape(inputs[:, slice_length, :], (1, -1, feature_dim)))
+    x_aggregate = K.concatenate(xs, axis=0)
+    # Shape: `(output_length, batch_size, filters)`.
+    output = K.batch_dot(x_aggregate, self.kernel)
+    output = K.permute_dimensions(output, (1, 0, 2))
+
+    if self.use_bias:
+      output += K.reshape(self.bias, (1, output_length, filters))
+    if self.activation is not None:
+      output = self.activation(output)
+    return output
+
+  def get_config(self):
+    config = {
+        'filters':
+            self.filters,
+        'kernel_size':
+            self.kernel_size,
+        'strides':
+            self.strides,
+        'padding':
+            self.padding,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'bias_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(LocallyConnected1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class LocallyConnected2D(Layer):
+  """Locally-connected layer for 2D inputs.
+
+  The `LocallyConnected2D` layer works similarly
+  to the `Conv2D` layer, except that weights are unshared,
+  that is, a different set of filters is applied at each
+  different patch of the input.
+
+  Examples:
+  ```python
+      # apply a 3x3 unshared weights convolution with 64 output filters on a
+      32x32 image
+      # with `data_format="channels_last"`:
+      model = Sequential()
+      model.add(LocallyConnected2D(64, (3, 3), input_shape=(32, 32, 3)))
+      # now model.output_shape == (None, 30, 30, 64)
+      # notice that this layer will consume (30*30)*(3*3*3*64) + (30*30)*64
+      parameters
+
+      # add a 3x3 unshared weights convolution on top, with 32 output filters:
+      model.add(LocallyConnected2D(32, (3, 3)))
+      # now model.output_shape == (None, 28, 28, 32)
+  ```
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of 2 integers, specifying the
+          width and height of the 2D convolution window.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+          specifying the strides of the convolution along the width and height.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, width, height, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, width, height)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to the kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+      4D tensor with shape:
+      `(samples, channels, rows, cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, rows, cols, channels)` if data_format='channels_last'.
+
+  Output shape:
+      4D tensor with shape:
+      `(samples, filters, new_rows, new_cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, new_rows, new_cols, filters)` if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to padding.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format=None,
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(LocallyConnected2D, self).__init__(**kwargs)
+    self.filters = filters
+    self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
+    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    if self.padding != 'valid':
+      raise ValueError('Invalid border mode for LocallyConnected2D '
+                       '(only "valid" is supported): ' + padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.activation = activations.get(activation)
+    self.use_bias = use_bias
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+    self.input_spec = InputSpec(ndim=4)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_last':
+      input_row, input_col = input_shape[1:-1]
+      input_filter = input_shape[3]
+    else:
+      input_row, input_col = input_shape[2:]
+      input_filter = input_shape[1]
+    if input_row is None or input_col is None:
+      raise ValueError('The spatial dimensions of the inputs to '
+                       ' a LocallyConnected2D layer '
+                       'should be fully-defined, but layer received '
+                       'the inputs shape ' + str(input_shape))
+
+    output_row = conv_utils.conv_output_length(input_row, self.kernel_size[0],
+                                               self.padding, self.strides[0])
+    output_col = conv_utils.conv_output_length(input_col, self.kernel_size[1],
+                                               self.padding, self.strides[1])
+    self.output_row = output_row
+    self.output_col = output_col
+    self.kernel_shape = (output_row * output_col, self.kernel_size[0] *
+                         self.kernel_size[1] * input_filter, self.filters)
+    self.kernel = self.add_weight(
+        self.kernel_shape,
+        initializer=self.kernel_initializer,
+        name='kernel',
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+    if self.use_bias:
+      self.bias = self.add_weight(
+          (output_row, output_col, self.filters),
+          initializer=self.bias_initializer,
+          name='bias',
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+    else:
+      self.bias = None
+    if self.data_format == 'channels_first':
+      self.input_spec = InputSpec(ndim=4, axes={1: input_filter})
+    else:
+      self.input_spec = InputSpec(ndim=4, axes={-1: input_filter})
+    self.built = True
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+    elif self.data_format == 'channels_last':
+      rows = input_shape[1]
+      cols = input_shape[2]
+    rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
+                                         self.padding, self.strides[0])
+    cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
+                                         self.padding, self.strides[1])
+
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], self.filters, rows, cols])
+    elif self.data_format == 'channels_last':
+      return tensor_shape.TensorShape(
+          [input_shape[0], rows, cols, self.filters])
+
+  def call(self, inputs):
+    stride_row, stride_col = self.strides
+    _, feature_dim, filters = self.kernel_shape
+
+    if self.data_format == 'channels_first':
+      if K.backend() == 'theano':
+        output = []
+        for i in range(self.output_row):
+          for j in range(self.output_col):
+            slice_row = slice(i * stride_row,
+                              i * stride_row + self.kernel_size[0])
+            slice_col = slice(j * stride_col,
+                              j * stride_col + self.kernel_size[1])
+            x_flatten = K.reshape(inputs[:, :, slice_row, slice_col],
+                                  (1, -1, feature_dim))
+            output.append(
+                K.dot(x_flatten, self.kernel[i * self.output_col + j, :, :]))
+        output = K.concatenate(output, axis=0)
+      else:
+        xs = []
+        for i in range(self.output_row):
+          for j in range(self.output_col):
+            slice_row = slice(i * stride_row,
+                              i * stride_row + self.kernel_size[0])
+            slice_col = slice(j * stride_col,
+                              j * stride_col + self.kernel_size[1])
+            xs.append(
+                K.reshape(inputs[:, :, slice_row, slice_col], (1, -1,
+                                                               feature_dim)))
+        x_aggregate = K.concatenate(xs, axis=0)
+        output = K.batch_dot(x_aggregate, self.kernel)
+      output = K.reshape(output, (self.output_row, self.output_col, -1,
+                                  filters))
+      output = K.permute_dimensions(output, (2, 3, 0, 1))
+
+    elif self.data_format == 'channels_last':
+      xs = []
+      for i in range(self.output_row):
+        for j in range(self.output_col):
+          slice_row = slice(i * stride_row,
+                            i * stride_row + self.kernel_size[0])
+          slice_col = slice(j * stride_col,
+                            j * stride_col + self.kernel_size[1])
+          xs.append(
+              K.reshape(inputs[:, slice_row, slice_col, :], (1, -1, feature_dim
+                                                            )))
+      x_aggregate = K.concatenate(xs, axis=0)
+      output = K.batch_dot(x_aggregate, self.kernel)
+      output = K.reshape(output, (self.output_row, self.output_col, -1,
+                                  filters))
+      output = K.permute_dimensions(output, (2, 0, 1, 3))
+
+    if self.use_bias:
+      if self.data_format == 'channels_first':
+        output += K.reshape(self.bias, (1, filters, self.output_row,
+                                        self.output_col))
+      elif self.data_format == 'channels_last':
+        output += K.reshape(self.bias, (1, self.output_row, self.output_col,
+                                        filters))
+    output = self.activation(output)
+    return output
+
+  def get_config(self):
+    config = {
+        'filters':
+            self.filters,
+        'kernel_size':
+            self.kernel_size,
+        'strides':
+            self.strides,
+        'padding':
+            self.padding,
+        'data_format':
+            self.data_format,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'bias_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(LocallyConnected2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/local_test.py b/tensorflow/contrib/keras/python/keras/layers/local_test.py
new file mode 100644
index 0000000000..f84f4e91d5
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/local_test.py
@@ -0,0 +1,165 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for locally-connected layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class LocallyConnectedLayersTest(test.TestCase):
+
+  def test_locallyconnected_1d(self):
+    num_samples = 2
+    num_steps = 8
+    input_dim = 5
+    filter_length = 3
+    filters = 4
+
+    for padding in ['valid']:
+      for strides in [1]:
+        if padding == 'same' and strides != 1:
+          continue
+
+        with self.test_session():
+          testing_utils.layer_test(
+              keras.layers.LocallyConnected1D,
+              kwargs={
+                  'filters': filters,
+                  'kernel_size': filter_length,
+                  'padding': padding,
+                  'strides': strides
+              },
+              input_shape=(num_samples, num_steps, input_dim))
+
+  def test_locallyconnected_1d_regularization(self):
+    num_samples = 2
+    num_steps = 8
+    input_dim = 5
+    filter_length = 3
+    filters = 4
+    kwargs = {
+        'filters': filters,
+        'kernel_size': filter_length,
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+    }
+
+    with self.test_session():
+      layer = keras.layers.LocallyConnected1D(**kwargs)
+      layer.build((num_samples, num_steps, input_dim))
+      self.assertEqual(len(layer.losses), 2)
+      layer(
+          keras.backend.variable(np.ones((num_samples, num_steps, input_dim))))
+      self.assertEqual(len(layer.losses), 3)
+
+    kwargs = {
+        'filters': filters,
+        'kernel_size': filter_length,
+        'kernel_constraint': 'max_norm',
+        'bias_constraint': 'max_norm',
+    }
+    with self.test_session():
+      layer = keras.layers.LocallyConnected1D(**kwargs)
+      layer.build((num_samples, num_steps, input_dim))
+      self.assertEqual(len(layer.constraints), 2)
+
+  def test_locallyconnected_2d(self):
+    num_samples = 8
+    filters = 3
+    stack_size = 4
+    num_row = 6
+    num_col = 10
+
+    for padding in ['valid']:
+      for strides in [(1, 1), (2, 2)]:
+        if padding == 'same' and strides != (1, 1):
+          continue
+
+        with self.test_session():
+          testing_utils.layer_test(
+              keras.layers.LocallyConnected2D,
+              kwargs={
+                  'filters': filters,
+                  'kernel_size': 3,
+                  'padding': padding,
+                  'kernel_regularizer': 'l2',
+                  'bias_regularizer': 'l2',
+                  'activity_regularizer': 'l2',
+                  'strides': strides,
+                  'data_format': 'channels_last'
+              },
+              input_shape=(num_samples, num_row, num_col, stack_size))
+
+  def test_locallyconnected_2d_channels_first(self):
+    num_samples = 8
+    filters = 3
+    stack_size = 4
+    num_row = 6
+    num_col = 10
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.LocallyConnected2D,
+          kwargs={
+              'filters': filters,
+              'kernel_size': 3,
+              'data_format': 'channels_first'
+          },
+          input_shape=(num_samples, num_row, num_col, stack_size))
+
+  def test_locallyconnected_2d_regularization(self):
+    num_samples = 8
+    filters = 3
+    stack_size = 4
+    num_row = 6
+    num_col = 10
+    kwargs = {
+        'filters': filters,
+        'kernel_size': 3,
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+    }
+    with self.test_session():
+      layer = keras.layers.LocallyConnected2D(**kwargs)
+      layer.build((num_samples, num_row, num_col, stack_size))
+      self.assertEqual(len(layer.losses), 2)
+      layer(
+          keras.backend.variable(
+              np.ones((num_samples, num_row, num_col, stack_size))))
+      self.assertEqual(len(layer.losses), 3)
+
+    kwargs = {
+        'filters': filters,
+        'kernel_size': 3,
+        'kernel_constraint': 'max_norm',
+        'bias_constraint': 'max_norm',
+    }
+    with self.test_session():
+      layer = keras.layers.LocallyConnected2D(**kwargs)
+      layer.build((num_samples, num_row, num_col, stack_size))
+      self.assertEqual(len(layer.constraints), 2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/lstm_test.py b/tensorflow/contrib/keras/python/keras/layers/lstm_test.py
new file mode 100644
index 0000000000..0e1d148bd8
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/lstm_test.py
@@ -0,0 +1,194 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for LSTM layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class LSTMLayerTest(test.TestCase):
+
+  def test_return_sequences_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.LSTM,
+          kwargs={'units': units,
+                  'return_sequences': True},
+          input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_dynamic_behavior_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      layer = keras.layers.LSTM(units, input_shape=(None, embedding_dim))
+      model = keras.models.Sequential()
+      model.add(layer)
+      model.compile('sgd', 'mse')
+      x = np.random.random((num_samples, timesteps, embedding_dim))
+      y = np.random.random((num_samples, units))
+      model.train_on_batch(x, y)
+
+  def test_dropout_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.LSTM,
+          kwargs={'units': units,
+                  'dropout': 0.1,
+                  'recurrent_dropout': 0.1},
+          input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_implementation_mode_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      for mode in [0, 1, 2]:
+        testing_utils.layer_test(
+            keras.layers.LSTM,
+            kwargs={'units': units,
+                    'implementation': mode},
+            input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_statefulness_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer_class = keras.layers.LSTM
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Embedding(
+              4,
+              embedding_dim,
+              mask_zero=True,
+              input_length=timesteps,
+              batch_input_shape=(num_samples, timesteps)))
+      layer = layer_class(
+          units, return_sequences=False, stateful=True, weights=None)
+      model.add(layer)
+      model.compile(optimizer='sgd', loss='mse')
+      out1 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertEqual(out1.shape, (num_samples, units))
+
+      # train once so that the states change
+      model.train_on_batch(
+          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+      out2 = model.predict(np.ones((num_samples, timesteps)))
+
+      # if the state is not reset, output should be different
+      self.assertNotEqual(out1.max(), out2.max())
+
+      # check that output changes after states are reset
+      # (even though the model itself didn't change)
+      layer.reset_states()
+      out3 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out2.max(), out3.max())
+
+      # check that container-level reset_states() works
+      model.reset_states()
+      out4 = model.predict(np.ones((num_samples, timesteps)))
+      np.testing.assert_allclose(out3, out4, atol=1e-5)
+
+      # check that the call to `predict` updated the states
+      out5 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out4.max(), out5.max())
+
+      # Check masking
+      layer.reset_states()
+
+      left_padded_input = np.ones((num_samples, timesteps))
+      left_padded_input[0, :1] = 0
+      left_padded_input[1, :2] = 0
+      out6 = model.predict(left_padded_input)
+
+      layer.reset_states()
+
+      right_padded_input = np.ones((num_samples, timesteps))
+      right_padded_input[0, -1:] = 0
+      right_padded_input[1, -2:] = 0
+      out7 = model.predict(right_padded_input)
+
+      np.testing.assert_allclose(out7, out6, atol=1e-5)
+
+  def test_regularization_LSTM(self):
+    embedding_dim = 4
+    layer_class = keras.layers.LSTM
+    with self.test_session():
+      layer = layer_class(
+          5,
+          return_sequences=False,
+          weights=None,
+          input_shape=(None, embedding_dim),
+          kernel_regularizer=keras.regularizers.l1(0.01),
+          recurrent_regularizer=keras.regularizers.l1(0.01),
+          bias_regularizer='l2',
+          activity_regularizer='l1')
+      layer.build((None, None, 2))
+      self.assertEqual(len(layer.losses), 3)
+      layer(keras.backend.variable(np.ones((2, 3, 2))))
+      self.assertEqual(len(layer.losses), 4)
+
+      layer = layer_class(
+          5,
+          return_sequences=False,
+          weights=None,
+          input_shape=(None, embedding_dim),
+          kernel_constraint=keras.constraints.max_norm(0.01),
+          recurrent_constraint=keras.constraints.max_norm(0.01),
+          bias_constraint='max_norm')
+      layer.build((None, None, embedding_dim))
+      self.assertEqual(len(layer.constraints), 3)
+
+  def test_with_masking_layer_LSTM(self):
+    layer_class = keras.layers.LSTM
+    with self.test_session():
+      inputs = np.random.random((2, 3, 4))
+      targets = np.abs(np.random.random((2, 3, 5)))
+      targets /= targets.sum(axis=-1, keepdims=True)
+      model = keras.models.Sequential()
+      model.add(keras.layers.Masking(input_shape=(3, 4)))
+      model.add(layer_class(units=5, return_sequences=True, unroll=False))
+      model.compile(loss='categorical_crossentropy', optimizer='adam')
+      model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  def test_from_config_LSTM(self):
+    layer_class = keras.layers.LSTM
+    for stateful in (False, True):
+      l1 = layer_class(units=1, stateful=stateful)
+      l2 = layer_class.from_config(l1.get_config())
+      assert l1.get_config() == l2.get_config()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/merge.py b/tensorflow/contrib/keras/python/keras/layers/merge.py
new file mode 100644
index 0000000000..eea4313d31
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/merge.py
@@ -0,0 +1,431 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=not-callable
+# pylint: disable=redefined-builtin
+"""Layers can merge several input tensors into a single output tensor.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.engine.topology import Layer
+from tensorflow.python.framework import tensor_shape
+
+
+class _Merge(Layer):
+  """Generic merge layer for elementwise merge functions.
+
+  Used to implement `Sum`, `Average`, etc.
+
+  Arguments:
+      **kwargs: standard layer keyword arguments.
+  """
+
+  def __init__(self, **kwargs):
+    super(_Merge, self).__init__(**kwargs)
+    self.supports_masking = True
+
+  def _merge_function(self, inputs):
+    raise NotImplementedError
+
+  def build(self, input_shape):
+    # Used purely for shape validation.
+    if not isinstance(input_shape, list):
+      raise ValueError('A merge layer should be called ' 'on a list of inputs.')
+    if len(input_shape) < 2:
+      raise ValueError('A merge layer should be called '
+                       'on a list of at least 2 inputs. '
+                       'Got ' + str(len(input_shape)) + ' inputs.')
+    if all([shape is None for shape in input_shape]):
+      return
+    input_shapes = [
+        tuple(tensor_shape.TensorShape(shape).as_list())
+        for shape in input_shape
+    ]
+    # TODO(fchollet): handle shapes with None entries.
+    input_shapes_set = set(input_shapes)
+    if None in input_shapes_set:
+      input_shapes_set.remove(None)
+    if len(input_shapes_set) > 1:
+      raise ValueError('Only tensors of same shape can '
+                       'be merged by layer' + self.name +
+                       ' Got input shapes: %s' % input_shapes)
+
+  def call(self, inputs):
+    return self._merge_function(inputs)
+
+  def compute_mask(self, inputs, mask=None):
+    if mask is None:
+      return None
+    if not isinstance(mask, list):
+      raise ValueError('`mask` should be a list.')
+    if not isinstance(inputs, list):
+      raise ValueError('`inputs` should be a list.')
+    if len(mask) != len(inputs):
+      raise ValueError('The lists `inputs` and `mask` '
+                       'should have the same length.')
+    if all([m is None for m in mask]):
+      return None
+    masks = [K.expand_dims(m, 0) for m in mask if m is not None]
+    return K.all(K.concatenate(masks, axis=0), axis=0, keepdims=False)
+
+
+class Add(_Merge):
+  """Layer that adds a list of inputs.
+
+  It takes as input a list of tensors,
+  all of the same shape, and returns
+  a single tensor (also of the same shape).
+  """
+
+  def _merge_function(self, inputs):
+    output = inputs[0]
+    for i in range(1, len(inputs)):
+      output += inputs[i]
+    return output
+
+
+class Multiply(_Merge):
+  """Layer that multiplies (element-wise) a list of inputs.
+
+  It takes as input a list of tensors,
+  all of the same shape, and returns
+  a single tensor (also of the same shape).
+  """
+
+  def _merge_function(self, inputs):
+    output = inputs[0]
+    for i in range(1, len(inputs)):
+      output *= inputs[i]
+    return output
+
+
+class Average(_Merge):
+  """Layer that averages a list of inputs.
+
+  It takes as input a list of tensors,
+  all of the same shape, and returns
+  a single tensor (also of the same shape).
+  """
+
+  def _merge_function(self, inputs):
+    output = inputs[0]
+    for i in range(1, len(inputs)):
+      output += inputs[i]
+    return output / len(inputs)
+
+
+class Maximum(_Merge):
+  """Layer that computes the maximum (element-wise) a list of inputs.
+
+  It takes as input a list of tensors,
+  all of the same shape, and returns
+  a single tensor (also of the same shape).
+  """
+
+  def _merge_function(self, inputs):
+    output = inputs[0]
+    for i in range(1, len(inputs)):
+      output = K.maximum(output, inputs[i])
+    return output
+
+
+class Concatenate(_Merge):
+  """Layer that concatenates a list of inputs.
+
+  It takes as input a list of tensors,
+  all of the same shape expect for the concatenation axis,
+  and returns a single tensor, the concatenation of all inputs.
+
+  Arguments:
+      axis: Axis along which to concatenate.
+      **kwargs: standard layer keyword arguments.
+  """
+
+  def __init__(self, axis=-1, **kwargs):
+    super(Concatenate, self).__init__(**kwargs)
+    self.axis = axis
+    self.supports_masking = True
+
+  def build(self, input_shape):
+    # Used purely for shape validation.
+    if not isinstance(input_shape, list):
+      raise ValueError('`Concatenate` layer should be called '
+                       'on a list of inputs')
+    if all([shape is None for shape in input_shape]):
+      return
+    reduced_inputs_shapes = [
+        tensor_shape.TensorShape(shape).as_list() for shape in input_shape
+    ]
+    shape_set = set()
+    for i in range(len(reduced_inputs_shapes)):
+      del reduced_inputs_shapes[i][self.axis]
+      shape_set.add(tuple(reduced_inputs_shapes[i]))
+    if len(shape_set) > 1:
+      raise ValueError('`Concatenate` layer requires '
+                       'inputs with matching shapes '
+                       'except for the concat axis. '
+                       'Got inputs shapes: %s' % (input_shape))
+
+  def call(self, inputs):
+    if not isinstance(inputs, list):
+      raise ValueError('A `Concatenate` layer should be called '
+                       'on a list of inputs.')
+    return K.concatenate(inputs, axis=self.axis)
+
+  def _compute_output_shape(self, input_shape):
+    if not isinstance(input_shape, list):
+      raise ValueError('A `Concatenate` layer should be called '
+                       'on a list of inputs.')
+    input_shapes = input_shape
+    output_shape = tensor_shape.TensorShape(input_shapes[0]).as_list()
+    for shape in input_shapes[1:]:
+      shape = tensor_shape.TensorShape(shape).as_list()
+      if output_shape[self.axis] is None or shape[self.axis] is None:
+        output_shape[self.axis] = None
+        break
+      output_shape[self.axis] += shape[self.axis]
+    return tensor_shape.TensorShape(output_shape)
+
+  def compute_mask(self, inputs, mask=None):
+    if mask is None:
+      return None
+    if not isinstance(mask, list):
+      raise ValueError('`mask` should be a list.')
+    if not isinstance(inputs, list):
+      raise ValueError('`inputs` should be a list.')
+    if len(mask) != len(inputs):
+      raise ValueError('The lists `inputs` and `mask` '
+                       'should have the same length.')
+    if all([m is None for m in mask]):
+      return None
+    # Make a list of masks while making sure
+    # the dimensionality of each mask
+    # is the same as the corresponding input.
+    masks = []
+    for input_i, mask_i in zip(inputs, mask):
+      if mask_i is None:
+        # Input is unmasked. Append all 1s to masks,
+        # but cast it to uint8 first
+        masks.append(K.cast(K.ones_like(input_i), 'uint8'))
+      elif K.ndim(mask_i) < K.ndim(input_i):
+        # Mask is smaller than the input, expand it
+        masks.append(K.expand_dims(mask_i))
+      else:
+        masks.append(mask_i)
+    concatenated = K.concatenate(masks, axis=self.axis)
+    return K.all(concatenated, axis=-1, keepdims=False)
+
+  def get_config(self):
+    config = {
+        'axis': self.axis,
+    }
+    base_config = super(Concatenate, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Dot(_Merge):
+  """Layer that computes a dot product between samples in two tensors.
+
+  E.g. if applied to two tensors `a` and `b` of shape `(batch_size, n)`,
+  the output will be a tensor of shape `(batch_size, 1)`
+  where each entry `i` will be the dot product between
+  `a[i]` and `b[i]`.
+
+  Arguments:
+      axes: Integer or tuple of integers,
+          axis or axes along which to take the dot product.
+      normalize: Whether to L2-normalize samples along the
+          dot product axis before taking the dot product.
+          If set to True, then the output of the dot product
+          is the cosine proximity between the two samples.
+      **kwargs: Standard layer keyword arguments.
+  """
+
+  def __init__(self, axes, normalize=False, **kwargs):
+    super(Dot, self).__init__(**kwargs)
+    if not isinstance(axes, int):
+      if not isinstance(axes, (list, tuple)):
+        raise TypeError('Invalid type for `axes` - '
+                        'should be a list or an int.')
+      if len(axes) != 2:
+        raise ValueError('Invalid format for `axes` - '
+                         'should contain two elements.')
+      if not isinstance(axes[0], int) or not isinstance(axes[1], int):
+        raise ValueError('Invalid format for `axes` - '
+                         'list elements should be "int".')
+    self.axes = axes
+    self.normalize = normalize
+    self.supports_masking = True
+
+  def build(self, input_shape):
+    # Used purely for shape validation.
+    if not isinstance(input_shape, list) or len(input_shape) != 2:
+      raise ValueError('A `Dot` layer should be called '
+                       'on a list of 2 inputs.')
+    shape1 = tensor_shape.TensorShape(input_shape[0]).as_list()
+    shape2 = tensor_shape.TensorShape(input_shape[1]).as_list()
+    if shape1 is None or shape2 is None:
+      return
+    if isinstance(self.axes, int):
+      if self.axes < 0:
+        axes = [self.axes % len(shape1), self.axes % len(shape2)]
+      else:
+        axes = [self.axes] * 2
+    else:
+      axes = self.axes
+    if shape1[axes[0]] != shape2[axes[1]]:
+      raise ValueError('Dimension incompatibility '
+                       '%s != %s. ' % (shape1[axes[0]], shape2[axes[1]]) +
+                       'Layer shapes: %s, %s' % (shape1, shape2))
+
+  def call(self, inputs):
+    x1 = inputs[0]
+    x2 = inputs[1]
+    if isinstance(self.axes, int):
+      if self.axes < 0:
+        axes = [self.axes % K.ndim(x1), self.axes % K.ndim(x2)]
+      else:
+        axes = [self.axes] * 2
+    else:
+      axes = []
+      for i in range(len(self.axes)):
+        if self.axes[i] < 0:
+          axes.append(self.axes[i] % K.ndim(inputs[i]))
+        else:
+          axes.append(self.axes[i])
+    if self.normalize:
+      x1 = K.l2_normalize(x1, axis=axes[0])
+      x2 = K.l2_normalize(x2, axis=axes[1])
+    output = K.batch_dot(x1, x2, axes)
+    return output
+
+  def _compute_output_shape(self, input_shape):
+    if not isinstance(input_shape, list) or len(input_shape) != 2:
+      raise ValueError('A `Dot` layer should be called '
+                       'on a list of 2 inputs.')
+    shape1 = tensor_shape.TensorShape(input_shape[0]).as_list()
+    shape2 = tensor_shape.TensorShape(input_shape[1]).as_list()
+    if isinstance(self.axes, int):
+      if self.axes < 0:
+        axes = [self.axes % len(shape1), self.axes % len(shape2)]
+      else:
+        axes = [self.axes] * 2
+    else:
+      axes = self.axes
+    shape1.pop(axes[0])
+    shape2.pop(axes[1])
+    shape2.pop(0)
+    output_shape = shape1 + shape2
+    if len(output_shape) == 1:
+      output_shape += [1]
+    return tensor_shape.TensorShape(output_shape)
+
+  def compute_mask(self, inputs, mask=None):
+    return None
+
+  def get_config(self):
+    config = {
+        'axes': self.axes,
+        'normalize': self.normalize,
+    }
+    base_config = super(Dot, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+def add(inputs, **kwargs):
+  """Functional interface to the `Add` layer.
+
+  Arguments:
+      inputs: A list of input tensors (at least 2).
+      **kwargs: Standard layer keyword arguments.
+
+  Returns:
+      A tensor, the sum of the inputs.
+  """
+  return Add(**kwargs)(inputs)
+
+
+def multiply(inputs, **kwargs):
+  """Functional interface to the `Multiply` layer.
+
+  Arguments:
+      inputs: A list of input tensors (at least 2).
+      **kwargs: Standard layer keyword arguments.
+
+  Returns:
+      A tensor, the element-wise product of the inputs.
+  """
+  return Multiply(**kwargs)(inputs)
+
+
+def average(inputs, **kwargs):
+  """Functional interface to the `Average` layer.
+
+  Arguments:
+      inputs: A list of input tensors (at least 2).
+      **kwargs: Standard layer keyword arguments.
+
+  Returns:
+      A tensor, the average of the inputs.
+  """
+  return Average(**kwargs)(inputs)
+
+
+def maximum(inputs, **kwargs):
+  """Functional interface to the `Maximum` layer.
+
+  Arguments:
+      inputs: A list of input tensors (at least 2).
+      **kwargs: Standard layer keyword arguments.
+
+  Returns:
+      A tensor, the element-wise maximum of the inputs.
+  """
+  return Maximum(**kwargs)(inputs)
+
+
+def concatenate(inputs, axis=-1, **kwargs):
+  """Functional interface to the `Concatenate` layer.
+
+  Arguments:
+      inputs: A list of input tensors (at least 2).
+      axis: Concatenation axis.
+      **kwargs: Standard layer keyword arguments.
+
+  Returns:
+      A tensor, the concatenation of the inputs alongside axis `axis`.
+  """
+  return Concatenate(axis=axis, **kwargs)(inputs)
+
+
+def dot(inputs, axes, normalize=False, **kwargs):
+  """Functional interface to the `Dot` layer.
+
+  Arguments:
+      inputs: A list of input tensors (at least 2).
+      axes: Integer or tuple of integers,
+          axis or axes along which to take the dot product.
+      normalize: Whether to L2-normalize samples along the
+          dot product axis before taking the dot product.
+          If set to True, then the output of the dot product
+          is the cosine proximity between the two samples.
+      **kwargs: Standard layer keyword arguments.
+
+  Returns:
+      A tensor, the dot product of the samples from the inputs.
+  """
+  return Dot(axes=axes, normalize=normalize, **kwargs)(inputs)
diff --git a/tensorflow/contrib/keras/python/keras/layers/merge_test.py b/tensorflow/contrib/keras/python/keras/layers/merge_test.py
new file mode 100644
index 0000000000..2887fb851b
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/merge_test.py
@@ -0,0 +1,178 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for merge layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class MergeLayersTest(test.TestCase):
+
+  def test_merge_add(self):
+    with self.test_session():
+      i1 = keras.layers.Input(shape=(4, 5))
+      i2 = keras.layers.Input(shape=(4, 5))
+      i3 = keras.layers.Input(shape=(4, 5))
+
+      o = keras.layers.add([i1, i2, i3])
+      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+      model = keras.models.Model([i1, i2, i3], o)
+
+      x1 = np.random.random((2, 4, 5))
+      x2 = np.random.random((2, 4, 5))
+      x3 = np.random.random((2, 4, 5))
+      out = model.predict([x1, x2, x3])
+      self.assertEqual(out.shape, (2, 4, 5))
+      self.assertAllClose(out, x1 + x2 + x3, atol=1e-4)
+
+      # test masking
+      i1 = keras.layers.Input(shape=(4, 5))
+      i2 = keras.layers.Input(shape=(4, 5))
+      m1 = keras.layers.Masking()(i1)
+      layer = keras.layers.Add()
+      o = layer([m1, i2])
+      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+      mask = layer.output_mask
+      self.assertListEqual(mask.get_shape().as_list(), [None, 4])
+
+  def test_merge_elementwise_errors(self):
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 6))
+    with self.assertRaises(ValueError):
+      keras.layers.add([i1, i2])
+    with self.assertRaises(ValueError):
+      keras.layers.add(i1)
+    with self.assertRaises(ValueError):
+      keras.layers.add([i1])
+
+  def test_merge_multiply(self):
+    with self.test_session():
+      i1 = keras.layers.Input(shape=(4, 5))
+      i2 = keras.layers.Input(shape=(4, 5))
+      i3 = keras.layers.Input(shape=(4, 5))
+      o = keras.layers.multiply([i1, i2, i3])
+      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+      model = keras.models.Model([i1, i2, i3], o)
+
+      x1 = np.random.random((2, 4, 5))
+      x2 = np.random.random((2, 4, 5))
+      x3 = np.random.random((2, 4, 5))
+      out = model.predict([x1, x2, x3])
+      self.assertEqual(out.shape, (2, 4, 5))
+      self.assertAllClose(out, x1 * x2 * x3, atol=1e-4)
+
+  def test_merge_average(self):
+    with self.test_session():
+      i1 = keras.layers.Input(shape=(4, 5))
+      i2 = keras.layers.Input(shape=(4, 5))
+      o = keras.layers.average([i1, i2])
+      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+      model = keras.models.Model([i1, i2], o)
+
+      x1 = np.random.random((2, 4, 5))
+      x2 = np.random.random((2, 4, 5))
+      out = model.predict([x1, x2])
+      self.assertEqual(out.shape, (2, 4, 5))
+      self.assertAllClose(out, 0.5 * (x1 + x2), atol=1e-4)
+
+  def test_merge_maximum(self):
+    with self.test_session():
+      i1 = keras.layers.Input(shape=(4, 5))
+      i2 = keras.layers.Input(shape=(4, 5))
+      o = keras.layers.maximum([i1, i2])
+      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+      model = keras.models.Model([i1, i2], o)
+
+      x1 = np.random.random((2, 4, 5))
+      x2 = np.random.random((2, 4, 5))
+      out = model.predict([x1, x2])
+      self.assertEqual(out.shape, (2, 4, 5))
+      self.assertAllClose(out, np.maximum(x1, x2), atol=1e-4)
+
+  def test_merge_concatenate(self):
+    with self.test_session():
+      i1 = keras.layers.Input(shape=(4, 5))
+      i2 = keras.layers.Input(shape=(4, 5))
+      o = keras.layers.concatenate([i1, i2], axis=1)
+      self.assertListEqual(o.get_shape().as_list(), [None, 8, 5])
+      model = keras.models.Model([i1, i2], o)
+
+      x1 = np.random.random((2, 4, 5))
+      x2 = np.random.random((2, 4, 5))
+      out = model.predict([x1, x2])
+      self.assertEqual(out.shape, (2, 8, 5))
+      self.assertAllClose(out, np.concatenate([x1, x2], axis=1), atol=1e-4)
+
+  def test_concatenate_errors(self):
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(3, 5))
+    with self.assertRaises(ValueError):
+      keras.layers.concatenate([i1, i2], axis=-1)
+    with self.assertRaises(ValueError):
+      keras.layers.concatenate(i1, axis=-1)
+    with self.assertRaises(ValueError):
+      keras.layers.concatenate([i1], axis=-1)
+
+  def test_merge_dot(self):
+    with self.test_session():
+      i1 = keras.layers.Input(shape=(4,))
+      i2 = keras.layers.Input(shape=(4,))
+      o = keras.layers.dot([i1, i2], axes=1)
+      self.assertListEqual(o.get_shape().as_list(), [None, 1])
+      model = keras.models.Model([i1, i2], o)
+
+      x1 = np.random.random((2, 4))
+      x2 = np.random.random((2, 4))
+      out = model.predict([x1, x2])
+      self.assertEqual(out.shape, (2, 1))
+      expected = np.zeros((2, 1))
+      expected[0, 0] = np.dot(x1[0], x2[0])
+      expected[1, 0] = np.dot(x1[1], x2[1])
+      self.assertAllClose(out, expected, atol=1e-4)
+
+      # Test with negative tuple of axes.
+      o = keras.layers.dot([i1, i2], axes=(-1, -1))
+      self.assertListEqual(o.get_shape().as_list(), [None, 1])
+      model = keras.models.Model([i1, i2], o)
+      out = model.predict([x1, x2])
+      self.assertEqual(out.shape, (2, 1))
+      self.assertAllClose(out, expected, atol=1e-4)
+
+      # test _compute_output_shape
+      layer = keras.layers.Dot(axes=-1)
+      self.assertEqual(layer._compute_output_shape([(4, 5), (4, 5)]), (4, 1))
+
+  def test_dot_errors(self):
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 6))
+    i3 = keras.layers.Input(shape=(4, 6))
+    with self.assertRaises(ValueError):
+      keras.layers.dot([i1, i2], axes=-1)
+    with self.assertRaises(ValueError):
+      keras.layers.dot(i1, axes=-1)
+    with self.assertRaises(ValueError):
+      keras.layers.dot([i1], axes=-1)
+    with self.assertRaises(ValueError):
+      keras.layers.dot([i1, i2, i3], axes=-1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/noise.py b/tensorflow/contrib/keras/python/keras/layers/noise.py
new file mode 100644
index 0000000000..adc88a4fce
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/noise.py
@@ -0,0 +1,111 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Layers for regularization models via the addition of noise.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.engine import Layer
+
+
+class GaussianNoise(Layer):
+  """Apply additive zero-centered Gaussian noise.
+
+  This is useful to mitigate overfitting
+  (you could see it as a form of random data augmentation).
+  Gaussian Noise (GS) is a natural choice as corruption process
+  for real valued inputs.
+
+  As it is a regularization layer, it is only active at training time.
+
+  Arguments:
+      stddev: float, standard deviation of the noise distribution.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as input.
+  """
+
+  def __init__(self, stddev, **kwargs):
+    super(GaussianNoise, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.stddev = stddev
+
+  def call(self, inputs, training=None):
+
+    def noised():
+      return inputs + K.random_normal(
+          shape=K.shape(inputs), mean=0., stddev=self.stddev)
+
+    return K.in_train_phase(noised, inputs, training=training)
+
+  def get_config(self):
+    config = {'stddev': self.stddev}
+    base_config = super(GaussianNoise, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class GaussianDropout(Layer):
+  """Apply multiplicative 1-centered Gaussian noise.
+
+  As it is a regularization layer, it is only active at training time.
+
+  Arguments:
+      rate: float, drop probability (as with `Dropout`).
+          The multiplicative noise will have
+          standard deviation `sqrt(rate / (1 - rate))`.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as input.
+
+  References:
+      - [Dropout: A Simple Way to Prevent Neural Networks from Overfitting
+        Srivastava, Hinton, et al.
+        2014](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)
+  """
+
+  def __init__(self, rate, **kwargs):
+    super(GaussianDropout, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.rate = rate
+
+  def call(self, inputs, training=None):
+    if 0 < self.rate < 1:
+
+      def noised():
+        stddev = np.sqrt(self.rate / (1.0 - self.rate))
+        return inputs * K.random_normal(
+            shape=K.shape(inputs), mean=1.0, stddev=stddev)
+
+      return K.in_train_phase(noised, inputs, training=training)
+    return inputs
+
+  def get_config(self):
+    config = {'rate': self.rate}
+    base_config = super(GaussianDropout, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/noise_test.py b/tensorflow/contrib/keras/python/keras/layers/noise_test.py
new file mode 100644
index 0000000000..b0257b167a
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/noise_test.py
@@ -0,0 +1,44 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for noise layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class NoiseLayersTest(test.TestCase):
+
+  def test_GaussianNoise(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.GaussianNoise,
+          kwargs={'stddev': 1.},
+          input_shape=(3, 2, 3))
+
+  def test_GaussianDropout(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.GaussianDropout,
+          kwargs={'rate': 0.5},
+          input_shape=(3, 2, 3))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/normalization.py b/tensorflow/contrib/keras/python/keras/layers/normalization.py
new file mode 100644
index 0000000000..41c618cc79
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/normalization.py
@@ -0,0 +1,233 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Normalization layers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.python.framework import tensor_shape
+
+
+class BatchNormalization(Layer):
+  """Batch normalization layer (Ioffe and Szegedy, 2014).
+
+  Normalize the activations of the previous layer at each batch,
+  i.e. applies a transformation that maintains the mean activation
+  close to 0 and the activation standard deviation close to 1.
+
+  Arguments:
+      axis: Integer, the axis that should be normalized
+          (typically the features axis).
+          For instance, after a `Conv2D` layer with
+          `data_format="channels_first"`,
+          set `axis=1` in `BatchNormalization`.
+      momentum: Momentum for the moving average.
+      epsilon: Small float added to variance to avoid dividing by zero.
+      center: If True, add offset of `beta` to normalized tensor.
+          If False, `beta` is ignored.
+      scale: If True, multiply by `gamma`.
+          If False, `gamma` is not used.
+          When the next layer is linear (also e.g. `nn.relu`),
+          this can be disabled since the scaling
+          will be done by the next layer.
+      beta_initializer: Initializer for the beta weight.
+      gamma_initializer: Initializer for the gamma weight.
+      moving_mean_initializer: Initializer for the moving mean.
+      moving_variance_initializer: Initializer for the moving variance.
+      beta_regularizer: Optional regularizer for the beta weight.
+      gamma_regularizer: Optional regularizer for the gamma weight.
+      beta_constraint: Optional constraint for the beta weight.
+      gamma_constraint: Optional constraint for the gamma weight.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as input.
+
+  References:
+      - [Batch Normalization: Accelerating Deep Network Training by Reducing
+        Internal Covariate Shift](https://arxiv.org/abs/1502.03167)
+  """
+
+  def __init__(self,
+               axis=-1,
+               momentum=0.99,
+               epsilon=1e-3,
+               center=True,
+               scale=True,
+               beta_initializer='zeros',
+               gamma_initializer='ones',
+               moving_mean_initializer='zeros',
+               moving_variance_initializer='ones',
+               beta_regularizer=None,
+               gamma_regularizer=None,
+               beta_constraint=None,
+               gamma_constraint=None,
+               **kwargs):
+    super(BatchNormalization, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.axis = axis
+    self.momentum = momentum
+    self.epsilon = epsilon
+    self.center = center
+    self.scale = scale
+    self.beta_initializer = initializers.get(beta_initializer)
+    self.gamma_initializer = initializers.get(gamma_initializer)
+    self.moving_mean_initializer = initializers.get(moving_mean_initializer)
+    self.moving_variance_initializer = initializers.get(
+        moving_variance_initializer)
+    self.beta_regularizer = regularizers.get(beta_regularizer)
+    self.gamma_regularizer = regularizers.get(gamma_regularizer)
+    self.beta_constraint = constraints.get(beta_constraint)
+    self.gamma_constraint = constraints.get(gamma_constraint)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    dim = input_shape[self.axis]
+    if dim is None:
+      raise ValueError('Axis ' + str(self.axis) + ' of '
+                       'input tensor should have a defined dimension '
+                       'but the layer received an input with shape ' + str(
+                           input_shape) + '.')
+    self.input_spec = InputSpec(ndim=len(input_shape), axes={self.axis: dim})
+    shape = (dim,)
+
+    if self.scale:
+      self.gamma = self.add_weight(
+          shape,
+          name='gamma',
+          initializer=self.gamma_initializer,
+          regularizer=self.gamma_regularizer,
+          constraint=self.gamma_constraint)
+    else:
+      self.gamma = None
+    if self.center:
+      self.beta = self.add_weight(
+          shape,
+          name='beta',
+          initializer=self.beta_initializer,
+          regularizer=self.beta_regularizer,
+          constraint=self.beta_constraint)
+    else:
+      self.beta = None
+    self.moving_mean = self.add_weight(
+        shape,
+        name='moving_mean',
+        initializer=self.moving_mean_initializer,
+        trainable=False)
+    self.moving_variance = self.add_weight(
+        shape,
+        name='moving_variance',
+        initializer=self.moving_variance_initializer,
+        trainable=False)
+    self.built = True
+
+  def call(self, inputs, training=None):
+    input_shape = inputs.get_shape().as_list()
+    # Prepare broadcasting shape.
+    ndim = len(input_shape)
+    reduction_axes = list(range(len(input_shape)))
+    del reduction_axes[self.axis]
+    broadcast_shape = [1] * len(input_shape)
+    broadcast_shape[self.axis] = input_shape[self.axis]
+
+    # Determines whether broadcasting is needed.
+    needs_broadcasting = (sorted(reduction_axes) != range(ndim)[:-1])
+
+    normed, mean, variance = K.normalize_batch_in_training(
+        inputs, self.gamma, self.beta, reduction_axes, epsilon=self.epsilon)
+
+    if training in {0, False}:
+      return normed
+    else:
+      self.add_update([
+          K.moving_average_update(self.moving_mean, mean, self.momentum),
+          K.moving_average_update(self.moving_variance, variance, self.momentum)
+      ], inputs)
+
+      def normalize_inference():
+        if needs_broadcasting:
+          # In this case we must explictly broadcast all parameters.
+          broadcast_moving_mean = K.reshape(self.moving_mean, broadcast_shape)
+          broadcast_moving_variance = K.reshape(self.moving_variance,
+                                                broadcast_shape)
+          if self.center:
+            broadcast_beta = K.reshape(self.beta, broadcast_shape)
+          else:
+            broadcast_beta = None
+          if self.scale:
+            broadcast_gamma = K.reshape(self.gamma, broadcast_shape)
+          else:
+            broadcast_gamma = None
+          return K.batch_normalization(
+              inputs,
+              broadcast_moving_mean,
+              broadcast_moving_variance,
+              broadcast_beta,
+              broadcast_gamma,
+              epsilon=self.epsilon)
+        else:
+          return K.batch_normalization(
+              inputs,
+              self.moving_mean,
+              self.moving_variance,
+              self.beta,
+              self.gamma,
+              epsilon=self.epsilon)
+
+    # Pick the normalized form corresponding to the training phase.
+    return K.in_train_phase(normed, normalize_inference, training=training)
+
+  def get_config(self):
+    config = {
+        'axis':
+            self.axis,
+        'momentum':
+            self.momentum,
+        'epsilon':
+            self.epsilon,
+        'center':
+            self.center,
+        'scale':
+            self.scale,
+        'beta_initializer':
+            initializers.serialize(self.beta_initializer),
+        'gamma_initializer':
+            initializers.serialize(self.gamma_initializer),
+        'moving_mean_initializer':
+            initializers.serialize(self.moving_mean_initializer),
+        'moving_variance_initializer':
+            initializers.serialize(self.moving_variance_initializer),
+        'beta_regularizer':
+            regularizers.serialize(self.beta_regularizer),
+        'gamma_regularizer':
+            regularizers.serialize(self.gamma_regularizer),
+        'beta_constraint':
+            constraints.serialize(self.beta_constraint),
+        'gamma_constraint':
+            constraints.serialize(self.gamma_constraint)
+    }
+    base_config = super(BatchNormalization, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/normalization_test.py b/tensorflow/contrib/keras/python/keras/layers/normalization_test.py
new file mode 100644
index 0000000000..51e23b8494
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/normalization_test.py
@@ -0,0 +1,142 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for normalization layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class NoiseLayersTest(test.TestCase):
+
+  def basic_batchnorm_test(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.BatchNormalization,
+          kwargs={
+              'momentum': 0.9,
+              'epsilon': 0.1,
+              'gamma_regularizer': keras.regularizers.l2(0.01),
+              'beta_regularizer': keras.regularizers.l2(0.01)
+          },
+          input_shape=(3, 4, 2))
+      testing_utils.layer_test(
+          keras.layers.BatchNormalization,
+          kwargs={
+              'gamma_initializer': 'ones',
+              'beta_initializer': 'ones',
+              'moving_mean_initializer': 'zeros',
+              'moving_variance_initializer': 'ones'
+          },
+          input_shape=(3, 4, 2))
+      testing_utils.layer_test(
+          keras.layers.BatchNormalization,
+          kwargs={'scale': False,
+                  'center': False},
+          input_shape=(3, 3))
+
+  def batchnorm_weights_test(self):
+    with self.test_session():
+      layer = keras.layers.BatchNormalization(scale=False, center=False)
+      layer.build((None, 3, 4))
+      self.assertEqual(len(layer.trainable_weights), 0)
+      self.assertEqual(len(layer.weights), 2)
+
+      layer = keras.layers.BatchNormalization()
+      layer.build((None, 3, 4))
+      self.assertEqual(len(layer.trainable_weights), 2)
+      self.assertEqual(len(layer.weights), 4)
+
+  def batchnorm_regularization_test(self):
+    with self.test_session():
+      layer = keras.layers.BatchNormalization(
+          gamma_regularizer='l1', beta_regularizer='l1')
+      layer.build((None, 3, 4))
+      self.assertEqual(len(layer.losses), 2)
+      layer = keras.layers.BatchNormalization(
+          gamma_constraint='l1', beta_constraint='l1')
+      layer.build((None, 3, 4))
+      self.assertEqual(len(layer.constraints), 2)
+
+  def test_batchnorm_correctness(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8)
+      model.add(norm)
+      model.compile(loss='mse', optimizer='sgd')
+
+      # centered on 5.0, variance 10.0
+      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10))
+      model.fit(x, x, epochs=4, verbose=0)
+      out = model.predict(x)
+      out -= keras.backend.eval(norm.beta)
+      out /= keras.backend.eval(norm.gamma)
+
+      np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
+      np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+
+  def test_batchnorm_convnet(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      norm = keras.layers.BatchNormalization(
+          axis=1, input_shape=(3, 4, 4), momentum=0.8)
+      model.add(norm)
+      model.compile(loss='mse', optimizer='sgd')
+
+      # centered on 5.0, variance 10.0
+      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 3, 4, 4))
+      model.fit(x, x, epochs=4, verbose=0)
+      out = model.predict(x)
+      out -= np.reshape(keras.backend.eval(norm.beta), (1, 3, 1, 1))
+      out /= np.reshape(keras.backend.eval(norm.gamma), (1, 3, 1, 1))
+
+      np.testing.assert_allclose(np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1)
+      np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
+
+  def test_shared_batchnorm(self):
+    """Test that a BN layer can be shared across different data streams.
+    """
+    with self.test_session():
+      # Test single layer reuse
+      bn = keras.layers.BatchNormalization(input_shape=(10,))
+      x1 = keras.layers.Input(shape=(10,))
+      bn(x1)
+
+      x2 = keras.layers.Input(shape=(10,))
+      y2 = bn(x2)
+
+      x = np.random.normal(loc=5.0, scale=10.0, size=(2, 10))
+      model = keras.models.Model(x2, y2)
+      assert len(model.updates) == 2
+      model.compile('sgd', 'mse')
+      model.train_on_batch(x, x)
+
+      # Test model-level reuse
+      x3 = keras.layers.Input(shape=(10,))
+      y3 = model(x3)
+      new_model = keras.models.Model(x3, y3)
+      assert len(model.updates) == 2
+      new_model.compile('sgd', 'mse')
+      new_model.train_on_batch(x, x)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/pooling.py b/tensorflow/contrib/keras/python/keras/layers/pooling.py
new file mode 100644
index 0000000000..e31caed3ec
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/pooling.py
@@ -0,0 +1,715 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Pooling layers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.contrib.keras.python.keras.utils import conv_utils
+from tensorflow.python.framework import tensor_shape
+
+
+class _Pooling1D(Layer):
+  """Abstract class for different pooling 1D layers.
+  """
+
+  def __init__(self, pool_size=2, strides=None, padding='valid', **kwargs):
+    super(_Pooling1D, self).__init__(**kwargs)
+    if strides is None:
+      strides = pool_size
+    self.pool_size = conv_utils.normalize_tuple(pool_size, 1, 'pool_size')
+    self.strides = conv_utils.normalize_tuple(strides, 1, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.input_spec = InputSpec(ndim=3)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    length = conv_utils.conv_output_length(input_shape[1], self.pool_size[0],
+                                           self.padding, self.strides[0])
+    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
+
+  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
+    raise NotImplementedError
+
+  def call(self, inputs):
+    inputs = K.expand_dims(inputs, 2)  # add dummy last dimension
+    output = self._pooling_function(
+        inputs=inputs,
+        pool_size=self.pool_size + (1,),
+        strides=self.strides + (1,),
+        padding=self.padding,
+        data_format='channels_last')
+    return K.squeeze(output, 2)  # remove dummy last dimension
+
+  def get_config(self):
+    config = {
+        'strides': self.strides,
+        'pool_size': self.pool_size,
+        'padding': self.padding
+    }
+    base_config = super(_Pooling1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class MaxPooling1D(_Pooling1D):
+  """Max pooling operation for temporal data.
+
+  Arguments:
+      pool_size: Integer, size of the max pooling windows.
+      strides: Integer, or None. Factor by which to downscale.
+          E.g. 2 will halve the input.
+          If None, it will default to `pool_size`.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+
+  Input shape:
+      3D tensor with shape: `(batch_size, steps, features)`.
+
+  Output shape:
+      3D tensor with shape: `(batch_size, downsampled_steps, features)`.
+  """
+
+  def __init__(self, pool_size=2, strides=None, padding='valid', **kwargs):
+    super(MaxPooling1D, self).__init__(pool_size, strides, padding, **kwargs)
+
+  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
+    output = K.pool2d(
+        inputs, pool_size, strides, padding, data_format, pool_mode='max')
+    return output
+
+
+class AveragePooling1D(_Pooling1D):
+  """Average pooling for temporal data.
+
+  Arguments:
+      pool_size: Integer, size of the max pooling windows.
+      strides: Integer, or None. Factor by which to downscale.
+          E.g. 2 will halve the input.
+          If None, it will default to `pool_size`.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+
+  Input shape:
+      3D tensor with shape: `(batch_size, steps, features)`.
+
+  Output shape:
+      3D tensor with shape: `(batch_size, downsampled_steps, features)`.
+  """
+
+  def __init__(self, pool_size=2, strides=None, padding='valid', **kwargs):
+    super(AveragePooling1D, self).__init__(pool_size, strides, padding,
+                                           **kwargs)
+
+  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
+    output = K.pool2d(
+        inputs, pool_size, strides, padding, data_format, pool_mode='avg')
+    return output
+
+
+class _Pooling2D(Layer):
+  """Abstract class for different pooling 2D layers.
+  """
+
+  def __init__(self,
+               pool_size=(2, 2),
+               strides=None,
+               padding='valid',
+               data_format=None,
+               **kwargs):
+    super(_Pooling2D, self).__init__(**kwargs)
+    data_format = conv_utils.normalize_data_format(data_format)
+    if strides is None:
+      strides = pool_size
+    self.pool_size = conv_utils.normalize_tuple(pool_size, 2, 'pool_size')
+    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=4)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+    else:
+      rows = input_shape[1]
+      cols = input_shape[2]
+    rows = conv_utils.conv_output_length(rows, self.pool_size[0], self.padding,
+                                         self.strides[0])
+    cols = conv_utils.conv_output_length(cols, self.pool_size[1], self.padding,
+                                         self.strides[1])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], rows, cols])
+    else:
+      return tensor_shape.TensorShape(
+          [input_shape[0], rows, cols, input_shape[3]])
+
+  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
+    raise NotImplementedError
+
+  def call(self, inputs):
+    output = self._pooling_function(
+        inputs=inputs,
+        pool_size=self.pool_size,
+        strides=self.strides,
+        padding=self.padding,
+        data_format=self.data_format)
+    return output
+
+  def get_config(self):
+    config = {
+        'pool_size': self.pool_size,
+        'padding': self.padding,
+        'strides': self.strides,
+        'data_format': self.data_format
+    }
+    base_config = super(_Pooling2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class MaxPooling2D(_Pooling2D):
+  """Max pooling operation for spatial data.
+
+  Arguments:
+      pool_size: integer or tuple of 2 integers,
+          factors by which to downscale (vertical, horizontal).
+          (2, 2) will halve the input in both spatial dimension.
+          If only one integer is specified, the same window length
+          will be used for both dimensions.
+      strides: Integer, tuple of 2 integers, or None.
+          Strides values.
+          If None, it will default to `pool_size`.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, width, height, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, width, height)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      - If `data_format='channels_last'`:
+          4D tensor with shape:
+          `(batch_size, rows, cols, channels)`
+      - If `data_format='channels_first'`:
+          4D tensor with shape:
+          `(batch_size, channels, rows, cols)`
+
+  Output shape:
+      - If `data_format='channels_last'`:
+          4D tensor with shape:
+          `(batch_size, pooled_rows, pooled_cols, channels)`
+      - If `data_format='channels_first'`:
+          4D tensor with shape:
+          `(batch_size, channels, pooled_rows, pooled_cols)`
+  """
+
+  def __init__(self,
+               pool_size=(2, 2),
+               strides=None,
+               padding='valid',
+               data_format=None,
+               **kwargs):
+    super(MaxPooling2D, self).__init__(pool_size, strides, padding, data_format,
+                                       **kwargs)
+
+  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
+    output = K.pool2d(
+        inputs, pool_size, strides, padding, data_format, pool_mode='max')
+    return output
+
+
+class AveragePooling2D(_Pooling2D):
+  """Average pooling operation for spatial data.
+
+  Arguments:
+      pool_size: integer or tuple of 2 integers,
+          factors by which to downscale (vertical, horizontal).
+          (2, 2) will halve the input in both spatial dimension.
+          If only one integer is specified, the same window length
+          will be used for both dimensions.
+      strides: Integer, tuple of 2 integers, or None.
+          Strides values.
+          If None, it will default to `pool_size`.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, width, height, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, width, height)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      - If `data_format='channels_last'`:
+          4D tensor with shape:
+          `(batch_size, rows, cols, channels)`
+      - If `data_format='channels_first'`:
+          4D tensor with shape:
+          `(batch_size, channels, rows, cols)`
+
+  Output shape:
+      - If `data_format='channels_last'`:
+          4D tensor with shape:
+          `(batch_size, pooled_rows, pooled_cols, channels)`
+      - If `data_format='channels_first'`:
+          4D tensor with shape:
+          `(batch_size, channels, pooled_rows, pooled_cols)`
+  """
+
+  def __init__(self,
+               pool_size=(2, 2),
+               strides=None,
+               padding='valid',
+               data_format=None,
+               **kwargs):
+    super(AveragePooling2D, self).__init__(pool_size, strides, padding,
+                                           data_format, **kwargs)
+
+  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
+    output = K.pool2d(
+        inputs, pool_size, strides, padding, data_format, pool_mode='avg')
+    return output
+
+
+class _Pooling3D(Layer):
+  """Abstract class for different pooling 3D layers.
+  """
+
+  def __init__(self,
+               pool_size=(2, 2, 2),
+               strides=None,
+               padding='valid',
+               data_format=None,
+               **kwargs):
+    super(_Pooling3D, self).__init__(**kwargs)
+    if strides is None:
+      strides = pool_size
+    self.pool_size = conv_utils.normalize_tuple(pool_size, 3, 'pool_size')
+    self.strides = conv_utils.normalize_tuple(strides, 3, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=5)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      len_dim1 = input_shape[2]
+      len_dim2 = input_shape[3]
+      len_dim3 = input_shape[4]
+    else:
+      len_dim1 = input_shape[1]
+      len_dim2 = input_shape[2]
+      len_dim3 = input_shape[3]
+    len_dim1 = conv_utils.conv_output_length(len_dim1, self.pool_size[0],
+                                             self.padding, self.strides[0])
+    len_dim2 = conv_utils.conv_output_length(len_dim2, self.pool_size[1],
+                                             self.padding, self.strides[1])
+    len_dim3 = conv_utils.conv_output_length(len_dim3, self.pool_size[2],
+                                             self.padding, self.strides[2])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], len_dim1, len_dim2, len_dim3])
+    else:
+      return tensor_shape.TensorShape(
+          [input_shape[0], len_dim1, len_dim2, len_dim3, input_shape[4]])
+
+  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
+    raise NotImplementedError
+
+  def call(self, inputs):
+    output = self._pooling_function(
+        inputs=inputs,
+        pool_size=self.pool_size,
+        strides=self.strides,
+        padding=self.padding,
+        data_format=self.data_format)
+    return output
+
+  def get_config(self):
+    config = {
+        'pool_size': self.pool_size,
+        'padding': self.padding,
+        'strides': self.strides,
+        'data_format': self.data_format
+    }
+    base_config = super(_Pooling3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class MaxPooling3D(_Pooling3D):
+  """Max pooling operation for 3D data (spatial or spatio-temporal).
+
+  Arguments:
+      pool_size: tuple of 3 integers,
+          factors by which to downscale (dim1, dim2, dim3).
+          (2, 2, 2) will halve the size of the 3D input in each dimension.
+      strides: tuple of 3 integers, or None. Strides values.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+          while `channels_first` corresponds to inputs with shape
+          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      - If `data_format='channels_last'`:
+          5D tensor with shape:
+          `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      - If `data_format='channels_first'`:
+          5D tensor with shape:
+          `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+
+  Output shape:
+      - If `data_format='channels_last'`:
+          5D tensor with shape:
+          `(batch_size, pooled_dim1, pooled_dim2, pooled_dim3, channels)`
+      - If `data_format='channels_first'`:
+          5D tensor with shape:
+          `(batch_size, channels, pooled_dim1, pooled_dim2, pooled_dim3)`
+  """
+
+  def __init__(self,
+               pool_size=(2, 2, 2),
+               strides=None,
+               padding='valid',
+               data_format=None,
+               **kwargs):
+    super(MaxPooling3D, self).__init__(pool_size, strides, padding, data_format,
+                                       **kwargs)
+
+  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
+    output = K.pool3d(
+        inputs, pool_size, strides, padding, data_format, pool_mode='max')
+    return output
+
+
+class AveragePooling3D(_Pooling3D):
+  """Average pooling operation for 3D data (spatial or spatio-temporal).
+
+  Arguments:
+      pool_size: tuple of 3 integers,
+          factors by which to downscale (dim1, dim2, dim3).
+          (2, 2, 2) will halve the size of the 3D input in each dimension.
+      strides: tuple of 3 integers, or None. Strides values.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+          while `channels_first` corresponds to inputs with shape
+          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      - If `data_format='channels_last'`:
+          5D tensor with shape:
+          `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      - If `data_format='channels_first'`:
+          5D tensor with shape:
+          `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+
+  Output shape:
+      - If `data_format='channels_last'`:
+          5D tensor with shape:
+          `(batch_size, pooled_dim1, pooled_dim2, pooled_dim3, channels)`
+      - If `data_format='channels_first'`:
+          5D tensor with shape:
+          `(batch_size, channels, pooled_dim1, pooled_dim2, pooled_dim3)`
+  """
+
+  def __init__(self,
+               pool_size=(2, 2, 2),
+               strides=None,
+               padding='valid',
+               data_format=None,
+               **kwargs):
+    super(AveragePooling3D, self).__init__(pool_size, strides, padding,
+                                           data_format, **kwargs)
+
+  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
+    output = K.pool3d(
+        inputs, pool_size, strides, padding, data_format, pool_mode='avg')
+    return output
+
+
+class _GlobalPooling1D(Layer):
+  """Abstract class for different global pooling 1D layers.
+  """
+
+  def __init__(self, **kwargs):
+    super(_GlobalPooling1D, self).__init__(**kwargs)
+    self.input_spec = InputSpec(ndim=3)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    return tensor_shape.TensorShape([input_shape[0], input_shape[2]])
+
+  def call(self, inputs):
+    raise NotImplementedError
+
+
+class GlobalAveragePooling1D(_GlobalPooling1D):
+  """Global average pooling operation for temporal data.
+
+  Input shape:
+      3D tensor with shape: `(batch_size, steps, features)`.
+
+  Output shape:
+      2D tensor with shape:
+      `(batch_size, channels)`
+  """
+
+  def call(self, inputs):
+    return K.mean(inputs, axis=1)
+
+
+class GlobalMaxPooling1D(_GlobalPooling1D):
+  """Global max pooling operation for temporal data.
+
+  Input shape:
+      3D tensor with shape: `(batch_size, steps, features)`.
+
+  Output shape:
+      2D tensor with shape:
+      `(batch_size, channels)`
+  """
+
+  def call(self, inputs):
+    return K.max(inputs, axis=1)
+
+
+class _GlobalPooling2D(Layer):
+  """Abstract class for different global pooling 2D layers.
+  """
+
+  def __init__(self, data_format=None, **kwargs):
+    super(_GlobalPooling2D, self).__init__(**kwargs)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=4)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_last':
+      return tensor_shape.TensorShape([input_shape[0], input_shape[3]])
+    else:
+      return tensor_shape.TensorShape([input_shape[0], input_shape[1]])
+
+  def call(self, inputs):
+    raise NotImplementedError
+
+  def get_config(self):
+    config = {'data_format': self.data_format}
+    base_config = super(_GlobalPooling2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class GlobalAveragePooling2D(_GlobalPooling2D):
+  """Global average pooling operation for spatial data.
+
+  Arguments:
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, width, height, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, width, height)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      - If `data_format='channels_last'`:
+          4D tensor with shape:
+          `(batch_size, rows, cols, channels)`
+      - If `data_format='channels_first'`:
+          4D tensor with shape:
+          `(batch_size, channels, rows, cols)`
+
+  Output shape:
+      2D tensor with shape:
+      `(batch_size, channels)`
+  """
+
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      return K.mean(inputs, axis=[1, 2])
+    else:
+      return K.mean(inputs, axis=[2, 3])
+
+
+class GlobalMaxPooling2D(_GlobalPooling2D):
+  """Global max pooling operation for spatial data.
+
+  Arguments:
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, width, height, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, width, height)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      - If `data_format='channels_last'`:
+          4D tensor with shape:
+          `(batch_size, rows, cols, channels)`
+      - If `data_format='channels_first'`:
+          4D tensor with shape:
+          `(batch_size, channels, rows, cols)`
+
+  Output shape:
+      2D tensor with shape:
+      `(batch_size, channels)`
+  """
+
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      return K.max(inputs, axis=[1, 2])
+    else:
+      return K.max(inputs, axis=[2, 3])
+
+
+class _GlobalPooling3D(Layer):
+  """Abstract class for different global pooling 3D layers.
+  """
+
+  def __init__(self, data_format=None, **kwargs):
+    super(_GlobalPooling3D, self).__init__(**kwargs)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=5)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_last':
+      return tensor_shape.TensorShape([input_shape[0], input_shape[4]])
+    else:
+      return tensor_shape.TensorShape([input_shape[0], input_shape[1]])
+
+  def call(self, inputs):
+    raise NotImplementedError
+
+  def get_config(self):
+    config = {'data_format': self.data_format}
+    base_config = super(_GlobalPooling3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class GlobalAveragePooling3D(_GlobalPooling3D):
+  """Global Average pooling operation for 3D data.
+
+  Arguments:
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+          while `channels_first` corresponds to inputs with shape
+          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      - If `data_format='channels_last'`:
+          5D tensor with shape:
+          `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      - If `data_format='channels_first'`:
+          5D tensor with shape:
+          `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+
+  Output shape:
+      2D tensor with shape:
+      `(batch_size, channels)`
+  """
+
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      return K.mean(inputs, axis=[1, 2, 3])
+    else:
+      return K.mean(inputs, axis=[2, 3, 4])
+
+
+class GlobalMaxPooling3D(_GlobalPooling3D):
+  """Global Max pooling operation for 3D data.
+
+  Arguments:
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+          while `channels_first` corresponds to inputs with shape
+          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      - If `data_format='channels_last'`:
+          5D tensor with shape:
+          `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      - If `data_format='channels_first'`:
+          5D tensor with shape:
+          `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+
+  Output shape:
+      2D tensor with shape:
+      `(batch_size, channels)`
+  """
+
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      return K.max(inputs, axis=[1, 2, 3])
+    else:
+      return K.max(inputs, axis=[2, 3, 4])
+
+
+# Aliases
+
+AvgPool1D = AveragePooling1D
+MaxPool1D = MaxPooling1D
+AvgPool2D = AveragePooling2D
+MaxPool2D = MaxPooling2D
+AvgPool3D = AveragePooling3D
+MaxPool3D = MaxPooling3D
+GlobalMaxPool1D = GlobalMaxPooling1D
+GlobalMaxPool2D = GlobalMaxPooling2D
+GlobalMaxPool3D = GlobalMaxPooling3D
+GlobalAvgPool1D = GlobalAveragePooling1D
+GlobalAvgPool2D = GlobalAveragePooling2D
+GlobalAvgPool3D = GlobalAveragePooling3D
diff --git a/tensorflow/contrib/keras/python/keras/layers/pooling_test.py b/tensorflow/contrib/keras/python/keras/layers/pooling_test.py
new file mode 100644
index 0000000000..6eb6deff60
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/pooling_test.py
@@ -0,0 +1,179 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for pooling layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class GlobalPoolingTest(test.TestCase):
+
+  def test_globalpooling_1d(self):
+    with self.test_session():
+      testing_utils.layer_test(keras.layers.pooling.GlobalMaxPooling1D,
+                               input_shape=(3, 4, 5))
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalAveragePooling1D, input_shape=(3, 4, 5))
+
+  def test_globalpooling_2d(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalMaxPooling2D,
+          kwargs={'data_format': 'channels_first'},
+          input_shape=(3, 4, 5, 6))
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalMaxPooling2D,
+          kwargs={'data_format': 'channels_last'},
+          input_shape=(3, 5, 6, 4))
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalAveragePooling2D,
+          kwargs={'data_format': 'channels_first'},
+          input_shape=(3, 4, 5, 6))
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalAveragePooling2D,
+          kwargs={'data_format': 'channels_last'},
+          input_shape=(3, 5, 6, 4))
+
+  def test_globalpooling_3d(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalMaxPooling3D,
+          kwargs={'data_format': 'channels_first'},
+          input_shape=(3, 4, 3, 4, 3))
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalMaxPooling3D,
+          kwargs={'data_format': 'channels_last'},
+          input_shape=(3, 4, 3, 4, 3))
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalAveragePooling3D,
+          kwargs={'data_format': 'channels_first'},
+          input_shape=(3, 4, 3, 4, 3))
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalAveragePooling3D,
+          kwargs={'data_format': 'channels_last'},
+          input_shape=(3, 4, 3, 4, 3))
+
+
+class Pooling2DTest(test.TestCase):
+
+  def test_maxpooling_2d(self):
+    pool_size = (3, 3)
+    with self.test_session():
+      for strides in [(1, 1), (2, 2)]:
+        testing_utils.layer_test(
+            keras.layers.MaxPooling2D,
+            kwargs={
+                'strides': strides,
+                'padding': 'valid',
+                'pool_size': pool_size
+            },
+            input_shape=(3, 5, 6, 4))
+
+  def test_averagepooling_2d(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.AveragePooling2D,
+          kwargs={'strides': (2, 2),
+                  'padding': 'same',
+                  'pool_size': (2, 2)},
+          input_shape=(3, 5, 6, 4))
+      testing_utils.layer_test(
+          keras.layers.AveragePooling2D,
+          kwargs={'strides': (2, 2),
+                  'padding': 'valid',
+                  'pool_size': (3, 3)},
+          input_shape=(3, 5, 6, 4))
+      testing_utils.layer_test(
+          keras.layers.AveragePooling2D,
+          kwargs={
+              'strides': (1, 1),
+              'padding': 'valid',
+              'pool_size': (2, 2),
+              'data_format': 'channels_first'
+          },
+          input_shape=(3, 4, 5, 6))
+
+
+class Pooling3DTest(test.TestCase):
+
+  def test_maxpooling_3d(self):
+    pool_size = (3, 3, 3)
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.MaxPooling3D,
+          kwargs={'strides': 2,
+                  'padding': 'valid',
+                  'pool_size': pool_size},
+          input_shape=(3, 11, 12, 10, 4))
+      testing_utils.layer_test(
+          keras.layers.MaxPooling3D,
+          kwargs={
+              'strides': 3,
+              'padding': 'valid',
+              'data_format': 'channels_first',
+              'pool_size': pool_size
+          },
+          input_shape=(3, 4, 11, 12, 10))
+
+  def test_averagepooling_3d(self):
+    pool_size = (3, 3, 3)
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.AveragePooling3D,
+          kwargs={'strides': 2,
+                  'padding': 'valid',
+                  'pool_size': pool_size},
+          input_shape=(3, 11, 12, 10, 4))
+      testing_utils.layer_test(
+          keras.layers.AveragePooling3D,
+          kwargs={
+              'strides': 3,
+              'padding': 'valid',
+              'data_format': 'channels_first',
+              'pool_size': pool_size
+          },
+          input_shape=(3, 4, 11, 12, 10))
+
+
+class Pooling1DTest(test.TestCase):
+
+  def test_maxpooling_1d(self):
+    with self.test_session():
+      for padding in ['valid', 'same']:
+        for stride in [1, 2]:
+          testing_utils.layer_test(
+              keras.layers.MaxPooling1D,
+              kwargs={'strides': stride,
+                      'padding': padding},
+              input_shape=(3, 5, 4))
+
+  def test_averagepooling_1d(self):
+    with self.test_session():
+      for padding in ['valid', 'same']:
+        for stride in [1, 2]:
+          testing_utils.layer_test(
+              keras.layers.AveragePooling1D,
+              kwargs={'strides': stride,
+                      'padding': padding},
+              input_shape=(3, 5, 4))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/recurrent.py b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
new file mode 100644
index 0000000000..06986d3eaa
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
@@ -0,0 +1,1249 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Recurrent layers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras import activations
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.python.framework import tensor_shape
+
+
+# pylint: disable=access-member-before-definition
+
+
+def _time_distributed_dense(x,
+                            w,
+                            b=None,
+                            dropout=None,
+                            input_dim=None,
+                            output_dim=None,
+                            timesteps=None,
+                            training=None):
+  """Apply `y . w + b` for every temporal slice y of x.
+
+  Arguments:
+      x: input tensor.
+      w: weight matrix.
+      b: optional bias vector.
+      dropout: wether to apply dropout (same dropout mask
+          for every temporal slice of the input).
+      input_dim: integer; optional dimensionality of the input.
+      output_dim: integer; optional dimensionality of the output.
+      timesteps: integer; optional number of timesteps.
+      training: training phase tensor or boolean.
+
+  Returns:
+      Output tensor.
+  """
+  if not input_dim:
+    input_dim = K.shape(x)[2]
+  if not timesteps:
+    timesteps = K.shape(x)[1]
+  if not output_dim:
+    output_dim = K.shape(w)[1]
+
+  if dropout is not None and 0. < dropout < 1.:
+    # apply the same dropout pattern at every timestep
+    ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim)))
+    dropout_matrix = K.dropout(ones, dropout)
+    expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps)
+    x = K.in_train_phase(x * expanded_dropout_matrix, x, training=training)
+
+  # collapse time dimension and batch dimension together
+  x = K.reshape(x, (-1, input_dim))
+  x = K.dot(x, w)
+  if b is not None:
+    x = K.bias_add(x, b)
+  # reshape to 3D tensor
+  if K.backend() == 'tensorflow':
+    x = K.reshape(x, K.stack([-1, timesteps, output_dim]))
+    x.set_shape([None, None, output_dim])
+  else:
+    x = K.reshape(x, (-1, timesteps, output_dim))
+  return x
+
+
+class Recurrent(Layer):
+  """Abstract base class for recurrent layers.
+
+  Do not use in a model -- it's not a valid layer!
+  Use its children classes `LSTM`, `GRU` and `SimpleRNN` instead.
+
+  All recurrent layers (`LSTM`, `GRU`, `SimpleRNN`) also
+  follow the specifications of this class and accept
+  the keyword arguments listed below.
+
+  Example:
+
+  ```python
+      # as the first layer in a Sequential model
+      model = Sequential()
+      model.add(LSTM(32, input_shape=(10, 64)))
+      # now model.output_shape == (None, 32)
+      # note: `None` is the batch dimension.
+
+      # for subsequent layers, not need to specify the input size:
+      model.add(LSTM(16))
+  ```
+
+  Arguments:
+      weights: list of Numpy arrays to set as initial weights.
+          The list should have 3 elements, of shapes:
+          `[(input_dim, output_dim), (output_dim, output_dim), (output_dim,)]`.
+      return_sequences: Boolean. Whether to return the last output
+          in the output sequence, or the full sequence.
+      go_backwards: Boolean (default False).
+          If True, process the input sequence backwards.
+      stateful: Boolean (default False). If True, the last state
+          for each sample at index i in a batch will be used as initial
+          state for the sample of index i in the following batch.
+      unroll: Boolean (default False).
+          If True, the network will be unrolled,
+          else a symbolic loop will be used.
+          Unrolling can speed-up a RNN,
+          although it tends to be more memory-intensive.
+          Unrolling is only suitable for short sequences.
+      implementation: one of {0, 1, or 2}.
+          If set to 0, the RNN will use
+          an implementation that uses fewer, larger matrix products,
+          thus running faster on CPU but consuming more memory.
+          If set to 1, the RNN will use more matrix products,
+          but smaller ones, thus running slower
+          (may actually be faster on GPU) while consuming less memory.
+          If set to 2 (LSTM/GRU only),
+          the RNN will combine the input gate,
+          the forget gate and the output gate into a single matrix,
+          enabling more time-efficient parallelization on the GPU.
+          Note: RNN dropout must be shared for all gates,
+          resulting in a slightly reduced regularization.
+      input_dim: dimensionality of the input (integer).
+          This argument (or alternatively, the keyword argument `input_shape`)
+          is required when using this layer as the first layer in a model.
+      input_length: Length of input sequences, to be specified
+          when it is constant.
+          This argument is required if you are going to connect
+          `Flatten` then `Dense` layers upstream
+          (without it, the shape of the dense outputs cannot be computed).
+          Note that if the recurrent layer is not the first layer
+          in your model, you would need to specify the input length
+          at the level of the first layer
+          (e.g. via the `input_shape` argument)
+
+  Input shape:s
+      3D tensor with shape `(batch_size, timesteps, input_dim)`,
+      (Optional) 2D tensors with shape `(batch_size, output_dim)`.
+
+  Output shape:
+      - if `return_sequences`: 3D tensor with shape
+          `(batch_size, timesteps, units)`.
+      - else, 2D tensor with shape `(batch_size, units)`.
+
+  # Masking
+      This layer supports masking for input data with a variable number
+      of timesteps. To introduce masks to your data,
+      use an `Embedding` layer with the `mask_zero` parameter
+      set to `True`.
+
+  # Note on using statefulness in RNNs
+      You can set RNN layers to be 'stateful', which means that the states
+      computed for the samples in one batch will be reused as initial states
+      for the samples in the next batch. This assumes a one-to-one mapping
+      between samples in different successive batches.
+
+      To enable statefulness:
+          - specify `stateful=True` in the layer constructor.
+          - specify a fixed batch size for your model, by passing
+              if sequential model:
+                `batch_input_shape=(...)` to the first layer in your model.
+              else for functional model with 1 or more Input layers:
+                `batch_shape=(...)` to all the first layers in your model.
+              This is the expected shape of your inputs
+              *including the batch size*.
+              It should be a tuple of integers, e.g. `(32, 10, 100)`.
+          - specify `shuffle=False` when calling fit().
+
+      To reset the states of your model, call `.reset_states()` on either
+      a specific layer, or on your entire model.
+
+  # Note on specifying initial states in RNNs
+      You can specify the initial state of RNN layers by calling them with
+      the keyword argument `initial_state`. The value of `initial_state`
+      should be a tensor or list of tensors representing the initial state
+      of the RNN layer.
+  """
+
+  def __init__(self,
+               return_sequences=False,
+               go_backwards=False,
+               stateful=False,
+               unroll=False,
+               implementation=0,
+               **kwargs):
+    super(Recurrent, self).__init__(**kwargs)
+    self.return_sequences = return_sequences
+    self.go_backwards = go_backwards
+    self.stateful = stateful
+    self.unroll = unroll
+    self.implementation = implementation
+    self.supports_masking = True
+    self.input_spec = InputSpec(ndim=3)
+    self.state_spec = None
+    self.dropout = 0
+    self.recurrent_dropout = 0
+
+  def _compute_output_shape(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.return_sequences:
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], self.units])
+    else:
+      return tensor_shape.TensorShape([input_shape[0], self.units])
+
+  def compute_mask(self, inputs, mask):
+    if self.return_sequences:
+      return mask
+    else:
+      return None
+
+  def step(self, inputs, states):
+    raise NotImplementedError
+
+  def get_constants(self, inputs, training=None):
+    return []
+
+  def get_initial_states(self, inputs):
+    # build an all-zero tensor of shape (samples, output_dim)
+    initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
+    initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
+    initial_state = K.expand_dims(initial_state)  # (samples, 1)
+    initial_state = K.tile(initial_state, [1,
+                                           self.units])  # (samples, output_dim)
+    initial_states = [initial_state for _ in range(len(self.states))]
+    return initial_states
+
+  def preprocess_input(self, inputs, training=None):
+    return inputs
+
+  def __call__(self, inputs, initial_state=None, **kwargs):
+    # If `initial_state` is specified,
+    # and if it a Keras tensor,
+    # then add it to the inputs and temporarily
+    # modify the input spec to include the state.
+    if initial_state is not None:
+      if hasattr(initial_state, '_keras_history'):
+        # Compute the full input spec, including state
+        input_spec = self.input_spec
+        state_spec = self.state_spec
+        if not isinstance(state_spec, list):
+          state_spec = [state_spec]
+        self.input_spec = [input_spec] + state_spec
+
+        # Compute the full inputs, including state
+        if not isinstance(initial_state, (list, tuple)):
+          initial_state = [initial_state]
+        inputs = [inputs] + list(initial_state)
+
+        # Perform the call
+        output = super(Recurrent, self).__call__(inputs, **kwargs)
+
+        # Restore original input spec
+        self.input_spec = input_spec
+        return output
+      else:
+        kwargs['initial_state'] = initial_state
+    return super(Recurrent, self).__call__(inputs, **kwargs)
+
+  def call(self, inputs, mask=None, initial_state=None, training=None):
+    # input shape: `(samples, time (padded with zeros), input_dim)`
+    # note that the .build() method of subclasses MUST define
+    # self.input_spec and self.state_spec with complete input shapes.
+    if initial_state is not None:
+      if not isinstance(initial_state, (list, tuple)):
+        initial_states = [initial_state]
+      else:
+        initial_states = list(initial_state)
+    if isinstance(inputs, list):
+      initial_states = inputs[1:]
+      inputs = inputs[0]
+    elif self.stateful:
+      initial_states = self.states
+    else:
+      initial_states = self.get_initial_states(inputs)
+
+    if len(initial_states) != len(self.states):
+      raise ValueError('Layer has ' + str(
+          len(self.states)) + ' states but was passed ' + str(
+              len(initial_states)) + ' initial states.')
+    input_shape = K.int_shape(inputs)
+    if self.unroll and input_shape[1] is None:
+      raise ValueError('Cannot unroll a RNN if the '
+                       'time dimension is undefined. \n'
+                       '- If using a Sequential model, '
+                       'specify the time dimension by passing '
+                       'an `input_shape` or `batch_input_shape` '
+                       'argument to your first layer. If your '
+                       'first layer is an Embedding, you can '
+                       'also use the `input_length` argument.\n'
+                       '- If using the functional API, specify '
+                       'the time dimension by passing a `shape` '
+                       'or `batch_shape` argument to your Input layer.')
+    constants = self.get_constants(inputs, training=None)
+    preprocessed_input = self.preprocess_input(inputs, training=None)
+    last_output, outputs, states = K.rnn(
+        self.step,
+        preprocessed_input,
+        initial_states,
+        go_backwards=self.go_backwards,
+        mask=mask,
+        constants=constants,
+        unroll=self.unroll)
+    if self.stateful:
+      updates = []
+      for i in range(len(states)):
+        updates.append((self.states[i], states[i]))
+      self.add_update(updates, inputs)
+
+    # Properly set learning phase
+    if 0 < self.dropout + self.recurrent_dropout:
+      last_output._uses_learning_phase = True
+      outputs._uses_learning_phase = True
+
+    if self.return_sequences:
+      return outputs
+    else:
+      return last_output
+
+  def reset_states(self, states_value=None):
+    if not self.stateful:
+      raise AttributeError('Layer must be stateful.')
+    if not self.input_spec:
+      raise RuntimeError('Layer has never been called '
+                         'and thus has no states.')
+    batch_size = self.input_spec.shape[0]
+    if not batch_size:
+      raise ValueError('If a RNN is stateful, it needs to know '
+                       'its batch size. Specify the batch size '
+                       'of your input tensors: \n'
+                       '- If using a Sequential model, '
+                       'specify the batch size by passing '
+                       'a `batch_input_shape` '
+                       'argument to your first layer.\n'
+                       '- If using the functional API, specify '
+                       'the time dimension by passing a '
+                       '`batch_shape` argument to your Input layer.')
+    if states_value is not None:
+      if not isinstance(states_value, (list, tuple)):
+        states_value = [states_value]
+      if len(states_value) != len(self.states):
+        raise ValueError('The layer has ' + str(len(self.states)) +
+                         ' states, but the `states_value` '
+                         'argument passed '
+                         'only has ' + str(len(states_value)) + ' entries')
+    if self.states[0] is None:
+      self.states = [K.zeros((batch_size, self.units)) for _ in self.states]
+      if not states_value:
+        return
+    for i, state in enumerate(self.states):
+      if states_value:
+        value = states_value[i]
+        if value.shape != (batch_size, self.units):
+          raise ValueError('Expected state #' + str(
+              i) + ' to have shape ' + str((batch_size, self.units)) +
+                           ' but got array with shape ' + str(value.shape))
+      else:
+        value = np.zeros((batch_size, self.units))
+      K.set_value(state, value)
+
+  def get_config(self):
+    config = {
+        'return_sequences': self.return_sequences,
+        'go_backwards': self.go_backwards,
+        'stateful': self.stateful,
+        'unroll': self.unroll,
+        'implementation': self.implementation
+    }
+    base_config = super(Recurrent, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class SimpleRNN(Recurrent):
+  """Fully-connected RNN where the output is to be fed back to input.
+
+  Arguments:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+          used for the linear transformation of the inputs..
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+          weights matrix,
+          used for the linear transformation of the recurrent state..
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      recurrent_regularizer: Regularizer function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to
+          the `kernel` weights matrix.
+      recurrent_constraint: Constraint function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the recurrent state.
+
+  References:
+      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
+        Networks](http://arxiv.org/abs/1512.05287)
+  """
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               **kwargs):
+    super(SimpleRNN, self).__init__(**kwargs)
+    self.units = units
+    self.activation = activations.get(activation)
+    self.use_bias = use_bias
+
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.recurrent_initializer = initializers.get(recurrent_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.recurrent_constraint = constraints.get(recurrent_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+    self.dropout = min(1., max(0., dropout))
+    self.recurrent_dropout = min(1., max(0., recurrent_dropout))
+
+  def build(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_dim = input_shape[2]
+    self.input_spec = InputSpec(shape=(batch_size, None, self.input_dim))
+    self.state_spec = InputSpec(shape=(batch_size, self.units))
+
+    self.states = [None]
+    if self.stateful:
+      self.reset_states()
+
+    self.kernel = self.add_weight(
+        (self.input_dim, self.units),
+        name='kernel',
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+    self.recurrent_kernel = self.add_weight(
+        (self.units, self.units),
+        name='recurrent_kernel',
+        initializer=self.recurrent_initializer,
+        regularizer=self.recurrent_regularizer,
+        constraint=self.recurrent_constraint)
+    if self.use_bias:
+      self.bias = self.add_weight(
+          (self.units,),
+          name='bias',
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+    else:
+      self.bias = None
+    self.built = True
+
+  def preprocess_input(self, inputs, training=None):
+    if self.implementation > 0:
+      return inputs
+    else:
+      input_shape = inputs.get_shape().as_list()
+      input_dim = input_shape[2]
+      timesteps = input_shape[1]
+      return _time_distributed_dense(
+          inputs,
+          self.kernel,
+          self.bias,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+
+  def step(self, inputs, states):
+    if self.implementation == 0:
+      h = inputs
+    else:
+      if 0 < self.dropout < 1:
+        h = K.dot(inputs * states[1], self.kernel)
+      else:
+        h = K.dot(inputs, self.kernel)
+      if self.bias is not None:
+        h = K.bias_add(h, self.bias)
+
+    prev_output = states[0]
+    if 0 < self.recurrent_dropout < 1:
+      prev_output *= states[2]
+    output = h + K.dot(prev_output, self.recurrent_kernel)
+    if self.activation is not None:
+      output = self.activation(output)
+
+    # Properly set learning phase on output tensor.
+    if 0 < self.dropout + self.recurrent_dropout:
+      output._uses_learning_phase = True
+    return output, [output]
+
+  def get_constants(self, inputs, training=None):
+    constants = []
+    if self.implementation == 0 and 0 < self.dropout < 1:
+      input_shape = K.int_shape(inputs)
+      input_dim = input_shape[-1]
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, int(input_dim)))
+
+      def dropped_inputs():
+        return K.dropout(ones, self.dropout)
+
+      dp_mask = K.in_train_phase(dropped_inputs, ones, training=training)
+      constants.append(dp_mask)
+    else:
+      constants.append(K.cast_to_floatx(1.))
+
+    if 0 < self.recurrent_dropout < 1:
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, self.units))
+
+      def dropped_inputs():  # pylint: disable=function-redefined
+        return K.dropout(ones, self.recurrent_dropout)
+
+      rec_dp_mask = K.in_train_phase(dropped_inputs, ones, training=training)
+      constants.append(rec_dp_mask)
+    else:
+      constants.append(K.cast_to_floatx(1.))
+    return constants
+
+  def get_config(self):
+    config = {
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout
+    }
+    base_config = super(SimpleRNN, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class GRU(Recurrent):
+  """Gated Recurrent Unit - Cho et al.
+
+  2014.
+
+  Arguments:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use
+          for the recurrent step.
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+          used for the linear transformation of the inputs..
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+          weights matrix,
+          used for the linear transformation of the recurrent state..
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      recurrent_regularizer: Regularizer function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to
+          the `kernel` weights matrix.
+      recurrent_constraint: Constraint function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the recurrent state.
+
+  References:
+      - [On the Properties of Neural Machine Translation: Encoder-Decoder
+        Approaches](https://arxiv.org/abs/1409.1259)
+      - [Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
+        Modeling](http://arxiv.org/abs/1412.3555v1)
+      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
+        Networks](http://arxiv.org/abs/1512.05287)
+  """
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='hard_sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               **kwargs):
+    super(GRU, self).__init__(**kwargs)
+    self.units = units
+    self.activation = activations.get(activation)
+    self.recurrent_activation = activations.get(recurrent_activation)
+    self.use_bias = use_bias
+
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.recurrent_initializer = initializers.get(recurrent_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.recurrent_constraint = constraints.get(recurrent_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+    self.dropout = min(1., max(0., dropout))
+    self.recurrent_dropout = min(1., max(0., recurrent_dropout))
+
+  def build(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    self.input_spec = InputSpec(shape=input_shape)
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_dim = input_shape[2]
+    self.input_spec = InputSpec(shape=(batch_size, None, self.input_dim))
+    self.state_spec = InputSpec(shape=(batch_size, self.units))
+
+    self.states = [None]
+    if self.stateful:
+      self.reset_states()
+
+    self.kernel = self.add_weight(
+        (self.input_dim, self.units * 3),
+        name='kernel',
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+    self.recurrent_kernel = self.add_weight(
+        (self.units, self.units * 3),
+        name='recurrent_kernel',
+        initializer=self.recurrent_initializer,
+        regularizer=self.recurrent_regularizer,
+        constraint=self.recurrent_constraint)
+
+    if self.use_bias:
+      self.bias = self.add_weight(
+          (self.units * 3,),
+          name='bias',
+          initializer='zero',
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+    else:
+      self.bias = None
+
+    self.kernel_z = self.kernel[:, :self.units]
+    self.recurrent_kernel_z = self.recurrent_kernel[:, :self.units]
+    self.kernel_r = self.kernel[:, self.units:self.units * 2]
+    self.recurrent_kernel_r = self.recurrent_kernel[:, self.units:self.units *
+                                                    2]
+    self.kernel_h = self.kernel[:, self.units * 2:]
+    self.recurrent_kernel_h = self.recurrent_kernel[:, self.units * 2:]
+
+    if self.use_bias:
+      self.bias_z = self.bias[:self.units]
+      self.bias_r = self.bias[self.units:self.units * 2]
+      self.bias_h = self.bias[self.units * 2:]
+    else:
+      self.bias_z = None
+      self.bias_r = None
+      self.bias_h = None
+    self.built = True
+
+  def preprocess_input(self, inputs, training=None):
+    if self.implementation == 0:
+      input_shape = inputs.get_shape().as_list()
+      input_dim = input_shape[2]
+      timesteps = input_shape[1]
+
+      x_z = _time_distributed_dense(
+          inputs,
+          self.kernel_z,
+          self.bias_z,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_r = _time_distributed_dense(
+          inputs,
+          self.kernel_r,
+          self.bias_r,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_h = _time_distributed_dense(
+          inputs,
+          self.kernel_h,
+          self.bias_h,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      return K.concatenate([x_z, x_r, x_h], axis=2)
+    else:
+      return inputs
+
+  def get_constants(self, inputs, training=None):
+    constants = []
+    if self.implementation == 0 and 0 < self.dropout < 1:
+      input_shape = K.int_shape(inputs)
+      input_dim = input_shape[-1]
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, int(input_dim)))
+
+      def dropped_inputs():
+        return K.dropout(ones, self.dropout)
+
+      dp_mask = [
+          K.in_train_phase(dropped_inputs, ones, training=training)
+          for _ in range(3)
+      ]
+      constants.append(dp_mask)
+    else:
+      constants.append([K.cast_to_floatx(1.) for _ in range(3)])
+
+    if 0 < self.recurrent_dropout < 1:
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, self.units))
+
+      def dropped_inputs():  # pylint: disable=function-redefined
+        return K.dropout(ones, self.recurrent_dropout)
+
+      rec_dp_mask = [
+          K.in_train_phase(dropped_inputs, ones, training=training)
+          for _ in range(3)
+      ]
+      constants.append(rec_dp_mask)
+    else:
+      constants.append([K.cast_to_floatx(1.) for _ in range(3)])
+    return constants
+
+  def step(self, inputs, states):
+    h_tm1 = states[0]  # previous memory
+    dp_mask = states[1]  # dropout matrices for recurrent units
+    rec_dp_mask = states[2]
+
+    if self.implementation == 2:
+      matrix_x = K.dot(inputs * dp_mask[0], self.kernel)
+      if self.use_bias:
+        matrix_x = K.bias_add(matrix_x, self.bias)
+      matrix_inner = K.dot(h_tm1 * rec_dp_mask[0],
+                           self.recurrent_kernel[:, :2 * self.units])
+
+      x_z = matrix_x[:, :self.units]
+      x_r = matrix_x[:, self.units:2 * self.units]
+      recurrent_z = matrix_inner[:, :self.units]
+      recurrent_r = matrix_inner[:, self.units:2 * self.units]
+
+      z = self.recurrent_activation(x_z + recurrent_z)
+      r = self.recurrent_activation(x_r + recurrent_r)
+
+      x_h = matrix_x[:, 2 * self.units:]
+      recurrent_h = K.dot(r * h_tm1 * rec_dp_mask[0],
+                          self.recurrent_kernel[:, 2 * self.units:])
+      hh = self.activation(x_h + recurrent_h)
+    else:
+      if self.implementation == 0:
+        x_z = inputs[:, :self.units]
+        x_r = inputs[:, self.units:2 * self.units]
+        x_h = inputs[:, 2 * self.units:]
+      elif self.implementation == 1:
+        x_z = K.dot(inputs * dp_mask[0], self.kernel_z)
+        x_r = K.dot(inputs * dp_mask[1], self.kernel_r)
+        x_h = K.dot(inputs * dp_mask[2], self.kernel_h)
+        if self.use_bias:
+          x_z = K.bias_add(x_z, self.bias_z)
+          x_r = K.bias_add(x_r, self.bias_r)
+          x_h = K.bias_add(x_r, self.bias_h)
+      else:
+        raise ValueError('Unknown `implementation` mode.')
+      z = self.recurrent_activation(x_z + K.dot(h_tm1 * rec_dp_mask[0],
+                                                self.recurrent_kernel_z))
+      r = self.recurrent_activation(x_r + K.dot(h_tm1 * rec_dp_mask[1],
+                                                self.recurrent_kernel_r))
+
+      hh = self.activation(x_h + K.dot(r * h_tm1 * rec_dp_mask[2],
+                                       self.recurrent_kernel_h))
+    h = z * h_tm1 + (1 - z) * hh
+    if 0 < self.dropout + self.recurrent_dropout:
+      h._uses_learning_phase = True
+    return h, [h]
+
+  def get_config(self):
+    config = {
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'recurrent_activation':
+            activations.serialize(self.recurrent_activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout
+    }
+    base_config = super(GRU, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class LSTM(Recurrent):
+  """Long-Short Term Memory unit - Hochreiter 1997.
+
+  For a step-by-step description of the algorithm, see
+  [this tutorial](http://deeplearning.net/tutorial/lstm.html).
+
+  Arguments:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use
+          for the recurrent step.
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+          used for the linear transformation of the inputs..
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+          weights matrix,
+          used for the linear transformation of the recurrent state..
+      bias_initializer: Initializer for the bias vector.
+      unit_forget_bias: Boolean.
+          If True, add 1 to the bias of the forget gate at initialization.
+          Setting it to true will also force `bias_initializer="zeros"`.
+          This is recommended in [Jozefowicz et
+            al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      recurrent_regularizer: Regularizer function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to
+          the `kernel` weights matrix.
+      recurrent_constraint: Constraint function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the recurrent state.
+
+  References:
+      - [Long short-term
+        memory](http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf)
+        (original 1997 paper)
+      - [Supervised sequence labeling with recurrent neural
+        networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
+      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
+        Networks](http://arxiv.org/abs/1512.05287)
+  """
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='hard_sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               **kwargs):
+    super(LSTM, self).__init__(**kwargs)
+    self.units = units
+    self.activation = activations.get(activation)
+    self.recurrent_activation = activations.get(recurrent_activation)
+    self.use_bias = use_bias
+
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.recurrent_initializer = initializers.get(recurrent_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.unit_forget_bias = unit_forget_bias
+
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.recurrent_constraint = constraints.get(recurrent_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+    self.dropout = min(1., max(0., dropout))
+    self.recurrent_dropout = min(1., max(0., recurrent_dropout))
+
+  def build(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    self.input_spec = InputSpec(shape=input_shape)
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_dim = input_shape[2]
+    self.input_spec = InputSpec(shape=(batch_size, None, self.input_dim))
+    self.state_spec = [
+        InputSpec(shape=(batch_size, self.units)), InputSpec(
+            shape=(batch_size, self.units))
+    ]
+
+    self.states = [None, None]
+    if self.stateful:
+      self.reset_states()
+
+    self.kernel = self.add_weight(
+        (self.input_dim, self.units * 4),
+        name='kernel',
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+    self.recurrent_kernel = self.add_weight(
+        (self.units, self.units * 4),
+        name='recurrent_kernel',
+        initializer=self.recurrent_initializer,
+        regularizer=self.recurrent_regularizer,
+        constraint=self.recurrent_constraint)
+
+    if self.use_bias:
+      self.bias = self.add_weight(
+          (self.units * 4,),
+          name='bias',
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+      if self.unit_forget_bias:
+        bias_value = np.zeros((self.units * 4,))
+        bias_value[self.units:self.units * 2] = 1.
+        K.set_value(self.bias, bias_value)
+    else:
+      self.bias = None
+
+    self.kernel_i = self.kernel[:, :self.units]
+    self.kernel_f = self.kernel[:, self.units:self.units * 2]
+    self.kernel_c = self.kernel[:, self.units * 2:self.units * 3]
+    self.kernel_o = self.kernel[:, self.units * 3:]
+
+    self.recurrent_kernel_i = self.recurrent_kernel[:, :self.units]
+    self.recurrent_kernel_f = self.recurrent_kernel[:, self.units:self.units *
+                                                    2]
+    self.recurrent_kernel_c = self.recurrent_kernel[:, self.units * 2:self.units
+                                                    * 3]
+    self.recurrent_kernel_o = self.recurrent_kernel[:, self.units * 3:]
+
+    if self.use_bias:
+      self.bias_i = self.bias[:self.units]
+      self.bias_f = self.bias[self.units:self.units * 2]
+      self.bias_c = self.bias[self.units * 2:self.units * 3]
+      self.bias_o = self.bias[self.units * 3:]
+    else:
+      self.bias_i = None
+      self.bias_f = None
+      self.bias_c = None
+      self.bias_o = None
+    self.built = True
+
+  def preprocess_input(self, inputs, training=None):
+    if self.implementation == 0:
+      input_shape = inputs.get_shape().as_list()
+      input_dim = input_shape[2]
+      timesteps = input_shape[1]
+
+      x_i = _time_distributed_dense(
+          inputs,
+          self.kernel_i,
+          self.bias_i,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_f = _time_distributed_dense(
+          inputs,
+          self.kernel_f,
+          self.bias_f,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_c = _time_distributed_dense(
+          inputs,
+          self.kernel_c,
+          self.bias_c,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_o = _time_distributed_dense(
+          inputs,
+          self.kernel_o,
+          self.bias_o,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      return K.concatenate([x_i, x_f, x_c, x_o], axis=2)
+    else:
+      return inputs
+
+  def get_constants(self, inputs, training=None):
+    constants = []
+    if self.implementation == 0 and 0 < self.dropout < 1:
+      input_shape = K.int_shape(inputs)
+      input_dim = input_shape[-1]
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, int(input_dim)))
+
+      def dropped_inputs():
+        return K.dropout(ones, self.dropout)
+
+      dp_mask = [
+          K.in_train_phase(dropped_inputs, ones, training=training)
+          for _ in range(4)
+      ]
+      constants.append(dp_mask)
+    else:
+      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
+
+    if 0 < self.recurrent_dropout < 1:
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, self.units))
+
+      def dropped_inputs():  # pylint: disable=function-redefined
+        return K.dropout(ones, self.recurrent_dropout)
+
+      rec_dp_mask = [
+          K.in_train_phase(dropped_inputs, ones, training=training)
+          for _ in range(4)
+      ]
+      constants.append(rec_dp_mask)
+    else:
+      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
+    return constants
+
+  def step(self, inputs, states):
+    h_tm1 = states[0]
+    c_tm1 = states[1]
+    dp_mask = states[2]
+    rec_dp_mask = states[3]
+
+    if self.implementation == 2:
+      z = K.dot(inputs * dp_mask[0], self.kernel)
+      z += K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel)
+      if self.use_bias:
+        z = K.bias_add(z, self.bias)
+
+      z0 = z[:, :self.units]
+      z1 = z[:, self.units:2 * self.units]
+      z2 = z[:, 2 * self.units:3 * self.units]
+      z3 = z[:, 3 * self.units:]
+
+      i = self.recurrent_activation(z0)
+      f = self.recurrent_activation(z1)
+      c = f * c_tm1 + i * self.activation(z2)
+      o = self.recurrent_activation(z3)
+    else:
+      if self.implementation == 0:
+        x_i = inputs[:, :self.units]
+        x_f = inputs[:, self.units:2 * self.units]
+        x_c = inputs[:, 2 * self.units:3 * self.units]
+        x_o = inputs[:, 3 * self.units:]
+      elif self.implementation == 1:
+        x_i = K.dot(inputs * dp_mask[0], self.kernel_i) + self.bias_i
+        x_f = K.dot(inputs * dp_mask[1], self.kernel_f) + self.bias_f
+        x_c = K.dot(inputs * dp_mask[2], self.kernel_c) + self.bias_c
+        x_o = K.dot(inputs * dp_mask[3], self.kernel_o) + self.bias_o
+      else:
+        raise ValueError('Unknown `implementation` mode.')
+
+      i = self.recurrent_activation(x_i + K.dot(h_tm1 * rec_dp_mask[0],
+                                                self.recurrent_kernel_i))
+      f = self.recurrent_activation(x_f + K.dot(h_tm1 * rec_dp_mask[1],
+                                                self.recurrent_kernel_f))
+      c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1 * rec_dp_mask[2],
+                                                      self.recurrent_kernel_c))
+      o = self.recurrent_activation(x_o + K.dot(h_tm1 * rec_dp_mask[3],
+                                                self.recurrent_kernel_o))
+    h = o * self.activation(c)
+    if 0 < self.dropout + self.recurrent_dropout:
+      h._uses_learning_phase = True
+    return h, [h, c]
+
+  def get_config(self):
+    config = {
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'recurrent_activation':
+            activations.serialize(self.recurrent_activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'unit_forget_bias':
+            self.unit_forget_bias,
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout
+    }
+    base_config = super(LSTM, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/serialization.py b/tensorflow/contrib/keras/python/keras/layers/serialization.py
new file mode 100644
index 0000000000..f9c21a3e67
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/serialization.py
@@ -0,0 +1,63 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Layer serialization/deserialization functions.
+"""
+# pylint: disable=wildcard-import
+# pylint: disable=unused-import
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.engine import Input
+from tensorflow.contrib.keras.python.keras.engine import InputLayer
+from tensorflow.contrib.keras.python.keras.layers.advanced_activations import *
+from tensorflow.contrib.keras.python.keras.layers.convolutional import *
+from tensorflow.contrib.keras.python.keras.layers.convolutional_recurrent import *
+from tensorflow.contrib.keras.python.keras.layers.core import *
+from tensorflow.contrib.keras.python.keras.layers.embeddings import *
+from tensorflow.contrib.keras.python.keras.layers.local import *
+from tensorflow.contrib.keras.python.keras.layers.merge import *
+from tensorflow.contrib.keras.python.keras.layers.noise import *
+from tensorflow.contrib.keras.python.keras.layers.normalization import *
+from tensorflow.contrib.keras.python.keras.layers.pooling import *
+from tensorflow.contrib.keras.python.keras.layers.recurrent import *
+from tensorflow.contrib.keras.python.keras.layers.wrappers import *
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+
+
+def serialize(layer):
+  return {'class_name': layer.__class__.__name__, 'config': layer.get_config()}
+
+
+def deserialize(config, custom_objects=None):
+  """Instantiates a layer from a config dictionary.
+
+  Arguments:
+      config: dict of the form {'class_name': str, 'config': dict}
+      custom_objects: dict mapping class names (or function names)
+          of custom (non-Keras) objects to class/functions
+
+  Returns:
+      Layer instance (may be Model, Sequential, Layer...)
+  """
+  from tensorflow.contrib.keras.python.keras import models  # pylint: disable=g-import-not-at-top
+  globs = globals()  # All layers.
+  globs['Model'] = models.Model
+  globs['Sequential'] = models.Sequential
+  return deserialize_keras_object(
+      config,
+      module_objects=globs,
+      custom_objects=custom_objects,
+      printable_module_name='layer')
diff --git a/tensorflow/contrib/keras/python/keras/layers/serialization_test.py b/tensorflow/contrib/keras/python/keras/layers/serialization_test.py
new file mode 100644
index 0000000000..fb2e506a4c
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/serialization_test.py
@@ -0,0 +1,41 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for layer serialization utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class LayerSerializationTest(test.TestCase):
+
+  def test_serialize_deserialize(self):
+    layer = keras.layers.Dense(
+        3, activation='relu', kernel_initializer='ones', bias_regularizer='l2')
+    config = keras.layers.serialize(layer)
+    new_layer = keras.layers.deserialize(config)
+    self.assertEqual(new_layer.activation, keras.activations.relu)
+    self.assertEqual(new_layer.bias_regularizer.__class__,
+                     keras.regularizers.L1L2)
+    self.assertEqual(new_layer.kernel_initializer.__class__,
+                     keras.initializers.Ones)
+    self.assertEqual(new_layer.units, 3)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/simplernn_test.py b/tensorflow/contrib/keras/python/keras/layers/simplernn_test.py
new file mode 100644
index 0000000000..21ba152da4
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/simplernn_test.py
@@ -0,0 +1,194 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SimpleRNN layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class SimpleRNNLayerTest(test.TestCase):
+
+  def test_return_sequences_SimpleRNN(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.SimpleRNN,
+          kwargs={'units': units,
+                  'return_sequences': True},
+          input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_dynamic_behavior_SimpleRNN(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      layer = keras.layers.SimpleRNN(units, input_shape=(None, embedding_dim))
+      model = keras.models.Sequential()
+      model.add(layer)
+      model.compile('sgd', 'mse')
+      x = np.random.random((num_samples, timesteps, embedding_dim))
+      y = np.random.random((num_samples, units))
+      model.train_on_batch(x, y)
+
+  def test_dropout_SimpleRNN(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.SimpleRNN,
+          kwargs={'units': units,
+                  'dropout': 0.1,
+                  'recurrent_dropout': 0.1},
+          input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_implementation_mode_SimpleRNN(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      for mode in [0, 1, 2]:
+        testing_utils.layer_test(
+            keras.layers.SimpleRNN,
+            kwargs={'units': units,
+                    'implementation': mode},
+            input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_statefulness_SimpleRNN(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer_class = keras.layers.SimpleRNN
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Embedding(
+              4,
+              embedding_dim,
+              mask_zero=True,
+              input_length=timesteps,
+              batch_input_shape=(num_samples, timesteps)))
+      layer = layer_class(
+          units, return_sequences=False, stateful=True, weights=None)
+      model.add(layer)
+      model.compile(optimizer='sgd', loss='mse')
+      out1 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertEqual(out1.shape, (num_samples, units))
+
+      # train once so that the states change
+      model.train_on_batch(
+          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+      out2 = model.predict(np.ones((num_samples, timesteps)))
+
+      # if the state is not reset, output should be different
+      self.assertNotEqual(out1.max(), out2.max())
+
+      # check that output changes after states are reset
+      # (even though the model itself didn't change)
+      layer.reset_states()
+      out3 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out2.max(), out3.max())
+
+      # check that container-level reset_states() works
+      model.reset_states()
+      out4 = model.predict(np.ones((num_samples, timesteps)))
+      np.testing.assert_allclose(out3, out4, atol=1e-5)
+
+      # check that the call to `predict` updated the states
+      out5 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out4.max(), out5.max())
+
+      # Check masking
+      layer.reset_states()
+
+      left_padded_input = np.ones((num_samples, timesteps))
+      left_padded_input[0, :1] = 0
+      left_padded_input[1, :2] = 0
+      out6 = model.predict(left_padded_input)
+
+      layer.reset_states()
+
+      right_padded_input = np.ones((num_samples, timesteps))
+      right_padded_input[0, -1:] = 0
+      right_padded_input[1, -2:] = 0
+      out7 = model.predict(right_padded_input)
+
+      np.testing.assert_allclose(out7, out6, atol=1e-5)
+
+  def test_regularization_SimpleRNN(self):
+    embedding_dim = 4
+    layer_class = keras.layers.SimpleRNN
+    with self.test_session():
+      layer = layer_class(
+          5,
+          return_sequences=False,
+          weights=None,
+          input_shape=(None, embedding_dim),
+          kernel_regularizer=keras.regularizers.l1(0.01),
+          recurrent_regularizer=keras.regularizers.l1(0.01),
+          bias_regularizer='l2',
+          activity_regularizer='l1')
+      layer.build((None, None, 2))
+      self.assertEqual(len(layer.losses), 3)
+      layer(keras.backend.variable(np.ones((2, 3, 2))))
+      self.assertEqual(len(layer.losses), 4)
+
+      layer = layer_class(
+          5,
+          return_sequences=False,
+          weights=None,
+          input_shape=(None, embedding_dim),
+          kernel_constraint=keras.constraints.max_norm(0.01),
+          recurrent_constraint=keras.constraints.max_norm(0.01),
+          bias_constraint='max_norm')
+      layer.build((None, None, embedding_dim))
+      self.assertEqual(len(layer.constraints), 3)
+
+  def test_with_masking_layer_SimpleRNN(self):
+    layer_class = keras.layers.SimpleRNN
+    with self.test_session():
+      inputs = np.random.random((2, 3, 4))
+      targets = np.abs(np.random.random((2, 3, 5)))
+      targets /= targets.sum(axis=-1, keepdims=True)
+      model = keras.models.Sequential()
+      model.add(keras.layers.Masking(input_shape=(3, 4)))
+      model.add(layer_class(units=5, return_sequences=True, unroll=False))
+      model.compile(loss='categorical_crossentropy', optimizer='adam')
+      model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  def test_from_config_SimpleRNN(self):
+    layer_class = keras.layers.SimpleRNN
+    for stateful in (False, True):
+      l1 = layer_class(units=1, stateful=stateful)
+      l2 = layer_class.from_config(l1.get_config())
+      assert l1.get_config() == l2.get_config()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/wrappers.py b/tensorflow/contrib/keras/python/keras/layers/wrappers.py
new file mode 100644
index 0000000000..75b4810e40
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/wrappers.py
@@ -0,0 +1,316 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Wrapper layers: layers that augment the functionality of another layer.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.python.framework import tensor_shape
+
+
+class Wrapper(Layer):
+  """Abstract wrapper base class.
+
+  Wrappers take another layer and augment it in various ways.
+  Do not use this class as a layer, it is only an abstract base class.
+  Two usable wrappers are the `TimeDistributed` and `Bidirectional` wrappers.
+
+  Arguments:
+      layer: The layer to be wrapped.
+  """
+
+  def __init__(self, layer, **kwargs):
+    self.layer = layer
+    super(Wrapper, self).__init__(**kwargs)
+
+  def build(self, input_shape=None):
+    # Assumes that self.layer is already set.
+    # Should be called at the end of .build() in the children classes.
+    self.trainable_weights = getattr(self.layer, 'trainable_weights', [])
+    self.non_trainable_weights = getattr(self.layer, 'non_trainable_weights',
+                                         [])
+    self.updates = getattr(self.layer, 'updates', [])
+    self.losses = getattr(self.layer, 'losses', [])
+    self.constraints = getattr(self.layer, 'constraints', {})
+    self.built = True
+
+  def get_weights(self):
+    weights = self.layer.get_weights()
+    return weights
+
+  def set_weights(self, weights):
+    self.layer.set_weights(weights)
+
+  def get_config(self):
+    config = {
+        'layer': {
+            'class_name': self.layer.__class__.__name__,
+            'config': self.layer.get_config()
+        }
+    }
+    base_config = super(Wrapper, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config):
+    from tensorflow.contrib.keras.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+    layer = deserialize_layer(config.pop('layer'))
+    return cls(layer, **config)
+
+
+class TimeDistributed(Wrapper):
+  """This wrapper allows to apply a layer to every temporal slice of an input.
+
+  The input should be at least 3D, and the dimension of index one
+  will be considered to be the temporal dimension.
+
+  Consider a batch of 32 samples,
+  where each sample is a sequence of 10 vectors of 16 dimensions.
+  The batch input shape of the layer is then `(32, 10, 16)`,
+  and the `input_shape`, not including the samples dimension, is `(10, 16)`.
+
+  You can then use `TimeDistributed` to apply a `Dense` layer
+  to each of the 10 timesteps, independently:
+
+  ```python
+      # as the first layer in a model
+      model = Sequential()
+      model.add(TimeDistributed(Dense(8), input_shape=(10, 16)))
+      # now model.output_shape == (None, 10, 8)
+
+      # subsequent layers: no need for input_shape
+      model.add(TimeDistributed(Dense(32)))
+      # now model.output_shape == (None, 10, 32)
+  ```
+
+  The output will then have shape `(32, 10, 8)`.
+
+  `TimeDistributed` can be used with arbitrary layers, not just `Dense`,
+  for instance with a `Conv2D` layer:
+
+  ```python
+      model = Sequential()
+      model.add(TimeDistributed(Conv2D(64, (3, 3)),
+                                input_shape=(10, 299, 299, 3)))
+  ```
+
+  Arguments:
+      layer: a layer instance.
+  """
+
+  def __init__(self, layer, **kwargs):
+    super(TimeDistributed, self).__init__(layer, **kwargs)
+    self.supports_masking = True
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    assert len(input_shape) >= 3
+    self.input_spec = InputSpec(shape=input_shape)
+    child_input_shape = [input_shape[0]] + input_shape[2:]
+    if not self.layer.built:
+      self.layer.build(child_input_shape)
+      self.layer.built = True
+    super(TimeDistributed, self).build()
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    child_input_shape = tensor_shape.TensorShape([input_shape[0]] + input_shape[
+        2:])
+    child_output_shape = self.layer._compute_output_shape(  # pylint: disable=protected-access
+        child_input_shape).as_list()
+    timesteps = input_shape[1]
+    return tensor_shape.TensorShape([child_output_shape[0], timesteps] +
+                                    child_output_shape[1:])
+
+  def call(self, inputs, mask=None):
+    input_shape = K.int_shape(inputs)
+    if input_shape[0]:
+      # batch size matters, use rnn-based implementation
+      def step(x, _):
+        output = self.layer.call(x)
+        return output, []
+
+      _, outputs, _ = K.rnn(
+          step,
+          inputs,
+          initial_states=[],
+          input_length=input_shape[1],
+          unroll=False)
+      y = outputs
+    else:
+      # No batch size specified, therefore the layer will be able
+      # to process batches of any size.
+      # We can go with reshape-based implementation for performance.
+      input_length = input_shape[1]
+      if not input_length:
+        input_length = K.shape(inputs)[1]
+      # Shape: (num_samples * timesteps, ...)
+      inputs = K.reshape(inputs, (-1,) + input_shape[2:])
+      y = self.layer.call(inputs)  # (num_samples * timesteps, ...)
+      # Shape: (num_samples, timesteps, ...)
+      output_shape = self._compute_output_shape(input_shape).as_list()  # pylint: disable=protected-access
+      y = K.reshape(y, [-1, input_length] + output_shape[2:])
+
+    # Apply activity regularizer if any:
+    if (hasattr(self.layer, 'activity_regularizer') and
+        self.layer.activity_regularizer is not None):
+      regularization_loss = self.layer.activity_regularizer(y)
+      self.add_loss(regularization_loss, inputs)
+    return y
+
+
+class Bidirectional(Wrapper):
+  """Bidirectional wrapper for RNNs.
+
+  Arguments:
+      layer: `Recurrent` instance.
+      merge_mode: Mode by which outputs of the
+          forward and backward RNNs will be combined.
+          One of {'sum', 'mul', 'concat', 'ave', None}.
+          If None, the outputs will not be combined,
+          they will be returned as a list.
+
+  Examples:
+
+  ```python
+      model = Sequential()
+      model.add(Bidirectional(LSTM(10, return_sequences=True), input_shape=(5,
+        10)))
+      model.add(Bidirectional(LSTM(10)))
+      model.add(Dense(5))
+      model.add(Activation('softmax'))
+      model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+  ```
+  """
+
+  def __init__(self, layer, merge_mode='concat', weights=None, **kwargs):
+    super(Bidirectional, self).__init__(layer, **kwargs)
+    if merge_mode not in ['sum', 'mul', 'ave', 'concat', None]:
+      raise ValueError('Invalid merge mode. '
+                       'Merge mode should be one of '
+                       '{"sum", "mul", "ave", "concat", None}')
+    self.forward_layer = copy.copy(layer)
+    config = layer.get_config()
+    config['go_backwards'] = not config['go_backwards']
+    self.backward_layer = layer.__class__.from_config(config)
+    self.forward_layer.name = 'forward_' + self.forward_layer.name
+    self.backward_layer.name = 'backward_' + self.backward_layer.name
+    self.merge_mode = merge_mode
+    if weights:
+      nw = len(weights)
+      self.forward_layer.initial_weights = weights[:nw // 2]
+      self.backward_layer.initial_weights = weights[nw // 2:]
+    self.stateful = layer.stateful
+    self.return_sequences = layer.return_sequences
+    self.supports_masking = True
+
+  def get_weights(self):
+    return self.forward_layer.get_weights() + self.backward_layer.get_weights()
+
+  def set_weights(self, weights):
+    nw = len(weights)
+    self.forward_layer.set_weights(weights[:nw // 2])
+    self.backward_layer.set_weights(weights[nw // 2:])
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.merge_mode in ['sum', 'ave', 'mul']:
+      return self.forward_layer._compute_output_shape(input_shape)  # pylint: disable=protected-access
+    elif self.merge_mode == 'concat':
+      shape = self.forward_layer._compute_output_shape(input_shape).as_list()  # pylint: disable=protected-access
+      shape[-1] *= 2
+      return tensor_shape.TensorShape(shape)
+    elif self.merge_mode is None:
+      shape = self.forward_layer._compute_output_shape(input_shape)  # pylint: disable=protected-access
+      return [shape, copy.copy(shape)]
+
+  def call(self, inputs, mask=None):
+    y = self.forward_layer.call(inputs, mask)
+    y_rev = self.backward_layer.call(inputs, mask)
+    if self.return_sequences:
+      y_rev = K.reverse(y_rev, 1)
+    if self.merge_mode == 'concat':
+      return K.concatenate([y, y_rev])
+    elif self.merge_mode == 'sum':
+      return y + y_rev
+    elif self.merge_mode == 'ave':
+      return (y + y_rev) / 2
+    elif self.merge_mode == 'mul':
+      return y * y_rev
+    elif self.merge_mode is None:
+      return [y, y_rev]
+
+  def reset_states(self):
+    self.forward_layer.reset_states()
+    self.backward_layer.reset_states()
+
+  def build(self, input_shape):
+    self.forward_layer.build(input_shape)
+    self.backward_layer.build(input_shape)
+    self.built = True
+
+  def compute_mask(self, inputs, mask):
+    if self.return_sequences:
+      if not self.merge_mode:
+        return [mask, mask]
+      else:
+        return mask
+    else:
+      return None
+
+  @property
+  def trainable_weights(self):
+    if hasattr(self.forward_layer, 'trainable_weights'):
+      return (self.forward_layer.trainable_weights +
+              self.backward_layer.trainable_weights)
+    return []
+
+  @property
+  def non_trainable_weights(self):
+    if hasattr(self.forward_layer, 'non_trainable_weights'):
+      return (self.forward_layer.non_trainable_weights +
+              self.backward_layer.non_trainable_weights)
+    return []
+
+  @property
+  def updates(self):
+    if hasattr(self.forward_layer, 'updates'):
+      return self.forward_layer.updates + self.backward_layer.updates
+    return []
+
+  @property
+  def losses(self):
+    if hasattr(self.forward_layer, 'losses'):
+      return self.forward_layer.losses + self.backward_layer.losses
+    return []
+
+  @property
+  def constraints(self):
+    constraints = {}
+    if hasattr(self.forward_layer, 'constraints'):
+      constraints.update(self.forward_layer.constraints)
+      constraints.update(self.backward_layer.constraints)
+    return constraints
+
+  def get_config(self):
+    config = {'merge_mode': self.merge_mode}
+    base_config = super(Bidirectional, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/wrappers_test.py b/tensorflow/contrib/keras/python/keras/layers/wrappers_test.py
new file mode 100644
index 0000000000..b892681ada
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/wrappers_test.py
@@ -0,0 +1,199 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for layer wrappers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class TimeDistributedTest(test.TestCase):
+
+  def test_timedistributed_dense(self):
+    # first, test with Dense layer
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Dense(2), input_shape=(3, 4)))
+      model.add(keras.layers.Activation('relu'))
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.fit(
+          np.random.random((10, 3, 4)),
+          np.random.random((10, 3, 2)),
+          epochs=1,
+          batch_size=10)
+
+      # test config
+      model.get_config()
+
+  def test_timedistributed_conv2d(self):
+    # test with Conv2D
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Conv2D(5, (2, 2), padding='same'),
+              input_shape=(2, 4, 4, 3)))
+      model.add(keras.layers.Activation('relu'))
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          np.random.random((1, 2, 4, 4, 3)), np.random.random((1, 2, 4, 4, 5)))
+
+      model = keras.models.model_from_json(model.to_json())
+      model.summary()
+
+  def test_timedistributed_stacked(self):
+    # test stacked layers
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Dense(2), input_shape=(3, 4)))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+      model.add(keras.layers.Activation('relu'))
+      model.compile(optimizer='rmsprop', loss='mse')
+
+      model.fit(
+          np.random.random((10, 3, 4)),
+          np.random.random((10, 3, 3)),
+          epochs=1,
+          batch_size=10)
+
+  def test_timedistributed_sequential(self):
+    # test wrapping Sequential model
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(3, input_dim=2))
+      outer_model = keras.models.Sequential()
+      outer_model.add(keras.layers.TimeDistributed(model, input_shape=(3, 2)))
+      outer_model.compile(optimizer='rmsprop', loss='mse')
+      outer_model.fit(
+          np.random.random((10, 3, 2)),
+          np.random.random((10, 3, 3)),
+          epochs=1,
+          batch_size=10)
+
+      # test with functional API
+      x = keras.layers.Input(shape=(3, 2))
+      y = keras.layers.TimeDistributed(model)(x)
+      outer_model = keras.models.Model(x, y)
+      outer_model.compile(optimizer='rmsprop', loss='mse')
+      outer_model.fit(
+          np.random.random((10, 3, 2)),
+          np.random.random((10, 3, 3)),
+          epochs=1,
+          batch_size=10)
+
+  def test_regularizers(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Dense(2, kernel_regularizer='l1'),
+              input_shape=(3, 4)))
+      model.add(keras.layers.Activation('relu'))
+      model.compile(optimizer='rmsprop', loss='mse')
+      self.assertEqual(len(model.losses), 1)
+
+
+class BidirectionalTest(test.TestCase):
+
+  def test_bidirectional(self):
+    rnn = keras.layers.SimpleRNN
+    samples = 2
+    dim = 2
+    timesteps = 2
+    output_dim = 2
+    with self.test_session():
+      for mode in ['sum', 'concat']:
+        x = np.random.random((samples, timesteps, dim))
+        target_dim = 2 * output_dim if mode == 'concat' else output_dim
+        y = np.random.random((samples, target_dim))
+
+        # test with Sequential model
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Bidirectional(
+                rnn(output_dim), merge_mode=mode, input_shape=(timesteps, dim)))
+        model.compile(loss='mse', optimizer='sgd')
+        model.fit(x, y, epochs=1, batch_size=1)
+
+        # test config
+        model.get_config()
+        model = keras.models.model_from_json(model.to_json())
+        model.summary()
+
+  def test_bidirectional_stacked(self):
+    # test stacked bidirectional layers
+    rnn = keras.layers.SimpleRNN
+    samples = 2
+    dim = 2
+    timesteps = 2
+    output_dim = 2
+    mode = 'sum'
+
+    with self.test_session():
+      x = np.random.random((samples, timesteps, dim))
+      target_dim = 2 * output_dim if mode == 'concat' else output_dim
+      y = np.random.random((samples, target_dim))
+
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Bidirectional(
+              rnn(output_dim, return_sequences=True),
+              merge_mode=mode,
+              input_shape=(timesteps, dim)))
+      model.add(keras.layers.Bidirectional(rnn(output_dim), merge_mode=mode))
+      model.compile(loss='mse', optimizer='sgd')
+      model.fit(x, y, epochs=1, batch_size=1)
+
+      # test with functional API
+      inputs = keras.layers.Input((timesteps, dim))
+      output = keras.layers.Bidirectional(
+          rnn(output_dim), merge_mode=mode)(inputs)
+      model = keras.models.Model(inputs, output)
+      model.compile(loss='mse', optimizer='sgd')
+      model.fit(x, y, epochs=1, batch_size=1)
+
+  def test_bidirectional_statefulness(self):
+    # Bidirectional and stateful
+    rnn = keras.layers.SimpleRNN
+    samples = 2
+    dim = 2
+    timesteps = 2
+    output_dim = 2
+    mode = 'sum'
+
+    with self.test_session():
+      x = np.random.random((samples, timesteps, dim))
+      target_dim = 2 * output_dim if mode == 'concat' else output_dim
+      y = np.random.random((samples, target_dim))
+
+      inputs = keras.layers.Input(batch_shape=(1, timesteps, dim))
+      output = keras.layers.Bidirectional(
+          rnn(output_dim, stateful=True), merge_mode=mode)(inputs)
+      model = keras.models.Model(inputs, output)
+      model.compile(loss='mse', optimizer='sgd')
+      model.fit(x, y, epochs=1, batch_size=1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/losses.py b/tensorflow/contrib/keras/python/keras/losses.py
new file mode 100644
index 0000000000..54b8fa429d
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/losses.py
@@ -0,0 +1,115 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Built-in Keras loss functions.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+
+
+def mean_squared_error(y_true, y_pred):
+  return K.mean(K.square(y_pred - y_true), axis=-1)
+
+
+def mean_absolute_error(y_true, y_pred):
+  return K.mean(K.abs(y_pred - y_true), axis=-1)
+
+
+def mean_absolute_percentage_error(y_true, y_pred):
+  # Equivalent to MAE, but sometimes easier to interpret.
+  diff = K.abs((y_true - y_pred) / K.clip(K.abs(y_true), K.epsilon(), None))
+  return 100. * K.mean(diff, axis=-1)
+
+
+def mean_squared_logarithmic_error(y_true, y_pred):
+  first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
+  second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
+  return K.mean(K.square(first_log - second_log), axis=-1)
+
+
+def squared_hinge(y_true, y_pred):
+  return K.mean(K.square(K.maximum(1. - y_true * y_pred, 0.)), axis=-1)
+
+
+def hinge(y_true, y_pred):
+  return K.mean(K.maximum(1. - y_true * y_pred, 0.), axis=-1)
+
+
+def categorical_crossentropy(y_true, y_pred):
+  return K.categorical_crossentropy(y_pred, y_true)
+
+
+def sparse_categorical_crossentropy(y_true, y_pred):
+  return K.sparse_categorical_crossentropy(y_pred, y_true)
+
+
+def binary_crossentropy(y_true, y_pred):
+  return K.mean(K.binary_crossentropy(y_pred, y_true), axis=-1)
+
+
+def kullback_leibler_divergence(y_true, y_pred):
+  y_true = K.clip(y_true, K.epsilon(), 1)
+  y_pred = K.clip(y_pred, K.epsilon(), 1)
+  return K.sum(y_true * K.log(y_true / y_pred), axis=-1)
+
+
+def poisson(y_true, y_pred):
+  return K.mean(y_pred - y_true * K.log(y_pred + K.epsilon()), axis=-1)
+
+
+def cosine_proximity(y_true, y_pred):
+  y_true = K.l2_normalize(y_true, axis=-1)
+  y_pred = K.l2_normalize(y_pred, axis=-1)
+  return -K.mean(y_true * y_pred, axis=-1)
+
+
+# Aliases.
+
+mse = MSE = mean_squared_error
+mae = MAE = mean_absolute_error
+mape = MAPE = mean_absolute_percentage_error
+msle = MSLE = mean_squared_logarithmic_error
+kld = KLD = kullback_leibler_divergence
+cosine = cosine_proximity
+
+
+def serialize(loss):
+  return loss.__name__
+
+
+def deserialize(name, custom_objects=None):
+  return deserialize_keras_object(
+      name,
+      module_objects=globals(),
+      custom_objects=custom_objects,
+      printable_module_name='loss function')
+
+
+def get(identifier):
+  if identifier is None:
+    return None
+  if isinstance(identifier, six.string_types):
+    identifier = str(identifier)
+    return deserialize(identifier)
+  elif callable(identifier):
+    return identifier
+  else:
+    raise ValueError('Could not interpret '
+                     'loss function identifier:', identifier)
diff --git a/tensorflow/contrib/keras/python/keras/losses_test.py b/tensorflow/contrib/keras/python/keras/losses_test.py
new file mode 100644
index 0000000000..fd4458cce2
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/losses_test.py
@@ -0,0 +1,78 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras loss functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+ALL_LOSSES = [keras.losses.mean_squared_error,
+              keras.losses.mean_absolute_error,
+              keras.losses.mean_absolute_percentage_error,
+              keras.losses.mean_squared_logarithmic_error,
+              keras.losses.squared_hinge,
+              keras.losses.hinge,
+              keras.losses.categorical_crossentropy,
+              keras.losses.binary_crossentropy,
+              keras.losses.kullback_leibler_divergence,
+              keras.losses.poisson,
+              keras.losses.cosine_proximity]
+
+
+class KerasLossesTest(test.TestCase):
+
+  def test_objective_shapes_3d(self):
+    with self.test_session():
+      y_a = keras.backend.variable(np.random.random((5, 6, 7)))
+      y_b = keras.backend.variable(np.random.random((5, 6, 7)))
+      for obj in ALL_LOSSES:
+        objective_output = obj(y_a, y_b)
+        self.assertListEqual(objective_output.get_shape().as_list(), [5, 6])
+
+  def test_objective_shapes_2d(self):
+    with self.test_session():
+      y_a = keras.backend.variable(np.random.random((6, 7)))
+      y_b = keras.backend.variable(np.random.random((6, 7)))
+      for obj in ALL_LOSSES:
+        objective_output = obj(y_a, y_b)
+        self.assertListEqual(objective_output.get_shape().as_list(), [6,])
+
+  def test_cce_one_hot(self):
+    with self.test_session():
+      y_a = keras.backend.variable(np.random.randint(0, 7, (5, 6)))
+      y_b = keras.backend.variable(np.random.random((5, 6, 7)))
+      objective_output = keras.losses.sparse_categorical_crossentropy(y_a, y_b)
+      assert keras.backend.eval(objective_output).shape == (5, 6)
+
+      y_a = keras.backend.variable(np.random.randint(0, 7, (6,)))
+      y_b = keras.backend.variable(np.random.random((6, 7)))
+      objective_output = keras.losses.sparse_categorical_crossentropy(y_a, y_b)
+      assert keras.backend.eval(objective_output).shape == (6,)
+
+  def test_serialization(self):
+    fn = keras.losses.get('mse')
+    config = keras.losses.serialize(fn)
+    new_fn = keras.losses.deserialize(config)
+    self.assertEqual(fn, new_fn)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/metrics.py b/tensorflow/contrib/keras/python/keras/metrics.py
new file mode 100644
index 0000000000..d7266c94cf
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/metrics.py
@@ -0,0 +1,87 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Built-in Keras metrics functions.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.keras.python.keras import backend as K
+# pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.losses import binary_crossentropy
+from tensorflow.contrib.keras.python.keras.losses import categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.losses import cosine_proximity
+from tensorflow.contrib.keras.python.keras.losses import hinge
+from tensorflow.contrib.keras.python.keras.losses import kullback_leibler_divergence
+from tensorflow.contrib.keras.python.keras.losses import mean_absolute_error
+from tensorflow.contrib.keras.python.keras.losses import mean_absolute_percentage_error
+from tensorflow.contrib.keras.python.keras.losses import mean_squared_error
+from tensorflow.contrib.keras.python.keras.losses import mean_squared_logarithmic_error
+from tensorflow.contrib.keras.python.keras.losses import poisson
+from tensorflow.contrib.keras.python.keras.losses import sparse_categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.losses import squared_hinge
+# pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+
+
+def binary_accuracy(y_true, y_pred):
+  return K.mean(K.equal(y_true, K.round(y_pred)), axis=-1)
+
+
+def categorical_accuracy(y_true, y_pred):
+  return K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis=-1))
+
+
+def sparse_categorical_accuracy(y_true, y_pred):
+  return K.equal(
+      K.max(y_true, axis=-1), K.cast(K.argmax(y_pred, axis=-1), K.floatx()))
+
+
+def top_k_categorical_accuracy(y_true, y_pred, k=5):
+  return K.mean(K.in_top_k(y_pred, K.argmax(y_true, axis=-1), k), axis=-1)
+
+
+# Aliases
+
+mse = MSE = mean_squared_error
+mae = MAE = mean_absolute_error
+mape = MAPE = mean_absolute_percentage_error
+msle = MSLE = mean_squared_logarithmic_error
+cosine = cosine_proximity
+
+
+def serialize(metric):
+  return metric.__name__
+
+
+def deserialize(name, custom_objects=None):
+  return deserialize_keras_object(
+      name,
+      module_objects=globals(),
+      custom_objects=custom_objects,
+      printable_module_name='metric function')
+
+
+def get(identifier):
+  if isinstance(identifier, six.string_types):
+    identifier = str(identifier)
+    return deserialize(identifier)
+  elif callable(identifier):
+    return identifier
+  else:
+    raise ValueError('Could not interpret '
+                     'metric function identifier:', identifier)
diff --git a/tensorflow/contrib/keras/python/keras/metrics_test.py b/tensorflow/contrib/keras/python/keras/metrics_test.py
new file mode 100644
index 0000000000..ac0a1372c6
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/metrics_test.py
@@ -0,0 +1,62 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras metrics functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class KerasMetricsTest(test.TestCase):
+
+  def test_metrics(self):
+    with self.test_session():
+      y_a = keras.backend.variable(np.random.random((6, 7)))
+      y_b = keras.backend.variable(np.random.random((6, 7)))
+      for metric in [keras.metrics.binary_accuracy,
+                     keras.metrics.categorical_accuracy]:
+        output = metric(y_a, y_b)
+        self.assertEqual(keras.backend.eval(output).shape, (6,))
+
+  def test_sparse_categorical_accuracy(self):
+    with self.test_session():
+      metric = keras.metrics.sparse_categorical_accuracy
+      y_a = keras.backend.variable(np.random.randint(0, 7, (6,)))
+      y_b = keras.backend.variable(np.random.random((6, 7)))
+      self.assertEqual(keras.backend.eval(metric(y_a, y_b)).shape, (6,))
+
+  def test_top_k_categorical_accuracy(self):
+    with self.test_session():
+      y_pred = keras.backend.variable(np.array([[0.3, 0.2, 0.1],
+                                                [0.1, 0.2, 0.7]]))
+      y_true = keras.backend.variable(np.array([[0, 1, 0], [1, 0, 0]]))
+      result = keras.backend.eval(
+          keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k=3))
+      self.assertEqual(result, 1)
+      result = keras.backend.eval(
+          keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k=2))
+      self.assertEqual(result, 0.5)
+      result = keras.backend.eval(
+          keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k=1))
+      self.assertEqual(result, 0.)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/models.py b/tensorflow/contrib/keras/python/keras/models.py
new file mode 100644
index 0000000000..2be4431d03
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/models.py
@@ -0,0 +1,1167 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Home of the Sequential model, and the `save_model`/`load_model` functions.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import json
+import os
+import warnings
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import layers as layer_module
+from tensorflow.contrib.keras.python.keras import optimizers
+from tensorflow.contrib.keras.python.keras.engine import topology
+from tensorflow.contrib.keras.python.keras.engine.topology import Input
+from tensorflow.contrib.keras.python.keras.engine.topology import Layer
+from tensorflow.contrib.keras.python.keras.engine.training import Model
+from tensorflow.contrib.keras.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
+
+
+# pylint: disable=g-import-not-at-top
+try:
+  import h5py
+except ImportError:
+  h5py = None
+
+try:
+  import yaml
+except ImportError:
+  yaml = None
+# pylint: enable=g-import-not-at-top
+
+
+def save_model(model, filepath, overwrite=True):
+  """Save a model to a HDF5 file.
+
+  The saved model contains:
+      - the model's configuration (topology)
+      - the model's weights
+      - the model's optimizer's state (if any)
+
+  Thus the saved model can be reinstantiated in
+  the exact same state, without any of the code
+  used for model definition or training.
+
+  Arguments:
+      model: Keras model instance to be saved.
+      filepath: String, path where to save the model.
+      overwrite: Whether we should overwrite any existing
+          model at the target location, or instead
+          ask the user with a manual prompt.
+
+  Raises:
+      ImportError: if h5py is not available.
+  """
+
+  if h5py is None:
+    raise ImportError('`save_model` requires h5py.')
+
+  def get_json_type(obj):
+    """Serialize any object to a JSON-serializable structure.
+
+    Arguments:
+        obj: the object to serialize
+
+    Returns:
+        JSON-serializable structure representing `obj`.
+
+    Raises:
+        TypeError: if `obj` cannot be serialized.
+    """
+    # if obj is a serializable Keras class instance
+    # e.g. optimizer, layer
+    if hasattr(obj, 'get_config'):
+      return {'class_name': obj.__class__.__name__, 'config': obj.get_config()}
+
+    # if obj is any numpy type
+    if type(obj).__module__ == np.__name__:
+      return obj.item()
+
+    # misc functions (e.g. loss function)
+    if callable(obj):
+      return obj.__name__
+
+    # if obj is a python 'type'
+    if type(obj).__name__ == type.__name__:
+      return obj.__name__
+
+    raise TypeError('Not JSON Serializable:', obj)
+
+  from tensorflow.contrib.keras.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+
+  # If file exists and should not be overwritten.
+  if not overwrite and os.path.isfile(filepath):
+    proceed = ask_to_proceed_with_overwrite(filepath)
+    if not proceed:
+      return
+
+  f = h5py.File(filepath, 'w')
+  f.attrs['keras_version'] = str(keras_version).encode('utf8')
+  f.attrs['backend'] = K.backend().encode('utf8')
+  f.attrs['model_config'] = json.dumps(
+      {
+          'class_name': model.__class__.__name__,
+          'config': model.get_config()
+      },
+      default=get_json_type).encode('utf8')
+
+  model_weights_group = f.create_group('model_weights')
+  model_layers = model.layers
+  topology.save_weights_to_hdf5_group(model_weights_group, model_layers)
+
+  if hasattr(model, 'optimizer'):
+    if isinstance(model.optimizer, optimizers.TFOptimizer):
+      warnings.warn(
+          'TensorFlow optimizers do not '
+          'make it possible to access '
+          'optimizer attributes or optimizer state '
+          'after instantiation. '
+          'As a result, we cannot save the optimizer '
+          'as part of the model save file.'
+          'You will have to compile your model again after loading it. '
+          'Prefer using a Keras optimizer instead '
+          '(see keras.io/optimizers).')
+    else:
+      f.attrs['training_config'] = json.dumps(
+          {
+              'optimizer_config': {
+                  'class_name': model.optimizer.__class__.__name__,
+                  'config': model.optimizer.get_config()
+              },
+              'loss': model.loss,
+              'metrics': model.metrics,
+              'sample_weight_mode': model.sample_weight_mode,
+              'loss_weights': model.loss_weights,
+          },
+          default=get_json_type).encode('utf8')
+
+      # Save optimizer weights.
+      symbolic_weights = getattr(model.optimizer, 'weights')
+      if symbolic_weights:
+        optimizer_weights_group = f.create_group('optimizer_weights')
+        weight_values = K.batch_get_value(symbolic_weights)
+        weight_names = []
+        for i, (w, val) in enumerate(zip(symbolic_weights, weight_values)):
+          # Default values of symbolic_weights is /variable for theano
+          if K.backend() == 'theano':
+            if hasattr(w, 'name') and w.name != '/variable':
+              name = str(w.name)
+            else:
+              name = 'param_' + str(i)
+          else:
+            if hasattr(w, 'name') and w.name:
+              name = str(w.name)
+            else:
+              name = 'param_' + str(i)
+          weight_names.append(name.encode('utf8'))
+        optimizer_weights_group.attrs['weight_names'] = weight_names
+        for name, val in zip(weight_names, weight_values):
+          param_dset = optimizer_weights_group.create_dataset(
+              name, val.shape, dtype=val.dtype)
+          if not val.shape:
+            # scalar
+            param_dset[()] = val
+          else:
+            param_dset[:] = val
+  f.flush()
+  f.close()
+
+
+def load_model(filepath, custom_objects=None):
+  """Loads a model saved via `save_model`.
+
+  Arguments:
+      filepath: String, path to the saved model.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+
+  Returns:
+      A Keras model instance. If an optimizer was found
+      as part of the saved model, the model is already
+      compiled. Otherwise, the model is uncompiled and
+      a warning will be displayed.
+
+  Raises:
+      ImportError: if h5py is not available.
+      ValueError: In case of an invalid savefile.
+  """
+  if h5py is None:
+    raise ImportError('`save_model` requires h5py.')
+
+  if not custom_objects:
+    custom_objects = {}
+
+  def convert_custom_objects(obj):
+    """Handles custom object lookup.
+
+    Arguments:
+        obj: object, dict, or list.
+
+    Returns:
+        The same structure, where occurences
+            of a custom object name have been replaced
+            with the custom object.
+    """
+    if isinstance(obj, list):
+      deserialized = []
+      for value in obj:
+        if value in custom_objects:
+          deserialized.append(custom_objects[value])
+        else:
+          deserialized.append(value)
+      return deserialized
+    if isinstance(obj, dict):
+      deserialized = {}
+      for key, value in obj.items():
+        if value in custom_objects:
+          deserialized[key] = custom_objects[value]
+        else:
+          deserialized[key] = value
+      return deserialized
+    if obj in custom_objects:
+      return custom_objects[obj]
+    return obj
+
+  f = h5py.File(filepath, mode='r')
+
+  # instantiate model
+  model_config = f.attrs.get('model_config')
+  if model_config is None:
+    raise ValueError('No model found in config file.')
+  model_config = json.loads(model_config.decode('utf-8'))
+  model = model_from_config(model_config, custom_objects=custom_objects)
+
+  # set weights
+  topology.load_weights_from_hdf5_group(f['model_weights'], model.layers)
+
+  # instantiate optimizer
+  training_config = f.attrs.get('training_config')
+  if training_config is None:
+    warnings.warn('No training configuration found in save file: '
+                  'the model was *not* compiled. Compile it manually.')
+    f.close()
+    return model
+  training_config = json.loads(training_config.decode('utf-8'))
+  optimizer_config = training_config['optimizer_config']
+  optimizer = optimizers.deserialize(
+      optimizer_config, custom_objects=custom_objects)
+
+  # Recover loss functions and metrics.
+  loss = convert_custom_objects(training_config['loss'])
+  metrics = convert_custom_objects(training_config['metrics'])
+  sample_weight_mode = training_config['sample_weight_mode']
+  loss_weights = training_config['loss_weights']
+
+  # Compile model.
+  model.compile(
+      optimizer=optimizer,
+      loss=loss,
+      metrics=metrics,
+      loss_weights=loss_weights,
+      sample_weight_mode=sample_weight_mode)
+
+  # Set optimizer weights.
+  if 'optimizer_weights' in f:
+    # Build train function (to get weight updates).
+    if isinstance(model, Sequential):
+      model.model._make_train_function()
+    else:
+      model._make_train_function()
+    optimizer_weights_group = f['optimizer_weights']
+    optimizer_weight_names = [
+        n.decode('utf8') for n in optimizer_weights_group.attrs['weight_names']
+    ]
+    optimizer_weight_values = [
+        optimizer_weights_group[n] for n in optimizer_weight_names
+    ]
+    model.optimizer.set_weights(optimizer_weight_values)
+  f.close()
+  return model
+
+
+def model_from_config(config, custom_objects=None):
+  """Instantiates a Keras model from its config.
+
+  Arguments:
+      config: Configuration dictionary.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+
+  Returns:
+      A Keras model instance (uncompiled).
+  """
+  if isinstance(config, list):
+    raise TypeError('`model_fom_config` expects a dictionary, not a list. '
+                    'Maybe you meant to use '
+                    '`Sequential.from_config(config)`?')
+  return layer_module.deserialize(config, custom_objects=custom_objects)
+
+
+def model_from_yaml(yaml_string, custom_objects=None):
+  """Parses a yaml model configuration file and returns a model instance.
+
+  Arguments:
+      yaml_string: YAML string encoding a model configuration.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+
+  Returns:
+      A Keras model instance (uncompiled).
+
+  Raises:
+      ImportError: if yaml module is not found.
+  """
+  if yaml is None:
+    raise ImportError('Requires yaml module installed.')
+  config = yaml.load(yaml_string)
+  return layer_module.deserialize(config, custom_objects=custom_objects)
+
+
+def model_from_json(json_string, custom_objects=None):
+  """Parses a JSON model configuration file and returns a model instance.
+
+  Arguments:
+      json_string: JSON string encoding a model configuration.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+
+  Returns:
+      A Keras model instance (uncompiled).
+  """
+  config = json.loads(json_string)
+  return layer_module.deserialize(config, custom_objects=custom_objects)
+
+
+class Sequential(Model):
+  """Linear stack of layers.
+
+  Arguments:
+      layers: list of layers to add to the model.
+
+  # Note
+      The first layer passed to a Sequential model
+      should have a defined input shape. What that
+      means is that it should have received an `input_shape`
+      or `batch_input_shape` argument,
+      or for some type of layers (recurrent, Dense...)
+      an `input_dim` argument.
+
+  Example:
+
+      ```python
+          model = Sequential()
+          # first layer must have a defined input shape
+          model.add(Dense(32, input_dim=500))
+          # afterwards, Keras does automatic shape inference
+          model.add(Dense(32))
+
+          # also possible (equivalent to the above):
+          model = Sequential()
+          model.add(Dense(32, input_shape=(500,)))
+          model.add(Dense(32))
+
+          # also possible (equivalent to the above):
+          model = Sequential()
+          # here the batch dimension is None,
+          # which means any batch size will be accepted by the model.
+          model.add(Dense(32, batch_input_shape=(None, 500)))
+          model.add(Dense(32))
+      ```
+  """
+
+  def __init__(self, layers=None, name=None):
+    self.layers = []  # Stack of layers.
+    self.model = None  # Internal Model instance.
+    self.inputs = []  # List of input tensors
+    self.outputs = []  # List of length 1: the output tensor (unique).
+    self._trainable = True
+    self._initial_weights = None
+
+    # Model attributes.
+    self.inbound_nodes = []
+    self.outbound_nodes = []
+    self.built = False
+
+    # Set model name.
+    if not name:
+      prefix = 'sequential_'
+      name = prefix + str(K.get_uid(prefix))
+    self.name = name
+
+    # Add to the model any layers passed to the constructor.
+    if layers:
+      for layer in layers:
+        self.add(layer)
+
+  def add(self, layer):
+    """Adds a layer instance on top of the layer stack.
+
+    Arguments:
+        layer: layer instance.
+
+    Raises:
+        TypeError: If `layer` is not a layer instance.
+        ValueError: In case the `layer` argument does not
+            know its input shape.
+        ValueError: In case the `layer` argument has
+            multiple output tensors, or is already connected
+            somewhere else (forbidden in `Sequential` models).
+    """
+    if not isinstance(layer, Layer):
+      raise TypeError('The added layer must be '
+                      'an instance of class Layer. '
+                      'Found: ' + str(layer))
+    if not self.outputs:
+      # first layer in model: check that it is an input layer
+      if not layer.inbound_nodes:
+        # create an input layer
+        if not hasattr(layer, 'batch_input_shape'):
+          raise ValueError('The first layer in a '
+                           'Sequential model must '
+                           'get an `input_shape` or '
+                           '`batch_input_shape` argument.')
+        # Instantiate the input layer.
+        x = Input(
+            batch_shape=layer.batch_input_shape,
+            dtype=layer.dtype,
+            name=layer.name + '_input')
+        # This will build the current layer
+        # and create the node connecting the current layer
+        # to the input layer we just created.
+        layer(x)
+
+      if len(layer.inbound_nodes) != 1:
+        raise ValueError('A layer added to a Sequential model must '
+                         'not already be connected somewhere else. '
+                         'Model received layer ' + layer.name + ' which has ' +
+                         str(len(layer.inbound_nodes)) +
+                         ' pre-existing inbound connections.')
+
+      if len(layer.inbound_nodes[0].output_tensors) != 1:
+        raise ValueError('All layers in a Sequential model '
+                         'should have a single output tensor. '
+                         'For multi-output layers, '
+                         'use the functional API.')
+
+      self.outputs = [layer.inbound_nodes[0].output_tensors[0]]
+      self.inputs = topology.get_source_inputs(self.outputs[0])
+
+      # We create an input node, which we will keep updated
+      # as we add more layers
+      topology.Node(
+          outbound_layer=self,
+          inbound_layers=[],
+          node_indices=[],
+          tensor_indices=[],
+          input_tensors=self.inputs,
+          output_tensors=self.outputs,
+          # no model-level masking for now
+          input_masks=[None for _ in self.inputs],
+          output_masks=[None])
+    else:
+      output_tensor = layer(self.outputs[0])
+      if isinstance(output_tensor, list):
+        raise TypeError('All layers in a Sequential model '
+                        'should have a single output tensor. '
+                        'For multi-output layers, '
+                        'use the functional API.')
+      self.outputs = [output_tensor]
+      # update self.inbound_nodes
+      self.inbound_nodes[0].output_tensors = self.outputs
+      self.inbound_nodes[0].output_shapes = [K.int_shape(self.outputs[0])]
+
+    self.layers.append(layer)
+    self.built = False
+
+  def pop(self):
+    """Removes the last layer in the model.
+
+    Raises:
+        TypeError: if there are no layers in the model.
+    """
+    if not self.layers:
+      raise TypeError('There are no layers in the model.')
+
+    self.layers.pop()
+    if not self.layers:
+      self.outputs = []
+      self.inbound_nodes = []
+      self.outbound_nodes = []
+    else:
+      self.layers[-1].outbound_nodes = []
+      self.outputs = [self.layers[-1].output]
+      # update self.inbound_nodes
+      self.inbound_nodes[0].output_tensors = self.outputs
+      self.inbound_nodes[0].output_shapes = [K.int_shape(self.outputs[0])]
+    self.built = False
+
+  def get_layer(self, name=None, index=None):
+    """Retrieve a layer that is part of the model.
+
+    Returns a layer based on either its name (unique)
+    or its index in the graph. Indices are based on
+    order of horizontal graph traversal (bottom-up).
+
+    Arguments:
+        name: string, name of layer.
+        index: integer, index of layer.
+
+    Returns:
+        A layer instance.
+    """
+    if self.model is None:
+      self.build()
+    return self.model.get_layer(name, index)
+
+  def call(self, inputs, mask=None):
+    if self.model is None:
+      self.build()
+    return self.model.call(inputs, mask)
+
+  def build(self, input_shape=None):
+    if not self.inputs or not self.outputs:
+      raise TypeError('Sequential model cannot be built: model is empty.'
+                      ' Add some layers first.')
+    # actually create the model
+    self.model = Model(self.inputs, self.outputs[0], name=self.name + '_model')
+    self.model.trainable = self.trainable
+
+    # mirror model attributes
+    self.supports_masking = self.model.supports_masking
+    self._output_mask_cache = self.model._output_mask_cache
+    self._output_tensor_cache = self.model._output_tensor_cache
+    self._output_shape_cache = self.model._output_shape_cache
+    self.input_layers = self.model.input_layers
+    self.input_layers_node_indices = self.model.input_layers_node_indices
+    self.input_layers_tensor_indices = self.model.input_layers_tensor_indices
+    self.output_layers = self.model.output_layers
+    self.output_layers_node_indices = self.model.output_layers_node_indices
+    self.output_layers_tensor_indices = self.model.output_layers_tensor_indices
+    self.nodes_by_depth = self.model.nodes_by_depth
+    self.container_nodes = self.model.container_nodes
+    self.output_names = self.model.output_names
+    self.input_names = self.model.input_names
+    self._feed_input_names = self.model._feed_input_names
+    self._feed_inputs = self.model._feed_inputs
+
+    # Make sure child model callbacks
+    # will call the parent Sequential model.
+    self.model.callback_model = self
+
+    self.built = True
+
+  @property
+  def uses_learning_phase(self):
+    if self.model is None:
+      self.build()
+    return self.model.uses_learning_phase
+
+  def _gather_list_attr(self, attr):
+    all_attrs = []
+    for layer in self.layers:
+      all_attrs += getattr(layer, attr, [])
+    return all_attrs
+
+  @property
+  def trainable(self):
+    return self._trainable
+
+  @trainable.setter
+  def trainable(self, value):
+    if self.model:
+      self.model.trainable = value
+    self._trainable = value
+
+  @property
+  def trainable_weights(self):
+    if not self.trainable:
+      return []
+    return self._gather_list_attr('trainable_weights')
+
+  @property
+  def non_trainable_weights(self):
+    weights = self._gather_list_attr('non_trainable_weights')
+    if not self.trainable:
+      trainable_weights = self._gather_list_attr('trainable_weights')
+      return trainable_weights + weights
+    return weights
+
+  @property
+  def updates(self):
+    if self.model is None:
+      self.build()
+    return self.model.updates
+
+  @property
+  def state_updates(self):
+    if self.model is None:
+      self.build()
+    return self.model.state_updates
+
+  def get_updates_for(self, inputs):
+    if self.model is None:
+      self.build()
+    return self.model.get_updates_for(inputs)
+
+  @property
+  def losses(self):
+    if self.model is None:
+      self.build()
+    return self.model.losses
+
+  def get_losses_for(self, inputs):
+    if self.model is None:
+      self.build()
+    return self.model.get_losses_for(inputs)
+
+  @property
+  def regularizers(self):
+    if self.model is None:
+      self.build()
+    return self.model.regularizers
+
+  @property
+  def constraints(self):
+    if self.model is None:
+      self.build()
+    return self.model.constraints
+
+  def get_weights(self):
+    """Retrieves the weights of the model.
+
+    Returns:
+        A flat list of Numpy arrays
+        (one array per model weight).
+    """
+    if self.model is None:
+      self.build()
+    return self.model.get_weights()
+
+  def set_weights(self, weights):
+    """Sets the weights of the model.
+
+    Arguments:
+        weights: Should be a list
+            of Numpy arrays with shapes and types matching
+            the output of `model.get_weights()`.
+    """
+    if self.model is None:
+      self.build()
+    self.model.set_weights(weights)
+
+  def load_weights(self, filepath, by_name=False):
+    if h5py is None:
+      raise ImportError('`load_weights` requires h5py.')
+    f = h5py.File(filepath, mode='r')
+    if 'layer_names' not in f.attrs and 'model_weights' in f:
+      f = f['model_weights']
+    layers = self.layers
+    if by_name:
+      topology.load_weights_from_hdf5_group_by_name(f, layers)
+    else:
+      topology.load_weights_from_hdf5_group(f, layers)
+    if hasattr(f, 'close'):
+      f.close()
+
+  def save_weights(self, filepath, overwrite=True):
+    if h5py is None:
+      raise ImportError('`save_weights` requires h5py.')
+    # If file exists and should not be overwritten:
+    if not overwrite and os.path.isfile(filepath):
+      proceed = ask_to_proceed_with_overwrite(filepath)
+      if not proceed:
+        return
+    layers = self.layers
+    f = h5py.File(filepath, 'w')
+    topology.save_weights_to_hdf5_group(f, layers)
+    f.flush()
+    f.close()
+
+  def compile(self,
+              optimizer,
+              loss,
+              metrics=None,
+              sample_weight_mode=None,
+              **kwargs):
+    """Configures the learning process.
+
+    Arguments:
+        optimizer: str (name of optimizer) or optimizer object.
+            See [optimizers](/optimizers).
+        loss: str (name of objective function) or objective function.
+            See [objectives](/objectives).
+        metrics: list of metrics to be evaluated by the model
+            during training and testing.
+            Typically you will use `metrics=['accuracy']`.
+            See [metrics](/metrics).
+        sample_weight_mode: if you need to do timestep-wise
+            sample weighting (2D weights), set this to "temporal".
+            "None" defaults to sample-wise weights (1D).
+        **kwargs: for Theano backend, these are passed into K.function.
+            Ignored for Tensorflow backend.
+
+    Example:
+        ```python
+            model = Sequential()
+            model.add(Dense(32, input_shape=(500,)))
+            model.add(Dense(10, activation='softmax'))
+            model.compile(optimizer='rmsprop',
+                          loss='categorical_crossentropy',
+                          metrics=['accuracy'])
+        ```
+    """
+    # create the underlying model
+    self.build()
+    # call compile method of Model class
+    self.model.compile(
+        optimizer,
+        loss,
+        metrics=metrics,
+        sample_weight_mode=sample_weight_mode,
+        **kwargs)
+    self.optimizer = self.model.optimizer
+    self.loss = self.model.loss
+    self.loss_weights = self.model.loss_weights
+    self.metrics = self.model.metrics
+    self.metrics_tensors = self.model.metrics_tensors
+    self.metrics_names = self.model.metrics_names
+    self.sample_weight_mode = self.model.sample_weight_mode
+
+  def fit(self,
+          x,
+          y,
+          batch_size=32,
+          epochs=10,
+          verbose=1,
+          callbacks=None,
+          validation_split=0.,
+          validation_data=None,
+          shuffle=True,
+          class_weight=None,
+          sample_weight=None,
+          initial_epoch=0):
+    """Trains the model for a fixed number of epochs.
+
+    Arguments:
+        x: input data, as a Numpy array or list of Numpy arrays
+            (if the model has multiple inputs).
+        y: labels, as a Numpy array.
+        batch_size: integer. Number of samples per gradient update.
+        epochs: integer, the number of epochs to train the model.
+        verbose: 0 for no logging to stdout,
+            1 for progress bar logging, 2 for one log line per epoch.
+        callbacks: list of `keras.callbacks.Callback` instances.
+            List of callbacks to apply during training.
+            See [callbacks](/callbacks).
+        validation_split: float (0. < x < 1).
+            Fraction of the data to use as held-out validation data.
+        validation_data: tuple (x_val, y_val) or tuple
+            (x_val, y_val, val_sample_weights) to be used as held-out
+            validation data. Will override validation_split.
+        shuffle: boolean or str (for 'batch').
+            Whether to shuffle the samples at each epoch.
+            'batch' is a special option for dealing with the
+            limitations of HDF5 data; it shuffles in batch-sized chunks.
+        class_weight: dictionary mapping classes to a weight value,
+            used for scaling the loss function (during training only).
+        sample_weight: Numpy array of weights for
+            the training samples, used for scaling the loss function
+            (during training only). You can either pass a flat (1D)
+            Numpy array with the same length as the input samples
+            (1:1 mapping between weights and samples),
+            or in the case of temporal data,
+            you can pass a 2D array with shape (samples, sequence_length),
+            to apply a different weight to every timestep of every sample.
+            In this case you should make sure to specify
+            sample_weight_mode="temporal" in compile().
+        initial_epoch: epoch at which to start training
+            (useful for resuming a previous training run)
+
+    Returns:
+        A `History` object. Its `History.history` attribute is
+        a record of training loss values and metrics values
+        at successive epochs, as well as validation loss values
+        and validation metrics values (if applicable).
+
+    Raises:
+        RuntimeError: if the model was never compiled.
+    """
+    if self.model is None:
+      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+    return self.model.fit(
+        x,
+        y,
+        batch_size=batch_size,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        validation_split=validation_split,
+        validation_data=validation_data,
+        shuffle=shuffle,
+        class_weight=class_weight,
+        sample_weight=sample_weight,
+        initial_epoch=initial_epoch)
+
+  def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):
+    """Computes the loss on some input data, batch by batch.
+
+    Arguments:
+        x: input data, as a Numpy array or list of Numpy arrays
+            (if the model has multiple inputs).
+        y: labels, as a Numpy array.
+        batch_size: integer. Number of samples per gradient update.
+        verbose: verbosity mode, 0 or 1.
+        sample_weight: sample weights, as a Numpy array.
+
+    Returns:
+        Scalar test loss (if the model has no metrics)
+        or list of scalars (if the model computes other metrics).
+        The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+
+    Raises:
+        RuntimeError: if the model was never compiled.
+    """
+    if self.model is None:
+      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+    return self.model.evaluate(
+        x,
+        y,
+        batch_size=batch_size,
+        verbose=verbose,
+        sample_weight=sample_weight)
+
+  def predict(self, x, batch_size=32, verbose=0):
+    """Generates output predictions for the input samples.
+
+    The input samples are processed batch by batch.
+
+    Arguments:
+        x: the input data, as a Numpy array.
+        batch_size: integer.
+        verbose: verbosity mode, 0 or 1.
+
+    Returns:
+        A Numpy array of predictions.
+    """
+    if self.model is None:
+      self.build()
+    return self.model.predict(x, batch_size=batch_size, verbose=verbose)
+
+  def predict_on_batch(self, x):
+    """Returns predictions for a single batch of samples.
+
+    Arguments:
+        x: input data, as a Numpy array or list of Numpy arrays
+            (if the model has multiple inputs).
+
+    Returns:
+        A Numpy array of predictions.
+    """
+    if self.model is None:
+      self.build()
+    return self.model.predict_on_batch(x)
+
+  def train_on_batch(self, x, y, class_weight=None, sample_weight=None):
+    """Single gradient update over one batch of samples.
+
+    Arguments:
+        x: input data, as a Numpy array or list of Numpy arrays
+            (if the model has multiple inputs).
+        y: labels, as a Numpy array.
+        class_weight: dictionary mapping classes to a weight value,
+            used for scaling the loss function (during training only).
+        sample_weight: sample weights, as a Numpy array.
+
+    Returns:
+        Scalar training loss (if the model has no metrics)
+        or list of scalars (if the model computes other metrics).
+        The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+
+    Raises:
+        RuntimeError: if the model was never compiled.
+    """
+    if self.model is None:
+      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+    return self.model.train_on_batch(
+        x, y, sample_weight=sample_weight, class_weight=class_weight)
+
+  def test_on_batch(self, x, y, sample_weight=None):
+    """Evaluates the model over a single batch of samples.
+
+    Arguments:
+        x: input data, as a Numpy array or list of Numpy arrays
+            (if the model has multiple inputs).
+        y: labels, as a Numpy array.
+        sample_weight: sample weights, as a Numpy array.
+
+    Returns:
+        Scalar test loss (if the model has no metrics)
+        or list of scalars (if the model computes other metrics).
+        The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+
+    Raises:
+        RuntimeError: if the model was never compiled.
+    """
+    if self.model is None:
+      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+    return self.model.test_on_batch(x, y, sample_weight=sample_weight)
+
+  def predict_proba(self, x, batch_size=32, verbose=1):
+    """Generates class probability predictions for the input samples.
+
+    The input samples are processed batch by batch.
+
+    Arguments:
+        x: input data, as a Numpy array or list of Numpy arrays
+            (if the model has multiple inputs).
+        batch_size: integer.
+        verbose: verbosity mode, 0 or 1.
+
+    Returns:
+        A Numpy array of probability predictions.
+    """
+    preds = self.predict(x, batch_size, verbose)
+    if preds.min() < 0. or preds.max() > 1.:
+      warnings.warn('Network returning invalid probability values. '
+                    'The last layer might not normalize predictions '
+                    'into probabilities '
+                    '(like softmax or sigmoid would).')
+    return preds
+
+  def predict_classes(self, x, batch_size=32, verbose=1):
+    """Generate class predictions for the input samples.
+
+    The input samples are processed batch by batch.
+
+    Arguments:
+        x: input data, as a Numpy array or list of Numpy arrays
+            (if the model has multiple inputs).
+        batch_size: integer.
+        verbose: verbosity mode, 0 or 1.
+
+    Returns:
+        A numpy array of class predictions.
+    """
+    proba = self.predict(x, batch_size=batch_size, verbose=verbose)
+    if proba.shape[-1] > 1:
+      return proba.argmax(axis=-1)
+    else:
+      return (proba > 0.5).astype('int32')
+
+  def fit_generator(self,
+                    generator,
+                    steps_per_epoch,
+                    epochs=1,
+                    verbose=1,
+                    callbacks=None,
+                    validation_data=None,
+                    validation_steps=None,
+                    class_weight=None,
+                    max_q_size=10,
+                    workers=1,
+                    pickle_safe=False,
+                    initial_epoch=0):
+    """Fits the model on data generated batch-by-batch by a Python generator.
+
+    The generator is run in parallel to the model, for efficiency.
+    For instance, this allows you to do real-time data augmentation
+    on images on CPU in parallel to training your model on GPU.
+
+    Arguments:
+        generator: A generator.
+            The output of the generator must be either
+            - a tuple (inputs, targets)
+            - a tuple (inputs, targets, sample_weights).
+            All arrays should contain the same number of samples.
+            The generator is expected to loop over its data
+            indefinitely. An epoch finishes when `samples_per_epoch`
+            samples have been seen by the model.
+        steps_per_epoch: Total number of steps (batches of samples)
+            to yield from `generator` before declaring one epoch
+            finished and starting the next epoch. It should typically
+            be equal to the number of unique samples if your dataset
+            divided by the batch size.
+        epochs: Integer, total number of iterations on the data.
+        verbose: Verbosity mode, 0, 1, or 2.
+        callbacks: List of callbacks to be called during training.
+        validation_data: This can be either
+            - A generator for the validation data
+            - A tuple (inputs, targets)
+            - A tuple (inputs, targets, sample_weights).
+        validation_steps: Only relevant if `validation_data`
+            is a generator.
+            Number of samples to use from validation generator
+            at the end of every epoch.
+        class_weight: Dictionary mapping class indices to a weight
+            for the class.
+        max_q_size: Maximum size for the generator queue
+        workers: Maximum number of processes to spin up
+        pickle_safe: Ff True, use process based threading.
+            Note that because
+            this implementation relies on multiprocessing,
+            you should not pass
+            non picklable arguments to the generator
+            as they can't be passed
+            easily to children processes.
+        initial_epoch: Epoch at which to start training
+            (useful for resuming a previous training run)
+
+    Returns:
+        A `History` object.
+
+    Raises:
+        RuntimeError: if the model was never compiled.
+
+    Example:
+
+    ```python
+        def generate_arrays_from_file(path):
+            while 1:
+                f = open(path)
+                for line in f:
+                    # create Numpy arrays of input data
+                    # and labels, from each line in the file
+                    x, y = process_line(line)
+                    yield (x, y)
+                f.close()
+
+        model.fit_generator(generate_arrays_from_file('/my_file.txt'),
+                            samples_per_epoch=10000, epochs=10)
+    ```
+    """
+    if self.model is None:
+      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+    return self.model.fit_generator(
+        generator,
+        steps_per_epoch,
+        epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        validation_data=validation_data,
+        validation_steps=validation_steps,
+        class_weight=class_weight,
+        max_q_size=max_q_size,
+        workers=workers,
+        pickle_safe=pickle_safe,
+        initial_epoch=initial_epoch)
+
+  def evaluate_generator(self,
+                         generator,
+                         steps,
+                         max_q_size=10,
+                         workers=1,
+                         pickle_safe=False):
+    """Evaluates the model on a data generator.
+
+    The generator should return the same kind of data
+    as accepted by `test_on_batch`.
+
+    Arguments:
+        generator: Generator yielding tuples (inputs, targets)
+            or (inputs, targets, sample_weights)
+        steps: Total number of steps (batches of samples)
+            to yield from `generator` before stopping.
+        max_q_size: maximum size for the generator queue
+        workers: maximum number of processes to spin up
+        pickle_safe: if True, use process based threading.
+            Note that because this implementation
+            relies on multiprocessing, you should not pass
+            non picklable arguments to the generator
+            as they can't be passed easily to children processes.
+
+    Returns:
+        Scalar test loss (if the model has no metrics)
+        or list of scalars (if the model computes other metrics).
+        The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+
+    Raises:
+        RuntimeError: if the model was never compiled.
+    """
+    if self.model is None:
+      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+    return self.model.evaluate_generator(
+        generator,
+        steps,
+        max_q_size=max_q_size,
+        workers=workers,
+        pickle_safe=pickle_safe)
+
+  def predict_generator(self,
+                        generator,
+                        steps,
+                        max_q_size=10,
+                        workers=1,
+                        pickle_safe=False):
+    """Generates predictions for the input samples from a data generator.
+
+    The generator should return the same kind of data as accepted by
+    `predict_on_batch`.
+
+    Arguments:
+        generator: generator yielding batches of input samples.
+        steps: Total number of steps (batches of samples)
+            to yield from `generator` before stopping.
+        max_q_size: maximum size for the generator queue
+        workers: maximum number of processes to spin up
+        pickle_safe: if True, use process based threading.
+            Note that because this implementation
+            relies on multiprocessing, you should not pass
+            non picklable arguments to the generator
+            as they can't be passed easily to children processes.
+
+    Returns:
+        A Numpy array of predictions.
+    """
+    if self.model is None:
+      self.build()
+    return self.model.predict_generator(
+        generator,
+        steps,
+        max_q_size=max_q_size,
+        workers=workers,
+        pickle_safe=pickle_safe)
+
+  def get_config(self):
+    config = []
+    for layer in self.layers:
+      config.append({
+          'class_name': layer.__class__.__name__,
+          'config': layer.get_config()
+      })
+    return copy.deepcopy(config)
+
+  @classmethod
+  def from_config(cls, config):
+    model = cls()
+    for conf in config:
+      layer = layer_module.deserialize(conf)
+      model.add(layer)
+    return model
diff --git a/tensorflow/contrib/keras/python/keras/models_test.py b/tensorflow/contrib/keras/python/keras/models_test.py
new file mode 100644
index 0000000000..50aba43c24
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/models_test.py
@@ -0,0 +1,193 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training routines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+try:
+  import h5py  # pylint:disable=g-import-not-at-top
+except ImportError:
+  h5py = None
+
+
+class TestModelSaving(test.TestCase):
+
+  def test_sequential_model_saving(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.RepeatVector(3))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+      model.compile(loss=keras.losses.MSE,
+                    optimizer=keras.optimizers.RMSprop(lr=0.0001),
+                    metrics=[keras.metrics.categorical_accuracy],
+                    sample_weight_mode='temporal')
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3, 3))
+      model.train_on_batch(x, y)
+
+      out = model.predict(x)
+      _, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+
+      new_model = keras.models.load_model(fname)
+      os.remove(fname)
+
+      out2 = new_model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+      # test that new updates are the same with both models
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3, 3))
+      model.train_on_batch(x, y)
+      new_model.train_on_batch(x, y)
+      out = model.predict(x)
+      out2 = new_model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+  def test_sequential_model_saving_2(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      # test with custom optimizer, loss
+
+      class CustomOp(keras.optimizers.RMSprop):
+        pass
+
+      def custom_loss(y_true, y_pred):
+        return keras.losses.mse(y_true, y_pred)
+
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.Dense(3))
+      model.compile(loss=custom_loss, optimizer=CustomOp(), metrics=['acc'])
+
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+      model.train_on_batch(x, y)
+
+      out = model.predict(x)
+      _, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+
+      model = keras.models.load_model(
+          fname,
+          custom_objects={'CustomOp': CustomOp,
+                          'custom_loss': custom_loss})
+      os.remove(fname)
+
+      out2 = model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+  def test_fuctional_model_saving(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      inputs = keras.layers.Input(shape=(3,))
+      x = keras.layers.Dense(2)(inputs)
+      output = keras.layers.Dense(3)(x)
+
+      model = keras.models.Model(inputs, output)
+      model.compile(loss=keras.losses.MSE,
+                    optimizer=keras.optimizers.RMSprop(lr=0.0001),
+                    metrics=[keras.metrics.categorical_accuracy])
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+      model.train_on_batch(x, y)
+
+      out = model.predict(x)
+      _, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+
+      model = keras.models.load_model(fname)
+      os.remove(fname)
+
+      out2 = model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+  def test_saving_without_compilation(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.Dense(3))
+      model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
+
+      _, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+      model = keras.models.load_model(fname)
+      os.remove(fname)
+
+  def test_saving_right_after_compilation(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.Dense(3))
+      model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
+      model.model._make_train_function()
+
+      _, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+      model = keras.models.load_model(fname)
+      os.remove(fname)
+
+
+class TestSequential(test.TestCase):
+  """Most Sequential model API tests are covered in `training_test.py`.
+  """
+
+  def test_sequential_pop(self):
+    num_hidden = 5
+    input_dim = 3
+    batch_size = 5
+    num_classes = 2
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
+      model.add(keras.layers.Dense(num_classes))
+      model.compile(loss='mse', optimizer='sgd')
+      x = np.random.random((batch_size, input_dim))
+      y = np.random.random((batch_size, num_classes))
+      model.fit(x, y, epochs=1)
+      model.pop()
+      self.assertEqual(len(model.layers), 1)
+      self.assertEqual(model.output_shape, (None, num_hidden))
+      model.compile(loss='mse', optimizer='sgd')
+      y = np.random.random((batch_size, num_hidden))
+      model.fit(x, y, epochs=1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/optimizers.py b/tensorflow/contrib/keras/python/keras/optimizers.py
new file mode 100644
index 0000000000..b50c18b0e1
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/optimizers.py
@@ -0,0 +1,757 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras optimizer classes (will eventually be replaced with core optimizers).
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.training import optimizer as tf_optimizer_module
+
+
+def clip_norm(g, c, n):
+  if c > 0:
+    g = K.switch(n >= c, g * c / n, g)
+  return g
+
+
+class Optimizer(object):
+  """Abstract optimizer base class.
+
+  Note: this is the parent class of all optimizers, not an actual optimizer
+  that can be used for training models.
+
+  All Keras optimizers support the following keyword arguments:
+
+      clipnorm: float >= 0. Gradients will be clipped
+          when their L2 norm exceeds this value.
+      clipvalue: float >= 0. Gradients will be clipped
+          when their absolute value exceeds this value.
+  """
+
+  def __init__(self, **kwargs):
+    allowed_kwargs = {'clipnorm', 'clipvalue'}
+    for k in kwargs:
+      if k not in allowed_kwargs:
+        raise TypeError('Unexpected keyword argument '
+                        'passed to optimizer: ' + str(k))
+    self.__dict__.update(kwargs)
+    self.updates = []
+    self.weights = []
+
+  def get_updates(self, params, constraints, loss):
+    raise NotImplementedError
+
+  def get_gradients(self, loss, params):
+    grads = K.gradients(loss, params)
+    if hasattr(self, 'clipnorm') and self.clipnorm > 0:
+      norm = K.sqrt(sum([K.sum(K.square(g)) for g in grads]))
+      grads = [clip_norm(g, self.clipnorm, norm) for g in grads]
+    if hasattr(self, 'clipvalue') and self.clipvalue > 0:
+      grads = [K.clip(g, -self.clipvalue, self.clipvalue) for g in grads]
+    return grads
+
+  def set_weights(self, weights):
+    """Sets the weights of the optimizer, from Numpy arrays.
+
+    Should only be called after computing the gradients
+    (otherwise the optimizer has no weights).
+
+    Arguments:
+        weights: a list of Numpy arrays. The number
+            of arrays and their shape must match
+            number of the dimensions of the weights
+            of the optimizer (i.e. it should match the
+            output of `get_weights`).
+
+    Raises:
+        ValueError: in case of incompatible weight shapes.
+    """
+    params = self.weights
+    weight_value_tuples = []
+    param_values = K.batch_get_value(params)
+    for pv, p, w in zip(param_values, params, weights):
+      if pv.shape != w.shape:
+        raise ValueError('Optimizer weight shape ' + str(pv.shape) +
+                         ' not compatible with '
+                         'provided weight shape ' + str(w.shape))
+      weight_value_tuples.append((p, w))
+    K.batch_set_value(weight_value_tuples)
+
+  def get_weights(self):
+    """Returns the current value of the weights of the optimizer.
+
+    Returns:
+        A list of numpy arrays.
+    """
+    return K.batch_get_value(self.weights)
+
+  def get_config(self):
+    config = {}
+    if hasattr(self, 'clipnorm'):
+      config['clipnorm'] = self.clipnorm
+    if hasattr(self, 'clipvalue'):
+      config['clipvalue'] = self.clipvalue
+    return config
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
+
+
+class SGD(Optimizer):
+  """Stochastic gradient descent optimizer.
+
+  Includes support for momentum,
+  learning rate decay, and Nesterov momentum.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+      momentum: float >= 0. Parameter updates momentum.
+      decay: float >= 0. Learning rate decay over each update.
+      nesterov: boolean. Whether to apply Nesterov momentum.
+  """
+
+  def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, **kwargs):
+    super(SGD, self).__init__(**kwargs)
+    self.iterations = K.variable(0., name='iterations')
+    self.lr = K.variable(lr, name='lr')
+    self.momentum = K.variable(momentum, name='momentum')
+    self.decay = K.variable(decay, name='decay')
+    self.initial_decay = decay
+    self.nesterov = nesterov
+
+  def get_updates(self, params, constraints, loss):
+    grads = self.get_gradients(loss, params)
+    self.updates = []
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr *= (1. / (1. + self.decay * self.iterations))
+      self.updates.append(K.update_add(self.iterations, 1))
+
+    # momentum
+    shapes = [K.int_shape(p) for p in params]
+    moments = [K.zeros(shape) for shape in shapes]
+    self.weights = [self.iterations] + moments
+    for p, g, m in zip(params, grads, moments):
+      v = self.momentum * m - lr * g  # velocity
+      self.updates.append(K.update(m, v))
+
+      if self.nesterov:
+        new_p = p + self.momentum * v - lr * g
+      else:
+        new_p = p + v
+
+      # apply constraints
+      if p in constraints:
+        c = constraints[p]
+        new_p = c(new_p)
+
+      self.updates.append(K.update(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'momentum': float(K.get_value(self.momentum)),
+        'decay': float(K.get_value(self.decay)),
+        'nesterov': self.nesterov
+    }
+    base_config = super(SGD, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class RMSprop(Optimizer):
+  # pylint: disable=line-too-long
+  """RMSProp optimizer.
+
+  It is recommended to leave the parameters of this optimizer
+  at their default values
+  (except the learning rate, which can be freely tuned).
+
+  This optimizer is usually a good choice for recurrent
+  neural networks.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+      rho: float >= 0.
+      epsilon: float >= 0. Fuzz factor.
+      decay: float >= 0. Learning rate decay over each update.
+
+  References:
+      - [rmsprop: Divide the gradient by a running average of its recent
+        magnitude](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
+  """
+
+  # pylint: enable=line-too-long
+
+  def __init__(self, lr=0.001, rho=0.9, epsilon=1e-8, decay=0., **kwargs):
+    super(RMSprop, self).__init__(**kwargs)
+    self.lr = K.variable(lr, name='lr')
+    self.rho = K.variable(rho, name='rho')
+    self.epsilon = epsilon
+    self.decay = K.variable(decay, name='decay')
+    self.initial_decay = decay
+    self.iterations = K.variable(0., name='iterations')
+
+  def get_updates(self, params, constraints, loss):
+    grads = self.get_gradients(loss, params)
+    shapes = [K.int_shape(p) for p in params]
+    accumulators = [K.zeros(shape) for shape in shapes]
+    self.weights = accumulators
+    self.updates = []
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr *= (1. / (1. + self.decay * self.iterations))
+      self.updates.append(K.update_add(self.iterations, 1))
+
+    for p, g, a in zip(params, grads, accumulators):
+      # update accumulator
+      new_a = self.rho * a + (1. - self.rho) * K.square(g)
+      self.updates.append(K.update(a, new_a))
+      new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
+
+      # apply constraints
+      if p in constraints:
+        c = constraints[p]
+        new_p = c(new_p)
+      self.updates.append(K.update(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'rho': float(K.get_value(self.rho)),
+        'decay': float(K.get_value(self.decay)),
+        'epsilon': self.epsilon
+    }
+    base_config = super(RMSprop, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Adagrad(Optimizer):
+  # pylint: disable=line-too-long
+  """Adagrad optimizer.
+
+  It is recommended to leave the parameters of this optimizer
+  at their default values.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+      epsilon: float >= 0.
+      decay: float >= 0. Learning rate decay over each update.
+
+  References:
+      - [Adaptive Subgradient Methods for Online Learning and Stochastic
+        Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+  """
+
+  # pylint: enable=line-too-long
+
+  def __init__(self, lr=0.01, epsilon=1e-8, decay=0., **kwargs):
+    super(Adagrad, self).__init__(**kwargs)
+    self.lr = K.variable(lr, name='lr')
+    self.epsilon = epsilon
+    self.decay = K.variable(decay, name='decay')
+    self.initial_decay = decay
+    self.iterations = K.variable(0., name='iterations')
+
+  def get_updates(self, params, constraints, loss):
+    grads = self.get_gradients(loss, params)
+    shapes = [K.int_shape(p) for p in params]
+    accumulators = [K.zeros(shape) for shape in shapes]
+    self.weights = accumulators
+    self.updates = []
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr *= (1. / (1. + self.decay * self.iterations))
+      self.updates.append(K.update_add(self.iterations, 1))
+
+    for p, g, a in zip(params, grads, accumulators):
+      new_a = a + K.square(g)  # update accumulator
+      self.updates.append(K.update(a, new_a))
+      new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
+      # apply constraints
+      if p in constraints:
+        c = constraints[p]
+        new_p = c(new_p)
+      self.updates.append(K.update(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'decay': float(K.get_value(self.decay)),
+        'epsilon': self.epsilon
+    }
+    base_config = super(Adagrad, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Adadelta(Optimizer):
+  # pylint: disable=line-too-long
+  """Adadelta optimizer.
+
+  It is recommended to leave the parameters of this optimizer
+  at their default values.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+          It is recommended to leave it at the default value.
+      rho: float >= 0.
+      epsilon: float >= 0. Fuzz factor.
+      decay: float >= 0. Learning rate decay over each update.
+
+  References:
+      - [Adadelta - an adaptive learning rate
+        method](http://arxiv.org/abs/1212.5701)
+  """
+
+  # pylint: enable=line-too-long
+
+  def __init__(self, lr=1.0, rho=0.95, epsilon=1e-8, decay=0., **kwargs):
+    super(Adadelta, self).__init__(**kwargs)
+    self.lr = K.variable(lr, name='lr')
+    self.rho = rho
+    self.epsilon = epsilon
+    self.decay = K.variable(decay, name='decay')
+    self.initial_decay = decay
+    self.iterations = K.variable(0., name='iterations')
+
+  def get_updates(self, params, constraints, loss):
+    grads = self.get_gradients(loss, params)
+    shapes = [K.int_shape(p) for p in params]
+    accumulators = [K.zeros(shape) for shape in shapes]
+    delta_accumulators = [K.zeros(shape) for shape in shapes]
+    self.weights = accumulators + delta_accumulators
+    self.updates = []
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr *= (1. / (1. + self.decay * self.iterations))
+      self.updates.append(K.update_add(self.iterations, 1))
+
+    for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
+      # update accumulator
+      new_a = self.rho * a + (1. - self.rho) * K.square(g)
+      self.updates.append(K.update(a, new_a))
+
+      # use the new accumulator and the *old* delta_accumulator
+      update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
+
+      new_p = p - lr * update
+      # apply constraints
+      if p in constraints:
+        c = constraints[p]
+        new_p = c(new_p)
+      self.updates.append(K.update(p, new_p))
+
+      # update delta_accumulator
+      new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update)
+      self.updates.append(K.update(d_a, new_d_a))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'rho': self.rho,
+        'decay': float(K.get_value(self.decay)),
+        'epsilon': self.epsilon
+    }
+    base_config = super(Adadelta, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Adam(Optimizer):
+  # pylint: disable=line-too-long
+  """Adam optimizer.
+
+  Default parameters follow those provided in the original paper.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+      beta_1: float, 0 < beta < 1. Generally close to 1.
+      beta_2: float, 0 < beta < 1. Generally close to 1.
+      epsilon: float >= 0. Fuzz factor.
+      decay: float >= 0. Learning rate decay over each update.
+
+  References:
+      - [Adam - A Method for Stochastic
+        Optimization](http://arxiv.org/abs/1412.6980v8)
+  """
+
+  # pylint: enable=line-too-long
+
+  def __init__(self,
+               lr=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-8,
+               decay=0.,
+               **kwargs):
+    super(Adam, self).__init__(**kwargs)
+    self.iterations = K.variable(0, name='iterations')
+    self.lr = K.variable(lr, name='lr')
+    self.beta_1 = K.variable(beta_1, name='beta_1')
+    self.beta_2 = K.variable(beta_2, name='beta_2')
+    self.epsilon = epsilon
+    self.decay = K.variable(decay, name='decay')
+    self.initial_decay = decay
+
+  def get_updates(self, params, constraints, loss):
+    grads = self.get_gradients(loss, params)
+    self.updates = [K.update_add(self.iterations, 1)]
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr *= (1. / (1. + self.decay * self.iterations))
+
+    t = self.iterations + 1
+    lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
+                 (1. - K.pow(self.beta_1, t)))
+
+    shapes = [K.int_shape(p) for p in params]
+    ms = [K.zeros(shape) for shape in shapes]
+    vs = [K.zeros(shape) for shape in shapes]
+    self.weights = [self.iterations] + ms + vs
+
+    for p, g, m, v in zip(params, grads, ms, vs):
+      m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
+      v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
+      p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
+
+      self.updates.append(K.update(m, m_t))
+      self.updates.append(K.update(v, v_t))
+
+      new_p = p_t
+      # apply constraints
+      if p in constraints:
+        c = constraints[p]
+        new_p = c(new_p)
+      self.updates.append(K.update(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'beta_1': float(K.get_value(self.beta_1)),
+        'beta_2': float(K.get_value(self.beta_2)),
+        'decay': float(K.get_value(self.decay)),
+        'epsilon': self.epsilon
+    }
+    base_config = super(Adam, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Adamax(Optimizer):
+  # pylint: disable=line-too-long
+  """Adamax optimizer from Adam paper's Section 7.
+
+  It is a variant of Adam based on the infinity norm.
+  Default parameters follow those provided in the paper.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+      beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
+      epsilon: float >= 0. Fuzz factor.
+      decay: float >= 0. Learning rate decay over each update.
+
+  References:
+      - [Adam - A Method for Stochastic
+        Optimization](http://arxiv.org/abs/1412.6980v8)
+  """
+
+  # pylint: enable=line-too-long
+
+  def __init__(self,
+               lr=0.002,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-8,
+               decay=0.,
+               **kwargs):
+    super(Adamax, self).__init__(**kwargs)
+    self.iterations = K.variable(0., name='iterations')
+    self.lr = K.variable(lr, name='lr')
+    self.beta_1 = K.variable(beta_1, name='beta_1')
+    self.beta_2 = K.variable(beta_2, name='beta_2')
+    self.epsilon = epsilon
+    self.decay = K.variable(decay, name='decay')
+    self.initial_decay = decay
+
+  def get_updates(self, params, constraints, loss):
+    grads = self.get_gradients(loss, params)
+    self.updates = [K.update_add(self.iterations, 1)]
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr *= (1. / (1. + self.decay * self.iterations))
+
+    t = self.iterations + 1
+    lr_t = lr / (1. - K.pow(self.beta_1, t))
+
+    shapes = [K.int_shape(p) for p in params]
+    # zero init of 1st moment
+    ms = [K.zeros(shape) for shape in shapes]
+    # zero init of exponentially weighted infinity norm
+    us = [K.zeros(shape) for shape in shapes]
+    self.weights = [self.iterations] + ms + us
+
+    for p, g, m, u in zip(params, grads, ms, us):
+
+      m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
+      u_t = K.maximum(self.beta_2 * u, K.abs(g))
+      p_t = p - lr_t * m_t / (u_t + self.epsilon)
+
+      self.updates.append(K.update(m, m_t))
+      self.updates.append(K.update(u, u_t))
+
+      new_p = p_t
+      # apply constraints
+      if p in constraints:
+        c = constraints[p]
+        new_p = c(new_p)
+      self.updates.append(K.update(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'beta_1': float(K.get_value(self.beta_1)),
+        'beta_2': float(K.get_value(self.beta_2)),
+        'decay': float(K.get_value(self.decay)),
+        'epsilon': self.epsilon
+    }
+    base_config = super(Adamax, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Nadam(Optimizer):
+  # pylint: disable=line-too-long
+  """Nesterov Adam optimizer.
+
+  Much like Adam is essentially RMSprop with momentum,
+  Nadam is Adam RMSprop with Nesterov momentum.
+
+  Default parameters follow those provided in the paper.
+  It is recommended to leave the parameters of this optimizer
+  at their default values.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+      beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
+      epsilon: float >= 0. Fuzz factor.
+
+  References:
+      - [Nadam report](http://cs229.stanford.edu/proj2015/054_report.pdf)
+      - [On the importance of initialization and momentum in deep
+        learning](http://www.cs.toronto.edu/~fritz/absps/momentum.pdf)
+  """
+
+  # pylint: enable=line-too-long
+
+  def __init__(self,
+               lr=0.002,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-8,
+               schedule_decay=0.004,
+               **kwargs):
+    super(Nadam, self).__init__(**kwargs)
+    self.iterations = K.variable(0., name='iterations')
+    self.m_schedule = K.variable(1., name='m_schedule')
+    self.lr = K.variable(lr, name='lr')
+    self.beta_1 = K.variable(beta_1, name='beta_1')
+    self.beta_2 = K.variable(beta_2, name='beta_2')
+    self.epsilon = epsilon
+    self.schedule_decay = schedule_decay
+
+  def get_updates(self, params, constraints, loss):
+    grads = self.get_gradients(loss, params)
+    self.updates = [K.update_add(self.iterations, 1)]
+
+    t = self.iterations + 1
+
+    # Due to the recommendations in [2], i.e. warming momentum schedule
+    momentum_cache_t = self.beta_1 * (1. - 0.5 *
+                                      (K.pow(0.96, t * self.schedule_decay)))
+    momentum_cache_t_1 = self.beta_1 * (
+        1. - 0.5 * (K.pow(0.96, (t + 1) * self.schedule_decay)))
+    m_schedule_new = self.m_schedule * momentum_cache_t
+    m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
+    self.updates.append((self.m_schedule, m_schedule_new))
+
+    shapes = [K.int_shape(p) for p in params]
+    ms = [K.zeros(shape) for shape in shapes]
+    vs = [K.zeros(shape) for shape in shapes]
+
+    self.weights = [self.iterations] + ms + vs
+
+    for p, g, m, v in zip(params, grads, ms, vs):
+      # the following equations given in [1]
+      g_prime = g / (1. - m_schedule_new)
+      m_t = self.beta_1 * m + (1. - self.beta_1) * g
+      m_t_prime = m_t / (1. - m_schedule_next)
+      v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g)
+      v_t_prime = v_t / (1. - K.pow(self.beta_2, t))
+      m_t_bar = (1. - momentum_cache_t
+                ) * g_prime + momentum_cache_t_1 * m_t_prime
+
+      self.updates.append(K.update(m, m_t))
+      self.updates.append(K.update(v, v_t))
+
+      p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon)
+      new_p = p_t
+
+      # apply constraints
+      if p in constraints:
+        c = constraints[p]
+        new_p = c(new_p)
+      self.updates.append(K.update(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'beta_1': float(K.get_value(self.beta_1)),
+        'beta_2': float(K.get_value(self.beta_2)),
+        'epsilon': self.epsilon,
+        'schedule_decay': self.schedule_decay
+    }
+    base_config = super(Nadam, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class TFOptimizer(Optimizer):
+  """Wrapper class for native TensorFlow optimizers.
+  """
+
+  def __init__(self, optimizer):  # pylint: disable=super-init-not-called
+    self.optimizer = optimizer
+    self.iterations = K.variable(0., name='iterations')
+    self.updates = []
+
+  def get_updates(self, params, constraints, loss):
+    if constraints:
+      raise ValueError('TF optimizers do not support '
+                       'weights constraints. Either remove '
+                       'all weights constraints in your model, '
+                       'or use a Keras optimizer.')
+    grads = self.optimizer.compute_gradients(loss, params)
+    opt_update = self.optimizer.apply_gradients(
+        grads, global_step=self.iterations)
+    self.updates.append(opt_update)
+    return self.updates
+
+  @property
+  def weights(self):
+    raise NotImplementedError
+
+  def get_config(self):
+    raise NotImplementedError
+
+  def from_config(self, config):
+    raise NotImplementedError
+
+
+# Aliases.
+
+# pylint: disable=invalid-name
+sgd = SGD
+rmsprop = RMSprop
+adagrad = Adagrad
+adadelta = Adadelta
+adam = Adam
+adamax = Adamax
+nadam = Nadam
+
+# pylint: enable=invalid-name
+
+
+def serialize(optimizer):
+  return serialize_keras_object(optimizer)
+
+
+def deserialize(config, custom_objects=None):
+  """Inverse of the `serialize` function.
+
+  Arguments:
+      config: Optimizer configuration dictionary.
+      custom_objects: Optional dictionary mapping
+          names (strings) to custom objects
+          (classes and functions)
+          to be considered during deserialization.
+
+  Returns:
+      A Keras Optimizer instance.
+  """
+  all_classes = {
+      'sgd': SGD,
+      'rmsprop': RMSprop,
+      'adagrad': Adagrad,
+      'adadelta': Adadelta,
+      'adam': Adam,
+      'adamax': Adamax,
+      'nadam': Nadam,
+      'tfoptimizer': TFOptimizer,
+  }
+  # Make deserialization case-insensitive for built-in optimizers.
+  if config['class_name'].lower() in all_classes:
+    config['class_name'] = config['class_name'].lower()
+  return deserialize_keras_object(
+      config,
+      module_objects=all_classes,
+      custom_objects=custom_objects,
+      printable_module_name='optimizer')
+
+
+def get(identifier):
+  """Retrieves a Keras Optimizer instance.
+
+  Arguments:
+      identifier: Optimizer identifier, one of
+          - String: name of an optimizer
+          - Dictionary: configuration dictionary.
+          - Keras Optimizer instance (it will be returned unchanged).
+          - TensorFlow Optimizer instance
+              (it will be wrapped as a Keras Optimizer).
+
+  Returns:
+      A Keras Optimizer instance.
+
+  Raises:
+      ValueError: If `identifier` cannot be interpreted.
+  """
+  # Wrap TF optimizer instances
+  if isinstance(identifier, tf_optimizer_module.Optimizer):
+    return TFOptimizer(identifier)
+  if isinstance(identifier, dict):
+    return deserialize(identifier)
+  elif isinstance(identifier, six.string_types):
+    config = {'class_name': str(identifier), 'config': {}}
+    return deserialize(config)
+  if isinstance(identifier, Optimizer):
+    return identifier
+  else:
+    raise ValueError('Could not interpret optimizer identifier:', identifier)
diff --git a/tensorflow/contrib/keras/python/keras/optimizers_test.py b/tensorflow/contrib/keras/python/keras/optimizers_test.py
new file mode 100644
index 0000000000..b3aaddb7c0
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/optimizers_test.py
@@ -0,0 +1,107 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras optimizers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+def _get_model(input_dim, num_hidden, output_dim):
+  model = keras.models.Sequential()
+  model.add(keras.layers.Dense(num_hidden,
+                               activation='relu',
+                               input_shape=(input_dim,)))
+  model.add(keras.layers.Dense(output_dim, activation='softmax'))
+  return model
+
+
+def _test_optimizer(optimizer, target=0.75):
+  np.random.seed(1337)
+  (x_train, y_train), _ = testing_utils.get_test_data(train_samples=1000,
+                                                      test_samples=200,
+                                                      input_shape=(10,),
+                                                      num_classes=2)
+  y_train = keras.utils.to_categorical(y_train)
+  model = _get_model(x_train.shape[1], 10, y_train.shape[1])
+  model.compile(loss='categorical_crossentropy',
+                optimizer=optimizer,
+                metrics=['accuracy'])
+  history = model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=0)
+  assert history.history['acc'][-1] >= target
+  config = keras.optimizers.serialize(optimizer)
+  optim = keras.optimizers.deserialize(config)
+  new_config = keras.optimizers.serialize(optim)
+  new_config['class_name'] = new_config['class_name'].lower()
+  assert config == new_config
+
+
+class KerasOptimizersTest(test.TestCase):
+
+  def test_sgd(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.SGD(lr=0.01,
+                                           momentum=0.9,
+                                           nesterov=True))
+
+  def test_rmsprop(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.RMSprop())
+      _test_optimizer(keras.optimizers.RMSprop(decay=1e-3))
+
+  def test_adagrad(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.Adagrad())
+      _test_optimizer(keras.optimizers.Adagrad(decay=1e-3))
+
+  def test_adadelta(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.Adadelta(), target=0.6)
+      _test_optimizer(keras.optimizers.Adadelta(decay=1e-3), target=0.6)
+
+  def test_adam(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.Adam())
+      _test_optimizer(keras.optimizers.Adam(decay=1e-3))
+
+  def test_adamax(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.Adamax())
+      _test_optimizer(keras.optimizers.Adamax(decay=1e-3))
+
+  def test_nadam(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.Nadam())
+
+  def test_clipnorm(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.SGD(lr=0.01,
+                                           momentum=0.9,
+                                           clipnorm=0.5))
+
+  def test_clipvalue(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.SGD(lr=0.01,
+                                           momentum=0.9,
+                                           clipvalue=0.5))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/__init__.py b/tensorflow/contrib/keras/python/keras/preprocessing/__init__.py
new file mode 100644
index 0000000000..9ae14c9674
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Data preprocessing module.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.preprocessing import image
+from tensorflow.contrib.keras.python.keras.preprocessing import sequence
+from tensorflow.contrib.keras.python.keras.preprocessing import text
+
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/image.py b/tensorflow/contrib/keras/python/keras/preprocessing/image.py
new file mode 100644
index 0000000000..86c7650a07
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/image.py
@@ -0,0 +1,1089 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Fairly basic set of tools for real-time data augmentation on image data.
+
+Can easily be extended to include new transformations,
+new preprocessing methods, etc...
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import re
+import threading
+import warnings
+
+import numpy as np
+from six.moves import range  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.keras.python.keras import backend as K
+
+
+# pylint: disable=g-import-not-at-top
+try:
+  from PIL import Image as pil_image
+except ImportError:
+  pil_image = None
+try:
+  from scipy import linalg
+  import scipy.ndimage as ndi
+except ImportError:
+  linalg = None
+  ndi = None
+# pylint: enable=g-import-not-at-top
+
+
+def random_rotation(x,
+                    rg,
+                    row_axis=1,
+                    col_axis=2,
+                    channel_axis=0,
+                    fill_mode='nearest',
+                    cval=0.):
+  """Performs a random rotation of a Numpy image tensor.
+
+  Arguments:
+      x: Input tensor. Must be 3D.
+      rg: Rotation range, in degrees.
+      row_axis: Index of axis for rows in the input tensor.
+      col_axis: Index of axis for columns in the input tensor.
+      channel_axis: Index of axis for channels in the input tensor.
+      fill_mode: Points outside the boundaries of the input
+          are filled according to the given mode
+          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
+      cval: Value used for points outside the boundaries
+          of the input if `mode='constant'`.
+
+  Returns:
+      Rotated Numpy image tensor.
+  """
+  theta = np.pi / 180 * np.random.uniform(-rg, rg)
+  rotation_matrix = np.array([[np.cos(theta), -np.sin(theta), 0],
+                              [np.sin(theta), np.cos(theta), 0], [0, 0, 1]])
+
+  h, w = x.shape[row_axis], x.shape[col_axis]
+  transform_matrix = transform_matrix_offset_center(rotation_matrix, h, w)
+  x = apply_transform(x, transform_matrix, channel_axis, fill_mode, cval)
+  return x
+
+
+def random_shift(x,
+                 wrg,
+                 hrg,
+                 row_axis=1,
+                 col_axis=2,
+                 channel_axis=0,
+                 fill_mode='nearest',
+                 cval=0.):
+  """Performs a random spatial shift of a Numpy image tensor.
+
+  Arguments:
+      x: Input tensor. Must be 3D.
+      wrg: Width shift range, as a float fraction of the width.
+      hrg: Height shift range, as a float fraction of the height.
+      row_axis: Index of axis for rows in the input tensor.
+      col_axis: Index of axis for columns in the input tensor.
+      channel_axis: Index of axis for channels in the input tensor.
+      fill_mode: Points outside the boundaries of the input
+          are filled according to the given mode
+          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
+      cval: Value used for points outside the boundaries
+          of the input if `mode='constant'`.
+
+  Returns:
+      Shifted Numpy image tensor.
+  """
+  h, w = x.shape[row_axis], x.shape[col_axis]
+  tx = np.random.uniform(-hrg, hrg) * h
+  ty = np.random.uniform(-wrg, wrg) * w
+  translation_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]])
+
+  transform_matrix = translation_matrix  # no need to do offset
+  x = apply_transform(x, transform_matrix, channel_axis, fill_mode, cval)
+  return x
+
+
+def random_shear(x,
+                 intensity,
+                 row_axis=1,
+                 col_axis=2,
+                 channel_axis=0,
+                 fill_mode='nearest',
+                 cval=0.):
+  """Performs a random spatial shear of a Numpy image tensor.
+
+  Arguments:
+      x: Input tensor. Must be 3D.
+      intensity: Transformation intensity.
+      row_axis: Index of axis for rows in the input tensor.
+      col_axis: Index of axis for columns in the input tensor.
+      channel_axis: Index of axis for channels in the input tensor.
+      fill_mode: Points outside the boundaries of the input
+          are filled according to the given mode
+          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
+      cval: Value used for points outside the boundaries
+          of the input if `mode='constant'`.
+
+  Returns:
+      Sheared Numpy image tensor.
+  """
+  shear = np.random.uniform(-intensity, intensity)
+  shear_matrix = np.array([[1, -np.sin(shear), 0], [0, np.cos(shear), 0],
+                           [0, 0, 1]])
+
+  h, w = x.shape[row_axis], x.shape[col_axis]
+  transform_matrix = transform_matrix_offset_center(shear_matrix, h, w)
+  x = apply_transform(x, transform_matrix, channel_axis, fill_mode, cval)
+  return x
+
+
+def random_zoom(x,
+                zoom_range,
+                row_axis=1,
+                col_axis=2,
+                channel_axis=0,
+                fill_mode='nearest',
+                cval=0.):
+  """Performs a random spatial zoom of a Numpy image tensor.
+
+  Arguments:
+      x: Input tensor. Must be 3D.
+      zoom_range: Tuple of floats; zoom range for width and height.
+      row_axis: Index of axis for rows in the input tensor.
+      col_axis: Index of axis for columns in the input tensor.
+      channel_axis: Index of axis for channels in the input tensor.
+      fill_mode: Points outside the boundaries of the input
+          are filled according to the given mode
+          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
+      cval: Value used for points outside the boundaries
+          of the input if `mode='constant'`.
+
+  Returns:
+      Zoomed Numpy image tensor.
+
+  Raises:
+      ValueError: if `zoom_range` isn't a tuple.
+  """
+  if len(zoom_range) != 2:
+    raise ValueError('zoom_range should be a tuple or list of two floats. '
+                     'Received arg: ', zoom_range)
+
+  if zoom_range[0] == 1 and zoom_range[1] == 1:
+    zx, zy = 1, 1
+  else:
+    zx, zy = np.random.uniform(zoom_range[0], zoom_range[1], 2)
+  zoom_matrix = np.array([[zx, 0, 0], [0, zy, 0], [0, 0, 1]])
+
+  h, w = x.shape[row_axis], x.shape[col_axis]
+  transform_matrix = transform_matrix_offset_center(zoom_matrix, h, w)
+  x = apply_transform(x, transform_matrix, channel_axis, fill_mode, cval)
+  return x
+
+
+def random_channel_shift(x, intensity, channel_axis=0):
+  x = np.rollaxis(x, channel_axis, 0)
+  min_x, max_x = np.min(x), np.max(x)
+  channel_images = [
+      np.clip(x_channel + np.random.uniform(-intensity, intensity), min_x,
+              max_x) for x_channel in x
+  ]
+  x = np.stack(channel_images, axis=0)
+  x = np.rollaxis(x, 0, channel_axis + 1)
+  return x
+
+
+def transform_matrix_offset_center(matrix, x, y):
+  o_x = float(x) / 2 + 0.5
+  o_y = float(y) / 2 + 0.5
+  offset_matrix = np.array([[1, 0, o_x], [0, 1, o_y], [0, 0, 1]])
+  reset_matrix = np.array([[1, 0, -o_x], [0, 1, -o_y], [0, 0, 1]])
+  transform_matrix = np.dot(np.dot(offset_matrix, matrix), reset_matrix)
+  return transform_matrix
+
+
+def apply_transform(x,
+                    transform_matrix,
+                    channel_axis=0,
+                    fill_mode='nearest',
+                    cval=0.):
+  """Apply the image transformation specified by a matrix.
+
+  Arguments:
+      x: 2D numpy array, single image.
+      transform_matrix: Numpy array specifying the geometric transformation.
+      channel_axis: Index of axis for channels in the input tensor.
+      fill_mode: Points outside the boundaries of the input
+          are filled according to the given mode
+          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
+      cval: Value used for points outside the boundaries
+          of the input if `mode='constant'`.
+
+  Returns:
+      The transformed version of the input.
+  """
+  x = np.rollaxis(x, channel_axis, 0)
+  final_affine_matrix = transform_matrix[:2, :2]
+  final_offset = transform_matrix[:2, 2]
+  channel_images = [
+      ndi.interpolation.affine_transform(
+          x_channel,
+          final_affine_matrix,
+          final_offset,
+          order=0,
+          mode=fill_mode,
+          cval=cval) for x_channel in x
+  ]
+  x = np.stack(channel_images, axis=0)
+  x = np.rollaxis(x, 0, channel_axis + 1)
+  return x
+
+
+def flip_axis(x, axis):
+  x = np.asarray(x).swapaxes(axis, 0)
+  x = x[::-1, ...]
+  x = x.swapaxes(0, axis)
+  return x
+
+
+def array_to_img(x, data_format=None, scale=True):
+  """Converts a 3D Numpy array to a PIL Image instance.
+
+  Arguments:
+      x: Input Numpy array.
+      data_format: Image data format.
+      scale: Whether to rescale image values
+          to be within [0, 255].
+
+  Returns:
+      A PIL Image instance.
+
+  Raises:
+      ImportError: if PIL is not available.
+      ValueError: if invalid `x` or `data_format` is passed.
+  """
+  if pil_image is None:
+    raise ImportError('Could not import PIL.Image. '
+                      'The use of `array_to_img` requires PIL.')
+  x = np.asarray(x, dtype=K.floatx())
+  if x.ndim != 3:
+    raise ValueError('Expected image array to have rank 3 (single image). '
+                     'Got array with shape:', x.shape)
+
+  if data_format is None:
+    data_format = K.image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Invalid data_format:', data_format)
+
+  # Original Numpy array x has format (height, width, channel)
+  # or (channel, height, width)
+  # but target PIL image has format (width, height, channel)
+  if data_format == 'channels_first':
+    x = x.transpose(1, 2, 0)
+  if scale:
+    x = x + max(-np.min(x), 0)  # pylint: disable=g-no-augmented-assignment
+    x_max = np.max(x)
+    if x_max != 0:
+      x /= x_max
+    x *= 255
+  if x.shape[2] == 3:
+    # RGB
+    return pil_image.fromarray(x.astype('uint8'), 'RGB')
+  elif x.shape[2] == 1:
+    # grayscale
+    return pil_image.fromarray(x[:, :, 0].astype('uint8'), 'L')
+  else:
+    raise ValueError('Unsupported channel number: ', x.shape[2])
+
+
+def img_to_array(img, data_format=None):
+  """Converts a PIL Image instance to a Numpy array.
+
+  Arguments:
+      img: PIL Image instance.
+      data_format: Image data format.
+
+  Returns:
+      A 3D Numpy array.
+
+  Raises:
+      ValueError: if invalid `img` or `data_format` is passed.
+  """
+  if data_format is None:
+    data_format = K.image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format: ', data_format)
+  # Numpy array x has format (height, width, channel)
+  # or (channel, height, width)
+  # but original PIL image has format (width, height, channel)
+  x = np.asarray(img, dtype=K.floatx())
+  if len(x.shape) == 3:
+    if data_format == 'channels_first':
+      x = x.transpose(2, 0, 1)
+  elif len(x.shape) == 2:
+    if data_format == 'channels_first':
+      x = x.reshape((1, x.shape[0], x.shape[1]))
+    else:
+      x = x.reshape((x.shape[0], x.shape[1], 1))
+  else:
+    raise ValueError('Unsupported image shape: ', x.shape)
+  return x
+
+
+def load_img(path, grayscale=False, target_size=None):
+  """Loads an image into PIL format.
+
+  Arguments:
+      path: Path to image file
+      grayscale: Boolean, whether to load the image as grayscale.
+      target_size: Either `None` (default to original size)
+          or tuple of ints `(img_height, img_width)`.
+
+  Returns:
+      A PIL Image instance.
+
+  Raises:
+      ImportError: if PIL is not available.
+  """
+  if pil_image is None:
+    raise ImportError('Could not import PIL.Image. '
+                      'The use of `array_to_img` requires PIL.')
+  img = pil_image.open(path)
+  if grayscale:
+    if img.mode != 'L':
+      img = img.convert('L')
+  else:
+    if img.mode != 'RGB':
+      img = img.convert('RGB')
+  if target_size:
+    wh_tuple = (target_size[1], target_size[0])
+    if img.size != wh_tuple:
+      img = img.resize(wh_tuple)
+  return img
+
+
+def list_pictures(directory, ext='jpg|jpeg|bmp|png'):
+  return [
+      os.path.join(root, f)
+      for root, _, files in os.walk(directory) for f in files
+      if re.match(r'([\w]+\.(?:' + ext + '))', f)
+  ]
+
+
+class ImageDataGenerator(object):
+  """Generate minibatches of image data with real-time data augmentation.
+
+  Arguments:
+      featurewise_center: set input mean to 0 over the dataset.
+      samplewise_center: set each sample mean to 0.
+      featurewise_std_normalization: divide inputs by std of the dataset.
+      samplewise_std_normalization: divide each input by its std.
+      zca_whitening: apply ZCA whitening.
+      rotation_range: degrees (0 to 180).
+      width_shift_range: fraction of total width.
+      height_shift_range: fraction of total height.
+      shear_range: shear intensity (shear angle in radians).
+      zoom_range: amount of zoom. if scalar z, zoom will be randomly picked
+          in the range [1-z, 1+z]. A sequence of two can be passed instead
+          to select this range.
+      channel_shift_range: shift range for each channels.
+      fill_mode: points outside the boundaries are filled according to the
+          given mode ('constant', 'nearest', 'reflect' or 'wrap'). Default
+          is 'nearest'.
+      cval: value used for points outside the boundaries when fill_mode is
+          'constant'. Default is 0.
+      horizontal_flip: whether to randomly flip images horizontally.
+      vertical_flip: whether to randomly flip images vertically.
+      rescale: rescaling factor. If None or 0, no rescaling is applied,
+          otherwise we multiply the data by the value provided
+          (before applying any other transformation).
+      preprocessing_function: function that will be implied on each input.
+          The function will run before any other modification on it.
+          The function should take one argument:
+          one image (Numpy tensor with rank 3),
+          and should output a Numpy tensor with the same shape.
+      data_format: 'channels_first' or 'channels_last'. In 'channels_first'
+        mode, the channels dimension
+          (the depth) is at index 1, in 'channels_last' mode it is at index 3.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+  """
+
+  def __init__(self,
+               featurewise_center=False,
+               samplewise_center=False,
+               featurewise_std_normalization=False,
+               samplewise_std_normalization=False,
+               zca_whitening=False,
+               rotation_range=0.,
+               width_shift_range=0.,
+               height_shift_range=0.,
+               shear_range=0.,
+               zoom_range=0.,
+               channel_shift_range=0.,
+               fill_mode='nearest',
+               cval=0.,
+               horizontal_flip=False,
+               vertical_flip=False,
+               rescale=None,
+               preprocessing_function=None,
+               data_format=None):
+    if data_format is None:
+      data_format = K.image_data_format()
+    self.featurewise_center = featurewise_center
+    self.samplewise_center = samplewise_center
+    self.featurewise_std_normalization = featurewise_std_normalization
+    self.samplewise_std_normalization = samplewise_std_normalization
+    self.zca_whitening = zca_whitening
+    self.rotation_range = rotation_range
+    self.width_shift_range = width_shift_range
+    self.height_shift_range = height_shift_range
+    self.shear_range = shear_range
+    self.zoom_range = zoom_range
+    self.channel_shift_range = channel_shift_range
+    self.fill_mode = fill_mode
+    self.cval = cval
+    self.horizontal_flip = horizontal_flip
+    self.vertical_flip = vertical_flip
+    self.rescale = rescale
+    self.preprocessing_function = preprocessing_function
+
+    if data_format not in {'channels_last', 'channels_first'}:
+      raise ValueError(
+          'data_format should be "channels_last" (channel after row and '
+          'column) or "channels_first" (channel before row and column). '
+          'Received arg: ', data_format)
+    self.data_format = data_format
+    if data_format == 'channels_first':
+      self.channel_axis = 1
+      self.row_axis = 2
+      self.col_axis = 3
+    if data_format == 'channels_last':
+      self.channel_axis = 3
+      self.row_axis = 1
+      self.col_axis = 2
+
+    self.mean = None
+    self.std = None
+    self.principal_components = None
+
+    if np.isscalar(zoom_range):
+      self.zoom_range = [1 - zoom_range, 1 + zoom_range]
+    elif len(zoom_range) == 2:
+      self.zoom_range = [zoom_range[0], zoom_range[1]]
+    else:
+      raise ValueError('zoom_range should be a float or '
+                       'a tuple or list of two floats. '
+                       'Received arg: ', zoom_range)
+
+  def flow(self,
+           x,
+           y=None,
+           batch_size=32,
+           shuffle=True,
+           seed=None,
+           save_to_dir=None,
+           save_prefix='',
+           save_format='jpeg'):
+    return NumpyArrayIterator(
+        x,
+        y,
+        self,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        seed=seed,
+        data_format=self.data_format,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format)
+
+  def flow_from_directory(self,
+                          directory,
+                          target_size=(256, 256),
+                          color_mode='rgb',
+                          classes=None,
+                          class_mode='categorical',
+                          batch_size=32,
+                          shuffle=True,
+                          seed=None,
+                          save_to_dir=None,
+                          save_prefix='',
+                          save_format='jpeg',
+                          follow_links=False):
+    return DirectoryIterator(
+        directory,
+        self,
+        target_size=target_size,
+        color_mode=color_mode,
+        classes=classes,
+        class_mode=class_mode,
+        data_format=self.data_format,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        seed=seed,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format,
+        follow_links=follow_links)
+
+  def standardize(self, x):
+    """Apply the normalization configuration to a batch of inputs.
+
+    Arguments:
+        x: batch of inputs to be normalized.
+
+    Returns:
+        The inputs, normalized.
+    """
+    if self.preprocessing_function:
+      x = self.preprocessing_function(x)
+    if self.rescale:
+      x *= self.rescale
+    # x is a single image, so it doesn't have image number at index 0
+    img_channel_axis = self.channel_axis - 1
+    if self.samplewise_center:
+      x -= np.mean(x, axis=img_channel_axis, keepdims=True)
+    if self.samplewise_std_normalization:
+      x /= (np.std(x, axis=img_channel_axis, keepdims=True) + 1e-7)
+
+    if self.featurewise_center:
+      if self.mean is not None:
+        x -= self.mean
+      else:
+        warnings.warn('This ImageDataGenerator specifies '
+                      '`featurewise_center`, but it hasn\'t'
+                      'been fit on any training data. Fit it '
+                      'first by calling `.fit(numpy_data)`.')
+    if self.featurewise_std_normalization:
+      if self.std is not None:
+        x /= (self.std + 1e-7)
+      else:
+        warnings.warn('This ImageDataGenerator specifies '
+                      '`featurewise_std_normalization`, but it hasn\'t'
+                      'been fit on any training data. Fit it '
+                      'first by calling `.fit(numpy_data)`.')
+    if self.zca_whitening:
+      if self.principal_components is not None:
+        flatx = np.reshape(x, (x.size))
+        whitex = np.dot(flatx, self.principal_components)
+        x = np.reshape(whitex, (x.shape[0], x.shape[1], x.shape[2]))
+      else:
+        warnings.warn('This ImageDataGenerator specifies '
+                      '`zca_whitening`, but it hasn\'t'
+                      'been fit on any training data. Fit it '
+                      'first by calling `.fit(numpy_data)`.')
+    return x
+
+  def random_transform(self, x):
+    """Randomly augment a single image tensor.
+
+    Arguments:
+        x: 3D tensor, single image.
+
+    Returns:
+        A randomly transformed version of the input (same shape).
+
+    Raises:
+        ImportError: if Scipy is not available.
+    """
+    if ndi is None:
+      raise ImportError('Scipy is required for image transformations.')
+
+    # x is a single image, so it doesn't have image number at index 0
+    img_row_axis = self.row_axis - 1
+    img_col_axis = self.col_axis - 1
+    img_channel_axis = self.channel_axis - 1
+
+    # use composition of homographies
+    # to generate final transform that needs to be applied
+    if self.rotation_range:
+      theta = np.pi / 180 * np.random.uniform(-self.rotation_range,
+                                              self.rotation_range)
+    else:
+      theta = 0
+
+    if self.height_shift_range:
+      tx = np.random.uniform(-self.height_shift_range,
+                             self.height_shift_range) * x.shape[img_row_axis]
+    else:
+      tx = 0
+
+    if self.width_shift_range:
+      ty = np.random.uniform(-self.width_shift_range,
+                             self.width_shift_range) * x.shape[img_col_axis]
+    else:
+      ty = 0
+
+    if self.shear_range:
+      shear = np.random.uniform(-self.shear_range, self.shear_range)
+    else:
+      shear = 0
+
+    if self.zoom_range[0] == 1 and self.zoom_range[1] == 1:
+      zx, zy = 1, 1
+    else:
+      zx, zy = np.random.uniform(self.zoom_range[0], self.zoom_range[1], 2)
+
+    transform_matrix = None
+    if theta != 0:
+      rotation_matrix = np.array([[np.cos(theta), -np.sin(theta), 0],
+                                  [np.sin(theta), np.cos(theta), 0], [0, 0, 1]])
+      transform_matrix = rotation_matrix
+
+    if tx != 0 or ty != 0:
+      shift_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]])
+      transform_matrix = shift_matrix if transform_matrix is None else np.dot(
+          transform_matrix, shift_matrix)
+
+    if shear != 0:
+      shear_matrix = np.array([[1, -np.sin(shear), 0], [0, np.cos(shear), 0],
+                               [0, 0, 1]])
+      transform_matrix = shear_matrix if transform_matrix is None else np.dot(
+          transform_matrix, shear_matrix)
+
+    if zx != 1 or zy != 1:
+      zoom_matrix = np.array([[zx, 0, 0], [0, zy, 0], [0, 0, 1]])
+      transform_matrix = zoom_matrix if transform_matrix is None else np.dot(
+          transform_matrix, zoom_matrix)
+
+    if transform_matrix is not None:
+      h, w = x.shape[img_row_axis], x.shape[img_col_axis]
+      transform_matrix = transform_matrix_offset_center(transform_matrix, h, w)
+      x = apply_transform(
+          x,
+          transform_matrix,
+          img_channel_axis,
+          fill_mode=self.fill_mode,
+          cval=self.cval)
+
+    if self.channel_shift_range != 0:
+      x = random_channel_shift(x, self.channel_shift_range, img_channel_axis)
+    if self.horizontal_flip:
+      if np.random.random() < 0.5:
+        x = flip_axis(x, img_col_axis)
+
+    if self.vertical_flip:
+      if np.random.random() < 0.5:
+        x = flip_axis(x, img_row_axis)
+
+    return x
+
+  def fit(self, x, augment=False, rounds=1, seed=None):
+    """Fits internal statistics to some sample data.
+
+    Required for featurewise_center, featurewise_std_normalization
+    and zca_whitening.
+
+    Arguments:
+        x: Numpy array, the data to fit on. Should have rank 4.
+            In case of grayscale data,
+            the channels axis should have value 1, and in case
+            of RGB data, it should have value 3.
+        augment: Whether to fit on randomly augmented samples
+        rounds: If `augment`,
+            how many augmentation passes to do over the data
+        seed: random seed.
+
+    Raises:
+        ValueError: in case of invalid input `x`.
+        ImportError: if Scipy is not available.
+    """
+    x = np.asarray(x, dtype=K.floatx())
+    if x.ndim != 4:
+      raise ValueError('Input to `.fit()` should have rank 4. '
+                       'Got array with shape: ' + str(x.shape))
+    if x.shape[self.channel_axis] not in {1, 3, 4}:
+      raise ValueError(
+          'Expected input to be images (as Numpy array) '
+          'following the data format convention "' + self.data_format + '" '
+          '(channels on axis ' + str(self.channel_axis) + '), i.e. expected '
+          'either 1, 3 or 4 channels on axis ' + str(self.channel_axis) + '. '
+          'However, it was passed an array with shape ' + str(
+              x.shape) + ' (' + str(x.shape[self.channel_axis]) + ' channels).')
+
+    if seed is not None:
+      np.random.seed(seed)
+
+    x = np.copy(x)
+    if augment:
+      ax = np.zeros(
+          tuple([rounds * x.shape[0]] + list(x.shape)[1:]), dtype=K.floatx())
+      for r in range(rounds):
+        for i in range(x.shape[0]):
+          ax[i + r * x.shape[0]] = self.random_transform(x[i])
+      x = ax
+
+    if self.featurewise_center:
+      self.mean = np.mean(x, axis=(0, self.row_axis, self.col_axis))
+      broadcast_shape = [1, 1, 1]
+      broadcast_shape[self.channel_axis - 1] = x.shape[self.channel_axis]
+      self.mean = np.reshape(self.mean, broadcast_shape)
+      x -= self.mean
+
+    if self.featurewise_std_normalization:
+      self.std = np.std(x, axis=(0, self.row_axis, self.col_axis))
+      broadcast_shape = [1, 1, 1]
+      broadcast_shape[self.channel_axis - 1] = x.shape[self.channel_axis]
+      self.std = np.reshape(self.std, broadcast_shape)
+      x /= (self.std + K.epsilon())
+
+    if self.zca_whitening:
+      if linalg is None:
+        raise ImportError('Scipy is required for zca_whitening.')
+
+      flat_x = np.reshape(x, (x.shape[0], x.shape[1] * x.shape[2] * x.shape[3]))
+      sigma = np.dot(flat_x.T, flat_x) / flat_x.shape[0]
+      u, s, _ = linalg.svd(sigma)
+      self.principal_components = np.dot(
+          np.dot(u, np.diag(1. / np.sqrt(s + 10e-7))), u.T)
+
+
+class Iterator(object):
+  """Abstract base class for image data iterators.
+
+  Arguments:
+      n: Integer, total number of samples in the dataset to loop over.
+      batch_size: Integer, size of a batch.
+      shuffle: Boolean, whether to shuffle the data between epochs.
+      seed: Random seeding for data shuffling.
+  """
+
+  def __init__(self, n, batch_size, shuffle, seed):
+    self.n = n
+    self.batch_size = batch_size
+    self.shuffle = shuffle
+    self.batch_index = 0
+    self.total_batches_seen = 0
+    self.lock = threading.Lock()
+    self.index_generator = self._flow_index(n, batch_size, shuffle, seed)
+
+  def reset(self):
+    self.batch_index = 0
+
+  def _flow_index(self, n, batch_size=32, shuffle=False, seed=None):
+    # Ensure self.batch_index is 0.
+    self.reset()
+    while 1:
+      if seed is not None:
+        np.random.seed(seed + self.total_batches_seen)
+      if self.batch_index == 0:
+        index_array = np.arange(n)
+        if shuffle:
+          index_array = np.random.permutation(n)
+
+      current_index = (self.batch_index * batch_size) % n
+      if n >= current_index + batch_size:
+        current_batch_size = batch_size
+        self.batch_index += 1
+      else:
+        current_batch_size = n - current_index
+        self.batch_index = 0
+      self.total_batches_seen += 1
+      yield (index_array[current_index:current_index + current_batch_size],
+             current_index, current_batch_size)
+
+  def __iter__(self):  # pylint: disable=non-iterator-returned
+    # Needed if we want to do something like:
+    # for x, y in data_gen.flow(...):
+    return self
+
+  def __next__(self, *args, **kwargs):
+    return self.next(*args, **kwargs)
+
+
+class NumpyArrayIterator(Iterator):
+  """Iterator yielding data from a Numpy array.
+
+  Arguments:
+      x: Numpy array of input data.
+      y: Numpy array of targets data.
+      image_data_generator: Instance of `ImageDataGenerator`
+          to use for random transformations and normalization.
+      batch_size: Integer, size of a batch.
+      shuffle: Boolean, whether to shuffle the data between epochs.
+      seed: Random seed for data shuffling.
+      data_format: String, one of `channels_first`, `channels_last`.
+      save_to_dir: Optional directory where to save the pictures
+          being yielded, in a viewable format. This is useful
+          for visualizing the random transformations being
+          applied, for debugging purposes.
+      save_prefix: String prefix to use for saving sample
+          images (if `save_to_dir` is set).
+      save_format: Format to use for saving sample images
+          (if `save_to_dir` is set).
+  """
+
+  def __init__(self,
+               x,
+               y,
+               image_data_generator,
+               batch_size=32,
+               shuffle=False,
+               seed=None,
+               data_format=None,
+               save_to_dir=None,
+               save_prefix='',
+               save_format='jpeg'):
+    if y is not None and len(x) != len(y):
+      raise ValueError('X (images tensor) and y (labels) '
+                       'should have the same length. '
+                       'Found: X.shape = %s, y.shape = %s' %
+                       (np.asarray(x).shape, np.asarray(y).shape))
+
+    if data_format is None:
+      data_format = K.image_data_format()
+    self.x = np.asarray(x, dtype=K.floatx())
+
+    if self.x.ndim != 4:
+      raise ValueError('Input data in `NumpyArrayIterator` '
+                       'should have rank 4. You passed an array '
+                       'with shape', self.x.shape)
+    channels_axis = 3 if data_format == 'channels_last' else 1
+    if self.x.shape[channels_axis] not in {1, 3, 4}:
+      raise ValueError(
+          'NumpyArrayIterator is set to use the '
+          'data format convention "' + data_format + '" '
+          '(channels on axis ' + str(channels_axis) + '), i.e. expected '
+          'either 1, 3 or 4 channels on axis ' + str(channels_axis) + '. '
+          'However, it was passed an array with shape ' + str(self.x.shape) +
+          ' (' + str(self.x.shape[channels_axis]) + ' channels).')
+    if y is not None:
+      self.y = np.asarray(y)
+    else:
+      self.y = None
+    self.image_data_generator = image_data_generator
+    self.data_format = data_format
+    self.save_to_dir = save_to_dir
+    self.save_prefix = save_prefix
+    self.save_format = save_format
+    super(NumpyArrayIterator, self).__init__(x.shape[0], batch_size, shuffle,
+                                             seed)
+
+  def next(self):
+    """For python 2.x.
+
+    Returns:
+        The next batch.
+    """
+    # Keeps under lock only the mechanism which advances
+    # the indexing of each batch.
+    with self.lock:
+      index_array, current_index, current_batch_size = next(
+          self.index_generator)
+    # The transformation of images is not under thread lock
+    # so it can be done in parallel
+    batch_x = np.zeros(
+        tuple([current_batch_size] + list(self.x.shape)[1:]), dtype=K.floatx())
+    for i, j in enumerate(index_array):
+      x = self.x[j]
+      x = self.image_data_generator.random_transform(x.astype(K.floatx()))
+      x = self.image_data_generator.standardize(x)
+      batch_x[i] = x
+    if self.save_to_dir:
+      for i in range(current_batch_size):
+        img = array_to_img(batch_x[i], self.data_format, scale=True)
+        fname = '{prefix}_{index}_{hash}.{format}'.format(
+            prefix=self.save_prefix,
+            index=current_index + i,
+            hash=np.random.randint(1e4),
+            format=self.save_format)
+        img.save(os.path.join(self.save_to_dir, fname))
+    if self.y is None:
+      return batch_x
+    batch_y = self.y[index_array]
+    return batch_x, batch_y
+
+
+class DirectoryIterator(Iterator):
+  """Iterator capable of reading images from a directory on disk.
+
+  Arguments:
+      directory: Path to the directory to read images from.
+          Each subdirectory in this directory will be
+          considered to contain images from one class,
+          or alternatively you could specify class subdirectories
+          via the `classes` argument.
+      image_data_generator: Instance of `ImageDataGenerator`
+          to use for random transformations and normalization.
+      target_size: tuple of integers, dimensions to resize input images to.
+      color_mode: One of `"rgb"`, `"grayscale"`. Color mode to read images.
+      classes: Optional list of strings, names of sudirectories
+          containing images from each class (e.g. `["dogs", "cats"]`).
+          It will be computed automatically if not set.
+      class_mode: Mode for yielding the targets:
+          `"binary"`: binary targets (if there are only two classes),
+          `"categorical"`: categorical targets,
+          `"sparse"`: integer targets,
+          `None`: no targets get yielded (only input images are yielded).
+      batch_size: Integer, size of a batch.
+      shuffle: Boolean, whether to shuffle the data between epochs.
+      seed: Random seed for data shuffling.
+      data_format: String, one of `channels_first`, `channels_last`.
+      save_to_dir: Optional directory where to save the pictures
+          being yielded, in a viewable format. This is useful
+          for visualizing the random transformations being
+          applied, for debugging purposes.
+      save_prefix: String prefix to use for saving sample
+          images (if `save_to_dir` is set).
+      save_format: Format to use for saving sample images
+          (if `save_to_dir` is set).
+  """
+
+  def __init__(self,
+               directory,
+               image_data_generator,
+               target_size=(256, 256),
+               color_mode='rgb',
+               classes=None,
+               class_mode='categorical',
+               batch_size=32,
+               shuffle=True,
+               seed=None,
+               data_format=None,
+               save_to_dir=None,
+               save_prefix='',
+               save_format='jpeg',
+               follow_links=False):
+    if data_format is None:
+      data_format = K.image_data_format()
+    self.directory = directory
+    self.image_data_generator = image_data_generator
+    self.target_size = tuple(target_size)
+    if color_mode not in {'rgb', 'grayscale'}:
+      raise ValueError('Invalid color mode:', color_mode,
+                       '; expected "rgb" or "grayscale".')
+    self.color_mode = color_mode
+    self.data_format = data_format
+    if self.color_mode == 'rgb':
+      if self.data_format == 'channels_last':
+        self.image_shape = self.target_size + (3,)
+      else:
+        self.image_shape = (3,) + self.target_size
+    else:
+      if self.data_format == 'channels_last':
+        self.image_shape = self.target_size + (1,)
+      else:
+        self.image_shape = (1,) + self.target_size
+    self.classes = classes
+    if class_mode not in {'categorical', 'binary', 'sparse', None}:
+      raise ValueError('Invalid class_mode:', class_mode,
+                       '; expected one of "categorical", '
+                       '"binary", "sparse", or None.')
+    self.class_mode = class_mode
+    self.save_to_dir = save_to_dir
+    self.save_prefix = save_prefix
+    self.save_format = save_format
+
+    white_list_formats = {'png', 'jpg', 'jpeg', 'bmp'}
+
+    # first, count the number of samples and classes
+    self.samples = 0
+
+    if not classes:
+      classes = []
+      for subdir in sorted(os.listdir(directory)):
+        if os.path.isdir(os.path.join(directory, subdir)):
+          classes.append(subdir)
+    self.num_class = len(classes)
+    self.class_indices = dict(zip(classes, range(len(classes))))
+
+    def _recursive_list(subpath):
+      return sorted(
+          os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])
+
+    for subdir in classes:
+      subpath = os.path.join(directory, subdir)
+      for root, _, files in _recursive_list(subpath):
+        for fname in files:
+          is_valid = False
+          for extension in white_list_formats:
+            if fname.lower().endswith('.' + extension):
+              is_valid = True
+              break
+          if is_valid:
+            self.samples += 1
+    print('Found %d images belonging to %d classes.' % (self.samples,
+                                                        self.num_class))
+
+    # second, build an index of the images in the different class subfolders
+    self.filenames = []
+    self.classes = np.zeros((self.samples,), dtype='int32')
+    i = 0
+    for subdir in classes:
+      subpath = os.path.join(directory, subdir)
+      for root, _, files in _recursive_list(subpath):
+        for fname in files:
+          is_valid = False
+          for extension in white_list_formats:
+            if fname.lower().endswith('.' + extension):
+              is_valid = True
+              break
+          if is_valid:
+            self.classes[i] = self.class_indices[subdir]
+            i += 1
+            # add filename relative to directory
+            absolute_path = os.path.join(root, fname)
+            self.filenames.append(os.path.relpath(absolute_path, directory))
+    super(DirectoryIterator, self).__init__(self.samples, batch_size, shuffle,
+                                            seed)
+
+  def next(self):
+    """For python 2.x.
+
+    Returns:
+        The next batch.
+    """
+    with self.lock:
+      index_array, current_index, current_batch_size = next(
+          self.index_generator)
+    # The transformation of images is not under thread lock
+    # so it can be done in parallel
+    batch_x = np.zeros(
+        (current_batch_size,) + self.image_shape, dtype=K.floatx())
+    grayscale = self.color_mode == 'grayscale'
+    # build batch of image data
+    for i, j in enumerate(index_array):
+      fname = self.filenames[j]
+      img = load_img(
+          os.path.join(self.directory, fname),
+          grayscale=grayscale,
+          target_size=self.target_size)
+      x = img_to_array(img, data_format=self.data_format)
+      x = self.image_data_generator.random_transform(x)
+      x = self.image_data_generator.standardize(x)
+      batch_x[i] = x
+    # optionally save augmented images to disk for debugging purposes
+    if self.save_to_dir:
+      for i in range(current_batch_size):
+        img = array_to_img(batch_x[i], self.data_format, scale=True)
+        fname = '{prefix}_{index}_{hash}.{format}'.format(
+            prefix=self.save_prefix,
+            index=current_index + i,
+            hash=np.random.randint(1e4),
+            format=self.save_format)
+        img.save(os.path.join(self.save_to_dir, fname))
+    # build batch of labels
+    if self.class_mode == 'sparse':
+      batch_y = self.classes[index_array]
+    elif self.class_mode == 'binary':
+      batch_y = self.classes[index_array].astype(K.floatx())
+    elif self.class_mode == 'categorical':
+      batch_y = np.zeros((len(batch_x), self.num_class), dtype=K.floatx())
+      for i, label in enumerate(self.classes[index_array]):
+        batch_y[i, label] = 1.
+    else:
+      return batch_x
+    return batch_x, batch_y
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/image_test.py b/tensorflow/contrib/keras/python/keras/preprocessing/image_test.py
new file mode 100644
index 0000000000..0dedf8f850
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/image_test.py
@@ -0,0 +1,231 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for image preprocessing utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+try:
+  import PIL  # pylint:disable=g-import-not-at-top
+except ImportError:
+  PIL = None
+
+
+def _generate_test_images():
+  img_w = img_h = 20
+  rgb_images = []
+  gray_images = []
+  for _ in range(8):
+    bias = np.random.rand(img_w, img_h, 1) * 64
+    variance = np.random.rand(img_w, img_h, 1) * (255 - 64)
+    imarray = np.random.rand(img_w, img_h, 3) * variance + bias
+    im = keras.preprocessing.image.array_to_img(imarray, scale=False)
+    rgb_images.append(im)
+
+    imarray = np.random.rand(img_w, img_h, 1) * variance + bias
+    im = keras.preprocessing.image.array_to_img(imarray, scale=False)
+    gray_images.append(im)
+
+  return [rgb_images, gray_images]
+
+
+class TestImage(test.TestCase):
+
+  def test_image_data_generator(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    for test_images in _generate_test_images():
+      img_list = []
+      for im in test_images:
+        img_list.append(keras.preprocessing.image.img_to_array(im)[None, ...])
+
+      images = np.vstack(img_list)
+      generator = keras.preprocessing.image.ImageDataGenerator(
+          featurewise_center=True,
+          samplewise_center=True,
+          featurewise_std_normalization=True,
+          samplewise_std_normalization=True,
+          zca_whitening=True,
+          rotation_range=90.,
+          width_shift_range=0.1,
+          height_shift_range=0.1,
+          shear_range=0.5,
+          zoom_range=0.2,
+          channel_shift_range=0.,
+          fill_mode='nearest',
+          cval=0.5,
+          horizontal_flip=True,
+          vertical_flip=True)
+      generator.fit(images, augment=True)
+
+      for x, _ in generator.flow(
+          images,
+          np.arange(images.shape[0]),
+          shuffle=True):
+        self.assertEqual(x.shape[1:], images.shape[1:])
+        break
+
+  def test_image_data_generator_invalid_data(self):
+    generator = keras.preprocessing.image.ImageDataGenerator(
+        featurewise_center=True,
+        samplewise_center=True,
+        featurewise_std_normalization=True,
+        samplewise_std_normalization=True,
+        zca_whitening=True,
+        data_format='channels_last')
+    # Test fit with invalid data
+    with self.assertRaises(ValueError):
+      x = np.random.random((3, 10, 10))
+      generator.fit(x)
+    with self.assertRaises(ValueError):
+      x = np.random.random((32, 3, 10, 10))
+      generator.fit(x)
+    with self.assertRaises(ValueError):
+      x = np.random.random((32, 10, 10, 5))
+      generator.fit(x)
+    # Test flow with invalid data
+    with self.assertRaises(ValueError):
+      x = np.random.random((32, 10, 10, 5))
+      generator.flow(np.arange(x.shape[0]))
+    with self.assertRaises(ValueError):
+      x = np.random.random((32, 10, 10))
+      generator.flow(np.arange(x.shape[0]))
+    with self.assertRaises(ValueError):
+      x = np.random.random((32, 3, 10, 10))
+      generator.flow(np.arange(x.shape[0]))
+
+  def test_image_data_generator_fit(self):
+    generator = keras.preprocessing.image.ImageDataGenerator(
+        featurewise_center=True,
+        samplewise_center=True,
+        featurewise_std_normalization=True,
+        samplewise_std_normalization=True,
+        zca_whitening=True,
+        data_format='channels_last')
+    # Test grayscale
+    x = np.random.random((32, 10, 10, 1))
+    generator.fit(x)
+    # Test RBG
+    x = np.random.random((32, 10, 10, 3))
+    generator.fit(x)
+    generator = keras.preprocessing.image.ImageDataGenerator(
+        featurewise_center=True,
+        samplewise_center=True,
+        featurewise_std_normalization=True,
+        samplewise_std_normalization=True,
+        zca_whitening=True,
+        data_format='channels_first')
+    # Test grayscale
+    x = np.random.random((32, 1, 10, 10))
+    generator.fit(x)
+    # Test RBG
+    x = np.random.random((32, 3, 10, 10))
+    generator.fit(x)
+
+  def test_directory_iterator(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    num_classes = 2
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+
+    # create folders and subfolders
+    paths = []
+    for cl in range(num_classes):
+      class_directory = 'class-{}'.format(cl)
+      classpaths = [
+          class_directory, os.path.join(class_directory, 'subfolder-1'),
+          os.path.join(class_directory, 'subfolder-2'), os.path.join(
+              class_directory, 'subfolder-1', 'sub-subfolder')
+      ]
+      for path in classpaths:
+        os.mkdir(os.path.join(temp_dir, path))
+      paths.append(classpaths)
+
+    # save the images in the paths
+    count = 0
+    filenames = []
+    for test_images in _generate_test_images():
+      for im in test_images:
+        # rotate image class
+        im_class = count % num_classes
+        # rotate subfolders
+        classpaths = paths[im_class]
+        filename = os.path.join(classpaths[count % len(classpaths)],
+                                'image-{}.jpg'.format(count))
+        filenames.append(filename)
+        im.save(os.path.join(temp_dir, filename))
+        count += 1
+
+    # create iterator
+    generator = keras.preprocessing.image.ImageDataGenerator()
+    dir_iterator = generator.flow_from_directory(temp_dir)
+
+    # check number of classes and images
+    self.assertEqual(len(dir_iterator.class_indices), num_classes)
+    self.assertEqual(len(dir_iterator.classes), count)
+    self.assertEqual(sorted(dir_iterator.filenames), sorted(filenames))
+
+  def test_img_utils(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    height, width = 10, 8
+
+    # Test channels_first data format
+    x = np.random.random((3, height, width))
+    img = keras.preprocessing.image.array_to_img(
+        x, data_format='channels_first')
+    self.assertEqual(img.size, (width, height))
+    x = keras.preprocessing.image.img_to_array(
+        img, data_format='channels_first')
+    self.assertEqual(x.shape, (3, height, width))
+    # Test 2D
+    x = np.random.random((1, height, width))
+    img = keras.preprocessing.image.array_to_img(
+        x, data_format='channels_first')
+    self.assertEqual(img.size, (width, height))
+    x = keras.preprocessing.image.img_to_array(
+        img, data_format='channels_first')
+    self.assertEqual(x.shape, (1, height, width))
+
+    # Test channels_last data format
+    x = np.random.random((height, width, 3))
+    img = keras.preprocessing.image.array_to_img(x, data_format='channels_last')
+    self.assertEqual(img.size, (width, height))
+    x = keras.preprocessing.image.img_to_array(img, data_format='channels_last')
+    self.assertEqual(x.shape, (height, width, 3))
+    # Test 2D
+    x = np.random.random((height, width, 1))
+    img = keras.preprocessing.image.array_to_img(x, data_format='channels_last')
+    self.assertEqual(img.size, (width, height))
+    x = keras.preprocessing.image.img_to_array(img, data_format='channels_last')
+    self.assertEqual(x.shape, (height, width, 1))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py b/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py
new file mode 100644
index 0000000000..5a24a63b01
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py
@@ -0,0 +1,222 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Preprocessing utilities for sequence data.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+
+import numpy as np
+from six.moves import range  # pylint: disable=redefined-builtin
+
+
+def pad_sequences(sequences,
+                  maxlen=None,
+                  dtype='int32',
+                  padding='pre',
+                  truncating='pre',
+                  value=0.):
+  """Pads each sequence to the same length (length of the longest sequence).
+
+  If maxlen is provided, any sequence longer
+  than maxlen is truncated to maxlen.
+  Truncation happens off either the beginning (default) or
+  the end of the sequence.
+
+  Supports post-padding and pre-padding (default).
+
+  Arguments:
+      sequences: list of lists where each element is a sequence
+      maxlen: int, maximum length
+      dtype: type to cast the resulting sequence.
+      padding: 'pre' or 'post', pad either before or after each sequence.
+      truncating: 'pre' or 'post', remove values from sequences larger than
+          maxlen either in the beginning or in the end of the sequence
+      value: float, value to pad the sequences to the desired value.
+
+  Returns:
+      x: numpy array with dimensions (number_of_sequences, maxlen)
+
+  Raises:
+      ValueError: in case of invalid values for `truncating` or `padding`,
+          or in case of invalid shape for a `sequences` entry.
+  """
+  if not hasattr(sequences, '__len__'):
+    raise ValueError('`sequences` must be iterable.')
+  lengths = []
+  for x in sequences:
+    if not hasattr(x, '__len__'):
+      raise ValueError('`sequences` must be a list of iterables. '
+                       'Found non-iterable: ' + str(x))
+    lengths.append(len(x))
+
+  num_samples = len(sequences)
+  if maxlen is None:
+    maxlen = np.max(lengths)
+
+  # take the sample shape from the first non empty sequence
+  # checking for consistency in the main loop below.
+  sample_shape = tuple()
+  for s in sequences:
+    if len(s) > 0:  # pylint: disable=g-explicit-length-test
+      sample_shape = np.asarray(s).shape[1:]
+      break
+
+  x = (np.ones((num_samples, maxlen) + sample_shape) * value).astype(dtype)
+  for idx, s in enumerate(sequences):
+    if not len(s):  # pylint: disable=g-explicit-length-test
+      continue  # empty list/array was found
+    if truncating == 'pre':
+      trunc = s[-maxlen:]  # pylint: disable=invalid-unary-operand-type
+    elif truncating == 'post':
+      trunc = s[:maxlen]
+    else:
+      raise ValueError('Truncating type "%s" not understood' % truncating)
+
+    # check `trunc` has expected shape
+    trunc = np.asarray(trunc, dtype=dtype)
+    if trunc.shape[1:] != sample_shape:
+      raise ValueError(
+          'Shape of sample %s of sequence at position %s is different from '
+          'expected shape %s'
+          % (trunc.shape[1:], idx, sample_shape))
+
+    if padding == 'post':
+      x[idx, :len(trunc)] = trunc
+    elif padding == 'pre':
+      x[idx, -len(trunc):] = trunc
+    else:
+      raise ValueError('Padding type "%s" not understood' % padding)
+  return x
+
+
+def make_sampling_table(size, sampling_factor=1e-5):
+  """Generates a word rank-based probabilistic sampling table.
+
+  This generates an array where the ith element
+  is the probability that a word of rank i would be sampled,
+  according to the sampling distribution used in word2vec.
+
+  The word2vec formula is:
+      p(word) = min(1, sqrt(word.frequency/sampling_factor) /
+      (word.frequency/sampling_factor))
+
+  We assume that the word frequencies follow Zipf's law (s=1) to derive
+  a numerical approximation of frequency(rank):
+     frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))
+      where gamma is the Euler-Mascheroni constant.
+
+  Arguments:
+      size: int, number of possible words to sample.
+      sampling_factor: the sampling factor in the word2vec formula.
+
+  Returns:
+      A 1D Numpy array of length `size` where the ith entry
+      is the probability that a word of rank i should be sampled.
+  """
+  gamma = 0.577
+  rank = np.array(list(range(size)))
+  rank[0] = 1
+  inv_fq = rank * (np.log(rank) + gamma) + 0.5 - 1. / (12. * rank)
+  f = sampling_factor * inv_fq
+
+  return np.minimum(1., f / np.sqrt(f))
+
+
+def skipgrams(sequence,
+              vocabulary_size,
+              window_size=4,
+              negative_samples=1.,
+              shuffle=True,
+              categorical=False,
+              sampling_table=None):
+  """Generates skipgram word pairs.
+
+  Takes a sequence (list of indexes of words),
+  returns couples of [word_index, other_word index] and labels (1s or 0s),
+  where label = 1 if 'other_word' belongs to the context of 'word',
+  and label=0 if 'other_word' is randomly sampled
+
+  Arguments:
+      sequence: a word sequence (sentence), encoded as a list
+          of word indices (integers). If using a `sampling_table`,
+          word indices are expected to match the rank
+          of the words in a reference dataset (e.g. 10 would encode
+          the 10-th most frequently occuring token).
+          Note that index 0 is expected to be a non-word and will be skipped.
+      vocabulary_size: int. maximum possible word index + 1
+      window_size: int. actually half-window.
+          The window of a word wi will be [i-window_size, i+window_size+1]
+      negative_samples: float >= 0. 0 for no negative (=random) samples.
+          1 for same number as positive samples. etc.
+      shuffle: whether to shuffle the word couples before returning them.
+      categorical: bool. if False, labels will be
+          integers (eg. [0, 1, 1 .. ]),
+          if True labels will be categorical eg. [[1,0],[0,1],[0,1] .. ]
+      sampling_table: 1D array of size `vocabulary_size` where the entry i
+          encodes the probabibily to sample a word of rank i.
+
+  Returns:
+      couples, labels: where `couples` are int pairs and
+          `labels` are either 0 or 1.
+
+  # Note
+      By convention, index 0 in the vocabulary is
+      a non-word and will be skipped.
+  """
+  couples = []
+  labels = []
+  for i, wi in enumerate(sequence):
+    if not wi:
+      continue
+    if sampling_table is not None:
+      if sampling_table[wi] < random.random():
+        continue
+
+    window_start = max(0, i - window_size)
+    window_end = min(len(sequence), i + window_size + 1)
+    for j in range(window_start, window_end):
+      if j != i:
+        wj = sequence[j]
+        if not wj:
+          continue
+        couples.append([wi, wj])
+        if categorical:
+          labels.append([0, 1])
+        else:
+          labels.append(1)
+
+  if negative_samples > 0:
+    num_negative_samples = int(len(labels) * negative_samples)
+    words = [c[0] for c in couples]
+    random.shuffle(words)
+
+    couples += [[words[i % len(words)], random.randint(1, vocabulary_size - 1)]
+                for i in range(num_negative_samples)]
+    if categorical:
+      labels += [[1, 0]] * num_negative_samples
+    else:
+      labels += [0] * num_negative_samples
+
+  if shuffle:
+    seed = random.randint(0, 10e6)
+    random.seed(seed)
+    random.shuffle(couples)
+    random.seed(seed)
+    random.shuffle(labels)
+
+  return couples, labels
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/sequence_test.py b/tensorflow/contrib/keras/python/keras/preprocessing/sequence_test.py
new file mode 100644
index 0000000000..4e54b95c8b
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/sequence_test.py
@@ -0,0 +1,99 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sequence data preprocessing utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class TestSequence(test.TestCase):
+
+  def test_pad_sequences(self):
+    a = [[1], [1, 2], [1, 2, 3]]
+
+    # test padding
+    b = keras.preprocessing.sequence.pad_sequences(a, maxlen=3, padding='pre')
+    self.assertAllClose(b, [[0, 0, 1], [0, 1, 2], [1, 2, 3]])
+    b = keras.preprocessing.sequence.pad_sequences(a, maxlen=3, padding='post')
+    self.assertAllClose(b, [[1, 0, 0], [1, 2, 0], [1, 2, 3]])
+
+    # test truncating
+    b = keras.preprocessing.sequence.pad_sequences(
+        a, maxlen=2, truncating='pre')
+    self.assertAllClose(b, [[0, 1], [1, 2], [2, 3]])
+    b = keras.preprocessing.sequence.pad_sequences(
+        a, maxlen=2, truncating='post')
+    self.assertAllClose(b, [[0, 1], [1, 2], [1, 2]])
+
+    # test value
+    b = keras.preprocessing.sequence.pad_sequences(a, maxlen=3, value=1)
+    self.assertAllClose(b, [[1, 1, 1], [1, 1, 2], [1, 2, 3]])
+
+  def test_pad_sequences_vector(self):
+    a = [[[1, 1]], [[2, 1], [2, 2]], [[3, 1], [3, 2], [3, 3]]]
+
+    # test padding
+    b = keras.preprocessing.sequence.pad_sequences(a, maxlen=3, padding='pre')
+    self.assertAllClose(b, [[[0, 0], [0, 0], [1, 1]], [[0, 0], [2, 1], [2, 2]],
+                            [[3, 1], [3, 2], [3, 3]]])
+    b = keras.preprocessing.sequence.pad_sequences(a, maxlen=3, padding='post')
+    self.assertAllClose(b, [[[1, 1], [0, 0], [0, 0]], [[2, 1], [2, 2], [0, 0]],
+                            [[3, 1], [3, 2], [3, 3]]])
+
+    # test truncating
+    b = keras.preprocessing.sequence.pad_sequences(
+        a, maxlen=2, truncating='pre')
+    self.assertAllClose(b, [[[0, 0], [1, 1]], [[2, 1], [2, 2]], [[3, 2], [3,
+                                                                          3]]])
+
+    b = keras.preprocessing.sequence.pad_sequences(
+        a, maxlen=2, truncating='post')
+    self.assertAllClose(b, [[[0, 0], [1, 1]], [[2, 1], [2, 2]], [[3, 1], [3,
+                                                                          2]]])
+
+    # test value
+    b = keras.preprocessing.sequence.pad_sequences(a, maxlen=3, value=1)
+    self.assertAllClose(b, [[[1, 1], [1, 1], [1, 1]], [[1, 1], [2, 1], [2, 2]],
+                            [[3, 1], [3, 2], [3, 3]]])
+
+  def test_make_sampling_table(self):
+    a = keras.preprocessing.sequence.make_sampling_table(3)
+    self.assertAllClose(
+        a, np.asarray([0.00315225, 0.00315225, 0.00547597]), rtol=.1)
+
+  def test_skipgrams(self):
+    # test with no window size and binary labels
+    couples, labels = keras.preprocessing.sequence.skipgrams(
+        np.arange(3), vocabulary_size=3)
+    for couple in couples:
+      assert couple[0] in [0, 1, 2] and couple[1] in [0, 1, 2]
+
+    # test window size and categorical labels
+    couples, labels = keras.preprocessing.sequence.skipgrams(
+        np.arange(5), vocabulary_size=5, window_size=1, categorical=True)
+    for couple in couples:
+      assert couple[0] - couple[1] <= 3
+    for l in labels:
+      assert len(l) == 2
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/text.py b/tensorflow/contrib/keras/python/keras/preprocessing/text.py
new file mode 100644
index 0000000000..b164b613d2
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/text.py
@@ -0,0 +1,288 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for text input preprocessing.
+
+May benefit from a fast Cython rewrite.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import string
+import sys
+import warnings
+
+import numpy as np
+from six.moves import range  # pylint: disable=redefined-builtin
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+
+if sys.version_info < (3,):
+  maketrans = string.maketrans
+else:
+  maketrans = str.maketrans
+
+
+def text_to_word_sequence(text,
+                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+                          lower=True,
+                          split=' '):
+  """Converts a text to a sequence of word indices.
+
+  Arguments:
+      text: Input text (string).
+      filters: Sequence of characters to filter out.
+      lower: Whether to convert the input to lowercase.
+      split: Sentence split marker (string).
+
+  Returns:
+      A list of integer word indices.
+  """
+  if lower:
+    text = text.lower()
+  text = text.translate(maketrans(filters, split * len(filters)))
+  seq = text.split(split)
+  return [i for i in seq if i]
+
+
+def one_hot(text,
+            n,
+            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+            lower=True,
+            split=' '):
+  seq = text_to_word_sequence(text, filters=filters, lower=lower, split=split)
+  return [(abs(hash(w)) % (n - 1) + 1) for w in seq]
+
+
+class Tokenizer(object):
+  """Text tokenization utility class.
+
+  This class allows to vectorize a text corpus, by turning each
+  text into either a sequence of integers (each integer being the index
+  of a token in a dictionary) or into a vector where the coefficient
+  for each token could be binary, based on word count, based on tf-idf...
+
+  Arguments:
+      num_words: the maximum number of words to keep, based
+          on word frequency. Only the most common `num_words` words will
+          be kept.
+      filters: a string where each element is a character that will be
+          filtered from the texts. The default is all punctuation, plus
+          tabs and line breaks, minus the `'` character.
+      lower: boolean. Whether to convert the texts to lowercase.
+      split: character or string to use for token splitting.
+      char_level: if True, every character will be treated as a word.
+
+  By default, all punctuation is removed, turning the texts into
+  space-separated sequences of words
+  (words maybe include the `'` character). These sequences are then
+  split into lists of tokens. They will then be indexed or vectorized.
+
+  `0` is a reserved index that won't be assigned to any word.
+  """
+
+  def __init__(self,
+               num_words=None,
+               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+               lower=True,
+               split=' ',
+               char_level=False,
+               **kwargs):
+    # Legacy support
+    if 'nb_words' in kwargs:
+      warnings.warn('The `nb_words` argument in `Tokenizer` '
+                    'has been renamed `num_words`.')
+      num_words = kwargs.pop('nb_words')
+    if kwargs:
+      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
+
+    self.word_counts = {}
+    self.word_docs = {}
+    self.filters = filters
+    self.split = split
+    self.lower = lower
+    self.num_words = num_words
+    self.document_count = 0
+    self.char_level = char_level
+
+  def fit_on_texts(self, texts):
+    """Updates internal vocabulary based on a list of texts.
+
+    Required before using `texts_to_sequences` or `texts_to_matrix`.
+
+    Arguments:
+        texts: can be a list of strings,
+            or a generator of strings (for memory-efficiency)
+    """
+    self.document_count = 0
+    for text in texts:
+      self.document_count += 1
+      seq = text if self.char_level else text_to_word_sequence(
+          text, self.filters, self.lower, self.split)
+      for w in seq:
+        if w in self.word_counts:
+          self.word_counts[w] += 1
+        else:
+          self.word_counts[w] = 1
+      for w in set(seq):
+        if w in self.word_docs:
+          self.word_docs[w] += 1
+        else:
+          self.word_docs[w] = 1
+
+    wcounts = list(self.word_counts.items())
+    wcounts.sort(key=lambda x: x[1], reverse=True)
+    sorted_voc = [wc[0] for wc in wcounts]
+    # note that index 0 is reserved, never assigned to an existing word
+    self.word_index = dict(
+        list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))))
+
+    self.index_docs = {}
+    for w, c in list(self.word_docs.items()):
+      self.index_docs[self.word_index[w]] = c
+
+  def fit_on_sequences(self, sequences):
+    """Updates internal vocabulary based on a list of sequences.
+
+    Required before using `sequences_to_matrix`
+    (if `fit_on_texts` was never called).
+
+    Arguments:
+        sequences: A list of sequence.
+            A "sequence" is a list of integer word indices.
+    """
+    self.document_count = len(sequences)
+    self.index_docs = {}
+    for seq in sequences:
+      seq = set(seq)
+      for i in seq:
+        if i not in self.index_docs:
+          self.index_docs[i] = 1
+        else:
+          self.index_docs[i] += 1
+
+  def texts_to_sequences(self, texts):
+    """Transforms each text in texts in a sequence of integers.
+
+    Only top "num_words" most frequent words will be taken into account.
+    Only words known by the tokenizer will be taken into account.
+
+    Arguments:
+        texts: A list of texts (strings).
+
+    Returns:
+        A list of sequences.
+    """
+    res = []
+    for vect in self.texts_to_sequences_generator(texts):
+      res.append(vect)
+    return res
+
+  def texts_to_sequences_generator(self, texts):
+    """Transforms each text in texts in a sequence of integers.
+
+    Only top "num_words" most frequent words will be taken into account.
+    Only words known by the tokenizer will be taken into account.
+
+    Arguments:
+        texts: A list of texts (strings).
+
+    Yields:
+        Yields individual sequences.
+    """
+    num_words = self.num_words
+    for text in texts:
+      seq = text if self.char_level else text_to_word_sequence(
+          text, self.filters, self.lower, self.split)
+      vect = []
+      for w in seq:
+        i = self.word_index.get(w)
+        if i is not None:
+          if num_words and i >= num_words:
+            continue
+          else:
+            vect.append(i)
+      yield vect
+
+  def texts_to_matrix(self, texts, mode='binary'):
+    """Convert a list of texts to a Numpy matrix.
+
+    Arguments:
+        texts: list of strings.
+        mode: one of "binary", "count", "tfidf", "freq".
+
+    Returns:
+        A Numpy matrix.
+    """
+    sequences = self.texts_to_sequences(texts)
+    return self.sequences_to_matrix(sequences, mode=mode)
+
+  def sequences_to_matrix(self, sequences, mode='binary'):
+    """Converts a list of sequences into a Numpy matrix.
+
+    Arguments:
+        sequences: list of sequences
+            (a sequence is a list of integer word indices).
+        mode: one of "binary", "count", "tfidf", "freq"
+
+    Returns:
+        A Numpy matrix.
+
+    Raises:
+        ValueError: In case of invalid `mode` argument,
+            or if the Tokenizer requires to be fit to sample data.
+    """
+    if not self.num_words:
+      if self.word_index:
+        num_words = len(self.word_index) + 1
+      else:
+        raise ValueError('Specify a dimension (num_words argument), '
+                         'or fit on some text data first.')
+    else:
+      num_words = self.num_words
+
+    if mode == 'tfidf' and not self.document_count:
+      raise ValueError('Fit the Tokenizer on some data '
+                       'before using tfidf mode.')
+
+    x = np.zeros((len(sequences), num_words))
+    for i, seq in enumerate(sequences):
+      if not seq:
+        continue
+      counts = {}
+      for j in seq:
+        if j >= num_words:
+          continue
+        if j not in counts:
+          counts[j] = 1.
+        else:
+          counts[j] += 1
+      for j, c in list(counts.items()):
+        if mode == 'count':
+          x[i][j] = c
+        elif mode == 'freq':
+          x[i][j] = c / len(seq)
+        elif mode == 'binary':
+          x[i][j] = 1
+        elif mode == 'tfidf':
+          # Use weighting scheme 2 in
+          # https://en.wikipedia.org/wiki/Tf%E2%80%93idf
+          tf = 1 + np.log(c)
+          idf = np.log(1 + self.document_count / (1 + self.index_docs.get(j, 0)
+                                                 ))
+          x[i][j] = tf * idf
+        else:
+          raise ValueError('Unknown vectorization mode:', mode)
+    return x
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/text_test.py b/tensorflow/contrib/keras/python/keras/preprocessing/text_test.py
new file mode 100644
index 0000000000..e94b9019b2
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/text_test.py
@@ -0,0 +1,59 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for text data preprocessing utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class TestText(test.TestCase):
+
+  def test_one_hot(self):
+    text = 'The cat sat on the mat.'
+    encoded = keras.preprocessing.text.one_hot(text, 5)
+    self.assertEqual(len(encoded), 6)
+    assert np.max(encoded) <= 4
+    assert np.min(encoded) >= 0
+
+  def test_tokenizer(self):
+    texts = [
+        'The cat sat on the mat.',
+        'The dog sat on the log.',
+        'Dogs and cats living together.'
+    ]
+    tokenizer = keras.preprocessing.text.Tokenizer(num_words=10)
+    tokenizer.fit_on_texts(texts)
+
+    sequences = []
+    for seq in tokenizer.texts_to_sequences_generator(texts):
+      sequences.append(seq)
+    assert np.max(np.max(sequences)) < 10
+    self.assertEqual(np.min(np.min(sequences)), 1)
+
+    tokenizer.fit_on_sequences(sequences)
+
+    for mode in ['binary', 'count', 'tfidf', 'freq']:
+      matrix = tokenizer.texts_to_matrix(texts, mode)
+      self.assertEqual(matrix.shape, (3, 10))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/regularizers.py b/tensorflow/contrib/keras/python/keras/regularizers.py
new file mode 100644
index 0000000000..36cc5c47e4
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/regularizers.py
@@ -0,0 +1,102 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in regularizers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import serialize_keras_object
+
+
+class Regularizer(object):
+  """Regularizer base class.
+  """
+
+  def __call__(self, x):
+    return 0.
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
+
+
+class L1L2(Regularizer):
+  """Regularizer for L1 and L2 regularization.
+
+  Arguments:
+      l1: Float; L1 regularization factor.
+      l2: Float; L2 regularization factor.
+  """
+
+  def __init__(self, l1=0., l2=0.):  # pylint: disable=redefined-outer-name
+    self.l1 = K.cast_to_floatx(l1)
+    self.l2 = K.cast_to_floatx(l2)
+
+  def __call__(self, x):
+    regularization = 0.
+    if self.l1:
+      regularization += K.sum(self.l1 * K.abs(x))
+    if self.l2:
+      regularization += K.sum(self.l2 * K.square(x))
+    return regularization
+
+  def get_config(self):
+    return {'l1': float(self.l1), 'l2': float(self.l2)}
+
+
+# Aliases.
+
+
+def l1(l=0.01):
+  return L1L2(l1=l)
+
+
+def l2(l=0.01):
+  return L1L2(l2=l)
+
+
+def l1_l2(l1=0.01, l2=0.01):  # pylint: disable=redefined-outer-name
+  return L1L2(l1=l1, l2=l2)
+
+
+def serialize(regularizer):
+  return serialize_keras_object(regularizer)
+
+
+def deserialize(config, custom_objects=None):
+  return deserialize_keras_object(
+      config,
+      module_objects=globals(),
+      custom_objects=custom_objects,
+      printable_module_name='regularizer')
+
+
+def get(identifier):
+  if identifier is None:
+    return None
+  if isinstance(identifier, dict):
+    return deserialize(identifier)
+  elif isinstance(identifier, six.string_types):
+    config = {'class_name': str(identifier), 'config': {}}
+    return deserialize(config)
+  elif callable(identifier):
+    return identifier
+  else:
+    raise ValueError('Could not interpret regularizer identifier:', identifier)
diff --git a/tensorflow/contrib/keras/python/keras/regularizers_test.py b/tensorflow/contrib/keras/python/keras/regularizers_test.py
new file mode 100644
index 0000000000..528024994f
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/regularizers_test.py
@@ -0,0 +1,76 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras regularizers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+DATA_DIM = 5
+NUM_CLASSES = 2
+
+
+def get_data():
+  (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+      train_samples=10,
+      test_samples=10,
+      input_shape=(DATA_DIM,),
+      num_classes=NUM_CLASSES)
+  y_train = keras.utils.to_categorical(y_train, NUM_CLASSES)
+  y_test = keras.utils.to_categorical(y_test, NUM_CLASSES)
+  return (x_train, y_train), (x_test, y_test)
+
+
+def create_model(kernel_regularizer=None, activity_regularizer=None):
+  model = keras.models.Sequential()
+  model.add(keras.layers.Dense(NUM_CLASSES,
+                               kernel_regularizer=kernel_regularizer,
+                               activity_regularizer=activity_regularizer,
+                               input_shape=(DATA_DIM,)))
+  return model
+
+
+class KerasRegularizersTest(test.TestCase):
+
+  def test_kernel_regularization(self):
+    with self.test_session():
+      (x_train, y_train), _ = get_data()
+      for reg in [keras.regularizers.l1(),
+                  keras.regularizers.l2(),
+                  keras.regularizers.l1_l2()]:
+        model = create_model(kernel_regularizer=reg)
+        model.compile(loss='categorical_crossentropy', optimizer='sgd')
+        assert len(model.losses) == 1
+        model.fit(x_train, y_train, batch_size=10,
+                  epochs=1, verbose=0)
+
+  def test_activity_regularization(self):
+    with self.test_session():
+      (x_train, y_train), _ = get_data()
+      for reg in [keras.regularizers.l1(), keras.regularizers.l2()]:
+        model = create_model(activity_regularizer=reg)
+        model.compile(loss='categorical_crossentropy', optimizer='sgd')
+        assert len(model.losses) == 1
+        model.fit(x_train, y_train, batch_size=10,
+                  epochs=1, verbose=0)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/testing_utils.py b/tensorflow/contrib/keras/python/keras/testing_utils.py
new file mode 100644
index 0000000000..baba5447d9
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/testing_utils.py
@@ -0,0 +1,166 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for unit-testing Keras."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import inspect
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+
+
+def get_test_data(train_samples,
+                  test_samples,
+                  input_shape,
+                  num_classes):
+  """Generates test data to train a model on.
+
+  Arguments:
+    train_samples: Integer, how many training samples to generate.
+    test_samples: Integer, how many test samples to generate.
+    input_shape: Tuple of integers, shape of the inputs.
+    num_classes: Integer, number of classes for the data and targets.
+      Only relevant if `classification=True`.
+
+  Returns:
+    A tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+  """
+  num_sample = train_samples + test_samples
+  templates = 2 * num_classes * np.random.random((num_classes,) + input_shape)
+  y = np.random.randint(0, num_classes, size=(num_sample,))
+  x = np.zeros((num_sample,) + input_shape)
+  for i in range(num_sample):
+    x[i] = templates[y[i]] + np.random.normal(loc=0, scale=1., size=input_shape)
+  return ((x[:train_samples], y[:train_samples]),
+          (x[train_samples:], y[train_samples:]))
+
+
+def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
+               input_data=None, expected_output=None,
+               expected_output_dtype=None):
+  """Test routine for a layer with a single input and single output.
+
+  Arguments:
+    layer_cls: Layer class object.
+    kwargs: Optional dictionary of keyword arguments for instantiating the
+      layer.
+    input_shape: Input shape tuple.
+    input_dtype: Data type of the input data.
+    input_data: Numpy array of input data.
+    expected_output: Shape tuple for the expected shape of the output.
+    expected_output_dtype: Data type expected for the output.
+
+  Returns:
+    The output data (Numpy array) returned by the layer, for additional
+    checks to be done by the calling code.
+  """
+  if input_data is None:
+    assert input_shape
+    if not input_dtype:
+      input_dtype = 'float32'
+    input_data_shape = list(input_shape)
+    for i, e in enumerate(input_data_shape):
+      if e is None:
+        input_data_shape[i] = np.random.randint(1, 4)
+    input_data = 10 * np.random.random(input_data_shape)
+    if input_dtype[:4] == 'float':
+      input_data -= 0.5
+    input_data = input_data.astype(input_dtype)
+  elif input_shape is None:
+    input_shape = input_data.shape
+  if input_dtype is None:
+    input_dtype = input_data.dtype
+  if expected_output_dtype is None:
+    expected_output_dtype = input_dtype
+
+  # instantiation
+  kwargs = kwargs or {}
+  layer = layer_cls(**kwargs)
+
+  # test get_weights , set_weights at layer level
+  weights = layer.get_weights()
+  layer.set_weights(weights)
+
+  # test and instantiation from weights
+  if 'weights' in inspect.getargspec(layer_cls.__init__):
+    kwargs['weights'] = weights
+    layer = layer_cls(**kwargs)
+
+  # test in functional API
+  x = keras.layers.Input(shape=input_shape[1:], dtype=input_dtype)
+  y = layer(x)
+  assert keras.backend.dtype(y) == expected_output_dtype
+
+  # check shape inference
+  model = keras.models.Model(x, y)
+  expected_output_shape = tuple(
+      layer._compute_output_shape(input_shape).as_list())  # pylint: disable=protected-access
+  actual_output = model.predict(input_data)
+  actual_output_shape = actual_output.shape
+  for expected_dim, actual_dim in zip(expected_output_shape,
+                                      actual_output_shape):
+    if expected_dim is not None:
+      assert expected_dim == actual_dim
+  if expected_output is not None:
+    np.testing.assert_allclose(actual_output, expected_output, rtol=1e-3)
+
+  # test serialization, weight setting at model level
+  model_config = model.get_config()
+  recovered_model = keras.models.Model.from_config(model_config)
+  if model.weights:
+    weights = model.get_weights()
+    recovered_model.set_weights(weights)
+    output = recovered_model.predict(input_data)
+    np.testing.assert_allclose(output, actual_output, rtol=1e-3)
+
+  # test training mode (e.g. useful for dropout tests)
+  model.compile('rmsprop', 'mse')
+  model.train_on_batch(input_data, actual_output)
+
+  # test as first layer in Sequential API
+  layer_config = layer.get_config()
+  layer_config['batch_input_shape'] = input_shape
+  layer = layer.__class__.from_config(layer_config)
+
+  model = keras.models.Sequential()
+  model.add(layer)
+  actual_output = model.predict(input_data)
+  actual_output_shape = actual_output.shape
+  for expected_dim, actual_dim in zip(expected_output_shape,
+                                      actual_output_shape):
+    if expected_dim is not None:
+      assert expected_dim == actual_dim
+  if expected_output is not None:
+    np.testing.assert_allclose(actual_output, expected_output, rtol=1e-3)
+
+  # test serialization, weight setting at model level
+  model_config = model.get_config()
+  recovered_model = keras.models.Sequential.from_config(model_config)
+  if model.weights:
+    weights = model.get_weights()
+    recovered_model.set_weights(weights)
+    output = recovered_model.predict(input_data)
+    np.testing.assert_allclose(output, actual_output, rtol=1e-3)
+
+  # test training mode (e.g. useful for dropout tests)
+  model.compile('rmsprop', 'mse')
+  model.train_on_batch(input_data, actual_output)
+
+  # for further checks in the caller function
+  return actual_output
diff --git a/tensorflow/contrib/keras/python/keras/utils/__init__.py b/tensorflow/contrib/keras/python/keras/utils/__init__.py
new file mode 100644
index 0000000000..68c28ab585
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/__init__.py
@@ -0,0 +1,40 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras utilities.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.utils import conv_utils
+from tensorflow.contrib.keras.python.keras.utils import data_utils
+from tensorflow.contrib.keras.python.keras.utils import generic_utils
+from tensorflow.contrib.keras.python.keras.utils import io_utils
+from tensorflow.contrib.keras.python.keras.utils import np_utils
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import custom_object_scope
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import get_custom_objects
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import Progbar
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.io_utils import HDF5Matrix
+from tensorflow.contrib.keras.python.keras.utils.layer_utils import convert_all_kernels_in_model
+from tensorflow.contrib.keras.python.keras.utils.np_utils import normalize
+from tensorflow.contrib.keras.python.keras.utils.np_utils import to_categorical
+from tensorflow.contrib.keras.python.keras.utils.vis_utils import plot_model
+
+
+# Globally-importable utils.
diff --git a/tensorflow/contrib/keras/python/keras/utils/conv_utils.py b/tensorflow/contrib/keras/python/keras/utils/conv_utils.py
new file mode 100644
index 0000000000..ffc131ec4f
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/conv_utils.py
@@ -0,0 +1,168 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities used by convolution layers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import range  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.keras.python.keras import backend as K
+
+
+def normalize_tuple(value, n, name):
+  """Transforms a single int or iterable of ints into an int tuple.
+
+  Arguments:
+      value: The value to validate and convert. Could an int, or any iterable
+        of ints.
+      n: The size of the tuple to be returned.
+      name: The name of the argument being validated, e.g. "strides" or
+        "kernel_size". This is only used to format error messages.
+
+  Returns:
+      A tuple of n integers.
+
+  Raises:
+      ValueError: If something else than an int/long or iterable thereof was
+      passed.
+  """
+  if isinstance(value, int):
+    return (value,) * n
+  else:
+    try:
+      value_tuple = tuple(value)
+    except TypeError:
+      raise ValueError('The `' + name + '` argument must be a tuple of ' + str(
+          n) + ' integers. Received: ' + str(value))
+    if len(value_tuple) != n:
+      raise ValueError('The `' + name + '` argument must be a tuple of ' + str(
+          n) + ' integers. Received: ' + str(value))
+    for single_value in value_tuple:
+      try:
+        int(single_value)
+      except ValueError:
+        raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                         str(n) + ' integers. Received: ' + str(value) + ' '
+                         'including element ' + str(single_value) + ' of type' +
+                         ' ' + str(type(single_value)))
+  return value_tuple
+
+
+def normalize_data_format(value):
+  if value is None:
+    value = K.image_data_format()
+  data_format = value.lower()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('The `data_format` argument must be one of '
+                     '"channels_first", "channels_last". Received: ' + str(
+                         value))
+  return data_format
+
+
+def normalize_padding(value):
+  padding = value.lower()
+  if padding not in {'valid', 'same', 'causal'}:
+    raise ValueError('The `padding` argument must be one of '
+                     '"valid", "same" (or "causal", only for `Conv1D). '
+                     'Received: ' + str(padding))
+  return padding
+
+
+def convert_kernel(kernel):
+  """Converts a Numpy kernel matrix from Theano format to TensorFlow format.
+
+  Also works reciprocally, since the transformation is its own inverse.
+
+  Arguments:
+      kernel: Numpy array (4D or 5D).
+
+  Returns:
+      The converted kernel.
+
+  Raises:
+      ValueError: in case of invalid kernel shape or invalid data_format.
+  """
+  if not 4 <= kernel.ndim <= 5:
+    raise ValueError('Invalid kernel shape:', kernel.shape)
+  slices = [slice(None, None, -1) for _ in range(kernel.ndim)]
+  no_flip = (slice(None, None), slice(None, None))
+  slices[-2:] = no_flip
+  return np.copy(kernel[slices])
+
+
+def conv_output_length(input_length, filter_size, padding, stride, dilation=1):
+  """Determines output length of a convolution given input length.
+
+  Arguments:
+      input_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+      dilation: dilation rate, integer.
+
+  Returns:
+      The output length (integer).
+  """
+  if input_length is None:
+    return None
+  assert padding in {'same', 'valid', 'full', 'causal'}
+  dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
+  if padding == 'same':
+    output_length = input_length
+  elif padding == 'valid':
+    output_length = input_length - dilated_filter_size + 1
+  elif padding == 'full':
+    output_length = input_length + dilated_filter_size - 1
+  elif padding == 'causal':
+    output_length = input_length
+  return (output_length + stride - 1) // stride
+
+
+def conv_input_length(output_length, filter_size, padding, stride):
+  """Determines input length of a convolution given output length.
+
+  Arguments:
+      output_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+
+  Returns:
+      The input length (integer).
+  """
+  if output_length is None:
+    return None
+  assert padding in {'same', 'valid', 'full'}
+  if padding == 'same':
+    pad = filter_size // 2
+  elif padding == 'valid':
+    pad = 0
+  elif padding == 'full':
+    pad = filter_size - 1
+  return (output_length - 1) * stride - 2 * pad + filter_size
+
+
+def deconv_length(dim_size, stride_size, kernel_size, padding):
+  if dim_size is None:
+    return None
+  dim_size *= stride_size
+  if padding == 'valid':
+    dim_size += max(kernel_size - stride_size, 0)
+  elif padding == 'full':
+    dim_size -= (stride_size + kernel_size - 2)
+  return dim_size
diff --git a/tensorflow/contrib/keras/python/keras/utils/data_utils.py b/tensorflow/contrib/keras/python/keras/utils/data_utils.py
new file mode 100644
index 0000000000..b2d5427c97
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/data_utils.py
@@ -0,0 +1,178 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for file download and caching."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import hashlib
+import os
+import shutil
+import sys
+import tarfile
+
+from six.moves.urllib.error import HTTPError
+from six.moves.urllib.error import URLError
+from six.moves.urllib.request import urlopen
+
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import Progbar
+
+
+if sys.version_info[0] == 2:
+
+  def urlretrieve(url, filename, reporthook=None, data=None):
+    """Replacement for `urlretrive` for Python 2.
+
+    Under Python 2, `urlretrieve` relies on `FancyURLopener` from legacy
+    `urllib` module, known to have issues with proxy management.
+
+    Arguments:
+        url: url to retrieve.
+        filename: where to store the retrieved data locally.
+        reporthook: a hook function that will be called once
+            on establishment of the network connection and once
+            after each block read thereafter.
+            The hook will be passed three arguments;
+            a count of blocks transferred so far,
+            a block size in bytes, and the total size of the file.
+        data: `data` argument passed to `urlopen`.
+    """
+
+    def chunk_read(response, chunk_size=8192, reporthook=None):
+      total_size = response.info().get('Content-Length').strip()
+      total_size = int(total_size)
+      count = 0
+      while 1:
+        chunk = response.read(chunk_size)
+        count += 1
+        if not chunk:
+          reporthook(count, total_size, total_size)
+          break
+        if reporthook:
+          reporthook(count, chunk_size, total_size)
+        yield chunk
+
+    response = urlopen(url, data)
+    with open(filename, 'wb') as fd:
+      for chunk in chunk_read(response, reporthook=reporthook):
+        fd.write(chunk)
+else:
+  from six.moves.urllib.request import urlretrieve  # pylint: disable=g-import-not-at-top
+
+
+def get_file(fname, origin, untar=False, md5_hash=None,
+             cache_subdir='datasets'):
+  """Downloads a file from a URL if it not already in the cache.
+
+  Passing the MD5 hash will verify the file after download
+  as well as if it is already present in the cache.
+
+  Arguments:
+      fname: name of the file
+      origin: original URL of the file
+      untar: boolean, whether the file should be decompressed
+      md5_hash: MD5 hash of the file for verification
+      cache_subdir: directory being used as the cache
+
+  Returns:
+      Path to the downloaded file
+  """
+  datadir_base = os.path.expanduser(os.path.join('~', '.keras'))
+  if not os.access(datadir_base, os.W_OK):
+    datadir_base = os.path.join('/tmp', '.keras')
+  datadir = os.path.join(datadir_base, cache_subdir)
+  if not os.path.exists(datadir):
+    os.makedirs(datadir)
+
+  if untar:
+    untar_fpath = os.path.join(datadir, fname)
+    fpath = untar_fpath + '.tar.gz'
+  else:
+    fpath = os.path.join(datadir, fname)
+
+  download = False
+  if os.path.exists(fpath):
+    # File found; verify integrity if a hash was provided.
+    if md5_hash is not None:
+      if not validate_file(fpath, md5_hash):
+        print('A local file was found, but it seems to be '
+              'incomplete or outdated.')
+        download = True
+  else:
+    download = True
+
+  if download:
+    print('Downloading data from', origin)
+    progbar = None
+
+    def dl_progress(count, block_size, total_size, progbar=None):
+      if progbar is None:
+        progbar = Progbar(total_size)
+      else:
+        progbar.update(count * block_size)
+
+    error_msg = 'URL fetch failure on {}: {} -- {}'
+    try:
+      try:
+        urlretrieve(origin, fpath,
+                    functools.partial(dl_progress, progbar=progbar))
+      except URLError as e:
+        raise Exception(error_msg.format(origin, e.errno, e.reason))
+      except HTTPError as e:
+        raise Exception(error_msg.format(origin, e.code, e.msg))
+    except (Exception, KeyboardInterrupt) as e:
+      if os.path.exists(fpath):
+        os.remove(fpath)
+      raise
+    progbar = None
+
+  if untar:
+    if not os.path.exists(untar_fpath):
+      print('Untaring file...')
+      tfile = tarfile.open(fpath, 'r:gz')
+      try:
+        tfile.extractall(path=datadir)
+      except (Exception, KeyboardInterrupt) as e:
+        if os.path.exists(untar_fpath):
+          if os.path.isfile(untar_fpath):
+            os.remove(untar_fpath)
+          else:
+            shutil.rmtree(untar_fpath)
+        raise
+      tfile.close()
+    return untar_fpath
+
+  return fpath
+
+
+def validate_file(fpath, md5_hash):
+  """Validates a file against a MD5 hash.
+
+  Arguments:
+      fpath: path to the file being validated
+      md5_hash: the MD5 hash being validated against
+
+  Returns:
+      Whether the file is valid
+  """
+  hasher = hashlib.md5()
+  with open(fpath, 'rb') as f:
+    buf = f.read()
+    hasher.update(buf)
+  if str(hasher.hexdigest()) == str(md5_hash):
+    return True
+  else:
+    return False
diff --git a/tensorflow/contrib/keras/python/keras/utils/generic_utils.py b/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
new file mode 100644
index 0000000000..c1e0296835
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
@@ -0,0 +1,337 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python utilities required by Keras."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import inspect
+import marshal
+import sys
+import time
+import types as python_types
+
+import numpy as np
+import six
+
+
+_GLOBAL_CUSTOM_OBJECTS = {}
+
+
+class CustomObjectScope(object):
+  """Provides a scope that changes to `_GLOBAL_CUSTOM_OBJECTS` cannot escape.
+
+  Code within a `with` statement will be able to access custom objects
+  by name. Changes to global custom objects persist
+  within the enclosing `with` statement. At end of the `with` statement,
+  global custom objects are reverted to state
+  at beginning of the `with` statement.
+
+  Example:
+
+  Consider a custom object `MyObject`
+
+  ```python
+      with CustomObjectScope({"MyObject":MyObject}):
+          layer = Dense(..., W_regularizer="MyObject")
+          # save, load, etc. will recognize custom object by name
+  ```
+  """
+
+  def __init__(self, *args):
+    self.custom_objects = args
+    self.backup = None
+
+  def __enter__(self):
+    self.backup = _GLOBAL_CUSTOM_OBJECTS.copy()
+    for objects in self.custom_objects:
+      _GLOBAL_CUSTOM_OBJECTS.update(objects)
+    return self
+
+  def __exit__(self, *args, **kwargs):
+    _GLOBAL_CUSTOM_OBJECTS.clear()
+    _GLOBAL_CUSTOM_OBJECTS.update(self.backup)
+
+
+def custom_object_scope(*args):
+  """Provides a scope that changes to `_GLOBAL_CUSTOM_OBJECTS` cannot escape.
+
+  Convenience wrapper for `CustomObjectScope`.
+  Code within a `with` statement will be able to access custom objects
+  by name. Changes to global custom objects persist
+  within the enclosing `with` statement. At end of the `with` statement,
+  global custom objects are reverted to state
+  at beginning of the `with` statement.
+
+  Example:
+
+  Consider a custom object `MyObject`
+
+  ```python
+      with custom_object_scope({"MyObject":MyObject}):
+          layer = Dense(..., W_regularizer="MyObject")
+          # save, load, etc. will recognize custom object by name
+  ```
+
+  Arguments:
+      *args: Variable length list of dictionaries of name,
+          class pairs to add to custom objects.
+
+  Returns:
+      Object of type `CustomObjectScope`.
+  """
+  return CustomObjectScope(*args)
+
+
+def get_custom_objects():
+  """Retrieves a live reference to the global dictionary of custom objects.
+
+  Updating and clearing custom objects using `custom_object_scope`
+  is preferred, but `get_custom_objects` can
+  be used to directly access `_GLOBAL_CUSTOM_OBJECTS`.
+
+  Example:
+
+  ```python
+      get_custom_objects().clear()
+      get_custom_objects()["MyObject"] = MyObject
+  ```
+
+  Returns:
+      Global dictionary of names to classes (`_GLOBAL_CUSTOM_OBJECTS`).
+  """
+  return _GLOBAL_CUSTOM_OBJECTS
+
+
+def serialize_keras_object(instance):
+  if instance is None:
+    return None
+  if hasattr(instance, 'get_config'):
+    return {
+        'class_name': instance.__class__.__name__,
+        'config': instance.get_config()
+    }
+  if hasattr(instance, '__name__'):
+    return instance.__name__
+  else:
+    raise ValueError('Cannot serialize', instance)
+
+
+def deserialize_keras_object(identifier,
+                             module_objects=None,
+                             custom_objects=None,
+                             printable_module_name='object'):
+  if isinstance(identifier, dict):
+    # In this case we are dealing with a Keras config dictionary.
+    config = identifier
+    if 'class_name' not in config or 'config' not in config:
+      raise ValueError('Improper config format: ' + str(config))
+    class_name = config['class_name']
+    if custom_objects and class_name in custom_objects:
+      cls = custom_objects[class_name]
+    elif class_name in _GLOBAL_CUSTOM_OBJECTS:
+      cls = _GLOBAL_CUSTOM_OBJECTS[class_name]
+    else:
+      module_objects = module_objects or {}
+      cls = module_objects.get(class_name)
+      if cls is None:
+        raise ValueError('Unknown ' + printable_module_name + ': ' + class_name)
+    if hasattr(cls, 'from_config'):
+      arg_spec = inspect.getargspec(cls.from_config)
+      if 'custom_objects' in arg_spec.args:
+        custom_objects = custom_objects or {}
+        return cls.from_config(
+            config['config'],
+            custom_objects=dict(
+                list(_GLOBAL_CUSTOM_OBJECTS.items()) + list(
+                    custom_objects.items())))
+      return cls.from_config(config['config'])
+    else:
+      # Then `cls` may be a function returning a class.
+      # in this case by convention `config` holds
+      # the kwargs of the function.
+      return cls(**config['config'])
+  elif isinstance(identifier, six.string_types):
+    function_name = identifier
+    if custom_objects and function_name in custom_objects:
+      fn = custom_objects.get(function_name)
+    elif function_name in _GLOBAL_CUSTOM_OBJECTS:
+      fn = _GLOBAL_CUSTOM_OBJECTS[function_name]
+    else:
+      fn = module_objects.get(function_name)
+      if fn is None:
+        raise ValueError('Unknown ' + printable_module_name, ':' + class_name)
+    return fn
+  else:
+    raise ValueError('Could not interpret serialized ' + printable_module_name +
+                     ': ' + identifier)
+
+
+def make_tuple(*args):
+  return args
+
+
+def func_dump(func):
+  """Serializes a user defined function.
+
+  Arguments:
+      func: the function to serialize.
+
+  Returns:
+      A tuple `(code, defaults, closure)`.
+  """
+  code = marshal.dumps(func.__code__).decode('raw_unicode_escape')
+  defaults = func.__defaults__
+  if func.__closure__:
+    closure = tuple(c.cell_contents for c in func.__closure__)
+  else:
+    closure = None
+  return code, defaults, closure
+
+
+def func_load(code, defaults=None, closure=None, globs=None):
+  """Deserializes a user defined function.
+
+  Arguments:
+      code: bytecode of the function.
+      defaults: defaults of the function.
+      closure: closure of the function.
+      globs: dictionary of global objects.
+
+  Returns:
+      A function object.
+  """
+  if isinstance(code, (tuple, list)):  # unpack previous dump
+    code, defaults, closure = code
+  code = marshal.loads(code.encode('raw_unicode_escape'))
+  if globs is None:
+    globs = globals()
+  return python_types.FunctionType(
+      code, globs, name=code.co_name, argdefs=defaults, closure=closure)
+
+
+class Progbar(object):
+  """Displays a progress bar.
+
+  Arguments:
+      target: Total number of steps expected.
+      interval: Minimum visual progress update interval (in seconds).
+  """
+
+  def __init__(self, target, width=30, verbose=1, interval=0.05):
+    self.width = width
+    self.target = target
+    self.sum_values = {}
+    self.unique_values = []
+    self.start = time.time()
+    self.last_update = 0
+    self.interval = interval
+    self.total_width = 0
+    self.seen_so_far = 0
+    self.verbose = verbose
+
+  def update(self, current, values=None, force=False):
+    """Updates the progress bar.
+
+    Arguments:
+        current: Index of current step.
+        values: List of tuples (name, value_for_last_step).
+            The progress bar will display averages for these values.
+        force: Whether to force visual progress update.
+    """
+    values = values or []
+    for k, v in values:
+      if k not in self.sum_values:
+        self.sum_values[k] = [
+            v * (current - self.seen_so_far), current - self.seen_so_far
+        ]
+        self.unique_values.append(k)
+      else:
+        self.sum_values[k][0] += v * (current - self.seen_so_far)
+        self.sum_values[k][1] += (current - self.seen_so_far)
+    self.seen_so_far = current
+
+    now = time.time()
+    if self.verbose == 1:
+      if not force and (now - self.last_update) < self.interval:
+        return
+
+      prev_total_width = self.total_width
+      sys.stdout.write('\b' * prev_total_width)
+      sys.stdout.write('\r')
+
+      numdigits = int(np.floor(np.log10(self.target))) + 1
+      barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
+      bar = barstr % (current, self.target)
+      prog = float(current) / self.target
+      prog_width = int(self.width * prog)
+      if prog_width > 0:
+        bar += ('=' * (prog_width - 1))
+        if current < self.target:
+          bar += '>'
+        else:
+          bar += '='
+      bar += ('.' * (self.width - prog_width))
+      bar += ']'
+      sys.stdout.write(bar)
+      self.total_width = len(bar)
+
+      if current:
+        time_per_unit = (now - self.start) / current
+      else:
+        time_per_unit = 0
+      eta = time_per_unit * (self.target - current)
+      info = ''
+      if current < self.target:
+        info += ' - ETA: %ds' % eta
+      else:
+        info += ' - %ds' % (now - self.start)
+      for k in self.unique_values:
+        info += ' - %s:' % k
+        if isinstance(self.sum_values[k], list):
+          avg = self.sum_values[k][0] / max(1, self.sum_values[k][1])
+          if abs(avg) > 1e-3:
+            info += ' %.4f' % avg
+          else:
+            info += ' %.4e' % avg
+        else:
+          info += ' %s' % self.sum_values[k]
+
+      self.total_width += len(info)
+      if prev_total_width > self.total_width:
+        info += ((prev_total_width - self.total_width) * ' ')
+
+      sys.stdout.write(info)
+      sys.stdout.flush()
+
+      if current >= self.target:
+        sys.stdout.write('\n')
+
+    if self.verbose == 2:
+      if current >= self.target:
+        info = '%ds' % (now - self.start)
+        for k in self.unique_values:
+          info += ' - %s:' % k
+          avg = self.sum_values[k][0] / max(1, self.sum_values[k][1])
+          if avg > 1e-3:
+            info += ' %.4f' % avg
+          else:
+            info += ' %.4e' % avg
+        sys.stdout.write(info + '\n')
+
+    self.last_update = now
+
+  def add(self, n, values=None):
+    self.update(self.seen_so_far + n, values)
diff --git a/tensorflow/contrib/keras/python/keras/utils/io_utils.py b/tensorflow/contrib/keras/python/keras/utils/io_utils.py
new file mode 100644
index 0000000000..7cef39b03f
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/io_utils.py
@@ -0,0 +1,133 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities related to disk I/O."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import defaultdict
+import sys
+
+import numpy as np
+
+
+try:
+  import h5py  # pylint:disable=g-import-not-at-top
+except ImportError:
+  h5py = None
+
+
+class HDF5Matrix(object):
+  """Representation of HDF5 dataset to be used instead of a Numpy array.
+
+  Example:
+
+  ```python
+      x_data = HDF5Matrix('input/file.hdf5', 'data')
+      model.predict(x_data)
+  ```
+
+  Providing `start` and `end` allows use of a slice of the dataset.
+
+  Optionally, a normalizer function (or lambda) can be given. This will
+  be called on every slice of data retrieved.
+
+  Arguments:
+      datapath: string, path to a HDF5 file
+      dataset: string, name of the HDF5 dataset in the file specified
+          in datapath
+      start: int, start of desired slice of the specified dataset
+      end: int, end of desired slice of the specified dataset
+      normalizer: function to be called on data when retrieved
+
+  Returns:
+      An array-like HDF5 dataset.
+  """
+  refs = defaultdict(int)
+
+  def __init__(self, datapath, dataset, start=0, end=None, normalizer=None):
+    if h5py is None:
+      raise ImportError('The use of HDF5Matrix requires '
+                        'HDF5 and h5py installed.')
+
+    if datapath not in list(self.refs.keys()):
+      f = h5py.File(datapath)
+      self.refs[datapath] = f
+    else:
+      f = self.refs[datapath]
+    self.data = f[dataset]
+    self.start = start
+    if end is None:
+      self.end = self.data.shape[0]
+    else:
+      self.end = end
+    self.normalizer = normalizer
+
+  def __len__(self):
+    return self.end - self.start
+
+  def __getitem__(self, key):
+    if isinstance(key, slice):
+      if key.stop + self.start <= self.end:
+        idx = slice(key.start + self.start, key.stop + self.start)
+      else:
+        raise IndexError
+    elif isinstance(key, int):
+      if key + self.start < self.end:
+        idx = key + self.start
+      else:
+        raise IndexError
+    elif isinstance(key, np.ndarray):
+      if np.max(key) + self.start < self.end:
+        idx = (self.start + key).tolist()
+      else:
+        raise IndexError
+    elif isinstance(key, list):
+      if max(key) + self.start < self.end:
+        idx = [x + self.start for x in key]
+      else:
+        raise IndexError
+    else:
+      raise IndexError
+    if self.normalizer is not None:
+      return self.normalizer(self.data[idx])
+    else:
+      return self.data[idx]
+
+  @property
+  def shape(self):
+    return (self.end - self.start,) + self.data.shape[1:]
+
+
+def ask_to_proceed_with_overwrite(filepath):
+  """Produces a prompt asking about overwriting a file.
+
+  Arguments:
+      filepath: the path to the file to be overwritten.
+
+  Returns:
+      True if we can proceed with overwrite, False otherwise.
+  """
+  get_input = input
+  if sys.version_info[:2] <= (2, 7):
+    get_input = raw_input
+  overwrite = get_input('[WARNING] %s already exists - overwrite? '
+                        '[y/n]' % (filepath))
+  while overwrite not in ['y', 'n']:
+    overwrite = get_input('Enter "y" (overwrite) or "n" (cancel).')
+  if overwrite == 'n':
+    return False
+  print('[TIP] Next time specify overwrite=True!')
+  return True
diff --git a/tensorflow/contrib/keras/python/keras/utils/layer_utils.py b/tensorflow/contrib/keras/python/keras/utils/layer_utils.py
new file mode 100644
index 0000000000..32e0de7d3d
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/layer_utils.py
@@ -0,0 +1,238 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities related to Keras layers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.utils.conv_utils import convert_kernel
+
+
+def print_summary(model, line_length=None, positions=None):
+  """Prints a summary of a model.
+
+  Arguments:
+      model: Keras model instance.
+      line_length: total length of printed lines
+      positions: relative or absolute positions of log elements in each line.
+          If not provided, defaults to `[.33, .55, .67, 1.]`.
+  """
+  if model.__class__.__name__ == 'Sequential':
+    sequential_like = True
+  else:
+    sequential_like = True
+    for v in model.nodes_by_depth.values():
+      if len(v) > 1:
+        sequential_like = False
+
+  if sequential_like:
+    line_length = line_length or 65
+    positions = positions or [.45, .85, 1.]
+    if positions[-1] <= 1:
+      positions = [int(line_length * p) for p in positions]
+    # header names for the different log elements
+    to_display = ['Layer (type)', 'Output Shape', 'Param #']
+  else:
+    line_length = line_length or 100
+    positions = positions or [.33, .55, .67, 1.]
+    if positions[-1] <= 1:
+      positions = [int(line_length * p) for p in positions]
+    # header names for the different log elements
+    to_display = ['Layer (type)', 'Output Shape', 'Param #', 'Connected to']
+    relevant_nodes = []
+    for v in model.nodes_by_depth.values():
+      relevant_nodes += v
+
+  def print_row(fields, positions):
+    line = ''
+    for i in range(len(fields)):
+      if i > 0:
+        line = line[:-1] + ' '
+      line += str(fields[i])
+      line = line[:positions[i]]
+      line += ' ' * (positions[i] - len(line))
+    print(line)
+
+  print('_' * line_length)
+  print_row(to_display, positions)
+  print('=' * line_length)
+
+  def print_layer_summary(layer):
+    try:
+      output_shape = layer.output_shape
+    except AttributeError:
+      output_shape = 'multiple'
+    name = layer.name
+    cls_name = layer.__class__.__name__
+    fields = [name + ' (' + cls_name + ')', output_shape, layer.count_params()]
+    print_row(fields, positions)
+
+  def print_layer_summary_with_connections(layer):
+    """Prints a summary for a single layer.
+
+    Arguments:
+        layer: target layer.
+    """
+    try:
+      output_shape = layer.output_shape
+    except AttributeError:
+      output_shape = 'multiple'
+    connections = []
+    for node_index, node in enumerate(layer.inbound_nodes):
+      if relevant_nodes:
+        node_key = layer.name + '_ib-' + str(node_index)
+        if node_key not in relevant_nodes:
+          # node is node part of the current network
+          continue
+      for i in range(len(node.inbound_layers)):
+        inbound_layer = node.inbound_layers[i].name
+        inbound_node_index = node.node_indices[i]
+        inbound_tensor_index = node.tensor_indices[i]
+        connections.append(inbound_layer + '[' + str(inbound_node_index) + ']['
+                           + str(inbound_tensor_index) + ']')
+
+    name = layer.name
+    cls_name = layer.__class__.__name__
+    if not connections:
+      first_connection = ''
+    else:
+      first_connection = connections[0]
+    fields = [
+        name + ' (' + cls_name + ')', output_shape, layer.count_params(),
+        first_connection
+    ]
+    print_row(fields, positions)
+    if len(connections) > 1:
+      for i in range(1, len(connections)):
+        fields = ['', '', '', connections[i]]
+        print_row(fields, positions)
+
+  layers = model.layers
+  for i in range(len(layers)):
+    if sequential_like:
+      print_layer_summary(layers[i])
+    else:
+      print_layer_summary_with_connections(layers[i])
+    if i == len(layers) - 1:
+      print('=' * line_length)
+    else:
+      print('_' * line_length)
+
+  trainable_count, non_trainable_count = count_total_params(
+      layers, layer_set=None)
+
+  print('Total params: {:,}'.format(trainable_count + non_trainable_count))
+  print('Trainable params: {:,}'.format(trainable_count))
+  print('Non-trainable params: {:,}'.format(non_trainable_count))
+  print('_' * line_length)
+
+
+def count_total_params(layers, layer_set=None):
+  """Counts the number of parameters in a list of layers.
+
+  Arguments:
+      layers: list of layers.
+      layer_set: set of layers already seen
+          (so that we don't count their weights twice).
+
+  Returns:
+      A tuple (count of trainable weights, count of non-trainable weights.)
+  """
+  if layer_set is None:
+    layer_set = set()
+  trainable_count = 0
+  non_trainable_count = 0
+  for layer in layers:
+    if layer in layer_set:
+      continue
+    layer_set.add(layer)
+    if hasattr(layer, 'layers'):
+      t, nt = count_total_params(layer.layers, layer_set)
+      trainable_count += t
+      non_trainable_count += nt
+    else:
+      trainable_count += np.sum(
+          [K.count_params(p) for p in layer.trainable_weights])
+      non_trainable_count += np.sum(
+          [K.count_params(p) for p in layer.non_trainable_weights])
+  return trainable_count, non_trainable_count
+
+
+def convert_all_kernels_in_model(model):
+  """Converts all convolution kernels in a model from Theano to TensorFlow.
+
+  Also works from TensorFlow to Theano.
+
+  Arguments:
+      model: target model for the conversion.
+  """
+  # Note: SeparableConvolution not included
+  # since only supported by TF.
+  conv_classes = {
+      'Conv1D',
+      'Conv2D',
+      'Conv3D',
+      'Conv2DTranspose',
+  }
+  to_assign = []
+  for layer in model.layers:
+    if layer.__class__.__name__ in conv_classes:
+      original_kernel = K.get_value(layer.kernel)
+      converted_kernel = convert_kernel(original_kernel)
+      to_assign.append((layer.kernel, converted_kernel))
+  K.batch_set_value(to_assign)
+
+
+def convert_dense_weights_data_format(dense,
+                                      previous_feature_map_shape,
+                                      target_data_format='channels_first'):
+  """Utility useful when changing a convnet's `data_format`.
+
+  When porting the weights of a convnet from one data format to the other,
+  if the convnet includes a `Flatten` layer
+  (applied to the last convolutional feature map)
+  followed by a `Dense` layer, the weights of that `Dense` layer
+  should be updated to reflect the new dimension ordering.
+
+  Arguments:
+      dense: The target `Dense` layer.
+      previous_feature_map_shape: A shape tuple of 3 integers,
+          e.g. `(512, 7, 7)`. The shape of the convolutional
+          feature map right before the `Flatten` layer that
+          came before the target `Dense` layer.
+      target_data_format: One of "channels_last", "channels_first".
+          Set it "channels_last"
+          if converting a "chnnels_first" model to "channels_last",
+          or reciprocally.
+  """
+  assert target_data_format in {'channels_last', 'channels_first'}
+  kernel, bias = dense.get_weights()
+  for i in range(kernel.shape[1]):
+    if target_data_format == 'channels_first':
+      c, h, w = previous_feature_map_shape
+      original_fm_shape = (h, w, c)
+      ki = kernel[:, i].reshape(original_fm_shape)
+      ki = np.transpose(ki, (2, 0, 1))  # last -> first
+    else:
+      h, w, c = previous_feature_map_shape
+      original_fm_shape = (c, h, w)
+      ki = kernel[:, i].reshape(original_fm_shape)
+      ki = np.transpose(ki, (1, 2, 0))  # first -> last
+    kernel[:, i] = np.reshape(ki, (np.prod(previous_feature_map_shape),))
+  dense.set_weights([kernel, bias])
diff --git a/tensorflow/contrib/keras/python/keras/utils/np_utils.py b/tensorflow/contrib/keras/python/keras/utils/np_utils.py
new file mode 100644
index 0000000000..a23172d342
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/np_utils.py
@@ -0,0 +1,58 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Numpy-related utilities."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def to_categorical(y, num_classes=None):
+  """Converts a class vector (integers) to binary class matrix.
+
+  E.g. for use with categorical_crossentropy.
+
+  Arguments:
+      y: class vector to be converted into a matrix
+          (integers from 0 to num_classes).
+      num_classes: total number of classes.
+
+  Returns:
+      A binary matrix representation of the input.
+  """
+  y = np.array(y, dtype='int').ravel()
+  if not num_classes:
+    num_classes = np.max(y) + 1
+  n = y.shape[0]
+  categorical = np.zeros((n, num_classes))
+  categorical[np.arange(n), y] = 1
+  return categorical
+
+
+def normalize(x, axis=-1, order=2):
+  """Normalizes a Numpy array.
+
+  Arguments:
+      x: Numpy array to normalize.
+      axis: axis along which to normalize.
+      order: Normalization order (e.g. 2 for L2 norm).
+
+  Returns:
+      A normalized copy of the array.
+  """
+  l2 = np.atleast_1d(np.linalg.norm(x, order, axis))
+  l2[l2 == 0] = 1
+  return x / np.expand_dims(l2, axis)
diff --git a/tensorflow/contrib/keras/python/keras/utils/vis_utils.py b/tensorflow/contrib/keras/python/keras/utils/vis_utils.py
new file mode 100644
index 0000000000..49efa6040d
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/vis_utils.py
@@ -0,0 +1,125 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities related to model visualization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+
+try:
+  # pydot-ng is a fork of pydot that is better maintained.
+  import pydot_ng as pydot  # pylint: disable=g-import-not-at-top
+except ImportError:
+  # Fall back on pydot if necessary.
+  try:
+    import pydot  # pylint: disable=g-import-not-at-top
+  except ImportError:
+    pydot = None
+
+
+def _check_pydot():
+  if not (pydot and pydot.find_graphviz()):
+    raise ImportError('Failed to import pydot. You must install pydot'
+                      ' and graphviz for `pydotprint` to work.')
+
+
+def model_to_dot(model, show_shapes=False, show_layer_names=True):
+  """Converts a Keras model to dot format.
+
+  Arguments:
+      model: A Keras model instance.
+      show_shapes: whether to display shape information.
+      show_layer_names: whether to display layer names.
+
+  Returns:
+      A `pydot.Dot` instance representing the Keras model.
+  """
+  from tensorflow.contrib.keras.python.keras.layers.wrappers import Wrapper  # pylint: disable=g-import-not-at-top
+  from tensorflow.contrib.keras.python.keras.models import Sequential  # pylint: disable=g-import-not-at-top
+
+  _check_pydot()
+  dot = pydot.Dot()
+  dot.set('rankdir', 'TB')
+  dot.set('concentrate', True)
+  dot.set_node_defaults(shape='record')
+
+  if isinstance(model, Sequential):
+    if not model.built:
+      model.build()
+    model = model.model
+  layers = model.layers
+
+  # Create graph nodes.
+  for layer in layers:
+    layer_id = str(id(layer))
+
+    # Append a wrapped layer's label to node's label, if it exists.
+    layer_name = layer.name
+    class_name = layer.__class__.__name__
+    if isinstance(layer, Wrapper):
+      layer_name = '{}({})'.format(layer_name, layer.layer.name)
+      child_class_name = layer.layer.__class__.__name__
+      class_name = '{}({})'.format(class_name, child_class_name)
+
+    # Create node's label.
+    if show_layer_names:
+      label = '{}: {}'.format(layer_name, class_name)
+    else:
+      label = class_name
+
+    # Rebuild the label as a table including input/output shapes.
+    if show_shapes:
+      try:
+        outputlabels = str(layer.output_shape)
+      except AttributeError:
+        outputlabels = 'multiple'
+      if hasattr(layer, 'input_shape'):
+        inputlabels = str(layer.input_shape)
+      elif hasattr(layer, 'input_shapes'):
+        inputlabels = ', '.join([str(ishape) for ishape in layer.input_shapes])
+      else:
+        inputlabels = 'multiple'
+      label = '%s\n|{input:|output:}|{{%s}|{%s}}' % (label, inputlabels,
+                                                     outputlabels)
+
+    node = pydot.Node(layer_id, label=label)
+    dot.add_node(node)
+
+  # Connect nodes with edges.
+  for layer in layers:
+    layer_id = str(id(layer))
+    for i, node in enumerate(layer.inbound_nodes):
+      node_key = layer.name + '_ib-' + str(i)
+      if node_key in model.container_nodes:
+        for inbound_layer in node.inbound_layers:
+          inbound_layer_id = str(id(inbound_layer))
+          layer_id = str(id(layer))
+          dot.add_edge(pydot.Edge(inbound_layer_id, layer_id))
+  return dot
+
+
+def plot_model(model,
+               to_file='model.png',
+               show_shapes=False,
+               show_layer_names=True):
+  dot = model_to_dot(model, show_shapes, show_layer_names)
+  _, extension = os.path.splitext(to_file)
+  if not extension:
+    extension = 'png'
+  else:
+    extension = extension[1:]
+  dot.write(to_file, format=extension)
diff --git a/tensorflow/contrib/keras/python/keras/wrappers/__init__.py b/tensorflow/contrib/keras/python/keras/wrappers/__init__.py
new file mode 100644
index 0000000000..51244ff681
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/wrappers/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras API wrappers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.wrappers import scikit_learn
+
diff --git a/tensorflow/contrib/keras/python/keras/wrappers/scikit_learn.py b/tensorflow/contrib/keras/python/keras/wrappers/scikit_learn.py
new file mode 100644
index 0000000000..ecda890fec
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/wrappers/scikit_learn.py
@@ -0,0 +1,324 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""API wrapper allowing to use certain Keras models with the Scikit-Learn API.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import inspect
+import types
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras.models import Sequential
+from tensorflow.contrib.keras.python.keras.utils.np_utils import to_categorical
+
+
+class BaseWrapper(object):
+  """Base class for the Keras scikit-learn wrapper.
+
+  Warning: This class should not be used directly.
+  Use descendant classes instead.
+
+  Arguments:
+      build_fn: callable function or class instance
+      **sk_params: model parameters & fitting parameters
+
+  The build_fn should construct, compile and return a Keras model, which
+  will then be used to fit/predict. One of the following
+  three values could be passed to build_fn:
+  1. A function
+  2. An instance of a class that implements the __call__ method
+  3. None. This means you implement a class that inherits from either
+  `KerasClassifier` or `KerasRegressor`. The __call__ method of the
+  present class will then be treated as the default build_fn.
+
+  `sk_params` takes both model parameters and fitting parameters. Legal model
+  parameters are the arguments of `build_fn`. Note that like all other
+  estimators in scikit-learn, 'build_fn' should provide default values for
+  its arguments, so that you could create the estimator without passing any
+  values to `sk_params`.
+
+  `sk_params` could also accept parameters for calling `fit`, `predict`,
+  `predict_proba`, and `score` methods (e.g., `epochs`, `batch_size`).
+  fitting (predicting) parameters are selected in the following order:
+
+  1. Values passed to the dictionary arguments of
+  `fit`, `predict`, `predict_proba`, and `score` methods
+  2. Values passed to `sk_params`
+  3. The default values of the `keras.models.Sequential`
+  `fit`, `predict`, `predict_proba` and `score` methods
+
+  When using scikit-learn's `grid_search` API, legal tunable parameters are
+  those you could pass to `sk_params`, including fitting parameters.
+  In other words, you could use `grid_search` to search for the best
+  `batch_size` or `epochs` as well as the model parameters.
+  """
+
+  def __init__(self, build_fn=None, **sk_params):
+    self.build_fn = build_fn
+    self.sk_params = sk_params
+    self.check_params(sk_params)
+
+  def check_params(self, params):
+    """Checks for user typos in "params".
+
+    Arguments:
+        params: dictionary; the parameters to be checked
+
+    Raises:
+        ValueError: if any member of `params` is not a valid argument.
+    """
+    legal_params_fns = [
+        Sequential.fit, Sequential.predict, Sequential.predict_classes,
+        Sequential.evaluate
+    ]
+    if self.build_fn is None:
+      legal_params_fns.append(self.__call__)
+    elif (not isinstance(self.build_fn, types.FunctionType) and
+          not isinstance(self.build_fn, types.MethodType)):
+      legal_params_fns.append(self.build_fn.__call__)
+    else:
+      legal_params_fns.append(self.build_fn)
+
+    legal_params = []
+    for fn in legal_params_fns:
+      legal_params += inspect.getargspec(fn)[0]
+    legal_params = set(legal_params)
+
+    for params_name in params:
+      if params_name not in legal_params:
+        if params_name != 'nb_epoch':
+          raise ValueError('{} is not a legal parameter'.format(params_name))
+
+  def get_params(self, **params):  # pylint: disable=unused-argument
+    """Gets parameters for this estimator.
+
+    Arguments:
+        **params: ignored (exists for API compatiblity).
+
+    Returns:
+        Dictionary of parameter names mapped to their values.
+    """
+    res = copy.deepcopy(self.sk_params)
+    res.update({'build_fn': self.build_fn})
+    return res
+
+  def set_params(self, **params):
+    """Sets the parameters of this estimator.
+
+    Arguments:
+        **params: Dictionary of parameter names mapped to their values.
+
+    Returns:
+        self
+    """
+    self.check_params(params)
+    self.sk_params.update(params)
+    return self
+
+  def fit(self, x, y, **kwargs):
+    """Constructs a new model with `build_fn` & fit the model to `(x, y)`.
+
+    Arguments:
+        x : array-like, shape `(n_samples, n_features)`
+            Training samples where n_samples in the number of samples
+            and n_features is the number of features.
+        y : array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
+            True labels for X.
+        **kwargs: dictionary arguments
+            Legal arguments are the arguments of `Sequential.fit`
+
+    Returns:
+        history : object
+            details about the training history at each epoch.
+    """
+    if self.build_fn is None:
+      self.model = self.__call__(**self.filter_sk_params(self.__call__))
+    elif (not isinstance(self.build_fn, types.FunctionType) and
+          not isinstance(self.build_fn, types.MethodType)):
+      self.model = self.build_fn(
+          **self.filter_sk_params(self.build_fn.__call__))
+    else:
+      self.model = self.build_fn(**self.filter_sk_params(self.build_fn))
+
+    loss_name = self.model.loss
+    if hasattr(loss_name, '__name__'):
+      loss_name = loss_name.__name__
+    if loss_name == 'categorical_crossentropy' and len(y.shape) != 2:
+      y = to_categorical(y)
+
+    fit_args = copy.deepcopy(self.filter_sk_params(Sequential.fit))
+    fit_args.update(kwargs)
+
+    history = self.model.fit(x, y, **fit_args)
+
+    return history
+
+  def filter_sk_params(self, fn, override=None):
+    """Filters `sk_params` and return those in `fn`'s arguments.
+
+    Arguments:
+        fn : arbitrary function
+        override: dictionary, values to override sk_params
+
+    Returns:
+        res : dictionary dictionary containing variables
+            in both sk_params and fn's arguments.
+    """
+    override = override or {}
+    res = {}
+    fn_args = inspect.getargspec(fn)[0]
+    for name, value in self.sk_params.items():
+      if name in fn_args:
+        res.update({name: value})
+    res.update(override)
+    return res
+
+
+class KerasClassifier(BaseWrapper):
+  """Implementation of the scikit-learn classifier API for Keras.
+  """
+
+  def predict(self, x, **kwargs):
+    """Returns the class predictions for the given test data.
+
+    Arguments:
+        x: array-like, shape `(n_samples, n_features)`
+            Test samples where n_samples in the number of samples
+            and n_features is the number of features.
+        **kwargs: dictionary arguments
+            Legal arguments are the arguments
+            of `Sequential.predict_classes`.
+
+    Returns:
+        preds: array-like, shape `(n_samples,)`
+            Class predictions.
+    """
+    kwargs = self.filter_sk_params(Sequential.predict_classes, kwargs)
+    return self.model.predict_classes(x, **kwargs)
+
+  def predict_proba(self, x, **kwargs):
+    """Returns class probability estimates for the given test data.
+
+    Arguments:
+        x: array-like, shape `(n_samples, n_features)`
+            Test samples where n_samples in the number of samples
+            and n_features is the number of features.
+        **kwargs: dictionary arguments
+            Legal arguments are the arguments
+            of `Sequential.predict_classes`.
+
+    Returns:
+        proba: array-like, shape `(n_samples, n_outputs)`
+            Class probability estimates.
+            In the case of binary classification,
+            tp match the scikit-learn API,
+            will return an array of shape '(n_samples, 2)'
+            (instead of `(n_sample, 1)` as in Keras).
+    """
+    kwargs = self.filter_sk_params(Sequential.predict_proba, kwargs)
+    probs = self.model.predict_proba(x, **kwargs)
+
+    # check if binary classification
+    if probs.shape[1] == 1:
+      # first column is probability of class 0 and second is of class 1
+      probs = np.hstack([1 - probs, probs])
+    return probs
+
+  def score(self, x, y, **kwargs):
+    """Returns the mean accuracy on the given test data and labels.
+
+    Arguments:
+        x: array-like, shape `(n_samples, n_features)`
+            Test samples where n_samples in the number of samples
+            and n_features is the number of features.
+        y: array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
+            True labels for x.
+        **kwargs: dictionary arguments
+            Legal arguments are the arguments of `Sequential.evaluate`.
+
+    Returns:
+        score: float
+            Mean accuracy of predictions on X wrt. y.
+
+    Raises:
+        ValueError: If the underlying model isn't configured to
+            compute accuracy. You should pass `metrics=["accuracy"]` to
+            the `.compile()` method of the model.
+    """
+    kwargs = self.filter_sk_params(Sequential.evaluate, kwargs)
+
+    loss_name = self.model.loss
+    if hasattr(loss_name, '__name__'):
+      loss_name = loss_name.__name__
+    if loss_name == 'categorical_crossentropy' and len(y.shape) != 2:
+      y = to_categorical(y)
+
+    outputs = self.model.evaluate(x, y, **kwargs)
+    if not isinstance(outputs, list):
+      outputs = [outputs]
+    for name, output in zip(self.model.metrics_names, outputs):
+      if name == 'acc':
+        return output
+    raise ValueError('The model is not configured to compute accuracy. '
+                     'You should pass `metrics=["accuracy"]` to '
+                     'the `model.compile()` method.')
+
+
+class KerasRegressor(BaseWrapper):
+  """Implementation of the scikit-learn regressor API for Keras.
+  """
+
+  def predict(self, x, **kwargs):
+    """Returns predictions for the given test data.
+
+    Arguments:
+        x: array-like, shape `(n_samples, n_features)`
+            Test samples where n_samples in the number of samples
+            and n_features is the number of features.
+        **kwargs: dictionary arguments
+            Legal arguments are the arguments of `Sequential.predict`.
+
+    Returns:
+        preds: array-like, shape `(n_samples,)`
+            Predictions.
+    """
+    kwargs = self.filter_sk_params(Sequential.predict, kwargs)
+    return np.squeeze(self.model.predict(x, **kwargs))
+
+  def score(self, x, y, **kwargs):
+    """Returns the mean loss on the given test data and labels.
+
+    Arguments:
+        x: array-like, shape `(n_samples, n_features)`
+            Test samples where n_samples in the number of samples
+            and n_features is the number of features.
+        y: array-like, shape `(n_samples,)`
+            True labels for X.
+        **kwargs: dictionary arguments
+            Legal arguments are the arguments of `Sequential.evaluate`.
+
+    Returns:
+        score: float
+            Mean accuracy of predictions on X wrt. y.
+    """
+    kwargs = self.filter_sk_params(Sequential.evaluate, kwargs)
+    loss = self.model.evaluate(x, y, **kwargs)
+    if isinstance(loss, list):
+      return loss[0]
+    return loss
diff --git a/tensorflow/contrib/keras/python/keras/wrappers/scikit_learn_test.py b/tensorflow/contrib/keras/python/keras/wrappers/scikit_learn_test.py
new file mode 100644
index 0000000000..95e0b951eb
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/wrappers/scikit_learn_test.py
@@ -0,0 +1,190 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Scikit-learn API wrapper."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+INPUT_DIM = 5
+HIDDEN_DIM = 5
+TRAIN_SAMPLES = 10
+TEST_SAMPLES = 5
+NUM_CLASSES = 2
+BATCH_SIZE = 5
+EPOCHS = 1
+
+
+def build_fn_clf(hidden_dim):
+  model = keras.models.Sequential()
+  model.add(keras.layers.Dense(INPUT_DIM, input_shape=(INPUT_DIM,)))
+  model.add(keras.layers.Activation('relu'))
+  model.add(keras.layers.Dense(hidden_dim))
+  model.add(keras.layers.Activation('relu'))
+  model.add(keras.layers.Dense(NUM_CLASSES))
+  model.add(keras.layers.Activation('softmax'))
+  model.compile(
+      optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
+  return model
+
+
+def assert_classification_works(clf):
+  np.random.seed(42)
+  (x_train, y_train), (x_test, _) = testing_utils.get_test_data(
+      train_samples=TRAIN_SAMPLES,
+      test_samples=TEST_SAMPLES,
+      input_shape=(INPUT_DIM,),
+      num_classes=NUM_CLASSES)
+
+  clf.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS)
+
+  score = clf.score(x_train, y_train, batch_size=BATCH_SIZE)
+  assert np.isscalar(score) and np.isfinite(score)
+
+  preds = clf.predict(x_test, batch_size=BATCH_SIZE)
+  assert preds.shape == (TEST_SAMPLES,)
+  for prediction in np.unique(preds):
+    assert prediction in range(NUM_CLASSES)
+
+  proba = clf.predict_proba(x_test, batch_size=BATCH_SIZE)
+  assert proba.shape == (TEST_SAMPLES, NUM_CLASSES)
+  assert np.allclose(np.sum(proba, axis=1), np.ones(TEST_SAMPLES))
+
+
+def build_fn_reg(hidden_dim):
+  model = keras.models.Sequential()
+  model.add(keras.layers.Dense(INPUT_DIM, input_shape=(INPUT_DIM,)))
+  model.add(keras.layers.Activation('relu'))
+  model.add(keras.layers.Dense(hidden_dim))
+  model.add(keras.layers.Activation('relu'))
+  model.add(keras.layers.Dense(1))
+  model.add(keras.layers.Activation('linear'))
+  model.compile(
+      optimizer='sgd', loss='mean_absolute_error', metrics=['accuracy'])
+  return model
+
+
+def assert_regression_works(reg):
+  np.random.seed(42)
+  (x_train, y_train), (x_test, _) = testing_utils.get_test_data(
+      train_samples=TRAIN_SAMPLES,
+      test_samples=TEST_SAMPLES,
+      input_shape=(INPUT_DIM,),
+      num_classes=NUM_CLASSES)
+
+  reg.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS)
+
+  score = reg.score(x_train, y_train, batch_size=BATCH_SIZE)
+  assert np.isscalar(score) and np.isfinite(score)
+
+  preds = reg.predict(x_test, batch_size=BATCH_SIZE)
+  assert preds.shape == (TEST_SAMPLES,)
+
+
+class ScikitLearnAPIWrapperTest(test.TestCase):
+
+  def test_classify_build_fn(self):
+    with self.test_session():
+      clf = keras.wrappers.scikit_learn.KerasClassifier(
+          build_fn=build_fn_clf,
+          hidden_dim=HIDDEN_DIM,
+          batch_size=BATCH_SIZE,
+          epochs=EPOCHS)
+
+      assert_classification_works(clf)
+
+  def test_classify_class_build_fn(self):
+
+    class ClassBuildFnClf(object):
+
+      def __call__(self, hidden_dim):
+        return build_fn_clf(hidden_dim)
+
+    with self.test_session():
+      clf = keras.wrappers.scikit_learn.KerasClassifier(
+          build_fn=ClassBuildFnClf(),
+          hidden_dim=HIDDEN_DIM,
+          batch_size=BATCH_SIZE,
+          epochs=EPOCHS)
+
+      assert_classification_works(clf)
+
+  def test_classify_inherit_class_build_fn(self):
+
+    class InheritClassBuildFnClf(keras.wrappers.scikit_learn.KerasClassifier):
+
+      def __call__(self, hidden_dim):
+        return build_fn_clf(hidden_dim)
+
+    with self.test_session():
+      clf = InheritClassBuildFnClf(
+          build_fn=None,
+          hidden_dim=HIDDEN_DIM,
+          batch_size=BATCH_SIZE,
+          epochs=EPOCHS)
+
+      assert_classification_works(clf)
+
+  def test_regression_build_fn(self):
+    with self.test_session():
+      reg = keras.wrappers.scikit_learn.KerasRegressor(
+          build_fn=build_fn_reg,
+          hidden_dim=HIDDEN_DIM,
+          batch_size=BATCH_SIZE,
+          epochs=EPOCHS)
+
+      assert_regression_works(reg)
+
+  def test_regression_class_build_fn(self):
+
+    class ClassBuildFnReg(object):
+
+      def __call__(self, hidden_dim):
+        return build_fn_reg(hidden_dim)
+
+    with self.test_session():
+      reg = keras.wrappers.scikit_learn.KerasRegressor(
+          build_fn=ClassBuildFnReg(),
+          hidden_dim=HIDDEN_DIM,
+          batch_size=BATCH_SIZE,
+          epochs=EPOCHS)
+
+      assert_regression_works(reg)
+
+  def test_regression_inherit_class_build_fn(self):
+
+    class InheritClassBuildFnReg(keras.wrappers.scikit_learn.KerasRegressor):
+
+      def __call__(self, hidden_dim):
+        return build_fn_reg(hidden_dim)
+
+    with self.test_session():
+      reg = InheritClassBuildFnReg(
+          build_fn=None,
+          hidden_dim=HIDDEN_DIM,
+          batch_size=BATCH_SIZE,
+          epochs=EPOCHS)
+
+      assert_regression_works(reg)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 1fa6986288..3e750ed059 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -209,6 +209,7 @@ def _get_default_do_not_descend_map():
           'select',
           'util'
       ],
+      'contrib.keras': ['api', 'python'],
       'contrib.layers': ['feature_column', 'summaries'],
       'contrib.learn': [
           'datasets',
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index f934d5c8a7..431c9f883f 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -120,6 +120,7 @@ sh_binary(
             ":simple_console",
             "//tensorflow:tensorflow_py",
             "//tensorflow/contrib/graph_editor:graph_editor_pip",
+            "//tensorflow/contrib/keras:keras",
             "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
             "//tensorflow/contrib/ndlstm:ndlstm",
             "//tensorflow/contrib/nn:nn_py",
author	Francois Chollet <fchollet@google.com>	2017-03-15 12:53:51 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-03-15 14:12:41 -0700
commit	f49f801276154d0f693c5d57db6977a7eb32f017 (patch)
tree	c9344fa01dde810067b72f0ed8d95a28889bc5e9
parent	d2bf01f51781611294babe1dee38e2e56de70809 (diff)