141 files changed, 4407 insertions, 602 deletions
diff --git a/.gitignore b/.gitignore
index 07dd151380..01f06be1a9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 .DS_Store
 .ipynb_checkpoints
 node_modules
+/.bazelrc
 /bazel-*
 /third_party/py/numpy/numpy_include
 /tools/bazel.rc
@@ -13,4 +14,4 @@ node_modules
 *.pyc
 __pycache__
 *.swp
-.vscode/
-\ No newline at end of file
+.vscode/
diff --git a/RELEASE.md b/RELEASE.md
index b223f51730..5f261a4543 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,10 @@
+# Release 1.0.1
+
+## Bug Fixes and Other Changes
+* Change GraphConstructor to not increase the version when importing, but instead take the min of all versions.
+* Google Cloud Storage fixes.
+* Removed `tf.core` and `tf.python` modules from the API. These were never intended to be exposed. Please use the same objects through top-level `tf` module instead.
+
 # Release 1.0.0
 
 ## Major Features and Improvements
@@ -88,6 +95,8 @@ To help you upgrade your existing TensorFlow Python code to match the API change
   from the tensorflow::ops namespace to tensorflow.
 * Change arg order for `{softmax,sparse_softmax,sigmoid}_cross_entropy_with_logits` to be (labels, predictions), and force use of named args.
 * tf.nn.rnn_cell.* and most functions in tf.nn.rnn.* (with the exception of dynamic_rnn and raw_rnn) are temporarily in tf.contrib.rnn.  They will be moved back into core for TF 1.1.
+* `tf.nn.sampled_softmax_loss` and `tf.nn.nce_loss` have both changed their API such that you need to switch the `inputs, labels` to `labels, inputs` parameters.
+* The shape keyword argument of the `SparseTensor` constructor changes its name to `dense_shape` between Tensorflow 0.12 and Tensorflow 1.0.
 
 ## Bug Fixes and Other Changes
 * Numerous C++ API updates.
diff --git a/WORKSPACE b/WORKSPACE
index 72fa0d8949..6ec1a7df3e 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -14,12 +14,7 @@ load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 
 closure_repositories()
 
-load("//tensorflow:workspace.bzl", "check_version", "tf_workspace")
-
-# We must check the bazel version before trying to parse any other BUILD files,
-# in case the parsing of those build files depends on the bazel version we
-# require here.
-check_version("0.4.2")
+load("//tensorflow:workspace.bzl", "tf_workspace")
 
 # Uncomment and update the paths in these entries to build the Android demo.
 #android_sdk_repository(
diff --git a/configure b/configure
index 05daa23d70..081db20d75 100755
--- a/configure
+++ b/configure
@@ -8,6 +8,9 @@ pushd `dirname $0` > /dev/null
 SOURCE_BASE_DIR=`pwd -P`
 popd > /dev/null
 
+# This file contains customized config settings.
+touch .bazelrc
+
 PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
 
 function is_linux() {
@@ -36,15 +39,11 @@ function is_windows() {
 }
 
 function bazel_clean_and_fetch() {
-  # bazel clean --expunge currently doesn't work on Windows
-  # TODO(pcloudy): Re-enable it after bazel clean --expunge is fixed.
-  if ! is_windows; then
-    bazel clean --expunge
-  fi
   if [ -z "$TF_BAZEL_TARGETS" ]; then
-    TF_BAZEL_TARGETS="//tensorflow/... -//tensorflow/contrib/nccl/... -//tensorflow/examples/android/..."
+    bazel fetch "//tensorflow/... -//tensorflow/contrib/nccl/... -//tensorflow/examples/android/..."
+  else
+    bazel fetch $TF_BAZEL_TARGETS
   fi
-  bazel fetch "$TF_BAZEL_TARGETS"
 }
 
 function sed_hyphen_i() {
@@ -102,8 +101,8 @@ if false; then # Disable building with MKL for now
 
   if [ "$TF_NEED_MKL" == "1" ]; then # TF_NEED_MKL
     DST=`dirname $0`
-    ARCHIVE_BASENAME=mklml_lnx_2017.0.2.20170110.tgz
-    GITHUB_RELEASE_TAG=v0.3
+    ARCHIVE_BASENAME=mklml_lnx_2017.0.2.20170209.tgz
+    GITHUB_RELEASE_TAG=v0.5
     MKLURL="https://github.com/01org/mkl-dnn/releases/download/$GITHUB_RELEASE_TAG/$ARCHIVE_BASENAME"
     if ! [ -e "$DST/third_party/mkl/$ARCHIVE_BASENAME" ]; then
       wget --no-check-certificate -P $DST/third_party/mkl/ $MKLURL
@@ -182,13 +181,12 @@ else
   TF_NEED_JEMALLOC=0
 fi
 
-if [ "$TF_NEED_JEMALLOC" == "1" ]; then
-  sed_hyphen_i -e "s/WITH_JEMALLOC = False/WITH_JEMALLOC = True/" tensorflow/core/platform/default/build_config.bzl
-else
-  sed_hyphen_i -e "s/WITH_JEMALLOC = True/WITH_JEMALLOC = False/" tensorflow/core/platform/default/build_config.bzl
+sed_hyphen_i -e "/with_jemalloc/d" .bazelrc
+if [[ "$TF_NEED_JEMALLOC" == "1" ]]; then
+  echo 'build --define with_jemalloc=true' >>.bazelrc
 fi
 
-while [ "$TF_NEED_GCP" == "" ]; do
+while [[ "$TF_NEED_GCP" == "" ]]; do
   read -p "Do you wish to build TensorFlow with "\
 "Google Cloud Platform support? [y/N] " INPUT
   case $INPUT in
@@ -202,23 +200,12 @@ while [ "$TF_NEED_GCP" == "" ]; do
   esac
 done
 
-if [ "$TF_NEED_GCP" == "1" ]; then
-  ## Verify that libcurl header files are available.
-  # Only check Linux, since on MacOS the header files are installed with XCode.
-  if is_linux && [[ ! -f "/usr/include/curl/curl.h" ]]; then
-    echo "ERROR: It appears that the development version of libcurl is not "\
-"available. Please install the libcurl3-dev package."
-    exit 1
-  fi
-
-  # Update Bazel build configuration.
-  sed_hyphen_i -e "s/WITH_GCP_SUPPORT = False/WITH_GCP_SUPPORT = True/" tensorflow/core/platform/default/build_config.bzl
-else
-  # Update Bazel build configuration.
-  sed_hyphen_i -e "s/WITH_GCP_SUPPORT = True/WITH_GCP_SUPPORT = False/" tensorflow/core/platform/default/build_config.bzl
+sed_hyphen_i -e "/with_gcp_support/d" .bazelrc
+if [[ "$TF_NEED_GCP" == "1" ]]; then
+  echo 'build --define with_gcp_support=true' >>.bazelrc
 fi
 
-while [ "$TF_NEED_HDFS" == "" ]; do
+while [[ "$TF_NEED_HDFS" == "" ]]; do
   read -p "Do you wish to build TensorFlow with "\
 "Hadoop File System support? [y/N] " INPUT
   case $INPUT in
@@ -232,16 +219,13 @@ while [ "$TF_NEED_HDFS" == "" ]; do
   esac
 done
 
-if [ "$TF_NEED_HDFS" == "1" ]; then
-  # Update Bazel build configuration.
-  sed_hyphen_i -e "s/WITH_HDFS_SUPPORT = False/WITH_HDFS_SUPPORT = True/" tensorflow/core/platform/default/build_config.bzl
-else
-  # Update Bazel build configuration.
-  sed_hyphen_i -e "s/WITH_HDFS_SUPPORT = True/WITH_HDFS_SUPPORT = False/" tensorflow/core/platform/default/build_config.bzl
+sed_hyphen_i -e "/with_hdfs_support/d" .bazelrc
+if [[ "$TF_NEED_HDFS" == "1" ]]; then
+  echo 'build --define with_hdfs_support=true' >>.bazelrc
 fi
 
 ## Enable XLA.
-while [ "$TF_ENABLE_XLA" == "" ]; do
+while [[ "$TF_ENABLE_XLA" == "" ]]; do
   read -p "Do you wish to build TensorFlow with the XLA just-in-time compiler (experimental)? [y/N] " INPUT
   case $INPUT in
     [Yy]* ) echo "XLA JIT support will be enabled for TensorFlow"; TF_ENABLE_XLA=1;;
@@ -251,12 +235,9 @@ while [ "$TF_ENABLE_XLA" == "" ]; do
   esac
 done
 
-if [ "$TF_ENABLE_XLA" == "1" ]; then
-  # Update Bazel build configuration.
-  sed_hyphen_i -e "s/^WITH_XLA_SUPPORT = [FT].*/WITH_XLA_SUPPORT = True/" tensorflow/core/platform/default/build_config_root.bzl
-else
-  # Update Bazel build configuration.
-  sed_hyphen_i -e "s/^WITH_XLA_SUPPORT = [FT].*/WITH_XLA_SUPPORT = False/" tensorflow/core/platform/default/build_config_root.bzl
+sed_hyphen_i -e "/with_xla_support/d" .bazelrc
+if [[ "$TF_ENABLE_XLA" == "1" ]]; then
+  echo 'build --define with_xla_support=true' >>.bazelrc
 fi
 
 
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index a2e74f40c3..1956cb0763 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -110,6 +110,34 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+# TODO(jhseu): Enable on other platforms other than Linux.
+config_setting(
+    name = "with_jemalloc",
+    values = {
+        "cpu": "k8",
+        "define": "with_jemalloc=true",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_gcp_support",
+    values = {"define": "with_gcp_support=true"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_hdfs_support",
+    values = {"define": "with_hdfs_support=true"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_xla_support",
+    values = {"define": "with_xla_support=true"},
+    visibility = ["//visibility:public"],
+)
+
 package_group(
     name = "internal",
     packages = ["//tensorflow/..."],
@@ -321,6 +349,8 @@ cc_binary(
     deps = [
         "//tensorflow/c:c_api",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:scope",
         "//tensorflow/core:tensorflow",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index 998ca7d21f..8f169cd036 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -138,7 +138,8 @@ tensorflow::Status AllocationTracker::DeallocateShape(
     TF_RET_CHECK(ShapeUtil::TupleElementCount(shape) == elements.size())
         << "tuple has unexpected number of elements: " << elements.size()
         << " != " << ShapeUtil::TupleElementCount(shape);
-    for (int i = 0; i < elements.size(); ++i) {
+    for (std::vector<se::DeviceMemoryBase>::size_type i = 0;
+         i < elements.size(); ++i) {
       VLOG(2) << "recursing onto the tuple elements";
       TF_RETURN_IF_ERROR(DeallocateShape(backend, device_ordinal, &elements[i],
                                          shape.tuple_shapes(i),
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index aa512f242a..715d3f33bc 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -118,10 +118,10 @@ GenericTransferManager::ShallowCopyTupleFromDevice(
 
   // Create a DeviceMemoryBase from each void* pointer.
   std::vector<se::DeviceMemoryBase> destination;
-  for (int i = 0; i < element_pointers.size(); ++i) {
+  for (std::vector<void*>::size_type i = 0; i < element_pointers.size(); ++i) {
     if (element_pointers[i] == nullptr &&
         !ShapeUtil::HasZeroElements(shape.tuple_shapes(i))) {
-      return FailedPrecondition("tuple contains nullptr at element %d", i);
+      return FailedPrecondition("tuple contains nullptr at element %lu", i);
     }
     int64 buffer_size = ShapeUtil::ByteSizeOf(shape.tuple_shapes(i),
                                               /*pointer_size=*/sizeof(void*));
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index d88315e747..60593afb8c 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -256,7 +256,8 @@ StatusOr<std::vector<const Allocation*>> Service::ResolveAndValidateArguments(
     tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
     const Backend* backend, int device_ordinal) {
   std::vector<const Allocation*> allocations;
-  for (int i = 0; i < arguments.size(); ++i) {
+  for (tensorflow::gtl::ArraySlice<const GlobalDataHandle*>::size_type i = 0; 
+       i < arguments.size(); ++i) {
     auto allocation_status = allocation_tracker_.Resolve(*arguments[i]);
     if (!allocation_status.ok()) {
       return Status(allocation_status.status().code(),
@@ -269,7 +270,7 @@ StatusOr<std::vector<const Allocation*>> Service::ResolveAndValidateArguments(
     if (allocation->backend() != backend ||
         allocation->device_ordinal() != device_ordinal) {
       return InvalidArgument(
-          "argument %d is on device %s but computation will be executed "
+          "argument %lu is on device %s but computation will be executed "
           "on device %s",
           i,
           allocation->backend()
@@ -295,13 +296,14 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
                            program_shape.parameters_size(), arguments.size());
   }
 
-  for (int i = 0; i < arguments.size(); ++i) {
+  for (tensorflow::gtl::ArraySlice<const Allocation*>::size_type i = 0;
+       i < arguments.size(); ++i) {
     // Verify that shape of arguments matches the shape of the arguments in the
     // ProgramShape.
     if (!ShapeUtil::Compatible(arguments[i]->shape(),
                                program_shape.parameters(i))) {
       return InvalidArgument(
-          "computation expects parameter %d to have shape %s, given shape %s",
+          "computation expects parameter %lu to have shape %s, given shape %s",
           i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
           ShapeUtil::HumanString(arguments[i]->shape()).c_str());
     }
@@ -383,7 +385,8 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
                           hlo_dumper, std::move(executors)));
 
   if (!other_directory_path.empty()) {
-    for (int64 i = 0; i < versioned_handles.size(); ++i) {
+    for (std::vector<VersionedComputationHandle>::size_type i = 0;
+         i < versioned_handles.size(); ++i) {
       executables[i]->set_session_module(std::move(session_modules[i]));
     }
   }
@@ -523,7 +526,8 @@ Service::ExecuteParallelAndRegisterResult(
 
   // Asynchronously launch all executables.
   std::vector<GlobalDataHandle> result_handles;
-  for (int64 i = 0; i < executables.size(); i++) {
+  for (tensorflow::gtl::ArraySlice<Executable*>::size_type i = 0;
+       i < executables.size(); i++) {
     TF_ASSIGN_OR_RETURN(
         perftools::gputools::DeviceMemoryBase result,
         executables[i]->ExecuteAsyncOnStream(&run_options[i], arguments[i]));
diff --git a/tensorflow/contrib/android/BUILD b/tensorflow/contrib/android/BUILD
index acd82dc21e..952f24f34b 100644
--- a/tensorflow/contrib/android/BUILD
+++ b/tensorflow/contrib/android/BUILD
@@ -72,13 +72,17 @@ LINKER_SCRIPT = "//tensorflow/contrib/android:jni/version_script.lds"
 cc_binary(
     name = "libtensorflow_inference.so",
     srcs = [],
-    copts = tf_copts(),
+    copts = tf_copts() + [
+        "-ffunction-sections",
+        "-fdata-sections",
+    ],
     linkopts = if_android([
         "-landroid",
         "-llog",
         "-lm",
         "-z defs",
         "-s",
+        "-Wl,--gc-sections",
         "-Wl,--version-script",  # This line must be directly followed by LINKER_SCRIPT.
         LINKER_SCRIPT,
     ]),
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 043a69f264..3c8dc869af 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -56,9 +56,10 @@ mark_as_advanced(DOWNLOAD_LOCATION)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
-  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC -D__VERSION__=\"MSVC\")
+  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
   add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
   add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH -D_ITERATOR_DEBUG_LEVEL=0)
+  add_definitions(-DTF_COMPILE_LIBRARY)
   add_definitions(-DNDEBUG /O2)  # Equivalent of -c opt in Bazel.
   add_definitions(/bigobj /nologo /EHsc /GF /FC /MP /Gm-)
   # Suppress warnings to reduce build log size.
@@ -190,6 +191,7 @@ if (tensorflow_ENABLE_GPU)
       ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda.h ${CUDA_TOOLKIT_TARGET_DIR}/include/cuComplex.h
       ${CUDA_TOOLKIT_TARGET_DIR}/include/cublas_v2.h ${CUDNN_HOME}/include/cudnn.h
       ${CUDA_TOOLKIT_TARGET_DIR}/include/cufft.h ${CUDA_TOOLKIT_TARGET_DIR}/include/curand.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda_runtime_api.h
       DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
     )
     include_directories(${tensorflow_source_dir}/third_party/gpus)
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 8e7f43b511..2641d5292d 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -13,7 +13,7 @@ Linux.
 Current Status
 --------------
 
-CMake can be used to build TensorFlow on Windows. See the [getting started documentation](https://www.tensorflow.org/get_started/os_setup.html#pip-installation-on-windows)
+CMake can be used to build TensorFlow on Windows. See the [getting started documentation](https://www.tensorflow.org/install/install_windows)
 for instructions on how to install a pre-built TensorFlow package on Windows.
 
 ### Current known limitations
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index bca700aca2..936196dd20 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -120,3 +120,43 @@ list(REMOVE_ITEM tf_cc_srcs ${tf_cc_test_srcs})
 
 add_library(tf_cc OBJECT ${tf_cc_srcs})
 add_dependencies(tf_cc tf_cc_framework tf_cc_ops)
+
+set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
+add_custom_target(tf_extension_ops)
+
+function(AddUserOps)
+  cmake_parse_arguments(_AT "" "" "TARGET;SOURCES;GPUSOURCES;DEPENDS;DISTCOPY" ${ARGN})
+  if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES)
+    # if gpu build is enabled and we have gpu specific code,
+    # hint to cmake that this needs to go to nvcc
+    set (gpu_source ${_AT_GPUSOURCES})
+    set (gpu_lib "${_AT_TARGET}_gpu")
+    set_source_files_properties(${gpu_source} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+    cuda_compile(gpu_lib ${gpu_source})
+  endif()
+  # create shared library from source and cuda obj
+  add_library(${_AT_TARGET} SHARED ${_AT_SOURCES} ${gpu_lib})
+  target_link_libraries(${_AT_TARGET} ${pywrap_tensorflow_lib})
+  if(WIN32)
+    if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES)
+        # some ops call out to cuda directly; need to link libs for the cuda dlls
+        target_link_libraries(${_AT_TARGET} ${CUDA_LIBRARIES})
+    endif()
+    if (_AT_DISTCOPY)
+        add_custom_command(TARGET ${_AT_TARGET} POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${_AT_TARGET}> ${_AT_DISTCOPY}/)
+    endif()
+  endif()
+  if (_AT_DEPENDS)
+    add_dependencies(${_AT_TARGET} ${_AT_DEPENDS})
+  endif()
+  # make sure TF_COMPILE_LIBRARY is not defined for this target
+  get_target_property(target_compile_flags  ${_AT_TARGET} COMPILE_FLAGS)
+  if(target_compile_flags STREQUAL "target_compile_flags-NOTFOUND")
+    set(target_compile_flags "/UTF_COMPILE_LIBRARY")
+  else()
+    set(target_compile_flags "${target_compile_flags} /UTF_COMPILE_LIBRARY")
+  endif()
+  set_target_properties(${_AT_TARGET} PROPERTIES COMPILE_FLAGS ${target_compile_flags})
+  add_dependencies(tf_extension_ops ${_AT_TARGET})
+endfunction(AddUserOps)
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index 691dee9ef0..3787ac4c81 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -199,7 +199,6 @@ add_custom_command(OUTPUT
     COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
     --raw_generate ${VERSION_INFO_CC}
     DEPENDS __force_rebuild)
-
 set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
 
 ########################################################
@@ -238,3 +237,9 @@ add_dependencies(tf_core_framework
     tf_core_lib
     proto_text
 )
+
+if(WIN32)
+  # Cmake > 3.6 will quote this as -D"__VERSION__=\"MSVC\"" which nvcc fails on.
+  # Instead of defining this global, limit it to tf_core_framework where its used.
+  target_compile_definitions(tf_core_framework PRIVATE __VERSION__="MSVC")
+endif()
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index dd28817b54..33384eed48 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -93,6 +93,12 @@ if(WIN32)
       "${tensorflow_source_dir}/tensorflow/core/kernels/meta_support.*"
       "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
       "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc"
+      # no in tensorflow.dll - comes from .so
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops.cc"
   )
   list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_windows_exclude_srcs})
 endif(WIN32)
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 2c21154217..2ecc08f421 100644..100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -623,12 +623,7 @@ add_custom_command(
       COMMENT "Running SWIG to generate Python wrappers"
       VERBATIM )
 
-# pywrap_tensorflow_internal is a shared library containing all of the
-# TensorFlow runtime and the standard ops and kernels. These are installed into
-# tf_python/tensorflow/python/.
-# TODO(mrry): Refactor this to expose a framework library that
-# facilitates `tf.load_op_library()`.
-add_library(pywrap_tensorflow_internal SHARED
+set (pywrap_tensorflow_internal_src
     "${tensorflow_source_dir}/tensorflow/python/client/tf_session_helper.h"
     "${tensorflow_source_dir}/tensorflow/python/client/tf_session_helper.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/cpp_shape_inference.h"
@@ -652,6 +647,55 @@ add_library(pywrap_tensorflow_internal SHARED
     "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.cc"
     "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.h"
     "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.cc"
+)
+
+if(WIN32)
+    # Windows: build a static library with the same objects as tensorflow.dll.
+    # This can be used to build for a standalone exe and also helps us to
+    # find all symbols that need to be exported from the dll which is needed
+    # to provide the tensorflow c/c++ api in tensorflow.dll.
+    # From the static library we create the def file with all symbols that need to
+    # be exported from tensorflow.dll. Because there is a limit of 64K sybmols
+    # that can be exported, we filter the symbols with a python script to the namespaces
+    # we need.
+    #
+    add_library(pywrap_tensorflow_internal_static STATIC
+        ${pywrap_tensorflow_internal_src}
+        $<TARGET_OBJECTS:tf_core_lib>
+        $<TARGET_OBJECTS:tf_core_cpu>
+        $<TARGET_OBJECTS:tf_core_framework>
+        $<TARGET_OBJECTS:tf_core_ops>
+        $<TARGET_OBJECTS:tf_core_direct_session>
+        $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
+        $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
+        $<TARGET_OBJECTS:tf_core_kernels>
+        $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+        $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
+    )
+    target_include_directories(pywrap_tensorflow_internal_static PUBLIC
+        ${PYTHON_INCLUDE_DIR}
+        ${NUMPY_INCLUDE_DIR}
+    )
+    target_link_libraries(pywrap_tensorflow_internal_static
+        tf_protos_cc
+        tf_python_protos_cc
+    )
+    set(pywrap_tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow.def")
+    set_source_files_properties(${pywrap_tensorflow_deffile} PROPERTIES GENERATED TRUE)
+
+    add_custom_command(TARGET pywrap_tensorflow_internal_static POST_BUILD
+        COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tools/create_def_file.py
+            --input $<TARGET_FILE:pywrap_tensorflow_internal_static>
+            --output ${pywrap_tensorflow_deffile}
+    )
+endif(WIN32)
+
+
+# pywrap_tensorflow_internal is a shared library containing all of the
+# TensorFlow runtime and the standard ops and kernels. These are installed into
+# tf_python/tensorflow/python/.
+add_library(pywrap_tensorflow_internal SHARED
+    ${pywrap_tensorflow_internal_src}
     $<TARGET_OBJECTS:tf_core_lib>
     $<TARGET_OBJECTS:tf_core_cpu>
     $<TARGET_OBJECTS:tf_core_framework>
@@ -662,7 +706,13 @@ add_library(pywrap_tensorflow_internal SHARED
     $<TARGET_OBJECTS:tf_core_kernels>
     $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
     $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
+    ${pywrap_tensorflow_deffile}
 )
+
+if(WIN32)
+    add_dependencies(pywrap_tensorflow_internal pywrap_tensorflow_internal_static)
+endif(WIN32)
+
 target_include_directories(pywrap_tensorflow_internal PUBLIC
     ${PYTHON_INCLUDE_DIR}
     ${NUMPY_INCLUDE_DIR}
@@ -675,6 +725,44 @@ target_link_libraries(pywrap_tensorflow_internal
     ${PYTHON_LIBRARIES}
 )
 
+if(WIN32)
+    # include contrib/rnn as .so
+    #
+    set(tf_gru_srcs
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.h"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.h"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc"
+    )
+    set(tf_gru_gpu_srcs
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops_gpu.cu.cc"
+    )
+
+    set(tf_lstm_srcs
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.h"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.h"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops.cc"
+    )
+    set(tf_lstm_gpu_srcs
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc"
+    )
+
+    AddUserOps(TARGET _gru_ops
+        SOURCES "${tf_gru_srcs}"
+        GPUSOURCES ${tf_gru_gpu_srcs}
+        DEPENDS pywrap_tensorflow_internal tf_python_ops
+        DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/python/ops/)
+
+    AddUserOps(TARGET _lstm_ops
+        SOURCES "${tf_lstm_srcs}"
+        GPUSOURCES ${tf_lstm_gpu_srcs}
+        DEPENDS pywrap_tensorflow_internal tf_python_ops
+        DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/python/ops/)
+endif(WIN32)
+
 ############################################################
 # Build a PIP package containing the TensorFlow runtime.
 ############################################################
@@ -684,14 +772,17 @@ add_dependencies(tf_python_build_pip_package
     tensorboard_copy_dependencies
     tf_python_copy_scripts_to_destination
     tf_python_touchup_modules
-    tf_python_ops)
+    tf_python_ops
+    tf_extension_ops)
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/setup.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
 if(WIN32)
   add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.dll
-                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd)
+                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib
+                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
 else()
   add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 711b5c49f4..449a762a9a 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -115,7 +115,14 @@ if (tensorflow_BUILD_PYTHON_TESTS)
   #
 
   # include all test
+  if (WIN32)
+    file(GLOB_RECURSE tf_test_rnn_src_py
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/python/kernel_tests/*_test.py"
+    )
+  endif()
+ 
   file(GLOB_RECURSE tf_test_src_py
+    ${tf_test_rnn_src_py}
     "${tensorflow_source_dir}/tensorflow/python/debug/cli/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/debug/lib/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/debug/wrappers/*_test.py"
diff --git a/tensorflow/contrib/cmake/tf_tools.cmake b/tensorflow/contrib/cmake/tf_tools.cmake
index 2aaa9ed53e..5151fdb444 100644
--- a/tensorflow/contrib/cmake/tf_tools.cmake
+++ b/tensorflow/contrib/cmake/tf_tools.cmake
@@ -106,3 +106,22 @@ target_link_libraries(${compare_graphs} PUBLIC
   ${tf_core_gpu_kernels_lib}
   ${tensorflow_EXTERNAL_LIBRARIES}
 )
+
+set(benchmark_model "benchmark_model")
+
+add_executable(${benchmark_model}
+    "${tensorflow_source_dir}/tensorflow/tools/benchmark/benchmark_model.cc"
+    "${tensorflow_source_dir}/tensorflow/tools/benchmark/benchmark_model_main.cc"
+    $<TARGET_OBJECTS:tf_core_lib>
+    $<TARGET_OBJECTS:tf_core_cpu>
+    $<TARGET_OBJECTS:tf_core_framework>
+    $<TARGET_OBJECTS:tf_core_ops>
+    $<TARGET_OBJECTS:tf_core_direct_session>
+    $<TARGET_OBJECTS:tf_core_kernels>
+)
+
+target_link_libraries(${benchmark_model} PUBLIC
+  tf_protos_cc
+  ${tf_core_gpu_kernels_lib}
+  ${tensorflow_EXTERNAL_LIBRARIES}
+)
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
new file mode 100644
index 0000000000..950c8f79bc
--- /dev/null
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -0,0 +1,134 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""
+create_def_file.py - tool to create a windows def file to export
+symbols from tensorflow.dll to enable tf.load_library().
+Because the linker allows only 64K symbols to be exported per dll
+we filter the symbols down to the essentials. The regular expressions
+we use for this are specific to tensorflow.
+
+TODO: this works fine but there is an issue with exporting
+'const char * const' and importing it from a user_ops. The problem is
+on the importing end and using __declspec(dllimport) works around it.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import io
+import os
+import re
+import sys
+import tempfile
+from subprocess import Popen, PIPE
+
+# External tools we use that come with visual studio sdk and
+# we assume that the caller has the correct PATH to the sdk
+UNDNAME = "undname.exe"
+DUMPBIN = "dumpbin.exe"
+
+# Exclude if matched
+EXCLUDE_RE = re.compile(r"deleting destructor|::internal::")
+
+# Include if matched before exclude
+INCLUDEPRE_RE = re.compile(r"tensorflow::internal::LogMessage|" +
+                           r"tensorflow::internal::CheckOpMessageBuilder")
+
+# Include if matched after exclude
+INCLUDE_RE = re.compile(r"^(TF_\w*)$|" +
+                        r"tensorflow::|" +
+                        r"functor::|" +
+                        r"perftools::gputools")
+
+
+def get_args():
+  """Parse command line."""
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--input", help="input library", required=True)
+  parser.add_argument("--output", help="output deffile", required=True)
+  args = parser.parse_args()
+  return args
+
+
+def main():
+  """main."""
+  args = get_args()
+
+  # Pipe dumpbin to extract all linkable symbols from a lib.
+  # Good symbols are collected in candidates and also written to
+  # a temp file.
+  candidates = []
+  tmpfile = tempfile.NamedTemporaryFile(mode="w", delete=False)
+  proc = Popen([DUMPBIN, "/nologo", "/linkermember:1", args.input], stdout=PIPE)
+  for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
+    cols = line.split()
+    if len(cols) < 2:
+      continue
+    sym = cols[1]
+    tmpfile.file.write(sym + "\n")
+    candidates.append(sym)
+  tmpfile.file.close()
+  exit_code = proc.wait()
+  if exit_code != 0:
+    print("{} failed, exit={}".format(DUMPBIN, exit_code))
+    return exit_code
+
+  # Run the symbols through undname to get their undecorated name
+  # so we can filter on something readable.
+  with open(args.output, "w") as def_fp:
+    # track dupes
+    taken = set()
+
+    # Header for the def file. Since the tensorflow.dll is actually called
+    # _pywrap_tensorflow.pyd in the python wheel, hint that in the def file.
+    def_fp.write("LIBRARY _pywrap_tensorflow_internal.pyd\n")
+    def_fp.write("EXPORTS\n")
+    def_fp.write("\t ??1OpDef@tensorflow@@UEAA@XZ\n")
+
+    # Each symbols returned by undname matches the same position in candidates.
+    # We compare on undname but use the decorated name from candidates.
+    dupes = 0
+    proc = Popen([UNDNAME, tmpfile.name], stdout=PIPE)
+    for idx, line in enumerate(io.TextIOWrapper(proc.stdout, encoding="utf-8")):
+      decorated = candidates[idx]
+      if decorated in taken:
+        # Symbol is already in output, done.
+        dupes += 1
+        continue
+
+      if not INCLUDEPRE_RE.search(line):
+        if EXCLUDE_RE.search(line):
+          continue
+        if not INCLUDE_RE.search(line):
+          continue
+
+      def_fp.write("\t" + decorated + "\n")
+      taken.add(decorated)
+  exit_code = proc.wait()
+  if exit_code != 0:
+    print("{} failed, exit={}".format(UNDNAME, exit_code))
+    return exit_code
+
+  os.unlink(tmpfile.name)
+
+  print("symbols={}, taken={}, dupes={}"
+        .format(len(candidates), len(taken), dupes))
+  return 0
+
+
+if __name__ == "__main__":
+  sys.exit(main())
diff --git a/tensorflow/contrib/framework/python/framework/checkpoint_utils.py b/tensorflow/contrib/framework/python/framework/checkpoint_utils.py
index 4cd3efafa0..5d078236ac 100644
--- a/tensorflow/contrib/framework/python/framework/checkpoint_utils.py
+++ b/tensorflow/contrib/framework/python/framework/checkpoint_utils.py
@@ -280,10 +280,11 @@ def init_from_checkpoint(checkpoint_dir, assignment_map):
       for var_name in scope_variables:
         # Lookup name with specified prefix and suffix from current variable.
         # If tensor_name given is '/' (root), don't use it for full name.
+        full_tensor_name = var_name[len(scopes):]
+        if current_var_or_name != "/":
+          full_tensor_name = full_tensor_name[1:]
         if tensor_name_in_ckpt != "/":
-          full_tensor_name = tensor_name_in_ckpt + var_name[len(scopes) + 1:]
-        else:
-          full_tensor_name = var_name[len(scopes) + 1:]
+          full_tensor_name = tensor_name_in_ckpt + full_tensor_name
         if full_tensor_name not in variable_map:
           raise ValueError(
               "Tensor %s (%s in %s) is not found in %s checkpoint" % (
diff --git a/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py b/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py
index 09eecb56dc..51ca5ec125 100644
--- a/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py
+++ b/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py
@@ -168,6 +168,29 @@ class CheckpointsTest(test.TestCase):
         self.assertAllEqual(my3.eval(session), v3)
         self.assertAllEqual(my4.eval(session), v4)
 
+  def testInitToRootCheckpoint(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir)
+
+    # New graph and session.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        my1 = variable_scope.get_variable("var1", [1, 10])
+        my2 = variable_scope.get_variable("var2", [10, 10])
+        my3 = variable_scope.get_variable("var3", [100, 100])
+        with variable_scope.variable_scope("useful_scope"):
+          my4 = variable_scope.get_variable("var4", [9, 9])
+
+        checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                              {"/": "/",})
+
+        session.run(variables.global_variables_initializer())
+        self.assertAllEqual(my1.eval(session), v1)
+        self.assertAllEqual(my2.eval(session), v2)
+        self.assertAllEqual(my3.eval(session), v3)
+        self.assertAllEqual(my4.eval(session), v4)
+
   def testInitFromPartitionVar(self):
     checkpoint_dir = self.get_temp_dir()
     with self.test_session() as session:
diff --git a/tensorflow/contrib/framework/python/ops/arg_scope.py b/tensorflow/contrib/framework/python/ops/arg_scope.py
index b7ec9ba936..ad84cd681a 100644
--- a/tensorflow/contrib/framework/python/ops/arg_scope.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope.py
@@ -30,11 +30,15 @@
     net = layers.conv2d(inputs, 64, [11, 11], 4, padding='VALID', scope='conv1')
     net = layers.conv2d(net, 256, [5, 5], scope='conv2')
   ```
-  The first call to conv2d will use predefined args:
-    layers.conv2d(inputs, 64, [11, 11], 4, padding='VALID', ..., scope='conv1')
+  The first call to conv2d will behave as follows:
+    layers.conv2d(inputs, 64, [11, 11], 4, padding='VALID',
+                  initializer=layers.variance_scaling_initializer(),
+                  regularizer=layers.l2_regularizer(0.05), scope='conv1')
 
-  The second call to conv2d will overwrite padding:
-    layers.conv2d(inputs, 256, [5, 5], padding='SAME', ..., scope='conv2')
+  The second call to conv2d will also use the arg_scope's default for padding:
+    layers.conv2d(inputs, 256, [5, 5], padding='SAME',
+                  initializer=layers.variance_scaling_initializer(),
+                  regularizer=layers.l2_regularizer(0.05), scope='conv2')
 
   Example of how to reuse an arg_scope:
 
@@ -49,7 +53,7 @@
     net = layers.conv2d(net, 256, [5, 5], scope='conv2')
   ```
 
-  Example of how to use tf.contrib.framework.add_arg_scope:
+  Example of how to use tf.contrib.framework.add_arg_scope to enable your function to be called within an arg_scope later:
 
   @tf.contrib.framework.add_arg_scope
   def conv2d(*args, **kwargs)
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index e746107e36..6ba8f7e8ae 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -40,6 +40,7 @@ See the @{$python/contrib.layers} guide.
 @@softmax
 @@stack
 @@unit_norm
+@@bow_encoder
 @@embed_sequence
 
 @@apply_regularization
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index adbbcea02f..07be8e9990 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -160,9 +160,8 @@ def _fused_batch_norm(
   they need to be added as a dependency to the `train_op`, example:
 
     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-    if update_ops:
-      updates = tf.group(*update_ops)
-      total_loss = control_flow_ops.with_dependencies([updates], total_loss)
+    with tf.control_dependencies(update_ops):
+      train_op = optimizer.minimize(loss)
 
   One can set updates_collections=None to force the updates in place, but that
   can have speed penalty, especially in distributed settings.
@@ -393,9 +392,8 @@ def batch_norm(inputs,
   they need to be added as a dependency to the `train_op`, example:
 
     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-    if update_ops:
-      updates = tf.group(*update_ops)
-      total_loss = control_flow_ops.with_dependencies([updates], total_loss)
+    with tf.control_dependencies(update_ops):
+      train_op = optimizer.minimize(loss)
 
   One can set updates_collections=None to force the updates in place, but that
   can have speed penalty, especially in distributed settings.
diff --git a/tensorflow/contrib/learn/__init__.py b/tensorflow/contrib/learn/__init__.py
index 85cef3d8db..bd56066b1b 100644
--- a/tensorflow/contrib/learn/__init__.py
+++ b/tensorflow/contrib/learn/__init__.py
@@ -33,6 +33,7 @@ See the @{$python/contrib.learn} guide.
 @@DNNLinearCombinedRegressor
 @@DNNLinearCombinedEstimator
 @@DNNLinearCombinedClassifier
+@@DynamicRnnEstimator
 @@LinearClassifier
 @@LinearEstimator
 @@LinearRegressor
diff --git a/tensorflow/contrib/learn/python/learn/README.md b/tensorflow/contrib/learn/python/learn/README.md
index f412c83a97..0aae178e9a 100644
--- a/tensorflow/contrib/learn/python/learn/README.md
+++ b/tensorflow/contrib/learn/python/learn/README.md
@@ -20,18 +20,17 @@ Optionally you can install [scikit-learn](http://scikit-learn.org/stable/) and [
 
 ### Tutorials
 
--   [TF Learn Quickstart](../../../../g3doc/tutorials/tflearn/index.md). Build,
+-   [TF Learn Quickstart](https://www.tensorflow.org/get_started/tflearn). Build,
     train, and evaluate a neural network with just a few lines of code.
--   [Input Functions](../../../../g3doc/tutorials/input_fn/index.md). Learn how
+-   [Input Functions](https://www.tensorflow.org/get_started/input_fn). Learn how
     to create input functions to feed data into your models.
--   [Linear Model](../../../../g3doc/tutorials/wide/index.md). Learn the basics
+-   [Linear Model](https://www.tensorflow.org/tutorials/wide). Learn the basics
     of building linear models.
--   [Wide and Deep
-    Learning](../../../../g3doc/tutorials/wide_and_deep/index.md). Jointly train
-    a linear model and a deep neural network.
--   [Logging and Monitoring](../../../../g3doc/tutorials/monitors/index.md). Use
-    the Monitor API to audit training of a neural network.
--   [Custom Estimators](../../../../g3doc/tutorials/estimators/index.md). Learn
+-   [Wide and Deep Learning](https://www.tensorflow.org/tutorials/wide_and_deep).
+    Jointly train a linear model and a deep neural network.
+-   [Logging and Monitoring](https://www.tensorflow.org/get_started/monitors).
+    Use the Monitor API to audit training of a neural network.
+-   [Custom Estimators](https://www.tensorflow.org/extend/estimators). Learn
     how to create a custom estimator.
 -   More coming soon.
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 6d591c42c6..7a95296945 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -1108,7 +1108,7 @@ class Estimator(BaseEstimator):
     if isinstance(model_fn_results, model_fn_lib.ModelFnOps):
       return model_fn_results
 
-    # Here model_fn_ops should be a tuple with 3 elements.
+    # Here model_fn_results should be a tuple with 3 elements.
     if len(model_fn_results) != 3:
       raise ValueError('Unrecognized value returned by model_fn, '
                        'please return ModelFnOps.')
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index faf78a3675..d7f1017a46 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -149,21 +149,16 @@ def _linear_model_fn(features, labels, mode, params, config=None):
       values=tuple(six.itervalues(features)),
       partitioner=partitioner) as scope:
     if joint_weights:
-      logits, _, _ = (
-          layers.joint_weighted_sum_from_feature_columns(
-              columns_to_tensors=features,
-              feature_columns=feature_columns,
-              num_outputs=head.logits_dimension,
-              weight_collections=[parent_scope],
-              scope=scope))
+      layer_fn = layers.joint_weighted_sum_from_feature_columns
     else:
-      logits, _, _ = (
-          layers.weighted_sum_from_feature_columns(
-              columns_to_tensors=features,
-              feature_columns=feature_columns,
-              num_outputs=head.logits_dimension,
-              weight_collections=[parent_scope],
-              scope=scope))
+      layer_fn = layers.weighted_sum_from_feature_columns
+        
+    logits, _, _ = layer_fn(
+            columns_to_tensors=features,
+            feature_columns=feature_columns,
+            num_outputs=head.logits_dimension,
+            weight_collections=[parent_scope],
+            scope=scope)
 
     def _train_op_fn(loss):
       global_step = contrib_variables.get_global_step()
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py
index c164a12b1d..09f19ad274 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py
@@ -63,57 +63,54 @@ def _assert_df_equals_dict(expected_df, actual_dict):
                                                              actual_dict[col]))
 
 
-def _make_test_csv():
-  f = tempfile.NamedTemporaryFile(
-      dir=test.get_temp_dir(), delete=False, mode="w")
-  w = csv.writer(f)
-  w.writerow(["int", "float", "bool", "string"])
-  for _ in range(100):
-    intvalue = np.random.randint(-10, 10)
-    floatvalue = np.random.rand()
-    boolvalue = int(np.random.rand() > 0.3)
-    stringvalue = "S: %.4f" % np.random.rand()
-
-    row = [intvalue, floatvalue, boolvalue, stringvalue]
-    w.writerow(row)
-  f.close()
-  return f.name
-
-
-def _make_test_csv_sparse():
-  f = tempfile.NamedTemporaryFile(
-      dir=test.get_temp_dir(), delete=False, mode="w")
-  w = csv.writer(f)
-  w.writerow(["int", "float", "bool", "string"])
-  for _ in range(100):
-    # leave columns empty; these will be read as default value (e.g. 0 or NaN)
-    intvalue = np.random.randint(-10, 10) if np.random.rand() > 0.5 else ""
-    floatvalue = np.random.rand() if np.random.rand() > 0.5 else ""
-    boolvalue = int(np.random.rand() > 0.3) if np.random.rand() > 0.5 else ""
-    stringvalue = (("S: %.4f" % np.random.rand()) if np.random.rand() > 0.5 else
-                   "")
-
-    row = [intvalue, floatvalue, boolvalue, stringvalue]
-    w.writerow(row)
-  f.close()
-  return f.name
-
-
-def _make_test_tfrecord():
-  f = tempfile.NamedTemporaryFile(dir=test.get_temp_dir(), delete=False)
-  w = tf_record.TFRecordWriter(f.name)
-  for i in range(100):
-    ex = example_pb2.Example()
-    ex.features.feature["var_len_int"].int64_list.value.extend(range((i % 3)))
-    ex.features.feature["fixed_len_float"].float_list.value.extend(
-        [float(i), 2 * float(i)])
-    w.write(ex.SerializeToString())
-  return f.name
-
-
 class TensorFlowDataFrameTestCase(test.TestCase):
   """Tests for `TensorFlowDataFrame`."""
 
+  def _make_test_csv(self):
+    f = tempfile.NamedTemporaryFile(
+        dir=self.get_temp_dir(), delete=False, mode="w")
+    w = csv.writer(f)
+    w.writerow(["int", "float", "bool", "string"])
+    for _ in range(100):
+      intvalue = np.random.randint(-10, 10)
+      floatvalue = np.random.rand()
+      boolvalue = int(np.random.rand() > 0.3)
+      stringvalue = "S: %.4f" % np.random.rand()
+
+      row = [intvalue, floatvalue, boolvalue, stringvalue]
+      w.writerow(row)
+    f.close()
+    return f.name
+
+  def _make_test_csv_sparse(self):
+    f = tempfile.NamedTemporaryFile(
+        dir=self.get_temp_dir(), delete=False, mode="w")
+    w = csv.writer(f)
+    w.writerow(["int", "float", "bool", "string"])
+    for _ in range(100):
+      # leave columns empty; these will be read as default value (e.g. 0 or NaN)
+      intvalue = np.random.randint(-10, 10) if np.random.rand() > 0.5 else ""
+      floatvalue = np.random.rand() if np.random.rand() > 0.5 else ""
+      boolvalue = int(np.random.rand() > 0.3) if np.random.rand() > 0.5 else ""
+      stringvalue = (("S: %.4f" % np.random.rand()) if np.random.rand() > 0.5 else
+                     "")
+
+      row = [intvalue, floatvalue, boolvalue, stringvalue]
+      w.writerow(row)
+    f.close()
+    return f.name
+
+  def _make_test_tfrecord(self):
+    f = tempfile.NamedTemporaryFile(dir=self.get_temp_dir(), delete=False)
+    w = tf_record.TFRecordWriter(f.name)
+    for i in range(100):
+      ex = example_pb2.Example()
+      ex.features.feature["var_len_int"].int64_list.value.extend(range((i % 3)))
+      ex.features.feature["fixed_len_float"].float_list.value.extend(
+          [float(i), 2 * float(i)])
+      w.write(ex.SerializeToString())
+    return f.name
+
   def _assert_pandas_equals_tensorflow(self, pandas_df, tensorflow_df,
                                        num_batches, batch_size):
     self.assertItemsEqual(
@@ -190,7 +187,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
     batch_size = 8
     enqueue_size = 7
 
-    data_path = _make_test_csv()
+    data_path = self._make_test_csv()
     default_values = [0, 0.0, 0, ""]
 
     pandas_df = pd.read_csv(data_path)
@@ -211,7 +208,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
     num_epochs = 17
     expected_num_batches = (num_epochs * 100) // batch_size
 
-    data_path = _make_test_csv()
+    data_path = self._make_test_csv()
     default_values = [0, 0.0, 0, ""]
 
     tensorflow_df = df.TensorFlowDataFrame.from_csv(
@@ -234,7 +231,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
     num_batches = 100
     batch_size = 8
 
-    data_path = _make_test_csv_sparse()
+    data_path = self._make_test_csv_sparse()
     feature_spec = {
         "int": parsing_ops.FixedLenFeature(None, dtypes.int16, np.nan),
         "float": parsing_ops.VarLenFeature(dtypes.float16),
@@ -270,7 +267,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
     enqueue_size = 11
     batch_size = 13
 
-    data_path = _make_test_tfrecord()
+    data_path = self._make_test_tfrecord()
     features = {
         "fixed_len_float":
             parsing_ops.FixedLenFeature(
@@ -318,7 +315,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
     num_epochs = 17
     expected_num_batches = (num_epochs * 100) // batch_size
 
-    data_path = _make_test_csv()
+    data_path = self._make_test_csv()
     default_values = [0, 0.0, 0, ""]
 
     tensorflow_df = df.TensorFlowDataFrame.from_csv(
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 1e5795d035..c1ba9d4ead 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -261,7 +261,7 @@ def streaming_false_negatives(predictions, labels, weights=None,
                               metrics_collections=None,
                               updates_collections=None,
                               name=None):
-  """Computes the total number of false positives.
+  """Computes the total number of false negatives.
 
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
diff --git a/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py b/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py
index 755ebd048b..f44302638e 100644
--- a/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py
+++ b/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py
@@ -13,7 +13,14 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Module implementing RNN Cells."""
+"""Module implementing RNN Cells.
+
+This module provides a number of basic commonly used RNN cells, such as LSTM
+(Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number of
+operators that allow adding dropouts, projections, or embeddings for inputs.
+Constructing multi-layer cells is supported by the class `MultiRNNCell`, or by
+calling the `rnn` ops several times.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -146,12 +153,12 @@ class GRUCell(RNNCell):
     with _checked_scope(self, scope or "gru_cell", reuse=self._reuse):
       with vs.variable_scope("gates"):  # Reset gate and update gate.
         # We start with bias of 1.0 to not reset and not update.
+        value = sigmoid(_linear(
+          [inputs, state], 2 * self._num_units, True, 1.0))
         r, u = array_ops.split(
-            value=_linear(
-                [inputs, state], 2 * self._num_units, True, 1.0),
+            value=value,
             num_or_size_splits=2,
             axis=1)
-        r, u = sigmoid(r), sigmoid(u)
       with vs.variable_scope("candidate"):
         c = self._activation(_linear([inputs, r * state],
                                      self._num_units, True))
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
index c1ec46d763..318b552f4a 100644
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -70,7 +70,7 @@ def _lstm_block_cell(x,
   cs = ci .* i + cs_prev .* f
   cs = clip(cs, cell_clip)
 
-  o = sigmoid(cs * wco + f)
+  o = sigmoid(cs * wco + o)
   co = tanh(cs)
   h = co .* o
   ```
diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index 34367db01b..616de3199c 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -486,7 +486,7 @@ class GreedyEmbeddingHelper(Helper):
     # Outputs are logits, use argmax to get the most probable id
     if not isinstance(outputs, ops.Tensor):
       raise TypeError("Expected outputs to be a single Tensor, got: %s" %
-                      outputs)
+                      type(outputs))
     sample_ids = math_ops.cast(
         math_ops.argmax(outputs, axis=-1), dtypes.int32)
     return sample_ids
diff --git a/tensorflow/contrib/seq2seq/python/ops/loss.py b/tensorflow/contrib/seq2seq/python/ops/loss.py
index e14f07bc09..61852eda4f 100644
--- a/tensorflow/contrib/seq2seq/python/ops/loss.py
+++ b/tensorflow/contrib/seq2seq/python/ops/loss.py
@@ -44,8 +44,7 @@ def sequence_loss(logits, targets, weights,
       sequence. When using weights as masking set all valid timesteps to 1 and
       all padded timesteps to 0.
     average_across_timesteps: If set, sum the cost across the sequence
-      dimension and divide by the cost by the total label weight across
-      timesteps.
+      dimension and divide the cost by the total label weight across timesteps.
     average_across_batch: If set, sum the cost across the batch dimension and
       divide the returned cost by the batch size.
     softmax_loss_function: Function (labels-batch, inputs-batch) -> loss-batch
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index 898d3a11d0..94b0263ae8 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -229,7 +229,7 @@ net = ...
 net = slim.conv2d(net, 256, [3, 3], scope='conv3_1')
 net = slim.conv2d(net, 256, [3, 3], scope='conv3_2')
 net = slim.conv2d(net, 256, [3, 3], scope='conv3_3')
-net = slim.max_pool2d(net, [2, 2], scope='pool3')
+net = slim.max_pool2d(net, [2, 2], scope='pool2')
 ```
 
 One way to reduce this code duplication would be via a `for` loop:
@@ -238,14 +238,14 @@ One way to reduce this code duplication would be via a `for` loop:
 net = ...
 for i in range(3):
   net = slim.conv2d(net, 256, [3, 3], scope='conv3_' % (i+1))
-net = slim.max_pool2d(net, [2, 2], scope='pool3')
+net = slim.max_pool2d(net, [2, 2], scope='pool2')
 ```
 
 This can be made even cleaner by using TF-Slim's `repeat` operation:
 
 ```python
 net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
-net = slim.max_pool(net, [2, 2], scope='pool2')
+net = slim.max_pool2d(net, [2, 2], scope='pool2')
 ```
 
 Notice that the `slim.repeat` not only applies the same argument in-line, it
diff --git a/tensorflow/contrib/util/loader.py b/tensorflow/contrib/util/loader.py
index 95657217a0..c2ae425b56 100644
--- a/tensorflow/contrib/util/loader.py
+++ b/tensorflow/contrib/util/loader.py
@@ -21,6 +21,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import re
 
 from tensorflow.python.framework import load_library
 from tensorflow.python.platform import resource_loader
@@ -29,9 +30,9 @@ from tensorflow.python.platform import resource_loader
 def load_op_library(path):
   """Loads a contrib op library from the given path.
 
-  NOTE(mrry): On Windows, we currently assume that contrib op
+  NOTE(mrry): On Windows, we currently assume that some contrib op
   libraries are statically linked into the main TensorFlow Python
-  extension DLL.
+  extension DLL - use dynamically linked ops if the .so is present.
 
   Args:
     path: An absolute path to a shared object file.
@@ -40,11 +41,17 @@ def load_op_library(path):
     A Python module containing the Python wrappers for Ops defined in the
     plugin.
   """
-  if os.name != 'nt':
-    path = resource_loader.get_path_to_datafile(path)
-    ret = load_library.load_op_library(path)
-    assert ret, 'Could not load %s' % path
-    return ret
-  else:
-    # NOTE(mrry):
-    return None
+  if os.name == 'nt':
+    # To avoid makeing every user_ops aware of windows, re-write
+    # the file extension from .so to .dll.
+    path = re.sub('\.so$', '.dll', path)
+
+    # TODO: currently we have only some user_ops as .dll's on windows - don't try
+    #   to load them if the dll is not found. Once we have all of them
+    #   this check should be removed.
+    if not os.path.exists(path):
+      return None
+  path = resource_loader.get_path_to_datafile(path)
+  ret = load_library.load_op_library(path)
+  assert ret, 'Could not load %s' % path
+  return ret
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index ea434c3eb2..79d44c5a0c 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -339,6 +339,7 @@ tf_cuda_library(
     hdrs = [
         "example/feature_util.h",
         "framework/allocator.h",
+        "framework/allocator_registry.h",
         "framework/attr_value_util.h",
         "framework/bfloat16.h",
         "framework/cancellation.h",
@@ -408,7 +409,9 @@ tf_cuda_library(
             "util/memmapped_file_system.h",
             "util/memmapped_file_system_writer.h",
         ],
-    }),
+    }) + if_mkl([
+        "util/mkl_util.h",
+    ]),
     visibility = ["//visibility:public"],
     deps = [":framework_internal"],
 )
@@ -707,7 +710,9 @@ cc_library(
         "//tensorflow/core/kernels:math_not_windows",
         "//tensorflow/core/kernels:quantized_ops",
     ]) + if_mkl([
-        "//tensorflow/core/kernels:mkl_ops",
+        "//tensorflow/core/kernels:mkl_conv_op",
+        "//tensorflow/core/kernels:mkl_matmul_op",
+        "//tensorflow/core/kernels:mkl_tfconv_op",
     ]),
 )
 
@@ -772,7 +777,7 @@ cc_library(
         "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:ops_testutil",
         "//tensorflow/core/kernels:ops_util",
-        "//tensorflow/core/platform/default/build_config:gtest",  # + if_sycl([":sycl_runtime"]),
+        "//tensorflow/core/platform/default/build_config:gtest",  # + if_sycl([":sycl_runtime"])
     ],
 )
 
@@ -1393,7 +1398,7 @@ tf_cuda_library(
         ":version_lib",
         "//tensorflow/core/kernels:bounds_check",
         "//third_party/eigen3",
-    ],
+    ] + if_mkl(["//third_party/mkl:intel_binary_blob"]),
     alwayslink = 1,
 )
 
@@ -1482,20 +1487,21 @@ tf_cuda_library(
     ),
     copts = tf_copts(),
     deps = [
-        ":framework",
-        ":framework_internal",
-        ":function_ops_op_lib",
-        ":functional_grad",
-        ":functional_ops_op_lib",
-        ":lib",
-        ":lib_internal",
-        ":proto_text",
-        ":protos_all_cc",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler/optimizers:meta_optimizer",
-        "//third_party/eigen3",
-        "//tensorflow/core/kernels:required",
-    ] + tf_additional_core_deps(),
+               ":framework",
+               ":framework_internal",
+               ":function_ops_op_lib",
+               ":functional_grad",
+               ":functional_ops_op_lib",
+               ":lib",
+               ":lib_internal",
+               ":proto_text",
+               ":protos_all_cc",
+               "//tensorflow/core/grappler:grappler_item",
+               "//tensorflow/core/grappler/optimizers:meta_optimizer",
+               "//third_party/eigen3",
+               "//tensorflow/core/kernels:required",
+           ] + if_mkl(["//third_party/mkl:intel_binary_blob"]) +
+           tf_additional_core_deps(),
     alwayslink = 1,
 )
 
@@ -2037,33 +2043,38 @@ tf_cc_tests(
     ],
 )
 
-if_mkl(
-    tf_cc_test_mkl(
-        name = "mkl_related_tests",
-        size = "small",
-        srcs = ["graph/mkl_optimizer_merge_test.cc"],
-        linkstatic = tf_kernel_tests_linkstatic(),
-        deps = [
-            ":core",
-            ":core_cpu",
-            ":core_cpu_internal",
-            ":direct_session_internal",
-            ":framework",
-            ":framework_internal",
-            ":lib",
-            ":lib_internal",
-            ":ops",
-            ":protos_all_cc",  # under if_mkl
-            ":test",
-            ":test_main",
-            ":testlib",
-            "//tensorflow/cc:cc_ops",
-            "//tensorflow/cc:scope",
-            "//tensorflow/cc:sendrecv_ops",
-            "//tensorflow/core/kernels:ops_util",
-            "//third_party/eigen3",
-        ],
-    ),
+tf_cc_test_mkl(
+    name = "mkl_related_tests",
+    size = "small",
+    srcs = [
+        "graph/mkl_layout_pass_test.cc",
+        "graph/mkl_optimizer_merge_test.cc",
+        "graph/mkl_tfconversion_pass_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/cc:sendrecv_ops",
+        "//tensorflow/core/kernels:mkl_conv_op",
+        "//tensorflow/core/kernels:mkl_matmul_op",
+        "//tensorflow/core/kernels:mkl_tfconv_op",
+        "//tensorflow/core/kernels:ops_util",
+        "//third_party/eigen3",
+    ],
 )
 
 tf_cc_tests_gpu(
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
new file mode 100644
index 0000000000..41bf23be27
--- /dev/null
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -0,0 +1,120 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A simple CPU allocator that intercepts malloc/free calls from MKL library
+// and redirects them to Tensorflow allocator
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
+
+#ifdef INTEL_MKL
+
+#include <string>
+#include "tensorflow/core/common_runtime/bfc_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/mem.h"
+
+#include "third_party/mkl/include/i_malloc.h"
+
+namespace tensorflow {
+
+class MklSubAllocator : public SubAllocator {
+ public:
+  ~MklSubAllocator() override {}
+
+  void* Alloc(size_t alignment, size_t num_bytes) override {
+    return port::AlignedMalloc(num_bytes, alignment);
+  }
+  void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); }
+};
+
+/// CPU allocator for MKL that wraps BFC allocator and intercepts
+/// and redirects memory allocation calls from MKL.
+class MklCPUAllocator : public Allocator {
+ public:
+  // Constructor and other standard functions
+
+  MklCPUAllocator() {
+    VLOG(2) << "MklCPUAllocator: In MklCPUAllocator";
+    allocator_ =
+        new BFCAllocator(new MklSubAllocator, kMaxMemSize, kAllowGrowth, kName);
+
+    // For redirecting all allocations from MKL to this allocator
+    // From: http://software.intel.com/en-us/node/528565
+    i_malloc = MallocHook;
+    i_calloc = CallocHook;
+    i_realloc = ReallocHook;
+    i_free = FreeHook;
+  }
+
+  ~MklCPUAllocator() override { delete allocator_; }
+
+  inline string Name() override { return kName; }
+
+  inline void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    return allocator_->AllocateRaw(alignment, num_bytes);
+  }
+
+  inline void DeallocateRaw(void* ptr) override {
+    allocator_->DeallocateRaw(ptr);
+  }
+
+ private:
+  // Hooks provided by this allocator for memory allocation routines from MKL
+
+  static inline void* MallocHook(size_t size) {
+    VLOG(2) << "MklCPUAllocator: In MallocHook";
+    return cpu_allocator()->AllocateRaw(kAlignment, size);
+  }
+
+  static inline void FreeHook(void* ptr) {
+    VLOG(2) << "MklCPUAllocator: In FreeHook";
+    cpu_allocator()->DeallocateRaw(ptr);
+  }
+
+  static inline void* CallocHook(size_t num, size_t size) {
+    Status s = Status(error::Code::UNIMPLEMENTED,
+                      "Unimplemented case for hooking MKL function.");
+    TF_CHECK_OK(s);  // way to assert with an error message
+  }
+
+  static inline void* ReallocHook(void* ptr, size_t size) {
+    Status s = Status(error::Code::UNIMPLEMENTED,
+                      "Unimplemented case for hooking MKL function.");
+    TF_CHECK_OK(s);  // way to assert with an error message
+  }
+
+  // TODO(jbobba): We should ideally move this into CPUOptions in config.proto.
+  /// Memory limit - 64GB
+  static const size_t kMaxMemSize =
+      static_cast<size_t>(64) * 1024 * 1024 * 1024;
+
+  /// Do we allow growth in BFC Allocator
+  static const bool kAllowGrowth = true;
+
+  /// Name
+  static constexpr const char* kName = "mklcpu";
+
+  /// The alignment that we need for the allocations
+  static const size_t kAlignment = 64;
+
+  Allocator* allocator_;  // owned by this class
+};
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 686bc6885e..ca6ba7970f 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.pb_text.h"
@@ -27,6 +28,10 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 
+#ifdef INTEL_MKL
+#include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
+#endif
+
 namespace tensorflow {
 
 ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
@@ -70,4 +75,8 @@ Status ThreadPoolDevice::MakeTensorFromProto(
                                  ProtoDebugString(tensor_proto));
 }
 
+#ifdef INTEL_MKL
+REGISTER_MEM_ALLOCATOR("MklCPUAllocator", 200, MklCPUAllocator);
+#endif
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index ff31ad965b..943dcab362 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 
+#include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/tracking_allocator.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -119,11 +120,13 @@ Allocator* MakeCpuAllocator() {
 }  // namespace
 
 Allocator* cpu_allocator() {
-  static Allocator* cpu_alloc = MakeCpuAllocator();
+  static Allocator* cpu_alloc = AllocatorRegistry::Global()->GetAllocator();
   if (cpu_allocator_collect_full_stats && !cpu_alloc->TracksAllocationSizes()) {
     cpu_alloc = new TrackingAllocator(cpu_alloc, true);
   }
   return cpu_alloc;
 }
 
+REGISTER_MEM_ALLOCATOR("DefaultCPUAllocator", 100, CPUAllocator);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/allocator_registry.cc b/tensorflow/core/framework/allocator_registry.cc
new file mode 100644
index 0000000000..792b1ceb5a
--- /dev/null
+++ b/tensorflow/core/framework/allocator_registry.cc
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/core/framework/allocator_registry.h"
+#include "tensorflow/core/platform/logging.h"
+
+
+namespace tensorflow {
+
+// static
+AllocatorRegistry* AllocatorRegistry::Global() {
+  static AllocatorRegistry* global_allocator_registry = new AllocatorRegistry;
+  return global_allocator_registry;
+}
+
+bool AllocatorRegistry::CheckForDuplicates(const string& name, int priority) {
+  for (auto entry : allocators_) {
+    if (!name.compare(entry.name) && priority == entry.priority) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void AllocatorRegistry::Register(const string& name, int priority,
+                                 Allocator* allocator) {
+  CHECK(!name.empty()) << "Need a valid name for Allocator";
+  CHECK_GE(priority, 0) << "Priority needs to be non-negative";
+  CHECK(!CheckForDuplicates(name, priority)) << "Allocator with name: [" << name
+                                             << "] and priority: [" << priority
+                                             << "] already registered";
+
+  AllocatorRegistryEntry tmp_entry;
+  tmp_entry.name = name;
+  tmp_entry.priority = priority;
+  tmp_entry.allocator = allocator;
+
+  allocators_.push_back(tmp_entry);
+  int high_pri = -1;
+  for (auto entry : allocators_) {
+    if (high_pri < entry.priority) {
+      m_curr_allocator_ = entry.allocator;
+      high_pri = entry.priority;
+    }
+  }
+}
+
+Allocator* AllocatorRegistry::GetAllocator() {
+  return CHECK_NOTNULL(m_curr_allocator_);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/allocator_registry.h b/tensorflow/core/framework/allocator_registry.h
new file mode 100644
index 0000000000..c419366ae1
--- /dev/null
+++ b/tensorflow/core/framework/allocator_registry.h
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Classes to maintain a static registry of memory allocators
+#ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+
+namespace tensorflow {
+
+// A global AllocatorRegistry is used to hold allocators for CPU backends
+class AllocatorRegistry {
+ public:
+  // Add an allocator to the registry.
+  void Register(const string& name, int priority, Allocator* allocator);
+
+  // Return allocator with highest priority
+  // If multiple allocators have the same high priority, return one of them
+  Allocator* GetAllocator();
+
+  // Returns the global registry of allocators.
+  static AllocatorRegistry* Global();
+
+ private:
+  typedef struct {
+    string name;
+    int priority;
+    Allocator* allocator;  // not owned
+  } AllocatorRegistryEntry;
+
+  bool CheckForDuplicates(const string& name, int priority);
+
+  std::vector<AllocatorRegistryEntry> allocators_;
+  Allocator* m_curr_allocator_;  // not owned
+};
+
+namespace allocator_registration {
+
+class AllocatorRegistration {
+ public:
+  AllocatorRegistration(const string& name, int priority,
+                        Allocator* allocator) {
+    AllocatorRegistry::Global()->Register(name, priority, allocator);
+  }
+};
+
+}  // namespace allocator_registration
+
+#define REGISTER_MEM_ALLOCATOR(name, priority, allocator) \
+  REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(__COUNTER__, name, priority, allocator)
+
+#define REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(ctr, name, priority, allocator) \
+  REGISTER_MEM_ALLOCATOR_UNIQ(ctr, name, priority, allocator)
+
+#define REGISTER_MEM_ALLOCATOR_UNIQ(ctr, name, priority, allocator) \
+  static allocator_registration::AllocatorRegistration              \
+      register_allocator_##ctr(name, priority, new allocator)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
diff --git a/tensorflow/core/framework/type_index.h b/tensorflow/core/framework/type_index.h
index dfde25c21e..b978d90fa8 100644
--- a/tensorflow/core/framework/type_index.h
+++ b/tensorflow/core/framework/type_index.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_FRAMEWORK_TYPE_INDEX_H_
 
 #include <string>
-#ifdef __GXX_RTTI
+#if defined(__GXX_RTTI) || defined(_CPPRTTI)
 #include <typeindex>
 #include <typeinfo>
 #endif  // __GXX_RTTI
@@ -30,7 +30,7 @@ namespace tensorflow {
 // binary sizes. The following #ifdef section provides a non-RTTI
 // replacement for std::type_index (with a minimal set of functions needed by
 // the TensorFlow framework, and more can be added if necessary).
-#ifndef __GXX_RTTI
+#if !defined(__GXX_RTTI) && !defined(_CPPRTTI)
 
 // A thin TypeIndex class that mimics std::type_index but does not use RTTI. As
 // a result, it does not provide the actual name of the type, and only returns a
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index 589730baf1..932d788f23 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -68,9 +68,9 @@ class DeviceType {
 std::ostream& operator<<(std::ostream& os, const DeviceType& d);
 
 // Convenient constants that can be passed to a DeviceType constructor
-extern const char* const DEVICE_CPU;   // "CPU"
-extern const char* const DEVICE_GPU;   // "GPU"
-extern const char* const DEVICE_SYCL;  // "SYCL"
+TF_EXPORT extern const char* const DEVICE_CPU;   // "CPU"
+TF_EXPORT extern const char* const DEVICE_GPU;   // "GPU"
+TF_EXPORT extern const char* const DEVICE_SYCL;  // "SYCL"
 
 typedef gtl::InlinedVector<MemoryType, 4> MemoryTypeVector;
 typedef gtl::ArraySlice<MemoryType> MemoryTypeSlice;
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
new file mode 100644
index 0000000000..87850b3e9a
--- /dev/null
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -0,0 +1,548 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include <vector>
+#include <utility>
+#include <string>
+#include <memory>
+#include <unordered_set>
+#include <functional>
+
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+#include "tensorflow/core/graph/mkl_layout_pass.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+// This pass implements rewriting of graph for propagating Mkl
+// layout as an additional output tensor (we will loosely call a
+// tensor that carries Mkl layout as Mkl tensor henceforth.)
+// from every Mkl supported NN layer.
+//
+// As a example, consider Relu layer. Current definition of Relu
+// layer looks like:
+//
+//           O = Relu(A)
+//
+// Relu has 1 input (A), and 1 output (O).
+//
+// This rewrite pass will generate a new graph node for Relu
+// (new node is called MklRelu) as:
+//
+//          O, O_m = MklRelu(A, A_m)
+//
+// MklRelu has 2 inputs (A and A_m) and 2 outputs (O and O_m).
+// Here A input is same as A input of Relu; O output is same
+// as O output of Relu. O_m is the additional output tensor
+// that will be set by MklRelu, and it represents Mkl tensor
+// corresponding to O -- in other words, O_m is some kind of
+// metadata for O. A_m is additional input of Relu, and it
+// represents metadata for A - as O_m is metadata for O, A_m
+// is metadata for A. MklRelu receives this metadata from
+// previous layer (in the graph).
+//
+// When previous layer in the graph is Mkl layer, A_m will
+// represent a valid Mkl tensor. But when previous Mkl layer
+// is not an Mkl layer, then A_m represents a dummy Mkl tensor.
+//
+// Rewriting rules:
+//   - Selection of an op for rewriting happens by registering
+//     an op with this pass. If an op is not registered, then
+//     it is not rewritten.
+//  - Number of inputs after rewriting:
+//      Since for every input Tensorflow tensor, the rewritten
+//      layer gets Mkl tensor, rewritten op gets 2*N inputs,
+//      where N is the number of inputs for original op.
+//  - Number of outputs after rewriting:
+//      Since for every output Tensorflow tensor, the rewritten
+//      layer generates Mkl tensor, rewritten op generates 2*N
+//      outputs, where N is the number of outputs of original op.
+//  - Ordering of Tensorflow tensors and Mkl tensors:
+//      Since every op generates twice the number of inputs and
+//      outputs, one could imagine different ordering among
+//      Tensorflow tensors and Mkl tensors. E.g., let's assume
+//      an op 'Conv2D' takes (A, B) as input, then new op
+//      'MklConv2D' can take (A, A_m, B, B_m) as input or it
+//      can also take (A, B, A_m, B_m) as input. Among N inputs
+//      one can get N! permutations.
+//
+//      So the question is: which one do we follow? Currently,
+//      we follow an intuitive order where Mkl tensor follows a
+//      corresponding Tensorflow tensor immediately. In the
+//      context of above example, it will be: (A, A_m, B, B_m).
+//      We follow same ordering rule for output tensors.
+//
+// NOTE: Current rewriting approach rewrites an op to Mkl op without
+//      any conditions. But in the future, it may be possible to
+//      consider conditions such as input shapes and sizes to rewrite
+//      an op.
+//
+// Graph rewrite algorithm:
+//      Algorithm: Graph Rewrite
+//      Input: Graph G, Names of nodes to rewrite and their new nodes
+//      Output: Modified Graph G' if nodes are modified, G otherwise.
+//      Start:
+//        N = Topological_Sort(G) // N is set of nodes in toposort order.
+//        foreach node n in N
+//        do
+//          if (Is_MKL_Layer(n))  // Can this layer accept Mkl layout as input.
+//          then
+//            E = set of <incoming edge and its src_output slot> of n
+//            E' = {}   // new set of edges for rewritten node
+//            foreach <e,s> in E
+//            do
+//              E' U {<e,s>}  // First copy edge which generates Tensorflow
+//                            // tensor as it is
+//              m = Source node of edge e
+//              if Is_Rewritten(m)  // Did we rewrite this node in this pass?
+//              then
+//                E' U {<m,s+1>}    // If yes, then m will generate Mkl tensor
+//                                  // as output.
+//              else
+//                d = Generate_Dummy_Mkl_Tensor()  // If not, generate dummy
+//                                                 // Mkl tensor.
+//                E' U {<d,0>}   // Dummy Mkl tensor has only 1 output slot.
+//              fi
+//            done
+//            n' = Build_New_Node(G,new_name,E')
+//            Mark_Rewritten(n')  // Mark new node as being rewritten.
+//          fi
+//        done
+//
+//      Explanation:
+//        For graph rewrite, we visit nodes of the graph in the topological
+//        sort order. With this ordering, we visit nodes in top-to-bottom
+//        fashion. We need this order because while visiting a node we want
+//        all of its input nodes (parents) visited (and rewritten if
+//        applicable). This is because if we need to rewrite a current node
+//        then all of its input nodes need to be fixed (in other words they
+//        cannot be removed later.)
+//
+//        While visiting each node, we first check if it is Mkl layer. If
+//        it is, then we rewrite that node after constructing new inputs to
+//        the node. If it is not Mkl layer, then we do not rewrite the node.
+//
+class MklLayoutRewritePass : public GraphOptimizationPass {
+ public:
+  MklLayoutRewritePass() {
+    csinfo_.conv2d          = "Conv2D";
+
+    ninfo_.push_back({csinfo_.conv2d,   GetMklOpName(csinfo_.conv2d),
+                      2, CopyAttrsConv2D});
+  }
+
+  // Standard interface to run pass
+  Status Run(const GraphOptimizationPassOptions& options);
+
+  // Helper function which does most of heavy lifting for rewriting
+  // Mkl nodes to propagate Mkl tensor as additional output
+  //
+  // Extracts common functionality between Run public interface and
+  // test interface.
+  //
+  // @return true, if and only if graph is mutated; false otherwise.
+  bool RunPass(std::unique_ptr<Graph>* g);
+
+ private:
+  /// Structure to specify name of original op, its new name after rewrite,
+  /// the number of inputs to the original op, and the function to be used
+  /// to copy attributes for the op
+  typedef struct {
+    string name;   // Original name of the op in the graph
+    string newname;   // New name of op in the graph
+    int    numins;  // Number of inputs to the original op
+    std::function<void(Node*, NodeBuilder*)> copyattrs;  // Function handler
+                    // to copy attributes from old node to new node.
+  } NodesInfo;
+
+  /// Structure to store all constant strings
+  struct {
+    string relu;
+    string relugrad;
+    string conv2d;
+  } csinfo_;
+
+  /// Maintain info about nodes to rewrite
+  std::vector<NodesInfo> ninfo_;
+
+  /// Hash table to maintain nodes visited in the graph.
+  std::unordered_set<const Node*> visited_nodes_;
+
+ private:
+  // Predicate to check if we rewrote node 'n'
+  //
+  // If we rewrote the node, then the rewritten node will produce
+  // Mkl tensor as output. If we did not rewrite the node, then
+  // we need to insert dummy Mkl node on the input side.
+  //
+  // Returns true if node is rewritten, false otherwise.
+  inline bool IsRewrittenNode(Node* n) const {
+    return visited_nodes_.find(n) != visited_nodes_.end();
+  }
+
+  // Mark the node as rewritten
+  inline void MarkRewrittenNode(Node* n) {
+    visited_nodes_.insert(n);
+  }
+
+  // Get the name of Mkl op from original TensorFlow op
+  // We prefix 'Mkl' to the original op to get Mkl op.
+  // TODO(nhasabni) We should move this to mkl_util.h.
+  inline string GetMklOpName(const string& name) const {
+    // Prefix that we add to Tensorflow op name to construct Mkl op name.
+    const char* const kMklOpPrefix = "Mkl";
+    return string(kMklOpPrefix) + name;
+  }
+
+  // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
+  // in graph 'g'. Original node is input in 'orign'.
+  //
+  // For details, refer to 'Number of inputs after rewriting' section in the
+  // documentation above.
+  //
+  // Returns Status::OK() if setting up inputs is successful, otherwise
+  // returns appropriate status code.
+  Status SetUpInputs(std::unique_ptr<Graph>* g,
+                    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+                    NodeBuilder* nb, Node* orign);
+
+  // Rewrite Node 'n' in graph 'g' with rewrite information specified in 'ni'
+  // Returns Status::OK() if node rewrite is successful, otherwise returns
+  // appropriate error status
+  Status RewriteNode(std::unique_ptr<Graph>* g, Node* n, const NodesInfo& ni);
+
+  // Functions specific to operators to copy attributes
+  // We need operator-specific function to copy attributes because the framework
+  // does not provide any generic function for it.
+  static void CopyAttrsConv2D(Node* orign, NodeBuilder* nb);
+
+  // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
+  // using node for original node 'orign' and return it in '*out'.
+  // TODO(nhasabni) We should move this to mkl_util.h
+  void GetDummyMklTensorNode(std::unique_ptr<Graph>* g, Node** out,
+                             Node* orign);
+};
+
+
+// We register Mkl rewrite pass for phase 1 in pre-placement group.
+// Do not change the ordering of the Mkl passes.
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 1,
+                      MklLayoutRewritePass);
+
+
+static void FillInputs(const Node* n,
+                       gtl::InlinedVector<Node*, 4>* control_edges,
+                       gtl::InlinedVector<std::pair<Node*, int>, 4>* in) {
+  DCHECK_EQ(in->size(), n->num_inputs());
+  control_edges->clear();
+  for (const Edge* e : n->in_edges()) {
+    if (e->IsControlEdge()) {
+      control_edges->push_back(e->src());
+    } else {
+      (*in)[e->dst_input()] = std::make_pair(e->src(), e->src_output());
+    }
+  }
+  std::sort(control_edges->begin(), control_edges->end());
+  if (n->op_def().is_commutative()) {
+    // For commutative inputs, we sort the input by the input Node*
+    // to get a canonical ordering (so that add(a,b) and add(b, a) will
+    // hash to the same value if is_commutative is true for 'add').
+    std::sort(in->begin(), in->end());
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+// Macros to build new node with different number of inputs.
+// We need this way because we need to specify all the inputs when
+// building a node. Comment at core/graph/node_builder.h, line 85-86.
+
+#define SETUP_INPUTS1(nb, op1) do {        \
+  nb->Input(op1.node, op1.index);          \
+}while(0)
+
+#define SETUP_INPUTS2(nb, op1, op2) do {   \
+  nb->Input(op1.node, op1.index);          \
+  nb->Input(op2.node, op2.index);          \
+}while(0)
+
+#define SETUP_INPUTS3(nb, op1, op2, op3) do {      \
+  nb->Input(op1.node, op1.index);          \
+  nb->Input(op2.node, op2.index);          \
+  nb->Input(op3.node, op3.index);          \
+}while(0)
+
+#define SETUP_INPUTS4(nb, op1, op2, op3, op4) do {  \
+  nb->Input(op1.node, op1.index);          \
+  nb->Input(op2.node, op2.index);          \
+  nb->Input(op3.node, op3.index);          \
+  nb->Input(op4.node, op4.index);          \
+}while(0)
+
+#define SETUP_INPUTS5(nb, op1, op2, op3, op4, op5) do {\
+  nb->Input(op1.node, op1.index);          \
+  nb->Input(op2.node, op2.index);          \
+  nb->Input(op3.node, op3.index);          \
+  nb->Input(op4.node, op4.index);          \
+  nb->Input(op5.node, op5.index);          \
+}while(0)
+
+// TODO(nhasabni) We should move this to mkl_util.h.
+void MklLayoutRewritePass::GetDummyMklTensorNode(
+    std::unique_ptr<Graph>* g, Node** out, Node* orign) {
+  // We use a tensor of shape {8} and value 0,0,0,0,0,0,0,0 to represent
+  // dummy Mkl tensor. 8 = 2*size_t.
+  const DataType dt = DataTypeToEnum<uint8>::v();
+  TensorProto proto;
+  proto.set_dtype(dt);
+  uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  proto.set_tensor_content(const_cast<const void*>(
+      static_cast<void*>(&zero)), 8);
+  TensorShape dummy_shape({8});
+  dummy_shape.AsProto(proto.mutable_tensor_shape());
+  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
+                 .Attr("value", proto)
+                 .Attr("dtype", dt)
+                 .Device(orign->def().device())  // We place this node on same
+                                             // device as device of original
+                                             // node.
+                 .Finalize(&**g, out));
+}
+
+Status MklLayoutRewritePass::SetUpInputs(std::unique_ptr<Graph>* g,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+    NodeBuilder* nb, Node* orign) {
+  std::vector<NodeBuilder::NodeOut> new_inputs;
+
+  // 1. Let's setup inputs for the new node.
+  for (int i = 0; i < inputs.size(); i++) {
+    Node* n = inputs[i].first;
+    // First let's copy original TF tensor input as it is.
+    new_inputs.push_back(NodeBuilder::NodeOut(n, inputs[i].second));
+
+    // Second, let's add edge to propagate Mkl tensors from input Mkl layers,
+    // or generate a dummy Mkl tensor representing not-mkl-tensor case.
+    if (IsRewrittenNode(n)) {
+      // If we have visited this node and rewritten it, then it will generate
+      // an edge that will receive Mkl tensor from a node.
+      // First, let's assert that this op is Mkl layer.
+      DataType T;
+      TF_CHECK_OK(GetNodeAttr(n->def(), "T", &T));
+      // If this op has been rewritten, then its name must have been same as
+      // Mkl op.
+      CHECK_EQ(mkl_layer_registry::IsMklLayer(n->type_string()), true);
+      // src slot number for Mkl tensor would be the one next to TF tensor
+      // slot number.
+      new_inputs.push_back(NodeBuilder::NodeOut(n, inputs[i].second+1));
+    } else {
+      // If we have not visited the node and rewritten it, then we need
+      // to create a dummy node that will feed a non-Mkl tensor to this node.
+      // DummyMklTensor node has no input and generates only 1 output
+      // (dummy Mkl tensor) as output slot number 0.
+      Node* dmt = nullptr;
+      GetDummyMklTensorNode(g, &dmt, orign);
+      CHECK_NOTNULL(dmt);
+      new_inputs.push_back(NodeBuilder::NodeOut(dmt, 0));
+    }
+  }
+
+  // The total number of inputs to new node _must_ be 2 times the number
+  // of inputs to the original node: N original Tensorflow tensors and
+  // N for Mkl tensors corresponding to each Tensorflow tensors.
+  CHECK_EQ(new_inputs.size(), inputs.size() * 2);
+
+  // 2. Let's build the node with new inputs.
+  switch (new_inputs.size()) {
+    case 0:  // We don't need to do anything for no input as we have
+             // already built node.
+            break;
+    case 1: SETUP_INPUTS1(nb, new_inputs[0]); break;
+    case 2: SETUP_INPUTS2(nb, new_inputs[0],
+                              new_inputs[1]); break;
+    case 3: SETUP_INPUTS3(nb, new_inputs[0],
+                              new_inputs[1],
+                              new_inputs[2]); break;
+    case 4: SETUP_INPUTS4(nb, new_inputs[0],
+                              new_inputs[1],
+                              new_inputs[2],
+                              new_inputs[3]); break;
+    case 5: SETUP_INPUTS5(nb, new_inputs[0],
+                              new_inputs[1],
+                              new_inputs[2],
+                              new_inputs[3],
+                              new_inputs[4]); break;
+    default: {
+      return Status(error::Code::UNIMPLEMENTED,
+                    "Could not create node with given number of inputs");
+    }
+  }
+
+  return Status::OK();
+}
+
+void MklLayoutRewritePass::CopyAttrsConv2D(Node* orign, NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  bool use_cudnn_on_gpu;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "data_format", &data_format));
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
+}
+
+Status MklLayoutRewritePass::RewriteNode(
+    std::unique_ptr<Graph>* g, Node* orign, const NodesInfo& ni) {
+  VLOG(1) << "MKLLayoutRewritePass: Original node:" << orign->DebugString();
+
+  // Get all inputs.
+  const int num = orign->num_inputs();
+  CHECK_EQ(num, ni.numins);
+  gtl::InlinedVector<Node*, 4> control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> inputs(num);
+  FillInputs(orign, &control_edges, &inputs);
+
+  // Build new node. We use same name as original node, but change the op name.
+  NodeBuilder nb(orign->name().c_str(), ni.newname.c_str());
+  // Copy user-specified device assigned to original node to new node.
+  nb.Device(orign->def().device());
+  // Set up new inputs to the rewritten node.
+  Status s = SetUpInputs(g, inputs, &nb, orign);
+  if (s != Status::OK()) {
+    return s;
+  }
+  // Copy attributes from original node to new node.
+  ni.copyattrs(orign, &nb);
+  // Set the Mkl layer label for this op.
+  nb.Attr("_kernel", mkl_layer_registry::kMklLayerLabel);
+  Node* newn = nullptr;
+
+  // Finalize graph and get new node.
+  TF_CHECK_OK(nb.Finalize(&**g, &newn));
+  CHECK_NOTNULL(newn);
+
+  // Incoming edges from 'orign' node to new 'newn' node are already copied
+  // in BuildNode. Copy outgoing edges from 'orign' node to new 'newn' node.
+  for (const Edge* e : orign->out_edges()) {
+    (*g)->AddEdge(newn, e->src_output(), e->dst(), e->dst_input());
+  }
+
+  // Copy the runtime device assigned from original code to new node.
+  newn->set_assigned_device_name(orign->assigned_device_name());
+
+  // Delete original node and mark new node as rewritten.
+  (*g)->RemoveNode(orign);
+  MarkRewrittenNode(newn);
+
+  VLOG(1) << "MKLLayoutRewritePass: New node:" << newn->DebugString();
+  return Status::OK();
+}
+
+bool MklLayoutRewritePass::RunPass(
+    std::unique_ptr<Graph>* g) {
+  bool result = false;
+  CHECK_NOTNULL(g);
+
+  DumpGraph("Before running MklLayoutRewritePass", &**g);
+
+  std::vector<Node*> order;
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+
+  for (Node* n : order) {
+    if (!n->IsOp()) {
+      continue;
+    }
+
+    for (const NodesInfo& ni : ninfo_) {
+      DataType dtype = DT_INVALID;
+      // An op needs to have data type (T) attribute and its corresponding
+      // Mkl op name must be supported.
+      if (GetNodeAttr(n->def(), "T", &dtype) == Status::OK() &&
+          mkl_layer_registry::IsMklLayer(GetMklOpName(n->type_string())) &&
+          n->type_string().compare(ni.name) == 0) {
+        string node_name = n->name();
+        string op_name = n->type_string();
+
+        VLOG(1) << "MKLLayoutRewritePass: Scheduled node " << node_name
+                << " with op " << op_name << " for rewrite using"
+                << " layout optimization.";
+
+        if (RewriteNode(g, n, ni) == Status::OK()) {
+          VLOG(1) << "MKLLayoutRewritePass: Successfully rewrote node "
+                  << node_name << " with op " << op_name
+                  << " for Mkl layout optimization.";
+          result = true;
+          break;  // We found matching nodesinfo so no need to search next.
+        }
+      }
+    }
+  }
+
+  DumpGraph("After running MklLayoutRewritePass", &**g);
+
+  return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//              Run function for the pass
+///////////////////////////////////////////////////////////////////////////////
+
+bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
+  return MklLayoutRewritePass().RunPass(g);
+}
+
+Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
+  if (options.graph == nullptr) {
+    return Status::OK();
+  }
+
+  // Get the ownership of graph
+  std::unique_ptr<Graph>* g = std::move(options.graph);
+
+  RunPass(g);
+
+  // Return the ownership of graph back
+  options.graph->reset(g->release());
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/graph/mkl_layout_pass.h b/tensorflow/core/graph/mkl_layout_pass.h
new file mode 100644
index 0000000000..ffe5c1ecfc
--- /dev/null
+++ b/tensorflow/core/graph/mkl_layout_pass.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A graph pass that rewrites graph for propagating MKL layout as a tensor
+
+#ifndef TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_
+#define TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_
+
+#ifdef INTEL_MKL
+
+#include <sys/types.h>
+#include <memory>
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+// Interface to invoke the pass for unit test
+//
+// Returns true if and only if 'g' is mutated.
+extern bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g);
+}  // namespace tensorflow
+
+#endif
+
+#endif  // TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
new file mode 100644
index 0000000000..10671ee2e9
--- /dev/null
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -0,0 +1,199 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/graph/mkl_layout_pass.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+#include <vector>
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+static void InitGraph(const string& s, Graph* graph) {
+  GraphDef graph_def;
+
+  auto parser = protobuf::TextFormat::Parser();
+  //  parser.AllowRelaxedWhitespace(true);
+  CHECK(parser.MergeFromString(s, &graph_def)) << s;
+  GraphConstructorOptions opts;
+  TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
+}
+
+class MklLayoutPassTest : public ::testing::Test {
+ public:
+  MklLayoutPassTest() : graph_(OpRegistry::Global()) {}
+
+  void InitGraph(const string& s) {
+    ::tensorflow::InitGraph(s, &graph_);
+    original_ = CanonicalGraphString(&graph_);
+  }
+
+  static bool IncludeNode(const Node* n) { return n->IsOp(); }
+
+  static string EdgeId(const Node* n, int index) {
+    if (index == 0) {
+      return n->name();
+    } else if (index == Graph::kControlSlot) {
+      return strings::StrCat(n->name(), ":control");
+    } else {
+      return strings::StrCat(n->name(), ":", index);
+    }
+  }
+
+  string CanonicalGraphString(Graph* g) {
+    std::vector<string> nodes;
+    std::vector<string> edges;
+    for (const Node* n : g->nodes()) {
+      if (IncludeNode(n)) {
+        nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")"));
+      }
+    }
+    for (const Edge* e : g->edges()) {
+      if (IncludeNode(e->src()) && IncludeNode(e->dst())) {
+        edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->",
+                                        EdgeId(e->dst(), e->dst_input())));
+      }
+    }
+    // Canonicalize
+    std::sort(nodes.begin(), nodes.end());
+    std::sort(edges.begin(), edges.end());
+    return strings::StrCat(str_util::Join(nodes, ";"), "|",
+                           str_util::Join(edges, ";"));
+  }
+
+  string DoMklLayoutOptimizationPass() {
+    string before = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "Before MKL layout rewrite pass: " << before;
+
+    std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(&graph_);
+    RunMklLayoutRewritePass(ug);
+
+    string result = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "After MKL layout rewrite pass:  " << result;
+    return result;
+  }
+
+  const string& OriginalGraph() const { return original_; }
+
+  Graph graph_;
+  string original_;
+};
+
+REGISTER_OP("Input").Output("o: float").SetIsStateful();
+
+// Single Conv2D Op; No Mkl layer on the input and on the output.
+// We will generate dummy Mkl tensor as 2nd input of Conv2D.
+TEST_F(MklLayoutPassTest, Conv2D_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MklConv2D);D(Mul);DMT/_0(Const);DMT/_1(Const)|"
+            "A->C;B->C:2;B->D;C->D:1;DMT/_0->C:1;DMT/_1->C:3");
+}
+
+// 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will
+// have 2 outputs, both of which will be inputs to next Conv2D.
+TEST_F(MklLayoutPassTest, Conv2D_Positive1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'C']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MklConv2D);D(MklConv2D);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->C;A->D;B->C:2;C->D:2;C->E;"
+            "C:1->D:3;D->E:1;DMT/_0->C:1;DMT/_1->C:3;DMT/_2->D:1");
+}
+
+static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
+  testing::StopTiming();
+  string s;
+  for (int in = 0; in < 10; in++) {
+    s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
+  }
+  random::PhiloxRandom philox(301, 17);
+  random::SimplePhilox rnd(&philox);
+  for (int op = 0; op < op_nodes; op++) {
+    s += strings::Printf(
+        "node { name: 'op%04d' op: 'Mul' attr { key: 'T' value { "
+        "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
+        op, rnd.Uniform(10), rnd.Uniform(10));
+  }
+
+  bool first = true;
+  while (iters > 0) {
+    Graph* graph = new Graph(OpRegistry::Global());
+    InitGraph(s, graph);
+    int N = graph->num_node_ids();
+    if (first) {
+      testing::SetLabel(strings::StrCat("Per graph node.  Nodes: ", N));
+      first = false;
+    }
+    {
+      testing::StartTiming();
+      std::unique_ptr<Graph> ug(graph);
+      RunMklLayoutRewritePass(&ug);
+      testing::StopTiming();
+    }
+    iters -= N;  // Our benchmark units are individual graph nodes,
+                 // not whole graphs
+    // delete graph;
+  }
+}
+BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif /* INTEL_MKL */
diff --git a/tensorflow/core/graph/mkl_optimizer_merge.cc b/tensorflow/core/graph/mkl_optimizer_merge.cc
index 98fc268d28..bc5915eda2 100644
--- a/tensorflow/core/graph/mkl_optimizer_merge.cc
+++ b/tensorflow/core/graph/mkl_optimizer_merge.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include <vector>
 #include <queue>
 #include <utility>
+#include <string>
+#include <memory>
 
 #include "tensorflow/core/graph/mkl_optimizer_merge.h"
 
@@ -33,6 +35,8 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
 
 namespace tensorflow {
 
@@ -58,8 +62,8 @@ static size_t kNodeMergeContextMaxDepth = 10;
 class NodeMergeRewritePass : public GraphOptimizationPass {
  public:
   NodeMergeRewritePass() {
-    csinfo_.conv2d                     = "Conv2D";
-    csinfo_.conv2dwithbias             = "Conv2DWithBias";
+    csinfo_.conv2d                     = "MklConv2D";
+    csinfo_.conv2dwithbias             = "MklConv2DWithBias";
     csinfo_.conv2dwithbiasbackpropbias = "Conv2DWithBiasBackpropBias";
     csinfo_.biasadd                    = "BiasAdd";
     csinfo_.matmul                     = "MatMul";
@@ -72,6 +76,9 @@ class NodeMergeRewritePass : public GraphOptimizationPass {
     // maxhops in backward data-flow graph. Since input of forward nodes
     // (Conv2D) directly goes to backward nodes, we do not expect the
     // hop-distance would be more than few nodes.
+    // TODO(nhasabni) Temporarily disabling rewrite of BiasAddGrad.
+    // Will enable it once we support Conv2DWithBiasBackpropBias op.
+#if 0
     rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.conv2dwithbiasbackpropbias,
                   {csinfo_.conv2dwithbias, kNodeMergeContextMaxDepth}});
     rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.conv2dwithbiasbackpropbias,
@@ -80,6 +87,7 @@ class NodeMergeRewritePass : public GraphOptimizationPass {
     // because we do not have a separate Op for MatMulwithBias.
     rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.biasaddgrad,
                       {csinfo_.matmul, kNodeMergeContextMaxDepth}});
+#endif
   }
 
   // Standard interface to run optimization pass
@@ -182,10 +190,16 @@ class NodeMergeRewritePass : public GraphOptimizationPass {
   // @return Matching rewriteinfo in case a match is found; null otherwise.
   const RewriteInfo* FindMatchingRewriteInfo(const Node* n,
                                              const Node** fwdn) const;
+
+  // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
+  // and return it in '*out'.
+  // TODO(nhasabni) We should move this to mkl_util.h
+  void GetDummyMklTensorNode(std::unique_ptr<Graph>* g, Node** out);
 };
 
-/// We register merge optimizer for phase 1 and MKLToTF insertion for phase 2.
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 1,
+// We register merge optimizer for phase 2 in pre-placement group.
+// Do not change the ordering of the Mkl passes.
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 2,
                       NodeMergeRewritePass);
 
 static void FillInputs(const Node* n,
@@ -219,8 +233,6 @@ Node* NodeMergeRewritePass::FindNodeForMerge(const Node* a) const {
     }
   }
 
-  VLOG(1) << "FindNodeForMerge: " << a->type_string();
-
   for (const MergeInfo* mi : matching_mi) {
     const int N_in = a->num_inputs();
     if (mi->op >= N_in) {
@@ -240,8 +252,6 @@ Node* NodeMergeRewritePass::FindNodeForMerge(const Node* a) const {
       continue;
     }
 
-    VLOG(1) << "     FindNode: " << b->type_string();
-
     gtl::InlinedVector<Node*, 4> b_control_edges;
     gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(N_in);
     FillInputs(b, &b_control_edges, &b_in);
@@ -258,6 +268,22 @@ Node* NodeMergeRewritePass::FindNodeForMerge(const Node* a) const {
   return nullptr;
 }
 
+void NodeMergeRewritePass::GetDummyMklTensorNode(
+    std::unique_ptr<Graph>* g, Node** out) {
+  const DataType dt = DataTypeToEnum<uint8>::v();
+  TensorProto proto;
+  proto.set_dtype(dt);
+  uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  proto.set_tensor_content(const_cast<const void*>(
+      static_cast<void*>(&zero)), 8);
+  TensorShape dummy_shape({8});
+  dummy_shape.AsProto(proto.mutable_tensor_shape());
+  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
+                 .Attr("value", proto)
+                 .Attr("dtype", dt)
+                 .Finalize(&**g, out));
+}
+
 Status NodeMergeRewritePass::MergeNode(std::unique_ptr<Graph>* g,
                                      Node* succ, Node* pred) {
   CHECK_NOTNULL(succ);
@@ -271,7 +297,6 @@ Status NodeMergeRewritePass::MergeNode(std::unique_ptr<Graph>* g,
     std::vector<int32> strides;
     string data_format_pred, data_format_succ;
     bool use_cudnn_on_gnu;
-    int groups = 1;
     TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
     TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
     TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
@@ -280,25 +305,28 @@ Status NodeMergeRewritePass::MergeNode(std::unique_ptr<Graph>* g,
     TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
     TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu",
                             &use_cudnn_on_gnu));
-    // Groups attribute may not be there on the input node. So we do not
-    // check for error in GetNodeAttr call.
-    GetNodeAttr(pred->def(), "groups", &groups);
     // We check to ensure that data formats of both succ and pred are same.
     // We expect them to be same, so we can enforce this as assert.
     // But assert can be too strict, so we enforce this as a check.
     // If the check fails, then we do not merge two nodes.
+    // We also do same check for devices.
     if (data_format_pred != data_format_succ ||
-        T_pred != T_succ) {
+        T_pred != T_succ ||
+        pred->assigned_device_name() != succ->assigned_device_name() ||
+        pred->def().device() != succ->def().device()) {
       return Status(error::Code::INVALID_ARGUMENT,
-                    "data_format or T attribute of Conv2D and BiasAdd"
-                    "do not match. Will skip node merge optimization");
+                    "data_format or T attribute or devices of Conv2D and "
+                    "BiasAdd do not match. Will skip node merge optimization");
     }
 
     // 2. Get inputs from both the nodes.
     // Find the 2 inputs from the conv and the bias from the add Bias.
     Node* oper1 = nullptr;
+    Node* oper1_mkl = nullptr;  // Mkl tensor corresponding to oper1
     Node* oper2 = nullptr;
+    Node* oper2_mkl = nullptr;  // Mkl tensor corresponding to oper2
     Node* oper3 = nullptr;
+    Node* oper3_mkl = nullptr;  // Mkl tensor corresponding to oper3
 
     const int succ_num = succ->num_inputs();
     gtl::InlinedVector<Node*, 4> succ_control_edges;
@@ -326,24 +354,35 @@ Status NodeMergeRewritePass::MergeNode(std::unique_ptr<Graph>* g,
       }
     }
 
-    // Get operand 0, 1 of conv2D
-    oper1 = pred_in[0].first;
-    oper2 = pred_in[1].first;
+    // Get operand 0, 1 of conv2D and their Mkl tensors.
+    CHECK_EQ(pred->in_edges().size(), 4);  // MklConv2D must have 4 inputs.
+    oper1     = pred_in[0].first;
+    oper1_mkl = pred_in[1].first;
+    oper2     = pred_in[2].first;
+    oper2_mkl = pred_in[3].first;
     // Get operand 1 of add_bias
-    oper3 = succ_in[1].first;
+    // BiasAdd must have 2 inputs: Conv, bias
+    CHECK_EQ(succ->in_edges().size(), 2);
+    oper3     = succ_in[1].first;
+    GetDummyMklTensorNode(g, &oper3_mkl);  // Get dummy Mkl tensor node
+    // as BiasAdd does not have Mkl tensor as input.
+    CHECK_NOTNULL(oper3_mkl);
 
     Node* ret;
     // We will use the node name of BiasAdd as the name of new node
     TF_CHECK_OK(NodeBuilder(succ->name(), csinfo_.conv2dwithbias)
                   .Input(oper1)
+                  .Input(oper1_mkl)
                   .Input(oper2)
+                  .Input(oper2_mkl)
                   .Input(oper3)
+                  .Input(oper3_mkl)
                   .Attr("T", T_pred)
                   .Attr("strides", strides)
                   .Attr("padding", padding)
                   .Attr("data_format", data_format_pred)
                   .Attr("use_cudnn_on_gpu", use_cudnn_on_gnu)
-                  .Attr("groups", groups)
+                  .Device(succ->def().device())
                   .Finalize(&**g, &ret));
     CHECK_NOTNULL(ret);
 
@@ -352,6 +391,15 @@ Status NodeMergeRewritePass::MergeNode(std::unique_ptr<Graph>* g,
       (*g)->AddEdge(ret, e->src_output(), e->dst(), e->dst_input());
     }
 
+    // Copy device assigned to old node to new node.
+    // It's ok to use pred or succ as we have enforced a check that
+    // both have same device assigned.
+    ret->set_assigned_device_name(pred->assigned_device_name());
+
+    VLOG(1) << "NodeMergeRewritePass: Merged old node:" << pred->DebugString()
+            << ", and node: " << succ->DebugString() << ", into node:"
+            << ret->DebugString();
+
     (*g)->RemoveNode(succ);
     (*g)->RemoveNode(pred);
 
@@ -369,13 +417,14 @@ Status NodeMergeRewritePass::RewriteNode(std::unique_ptr<Graph>* g, Node *n) {
   const Node* fwdn = nullptr;
   const RewriteInfo* ri = FindMatchingRewriteInfo(n, &fwdn);
   if (ri == nullptr || fwdn == nullptr) {
-    VLOG(1) << "Rewriteinfo not found for: " << n->type_string();
+    VLOG(2) << "NodeMergeRewritePass: Rewriteinfo not found for: "
+            << n->type_string();
     return Status(error::Code::INVALID_ARGUMENT,
                   "Rewrite info not found for the node."
                   "Will skip node rewrite optimization");
   }
 
-  VLOG(1) << "Rewrite called for: " << n->type_string();
+  VLOG(1) << "NodeMergeRewritePass: Rewrite called for: " << n->type_string();
 
   if (n->type_string() == csinfo_.biasaddgrad &&
       ri->node         == csinfo_.biasaddgrad &&
@@ -407,6 +456,7 @@ Status NodeMergeRewritePass::RewriteNode(std::unique_ptr<Graph>* g, Node *n) {
                     .Attr("T", T)
                     .Attr("data_format", data_format)
                     .Attr("strides", strides)
+                    .Device(n->def().device())
                     .Finalize(&**g, &ret));
     } else {
       CHECK_EQ(ri->rewrite, csinfo_.biasaddgrad);
@@ -414,6 +464,7 @@ Status NodeMergeRewritePass::RewriteNode(std::unique_ptr<Graph>* g, Node *n) {
                     .Input(op)
                     .Attr("T", T)
                     .Attr("data_format", data_format)
+                    .Device(n->def().device())
                     .Finalize(&**g, &ret));
     }
 
@@ -424,7 +475,11 @@ Status NodeMergeRewritePass::RewriteNode(std::unique_ptr<Graph>* g, Node *n) {
       (*g)->AddEdge(ret, e->src_output(), e->dst(), e->dst_input());
     }
 
-    VLOG(1) << "Rewrite node: " << n->type_string() << " successful";
+    // Copy device assigned to old node to new node.
+    ret->set_assigned_device_name(n->assigned_device_name());
+
+    VLOG(1) << "MKLOptimizerMergePass: Rewrote old node:" << n->DebugString()
+            << ", into node:" << ret->DebugString();
     (*g)->RemoveNode(n);
 
     return Status::OK();
@@ -450,7 +505,8 @@ NodeMergeRewritePass::FindMatchingRewriteInfo(const Node* n,
     }
   }
 
-  VLOG(1) << "Searching graph for: " << n->type_string() << " in backwards.";
+  VLOG(1) << "NodeMergeRewritePass: Searching graph for: "
+          << n->type_string() << " in backwards.";
 
   // Now we will check for forward op name for rewrite info in data
   // flow graph. Get the max hops we should search for the fwd node
@@ -473,7 +529,8 @@ NodeMergeRewritePass::FindMatchingRewriteInfo(const Node* n,
     curr_depth = curr_pair.second;
     CHECK_NOTNULL(curr_node);
 
-    VLOG(1) << "Visiting node: " << curr_node->type_string()
+    VLOG(1) << "NodeMergeRewritePass: Visiting node: "
+            << curr_node->type_string()
             << " at depth: " << curr_depth
             << " for node: " << n->type_string();
 
@@ -528,17 +585,16 @@ bool NodeMergeRewritePass::RunPass(std::unique_ptr<Graph>* g) {
   std::vector<std::pair<Node*, Node*>> nodes_to_be_merged;
   std::vector<Node*> nodes_to_be_rewritten;
 
-  VLOG(1) << "Running NodeMerge Optimization";
-
   for (Node* n : order) {
     if (!n->IsOp()) continue;
     Node* n1 = nullptr;
     if ((n1 = FindNodeForMerge(n)) != nullptr) {
-      VLOG(1) << "Scheduled nodes " << n->name() << " and "
-              << n1->name() << " for merging";
+      VLOG(1) << "NodeMergeRewritePass: Scheduled nodes "
+              << n->name() << " and " << n1->name() << " for merging";
       nodes_to_be_merged.push_back(std::make_pair(n, n1));
     } else if (IsApplicableRewriteNode(n)) {
-      VLOG(1) << "Scheduled node " << n->name() << " for rewrite";
+      VLOG(1) << "NodeMergeRewritePass: Scheduled node " << n->name()
+              << " for rewrite";
       nodes_to_be_rewritten.push_back(n);
     }
   }
@@ -549,7 +605,8 @@ bool NodeMergeRewritePass::RunPass(std::unique_ptr<Graph>* g) {
     string n1_name = i.first->name();
     string n2_name = i.second->name();
     if (MergeNode(g, i.first, i.second) == Status::OK()) {
-      VLOG(1) << "Merged nodes " << n1_name << " and " << n2_name;
+      VLOG(1) << "NodeMergeRewritePass: Merged nodes " << n1_name
+              << " and " << n2_name;
       result = true;
     }
   }
@@ -559,7 +616,8 @@ bool NodeMergeRewritePass::RunPass(std::unique_ptr<Graph>* g) {
   for (Node* i : nodes_to_be_rewritten) {
     string name = i->name();
     if (RewriteNode(g, i) == Status::OK()) {
-      VLOG(1) << "Rewrite node: " << name << " successful.";
+      VLOG(1) << "NodeMergeRewritePass: Rewrite node: "
+              << name << " successful.";
       result = true;
     }
   }
@@ -574,8 +632,6 @@ bool OptimizeNodeMerge(std::unique_ptr<Graph>* g) {
 }
 
 Status NodeMergeRewritePass::Run(const GraphOptimizationPassOptions& options) {
-  // Currently checking only for two cases - Conv2D+Bias and Matmul+Bias.
-  // It is possible to extend it to other operators in future.
   if (options.graph == nullptr) {
     return Status::OK();
   }
diff --git a/tensorflow/core/graph/mkl_optimizer_merge.h b/tensorflow/core/graph/mkl_optimizer_merge.h
index 554709e9dd..b2caec58af 100644
--- a/tensorflow/core/graph/mkl_optimizer_merge.h
+++ b/tensorflow/core/graph/mkl_optimizer_merge.h
@@ -21,20 +21,14 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #include <sys/types.h>
-#include <vector>
-#include <string>
 #include <memory>
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/common_runtime/optimization_registry.h"
 
 namespace tensorflow {
-
 // Interface to invoke the pass for unit test
 //
 // Returns true if and only if 'g' is mutated.
 extern bool OptimizeNodeMerge(std::unique_ptr<Graph>* g);
-
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/graph/mkl_optimizer_merge_test.cc b/tensorflow/core/graph/mkl_optimizer_merge_test.cc
index da3b01955c..5aae61ad19 100644
--- a/tensorflow/core/graph/mkl_optimizer_merge_test.cc
+++ b/tensorflow/core/graph/mkl_optimizer_merge_test.cc
@@ -105,6 +105,7 @@ class OptimizerMergeTest : public ::testing::Test {
 };
 
 REGISTER_OP("Input").Output("o: float").SetIsStateful();
+REGISTER_OP("MklInput").Output("o: uint8").SetIsStateful();
 
 TEST_F(OptimizerMergeTest, Basic) {
   InitGraph(
@@ -121,10 +122,40 @@ TEST_F(OptimizerMergeTest, Basic) {
 
 // Test set 1: Conv2D + AddBias
 
-// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y)
+// C=MklConv2D(A,M,B,N); E=BiasAdd(C,D); Z=Sub(E,Y)
 TEST_F(OptimizerMergeTest, Conv2DWithBias_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoNodeMerge(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);E(MklConv2DWithBias);"
+            "M(MklInput);N(MklInput);Y(Input);Z(Sub)|A->E;B->E:2;D->E:4;"
+            "DMT/_0->E:5;E->Z;M->E:1;N->E:3;Y->Z:1");
+}
+
+// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
+// We do not merge in this case as op is Conv2D and not MklConv2D.
+TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_NoMklConv2D) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
       "node { name: 'B' op: 'Input'}"
       "node { name: 'C' op: 'Conv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -143,63 +174,69 @@ TEST_F(OptimizerMergeTest, Conv2DWithBias_Positive) {
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
       " input: ['E', 'Y']}");
   EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);D(Input);E(Conv2DWithBias);Y(Input);Z(Sub)|"
-             "A->E;B->E:1;D->E:2;E->Z;Y->Z:1");
+            "A(Input);B(Input);C(Conv2D);D(Input);E(BiasAdd);Y(Input);Z(Sub)|"
+             "A->C;B->C:1;C->E;D->E:1;E->Z;Y->Z:1");
 }
 
-// Graph contains only Conv2D, no AddBias.
+// Graph contains only MklConv2D, no AddBias.
 TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_NoAddBias) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
       "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}");
+      " input: ['A', 'M', 'B', 'N']}");
   EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Conv2D)|"
-             "A->C;B->C:1");
+            "A(Input);B(Input);C(MklConv2D);M(MklInput);N(MklInput)|"
+             "A->C;B->C:2;M->C:1;N->C:3");
 }
 
-// Conv2D output does not go to BiasAdd.
+// MklConv2D output does not go to BiasAdd.
 TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow1) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
       "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
+      " input: ['A', 'M', 'B', 'N']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'Input'}"
       "node { name: 'F' op: 'BiasAdd'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D', 'E'] }");  // Output of Conv2D does not go to BiasAdd.
+      " input: ['D', 'E'] }");  // Output of MklConv2D does not go to BiasAdd.
   EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Conv2D);D(Input);E(Input);F(BiasAdd)|"
-             "A->C;B->C:1;D->F;E->F:1");
+            "A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);"
+            "M(MklInput);N(MklInput)|A->C;B->C:2;D->F;E->F:1;M->C:1;N->C:3");
 }
 
-// Conv2D has two outgoing edges: BiasAdd and some other dummy node (Add).
+// MklConv2D has two outgoing edges: BiasAdd and some other dummy node (Add).
 // Merge should not be done in such case.
 TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow2) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
       "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
+      " input: ['A', 'M', 'B', 'N']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'Input'}"
       "node { name: 'F' op: 'BiasAdd'"
@@ -211,8 +248,9 @@ TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow2) {
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['C', 'E'] }");
   EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Conv2D);D(Input);E(Input);F(BiasAdd);G(Add)|"
-             "A->C;B->C:1;C->G;D->F;E->F:1;E->G:1");
+            "A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);"
+            "G(Add);M(MklInput);N(MklInput)|A->C;B->C:2;C->G;D->F;"
+            "E->F:1;E->G:1;M->C:1;N->C:3");
 }
 
 // data_format attribute value mismatch. Merge should not be done
@@ -220,30 +258,65 @@ TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow2) {
 TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_AttrMismatch) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
       "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
+      " input: ['A', 'M', 'B', 'N']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'BiasAdd'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NHCW' } }"
       " input: ['C', 'D'] }");
   EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Conv2D);D(Input);E(BiasAdd)|"
-            "A->C;B->C:1;C->E;D->E:1");
+            "A(Input);B(Input);C(MklConv2D);D(Input);E(BiasAdd);M(MklInput);"
+            "N(MklInput)|A->C;B->C:2;C->E;D->E:1;M->C:1;N->C:3");
 }
 
-// Test set 2: Conv2D..BiasAddGrad -> Conv2DWithBiasBackpropBias rewrite tests
+#if 0
+// This test set is disabled temporarily as we do not enable node rewrite.
+// This test set will be enabled when we support Mkl-specific kernels for
+// backward bias.
+//
+// Test set 2: MklConv2D..BiasAddGrad -> Conv2DWithBiasBackpropBias
+// rewrite tests
 
-// C=Conv2D(A,B); D=Sub(C,A); F=BiasAddGrad(D)
+// C=MklConv2D(A,M,B,N); D=Sub(C,A); E=BiasAddGrad(D)
 TEST_F(OptimizerMergeTest, Conv2DBackprop_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoNodeMerge(),
+            "A(Input);B(Input);C(MklConv2D);D(Sub);E(Conv2DWithBiasBackpropBias);"
+            "M(MklInput);N(MklInput)|A->C;A->D:1;B->C:2;C->D;D->E;M->C:1;N->C:3");
+}
+
+// No MklConv2D in context, but Conv2D in context. No rewrite should happen.
+// C=Conv2D(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoMklConv2D) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
       "node { name: 'B' op: 'Input'}"
       "node { name: 'C' op: 'Conv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -260,12 +333,12 @@ TEST_F(OptimizerMergeTest, Conv2DBackprop_Positive) {
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " input: ['D'] }");
   EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Conv2D);D(Sub);E(Conv2DWithBiasBackpropBias)|"
+            "A(Input);B(Input);C(Conv2D);D(Sub);E(BiasAddGrad)|"
              "A->C;A->D:1;B->C:1;C->D;D->E");
 }
 
 // No Conv2D in the context for BiasAddGrad. No rewrite should happen.
-// C=Add(A,B); D=Sub(C,A); F=BiasAddGrad(D,E)
+// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
 TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
@@ -287,7 +360,7 @@ TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D) {
 
 // No Conv2D in the context for BiasAddGrad, but MatMul in context.
 // Rewrite should happen, but name of BiasAddGrad does not change.
-// C=MatMul(A,B); D=Sub(C,A); F=BiasAddGrad(D,E)
+// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
 TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D_MatMul) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
@@ -310,7 +383,7 @@ TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D_MatMul) {
 }
 
 // Test set 3: MatMul..BiasAddGrad -> BiasAddGrad rewrite tests
-// C=MatMul(A,B); D=Sub(C,A); F=BiasAddGrad(D,E)
+// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
 TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
@@ -333,7 +406,7 @@ TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Positive) {
 }
 
 // No MatMul in the context for BiasAddGrad. No rewrite should happen.
-// C=Add(A,B); D=Sub(C,A); F=BiasAddGrad(D,E)
+// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
 TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Negative_NoMatMul) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
@@ -352,7 +425,7 @@ TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Negative_NoMatMul) {
             "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
              "A->C;A->D:1;B->C:1;C->D;D->E");
 }
-
+#endif
 
 static void BM_NodeMerge(int iters, int op_nodes) {
   testing::StopTiming();
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
new file mode 100644
index 0000000000..1e7b5e7094
--- /dev/null
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -0,0 +1,271 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include <set>
+#include <vector>
+#include <queue>
+#include <utility>
+#include <string>
+#include <memory>
+
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+// This pass inserts Mkl to Tf tensor conversion nodes (represented by C)
+// in the graph in between A and B, where A and B match any one
+// of the following
+// cases:
+//  1) A = layer/Op that generates output in Mkl format and,
+//     B = layer/Op that does not accept input in Mkl format and,
+//     A -> B (there is a direct edge between A and B, then
+//     We will insert C such that A->C->B.
+//
+//  2) A = layer/Op that generates output in Mkl format and,
+//     B = NULL (in other words, A is the last layer in the graph), then
+//     We will insert C such that A->C->B. (C will be the last layer.)
+//
+//  Note that case 1 applies to all outputs of A that are input to B.
+//  In other words, the conversions will be required for every output
+//  of A that is input to B. For example, let us say the output of A
+//  is A1, A2, A3, of which A1 and A2 are in Mkl format, but A3 is not
+//  in Mkl format, and all of them are input to B. In such case, we will
+//  do the conversion for A1 and A2 only. We do not need to do any conversion
+//  for A3.
+//
+// This pass relies on layers registering themselves about their Mkl compliant.
+// Mkl compliant layer can accept inputs in Mkl format, and produce output in
+// Mkl format. Non-compliant layer accepts inputs and outputs in
+// TensorFlow format.
+//
+class MklToTfConversionPass : public GraphOptimizationPass {
+ public:
+  MklToTfConversionPass() {}
+  Status Run(const GraphOptimizationPassOptions& options);
+
+  // Insert layout conversion node in the graph pointed by g.
+  // Function scans the graph for candidate edges where we
+  // need to insert conversion nodes.
+  //
+  // @return true even if single conversion node is inserted;
+  // false, otherwise.
+  bool RunPass(std::unique_ptr<Graph>* g);
+
+
+ private:
+  // Is the input Op supported by Mkl-specific layout?
+  //
+  // @input op_name string of the op
+  // @return true if op is Mkl supported; false, otherwise.
+  inline bool IsMklSupportedOp(const string& op_name) const {
+    return mkl_layer_registry::IsMklLayer(op_name);
+  }
+
+  // Insert layout conversion node on the edge pointed by 'e' from graph 'g'.
+  //
+  // Edge will be deleted once a call to this function is successful.
+  // Any attempt to use the edge after this call
+  // will lead to undefined behaviors.
+  //
+  // @return Success:OK() if insertion is successful, otherwise returns
+  //         appropriate error status code.
+  Status InsertConversionNodeOnEdge(std::unique_ptr<Graph>* g, Edge*);
+};
+
+// We register MklToTf insertion for phase 1 in post-partition grouping.
+// We register this pass after partitioning so that we get a complete
+// picture of inputs and outputs of the nodes in the graphs.
+const OptimizationPassRegistry::Grouping kMklTfConvPassGroup =
+  OptimizationPassRegistry::POST_PARTITIONING;
+REGISTER_OPTIMIZATION(kMklTfConvPassGroup, 1, MklToTfConversionPass);
+
+Status MklToTfConversionPass::InsertConversionNodeOnEdge(
+    std::unique_ptr<Graph>* g, Edge *e) {
+  CHECK_NOTNULL(e);
+
+  Node* src = e->src();
+  Node* dst = e->dst();
+
+  CHECK_NOTNULL(src);
+  CHECK_NOTNULL(dst);
+
+  Node* conversion_node = nullptr;
+  DataType src_datatype = DT_INVALID;
+  DataType dst_datatype = DT_INVALID;
+  string data_format;
+
+  TF_CHECK_OK(GetNodeAttr(src->def(), "T", &src_datatype));
+  TF_CHECK_OK(GetNodeAttr(dst->def(), "T", &dst_datatype));
+  if (src_datatype != dst_datatype) {
+    string err_msg = "T attribute of " + src->name() + " and " +
+                      dst->name() + " do not match. Will not insert" +
+                     " MklToTf node in such case.";
+    return Status(error::Code::INVALID_ARGUMENT, err_msg.c_str());
+  }
+
+  // Lets build the conversion node and specify src as input.
+  TF_CHECK_OK(NodeBuilder((*g)->NewName("Mkl2Tf"), "MklToTf")
+        .Input(src, e->src_output())
+        .Input(src, e->src_output()+1)  // Mkl tensor immediately
+                                        // follows Tf tensor.
+        .Device(src->def().device())  // We want to get conversion node
+                                      // on same device as source node.
+        .Attr("T", src_datatype)
+        .Finalize(&**g, &conversion_node));
+
+  CHECK_NOTNULL(conversion_node);
+  if (GetNodeAttr(src->def(), "data_format", &data_format) == Status::OK()) {
+    conversion_node->AddAttr("data_format", data_format);
+  }
+
+  // Get assigned device from source node and apply it to conversion node.
+  // We want conversion node to be on the same device as the source node.
+  conversion_node->set_assigned_device_name(src->assigned_device_name());
+
+  // Set the Mkl layer label for this op.
+  conversion_node->AddAttr("_kernel", mkl_layer_registry::kMklLayerLabel);
+
+  // Now that we have added edge from src->conversion_node, let's add edge from
+  // output of conversion_node to the dest node. Since conversion_node
+  // has only 1 output, the src_output of conversion_node is 0.
+  CHECK_NOTNULL((*g)->AddEdge(conversion_node, 0, dst, e->dst_input()));
+
+  VLOG(1) << "MklToTfConversionPass: Inserting Conversion node on: "
+          << src->type_string() << " and " << dst->type_string()
+          << " successful.";
+
+  // Remove src->dst edge now.
+  (*g)->RemoveEdge(e);
+  return Status::OK();
+}
+
+bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
+  bool result = false;
+
+  CHECK_NOTNULL(g);
+
+  DumpGraph("Before MklToTfConversionPass", &**g);
+
+  // Since we are looking for mkl-supported op node immediately
+  // followed by non-mkl op node, we will just iterate over edge
+  // set of the graph.
+  // vector to maintain candiadate edges whose source and destination
+  // are candidate for inserting conversion node
+  std::vector<Edge*> candidate_edges;
+
+  for (const Edge *e : (*g)->edges()) {
+    Node* src = e->src();
+    Node* dst = e->dst();
+
+    // We skip control edges.
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    VLOG(1) << "MklToTfConversionPass: InsertConversionNodes: "
+            << src->type_string() << " and " << dst->type_string();
+
+    // Let's get source and destination data type.
+    DataType src_datatype = DT_INVALID;
+    if (GetNodeAttr(src->def(), "T", &src_datatype) != Status::OK()) {
+      continue;
+    }
+    // We cannot check datatype on destination node because destination node
+    // may not be Mkl node.
+    DataType dst_datatype = DT_INVALID;
+    GetNodeAttr(dst->def(), "T", &dst_datatype);
+
+    // Check if src with is Mkl-compliant, while dst is not Mkl-compliant.
+    if (IsMklSupportedOp(src->type_string()) &&
+       !IsMklSupportedOp(dst->type_string())) {
+      VLOG(1) << "MklToTfConversionPass: Scheduled nodes " << src->name()
+              << " and " << dst->name() << " for inserting conversion nodes";
+      candidate_edges.push_back(const_cast<Edge*>(e));
+    }
+  }
+
+  // Process all candidate edges and insert conversion nodes on them.
+  for (Edge* e : candidate_edges) {
+    // Even if we insert conversion node on a single edge, we
+    // need to return true.
+    string src_name = e->src()->name();
+    string dst_name = e->dst()->name();
+    if (InsertConversionNodeOnEdge(g, e) == Status::OK()) {
+      VLOG(1) << "MklToTfConversionPass: Inserted conversion "
+              << "node on edge between " << src_name << " and " << dst_name;
+      result = true;
+    }
+  }
+
+  DumpGraph("After MklToTfConversionPass", &**g);
+
+  // We need to return true even if we insert one conversion node
+  // anywhere in the graph.
+  return result;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//              Run function for the pass
+//////////////////////////////////////////////////////////////////////////////
+
+bool InsertMklToTfConversionNodes(std::unique_ptr<Graph>* g) {
+  return MklToTfConversionPass().RunPass(g);
+}
+
+Status MklToTfConversionPass::Run(
+  const GraphOptimizationPassOptions& options) {
+  if (options.graph == nullptr && options.partition_graphs == nullptr) {
+    return Status::OK();
+  }
+
+  auto process_graph = [&](std::unique_ptr<Graph>* g) {
+    // Get the ownership of graph
+    std::unique_ptr<Graph>* ng = std::move(g);
+    RunPass(ng);
+    // Return the ownership of graph back
+    g->reset(ng->release());
+  };
+
+  if (kMklTfConvPassGroup != OptimizationPassRegistry::POST_PARTITIONING) {
+    // For any pre-partitioning phase, graph is stored in options.graph.
+    process_graph(options.graph);
+  } else {
+    // For post partitioning phase, graphs are stored in
+    // options.partition_graphs.
+    for (auto& pg : *options.partition_graphs) {
+      process_graph(&pg.second);
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.h b/tensorflow/core/graph/mkl_tfconversion_pass.h
new file mode 100644
index 0000000000..0562d8b3cd
--- /dev/null
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// An optimization pass that inserts MklToTf conversion nodes in the graph
+
+#ifndef TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_
+#define TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_
+
+#ifdef INTEL_MKL
+
+#include <sys/types.h>
+#include <memory>
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+// Interface to invoke the pass for unit test
+//
+// Returns true if and only if 'g' is mutated.
+extern bool InsertMklToTfConversionNodes(std::unique_ptr<Graph>* g);
+}  // namespace tensorflow
+
+#endif
+
+#endif  // TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
new file mode 100644
index 0000000000..103ff295b3
--- /dev/null
+++ b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
@@ -0,0 +1,243 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
+
+#include <vector>
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+class MklToTfConversionPass : public ::testing::Test {
+ public:
+  MklToTfConversionPass() : graph_(OpRegistry::Global()) {}
+
+  static void InitGraph(const string& s, Graph* graph) {
+    GraphDef graph_def;
+
+    auto parser = protobuf::TextFormat::Parser();
+    CHECK(parser.MergeFromString(s, &graph_def)) << s;
+    GraphConstructorOptions opts;
+    TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
+  }
+
+  void InitGraph(const string& s) {
+    InitGraph(s, &graph_);
+    original_ = CanonicalGraphString(&graph_);
+  }
+
+  static bool IncludeNode(const Node* n) { return n->IsOp(); }
+
+  static string EdgeId(const Node* n, int index) {
+    if (index == 0) {
+      return n->name();
+    } else if (index == Graph::kControlSlot) {
+      return strings::StrCat(n->name(), ":control");
+    } else {
+      return strings::StrCat(n->name(), ":", index);
+    }
+  }
+
+  string CanonicalGraphString(Graph* g) {
+    std::vector<string> nodes;
+    std::vector<string> edges;
+    for (const Node* n : g->nodes()) {
+      if (IncludeNode(n)) {
+        nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")"));
+      }
+    }
+    for (const Edge* e : g->edges()) {
+      if (IncludeNode(e->src()) && IncludeNode(e->dst())) {
+        edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->",
+                                        EdgeId(e->dst(), e->dst_input())));
+      }
+    }
+    // Canonicalize
+    std::sort(nodes.begin(), nodes.end());
+    std::sort(edges.begin(), edges.end());
+    return strings::StrCat(str_util::Join(nodes, ";"), "|",
+                           str_util::Join(edges, ";"));
+  }
+
+  string DoRunMklToTfConversionPass() {
+    string before = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "Before MklToTf conversion pass: " << before;
+
+    std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(&graph_);
+    InsertMklToTfConversionNodes(ug);
+
+    string result = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "After MklToTf conversion pass:  " << result;
+    return result;
+  }
+
+  const string& OriginalGraph() const { return original_; }
+
+  Graph graph_;
+  string original_;
+};
+
+REGISTER_OP("Input").Output("o: float").SetIsStateful();
+REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
+REGISTER_OP("MklInput").Output("o: uint8").SetIsStateful();
+
+TEST_F(MklToTfConversionPass, Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoRunMklToTfConversionPass(),
+            "A(Input);B(Input);C(Mul);D(Mul)|"
+            "A->C;A->D;B->C:1;B->D:1");
+}
+
+// MklConv2D followed by Non-Mkl layer
+// C=MklConv2D(A,M,B,N); E=Sub(C,D)
+TEST_F(MklToTfConversionPass, Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'D']}");
+  EXPECT_EQ(DoRunMklToTfConversionPass(),
+            "A(Input);B(Input);C(MklConv2D);D(Input);E(Sub);M(MklInput);"
+            "Mkl2Tf/_0(MklToTf);N(MklInput)|A->C;B->C:2;C->Mkl2Tf/_0;"
+            "C:1->Mkl2Tf/_0:1;D->E:1;M->C:1;Mkl2Tf/_0->E;N->C:3");
+}
+
+// MklConv2D followed by Non-Mkl layer, and MklConv2D uses half type
+// C=MklConv2D(A,M,B,N); E=Sub(C,D)
+// MklToTf node should be inserted.
+TEST_F(MklToTfConversionPass, Positive_Type) {
+  InitGraph(
+      "node { name: 'A' op: 'HalfInput'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'HalfInput'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_HALF } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'HalfInput'}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_HALF } }"
+      " input: ['C', 'D']}");
+  EXPECT_EQ(DoRunMklToTfConversionPass(),
+            "A(HalfInput);B(HalfInput);C(MklConv2D);D(HalfInput);"
+            "E(Sub);M(MklInput);Mkl2Tf/_0(MklToTf);N(MklInput)|"
+            "A->C;B->C:2;C->Mkl2Tf/_0;C:1->Mkl2Tf/_0:1;D->E:1;"
+            "M->C:1;Mkl2Tf/_0->E;N->C:3");
+}
+
+// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
+// There is no Mkl layer so no conversion op should be inserted.
+TEST_F(MklToTfConversionPass, Negative_NoMklLayer) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoRunMklToTfConversionPass(),
+            "A(Input);B(Input);C(Conv2D);D(Input);E(BiasAdd);Y(Input);Z(Sub)|"
+             "A->C;B->C:1;C->E;D->E:1;E->Z;Y->Z:1");
+}
+
+static void BM_RunMklToTfConversionPass(int iters, int op_nodes) {
+  testing::StopTiming();
+  string s;
+  for (int in = 0; in < 10; in++) {
+    s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
+  }
+  random::PhiloxRandom philox(301, 17);
+  random::SimplePhilox rnd(&philox);
+  for (int op = 0; op < op_nodes; op++) {
+    s += strings::Printf(
+        "node { name: 'op%04d' op: 'Mul' attr { key: 'T' value { "
+        "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
+        op, rnd.Uniform(10), rnd.Uniform(10));
+  }
+
+  bool first = true;
+  while (iters > 0) {
+    Graph* graph = new Graph(OpRegistry::Global());
+    MklToTfConversionPass::InitGraph(s, graph);
+    int N = graph->num_node_ids();
+    if (first) {
+      testing::SetLabel(strings::StrCat("Per graph node.  Nodes: ", N));
+      first = false;
+    }
+    {
+      testing::StartTiming();
+      std::unique_ptr<Graph> ug(graph);
+      InsertMklToTfConversionNodes(&ug);
+      testing::StopTiming();
+    }
+    iters -= N;  // Our benchmark units are individual graph nodes,
+                 // not whole graphs
+    // delete graph;
+  }
+}
+BENCHMARK(BM_RunMklToTfConversionPass)->Arg(1000)->Arg(10000);
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 9740f96a6d..3b79d4c3db 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -688,8 +688,15 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "transpose_op",
-    prefix = "transpose_op",
-    deps = ARRAY_DEPS,
+    srcs = [
+        "transpose_op.cc",
+    ] + if_mkl([
+        "mkl_transpose_op.cc",
+    ]),
+    hdrs = ["transpose_op.h"],
+    deps = ARRAY_DEPS + if_mkl([
+        "//third_party/mkl:intel_binary_blob",
+    ]),
 )
 
 tf_kernel_library(
@@ -1735,6 +1742,22 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "resize_benchmark_test",
+    srcs = ["resize_op_benchmark_test.cc"],
+    deps = [
+        ":image",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 cc_library(
     name = "io",
     deps = [
@@ -4376,7 +4399,7 @@ tf_cc_test(
 
 if_mkl(
     tf_kernel_library(
-        name = "mkl_ops",
+        name = "mkl_matmul_op",
         prefix = "mkl_matmul",
         deps = [
             ":math",
@@ -4385,6 +4408,40 @@ if_mkl(
     ),
 )
 
+if_mkl(
+    tf_kernel_library(
+        name = "mkl_conv_op",
+        prefix = "mkl_conv",
+        deps = [
+            ":bounds_check",
+            ":ops_util",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:lib_internal",
+            "//tensorflow/core:nn_ops_op_lib",
+            "//third_party/mkl:intel_binary_blob",
+        ],
+    ),
+)
+
+if_mkl(
+    tf_kernel_library(
+        name = "mkl_tfconv_op",
+        prefix = "mkl_tfconv",
+        deps = [
+            ":bounds_check",
+            ":ops_util",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:lib_internal",
+            "//tensorflow/core:nn_ops_op_lib",
+            "//third_party/mkl:intel_binary_blob",
+        ],
+    ),
+)
+
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.
 
diff --git a/tensorflow/core/kernels/adjust_hue_op.cc b/tensorflow/core/kernels/adjust_hue_op.cc
index 09300737c7..e8f32693f7 100644
--- a/tensorflow/core/kernels/adjust_hue_op.cc
+++ b/tensorflow/core/kernels/adjust_hue_op.cc
@@ -1,5 +1,4 @@
 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -12,16 +11,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
 #include <memory>
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/adjust_hue_op.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/work_sharder.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -77,6 +84,7 @@ template <class Device>
 class AdjustHueOp;
 
 namespace internal {
+
 // Helper function to convert a RGB color to H-and-V-range. H is in the range
 // of [0, 6] instead of the normal [0, 1]
 static void rgb_to_hv_range(float r, float g, float b, float* h, float* v_min,
@@ -185,6 +193,7 @@ static void hv_range_to_rgb(float h, float v_min, float v_max, float* r,
 }
 }  // namespace internal
 
+
 template <>
 class AdjustHueOp<CPUDevice> : public AdjustHueOpBase {
  public:
@@ -237,4 +246,34 @@ class AdjustHueOp<CPUDevice> : public AdjustHueOpBase {
 REGISTER_KERNEL_BUILDER(Name("AdjustHue").Device(DEVICE_CPU),
                         AdjustHueOp<CPUDevice>);
 
+#if GOOGLE_CUDA
+template <>
+class AdjustHueOp<GPUDevice> : public AdjustHueOpBase {
+ public:
+  explicit AdjustHueOp(OpKernelConstruction* context)
+      : AdjustHueOpBase(context) {}
+
+  virtual void DoCompute(OpKernelContext* context, const ComputeOptions& options) override {
+    const Tensor* input = options.input;
+    const Tensor* delta = options.delta;
+    Tensor* output = options.output;
+    const int64 number_of_elements = input->NumElements();
+    GPUDevice device = context->eigen_gpu_device();
+    const auto stream = device.stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+    if (number_of_elements > 0) {
+      const float* input_data = input->flat<float>().data();
+      const float* delta_h = delta->flat<float>().data();
+      float* const output_data = output->flat<float>().data();
+      functor::AdjustHueGPU()(&device, number_of_elements, input_data, delta_h,
+                              output_data);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("AdjustHue").Device(DEVICE_GPU), AdjustHueOp<GPUDevice>);
+
+#endif
+
+//} // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/adjust_hue_op.h b/tensorflow/core/kernels/adjust_hue_op.h
new file mode 100644
index 0000000000..5b30bd8540
--- /dev/null
+++ b/tensorflow/core/kernels/adjust_hue_op.h
@@ -0,0 +1,42 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H
+#define _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+struct AdjustHueGPU {
+  void operator()(
+      GPUDevice* device,
+      const int64 number_of_elements,
+      const float* const input,
+      const float* const delta,
+      float* const output
+  );
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
+#endif // _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H
diff --git a/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc b/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
new file mode 100644
index 0000000000..2fc69ed101
--- /dev/null
+++ b/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
@@ -0,0 +1,141 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/adjust_hue_op.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+namespace internal {
+
+namespace {
+  typedef struct RgbTuple {
+    float r;
+    float g;
+    float b;
+  } RgbTuple;
+
+  typedef struct HsvTuple {
+    float h;
+    float s;
+    float v;
+  } HsvTuple;
+}  // anon namespace
+
+__device__ HsvTuple rgb2hsv_cuda(const float r, const float g, const float b)
+{
+  HsvTuple tuple;
+  const float M = fmaxf(r, fmaxf(g, b));
+  const float m = fminf(r, fminf(g, b));
+  const float chroma = M - m;
+  float h = 0.0f, s = 0.0f;
+  // hue
+  if (chroma > 0.0f) {
+    if (M == r) {
+      const float num = (g - b) / chroma;
+      const float sign = copysignf(1.0f, num);
+      h = ((sign < 0.0f) * 6.0f + sign * fmodf(sign * num, 6.0f)) / 6.0f;
+    } else if (M == g) {
+      h = ((b - r) / chroma + 2.0f) / 6.0f;
+    } else {
+      h = ((r - g) / chroma + 4.0f) / 6.0f;
+    }
+  } else {
+    h = 0.0f;
+  }
+  // saturation
+  if (M > 0.0) {
+    s = chroma / M;
+  } else {
+    s = 0.0f;
+  }
+  tuple.h = h;
+  tuple.s = s;
+  tuple.v = M;
+  return tuple;
+}
+
+__device__ RgbTuple hsv2rgb_cuda(const float h, const float s, const float v)
+{
+  RgbTuple tuple;
+  const float new_h = h * 6.0f;
+  const float chroma = v * s;
+  const float x = chroma * (1.0f - fabsf(fmodf(new_h, 2.0f) - 1.0f));
+  const float new_m = v - chroma;
+  const bool between_0_and_1 = new_h >= 0.0f && new_h < 1.0f;
+  const bool between_1_and_2 = new_h >= 1.0f && new_h < 2.0f;
+  const bool between_2_and_3 = new_h >= 2.0f && new_h < 3.0f;
+  const bool between_3_and_4 = new_h >= 3.0f && new_h < 4.0f;
+  const bool between_4_and_5 = new_h >= 4.0f && new_h < 5.0f;
+  const bool between_5_and_6 = new_h >= 5.0f && new_h < 6.0f;
+  tuple.r = chroma * (between_0_and_1 || between_5_and_6) +
+      x * (between_1_and_2 || between_4_and_5) + new_m;
+  tuple.g = chroma * (between_1_and_2 || between_2_and_3) +
+      x * (between_0_and_1 || between_3_and_4) + new_m;
+  tuple.b = chroma * (between_3_and_4 || between_4_and_5) +
+      x * (between_2_and_3 || between_5_and_6) + new_m;
+  return tuple;
+}
+
+__global__ void adjust_hue_nhwc(const int64 number_elements,
+                                const float * const __restrict__ input,
+                                float * const output,
+                                const float * const hue_delta)
+{
+  // multiply by 3 since we're dealing with contiguous RGB bytes for each pixel (NHWC)
+  const int64 idx = (blockDim.x * blockIdx.x + threadIdx.x) * 3;
+  // bounds check
+  if (idx > number_elements - 1) {
+    return;
+  }
+  const float delta = hue_delta[0];
+  const HsvTuple hsv = rgb2hsv_cuda(input[idx], input[idx + 1], input[idx + 2]);
+  // hue adjustment
+  float new_h = fmodf(hsv.h + delta, 1.0f);
+  if (new_h < 0.0f) {
+    new_h = fmodf(1.0f + new_h, 1.0f);
+  }
+  const RgbTuple rgb = hsv2rgb_cuda(new_h, hsv.s, hsv.v);
+  output[idx] = rgb.r;
+  output[idx + 1] = rgb.g;
+  output[idx + 2] = rgb.b;
+}
+} // namespace internal
+
+
+namespace functor {
+
+void AdjustHueGPU::operator()(
+  GPUDevice* device,
+  const int64 number_of_elements,
+  const float* const input,
+  const float* const delta,
+  float* const output
+) {
+  const auto stream = device->stream();
+  const CudaLaunchConfig config = GetCudaLaunchConfig(number_of_elements, *device);
+  const int threads_per_block = config.thread_per_block;
+  const int block_count = (number_of_elements + threads_per_block - 1) / threads_per_block;
+  internal::adjust_hue_nhwc<<<block_count, threads_per_block, 0, stream>>>(
+    number_of_elements, input, output, delta
+  );
+}
+} // namespace functor
+}  // namespace tensorflow
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h
index 492c358a52..f93921d4a5 100644
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@@ -338,6 +338,7 @@ struct AvgPoolMeanReducer {
 // In the case below, 0xd8 implies (false_mask) ? (b) : (a)
 // For details, refer to the vpternlogd instruction table at
 // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf
+
 #define psel(a, b, false_mask)                        \
   _mm512_castsi512_ps(_mm512_ternarylogic_epi32(      \
       _mm512_castps_si512(a), _mm512_castps_si512(b), \
diff --git a/tensorflow/core/kernels/fixed_length_record_reader_op.cc b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
index 008ea11017..637a6cef95 100644
--- a/tensorflow/core/kernels/fixed_length_record_reader_op.cc
+++ b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
@@ -40,8 +40,8 @@ class FixedLengthRecordReader : public ReaderBase {
 
   // On success:
   // * input_buffer_ != nullptr,
-  // * input_buffer_->Tell() == footer_bytes_
-  // * file_pos_limit_ == file size - header_bytes_
+  // * input_buffer_->Tell() == header_bytes_
+  // * file_pos_limit_ == file size - footer_bytes_
   Status OnWorkStartedLocked() override {
     record_number_ = 0;
     uint64 file_size = 0;
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
new file mode 100644
index 0000000000..93791851b1
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -0,0 +1,457 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+#ifdef INTEL_MKL
+
+#include <string.h>
+#include <map>
+#include <vector>
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "tensorflow/core/util/mkl_util.h"
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T, bool biasEnabled>
+class MklConv2DOp : public OpKernel {
+ public:
+  ~MklConv2DOp() {}
+
+  explicit MklConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+
+    const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
+    const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
+    OP_REQUIRES(
+        context, stride_n == 1 && stride_c == 1,
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = MklGetInput(context, 0);
+    GetMklShape(context, 0, &(mkl_params_.input_shape));
+    bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor();
+
+    const Tensor& filter = MklGetInput(context, 1);
+    MklShape mkl_filter_shape;
+    GetMklShape(context, 1, &mkl_filter_shape);
+    CHECK(!mkl_filter_shape.IsMklTensor())
+        << "Conv filter should not be in MKL Layout";
+
+    if (biasEnabled) {
+      const Tensor& bias = MklGetInput(context, 2);
+      OP_REQUIRES(context, bias.dims() == 1,
+                  errors::InvalidArgument("bias must be 1-dimensional: ",
+                                          bias.shape().DebugString()));
+    }
+
+    if (!input_in_mkl_format) {
+      OP_REQUIRES(context, input.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional",
+                                          input.shape().DebugString()));
+    }
+
+    OP_REQUIRES(context, filter.dims() == 4,
+                errors::InvalidArgument("filter must be 4-dimensional: ",
+                                        filter.shape().DebugString()));
+
+    for (int i = 0; i < 3; i++) {
+      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
+                                           std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("filter too large"));
+    }
+
+    const int64 input_depth = input_in_mkl_format
+                                  ? mkl_params_.input_shape.GetSizes()[2]
+                                  : GetTensorDim(input, data_format_, 'C');
+    OP_REQUIRES(
+        context, input_depth == filter.dim_size(2),
+        errors::InvalidArgument("input and filter must have the same depth: ",
+                                input_depth, " vs ", filter.dim_size(2)));
+    // The last dimension for filter is out_depth.
+    const int out_depth = static_cast<int>(filter.dim_size(3));
+
+    // The second dimension for input is rows/height.
+    // The first dimension for filter is rows/height.
+    const int64 input_rows_raw = input_in_mkl_format
+                                     ? mkl_params_.input_shape.GetSizes()[1]
+                                     : GetTensorDim(input, data_format_, 'H');
+    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("Input rows too large"));
+    const int input_rows = static_cast<int>(input_rows_raw);
+    const int filter_rows = static_cast<int>(filter.dim_size(0));
+
+    // The third dimension for input is columns/width.
+    // The second dimension for filter is columns/width.
+    const int64 input_cols_raw = input_in_mkl_format
+                                     ? mkl_params_.input_shape.GetSizes()[0]
+                                     : GetTensorDim(input, data_format_, 'W');
+    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("Input cols too large"));
+    const int input_cols = static_cast<int>(input_cols_raw);
+    const int filter_cols = static_cast<int>(filter.dim_size(1));
+
+    // The first dimension for input is batch.
+    const int64 input_batch_raw = input_in_mkl_format
+                                      ? mkl_params_.input_shape.GetSizes()[3]
+                                      : GetTensorDim(input, data_format_, 'N');
+    OP_REQUIRES(context, FastBoundsCheck(input_batch_raw,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("batch is too large"));
+    const int batch = static_cast<int>(input_batch_raw);
+
+    // For now we take the stride from the second and third dimensions only (we
+    // do not support striding on the batch or depth dimension).
+    const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
+    const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
+
+    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_rows, filter_rows, stride_rows,
+                                         padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_cols, filter_cols, stride_cols,
+                                         padding_, &out_cols, &pad_cols));
+    TensorShape out_shape =
+        ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      // TODO(jbobba): Verify correctness here
+      //               Need semantics for Null MKL tensor
+      return;
+    }
+
+    if (batch == 0) {
+      // Nothing to do, allocate output tensor and return
+      MklShape mkl_output_mkl_shape;
+      mkl_output_mkl_shape.SetMklTensor(false);
+      AllocateOutputSetMklshape(context, 0, &output, input.shape(),
+                                mkl_output_mkl_shape);
+      return;
+    }
+
+    // Create MKL convolution primitives
+    mkl_params_.in_dims = input_in_mkl_format
+                              ? mkl_params_.input_shape.GetDimension()
+                              : input.dims();
+    mkl_params_.filter_dims = filter.dims();
+    mkl_params_.in_sizes[0] = static_cast<size_t>(input_cols);
+    mkl_params_.in_sizes[1] = static_cast<size_t>(input_rows);
+    mkl_params_.in_sizes[2] = static_cast<size_t>(input_depth);
+    mkl_params_.in_sizes[3] = static_cast<size_t>(batch);
+    mkl_params_.out_sizes[0] = static_cast<size_t>(out_cols);
+    mkl_params_.out_sizes[1] = static_cast<size_t>(out_rows);
+    mkl_params_.out_sizes[2] = static_cast<size_t>(out_depth);
+    mkl_params_.out_sizes[3] = static_cast<size_t>(batch);
+    mkl_params_.input_offset[0] = static_cast<int>(-pad_cols);
+    mkl_params_.input_offset[1] = static_cast<int>(-pad_rows);
+    mkl_params_.conv_stride[0] = static_cast<size_t>(stride_cols);
+    mkl_params_.conv_stride[1] = static_cast<size_t>(stride_rows);
+
+    GetStridesFromSizes(data_format_, mkl_params_.out_strides,
+                        mkl_params_.out_sizes);
+    GetStridesFromSizes(data_format_, mkl_params_.in_strides,
+                        mkl_params_.in_sizes);
+
+    // TF filter dimension order (out_depth, in_depth, cols, rows) ->
+    // MKL filter dimension order (out_depth, in_depth, rows, cols)
+    mkl_params_.filter_sizes[0] = filter.dim_size(1);  // cols
+    mkl_params_.filter_sizes[1] = filter.dim_size(0);  // rows
+    mkl_params_.filter_sizes[2] = filter.dim_size(2);  // in_depth
+    mkl_params_.filter_sizes[3] = filter.dim_size(3);  // out_depth
+
+    // TF filter layout - (rows, cols, in_depth, out_depth)
+    mkl_params_.filter_strides[0] =
+        filter.dim_size(2) * filter.dim_size(3);  // cols
+    mkl_params_.filter_strides[1] =
+        filter.dim_size(1) * filter.dim_size(2) * filter.dim_size(3);  // rows
+    mkl_params_.filter_strides[2] = filter.dim_size(3);  // in_depth
+    mkl_params_.filter_strides[3] = 1;                   // out_depth
+
+    if (biasEnabled) {
+      const Tensor& bias = MklGetInput(context, 2);
+      mkl_params_.bias_sizes[0] = {static_cast<size_t>(bias.dim_size(0))};
+      mkl_params_.bias_strides[0] = {1};
+    }
+
+    // Create Convolution Primitive
+    if (biasEnabled) {
+      CHECK_EQ(dnnConvolutionCreateForwardBias_F32(
+                   &mkl_prim_convolution_fwd_, nullptr,
+                   dnnAlgorithmConvolutionDirect, mkl_params_.in_dims,
+                   mkl_params_.in_sizes, mkl_params_.out_sizes,
+                   mkl_params_.filter_sizes, mkl_params_.conv_stride,
+                   mkl_params_.input_offset, dnnBorderZeros),
+               E_SUCCESS);
+    } else {
+      CHECK_EQ(dnnConvolutionCreateForward_F32(
+                   &mkl_prim_convolution_fwd_, nullptr,
+                   dnnAlgorithmConvolutionDirect, mkl_params_.in_dims,
+                   mkl_params_.in_sizes, mkl_params_.out_sizes,
+                   mkl_params_.filter_sizes, mkl_params_.conv_stride,
+                   mkl_params_.input_offset, dnnBorderZeros),
+               E_SUCCESS);
+    }
+
+    TensorShape mkl_output_tf_shape;
+    MklShape mkl_output_mkl_shape;
+    mkl_output_mkl_shape.SetMklTensor(true);
+    mkl_output_mkl_shape.SetMklLayout(mkl_prim_convolution_fwd_,
+                                      dnnResourceDst);
+    mkl_output_mkl_shape.SetTfLayout(mkl_params_.in_dims, mkl_params_.out_sizes,
+                                     mkl_params_.out_strides);
+    mkl_output_tf_shape.AddDim(
+        dnnLayoutGetMemorySize_F32(
+            static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
+        sizeof(T));
+    AllocateOutputSetMklshape(context, 0, &output, mkl_output_tf_shape,
+                              mkl_output_mkl_shape);
+    mkl_conv_res_[dnnResourceDst] =
+        static_cast<void*>(output->flat<T>().data());
+
+    MklCreateInputLayouts(context);
+
+    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor,
+        mkl_tmp_bias_buf_tensor;  // Temp tensor used to allocate tmp
+                                  // buffers
+    MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor,
+                                &mkl_tmp_filter_buf_tensor,
+                                &mkl_tmp_bias_buf_tensor);
+
+    // Execute convolution
+    CHECK_EQ(dnnExecute_F32(mkl_prim_convolution_fwd_, mkl_conv_res_),
+             E_SUCCESS);
+
+    MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    int in_dims;
+    size_t in_sizes[4];
+    size_t in_strides[4];
+    size_t out_sizes[4];
+    size_t out_strides[4];
+    int filter_dims;
+    size_t filter_sizes[4];
+    size_t filter_strides[4];
+    size_t bias_sizes[1];
+    size_t bias_strides[1];
+    int input_offset[2];
+    size_t conv_stride[2];
+    MklShape input_shape;
+  } MklConv2DOpParams;
+
+  // Create MKL dnnLayout_t objects for tensors coming into the layer
+  void MklCreateInputLayouts(OpKernelContext* context) {
+    bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor();
+    if (input_in_mkl_format) {
+      mkl_lt_input_ =
+          static_cast<dnnLayout_t>(mkl_params_.input_shape.GetCurLayout());
+    } else {
+      CHECK_EQ(
+          dnnLayoutCreate_F32(&mkl_lt_input_, mkl_params_.in_dims,
+                              mkl_params_.in_sizes, mkl_params_.in_strides),
+          E_SUCCESS);
+    }
+
+    CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_filter_, mkl_params_.filter_dims,
+                                 mkl_params_.filter_sizes,
+                                 mkl_params_.filter_strides),
+             E_SUCCESS);
+
+    if (biasEnabled) {
+      CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_bias_, 1, mkl_params_.bias_sizes,
+                                   mkl_params_.bias_strides),
+               E_SUCCESS);
+    }
+  }
+
+  // Compare incoming tensor layouts with MKL preferred layouts and convert
+  // data to the preferred layout if necessary
+  void MklPrepareConvolutionInputs(OpKernelContext* context,
+                                   Tensor* mkl_tmp_input_buf_tensor,
+                                   Tensor* mkl_tmp_filter_buf_tensor,
+                                   Tensor* mkl_tmp_bias_buf_tensor) {
+    bool mkl_convert_input, mkl_convert_filter, mkl_convert_bias;
+    dnnPrimitive_t mkl_prim_convert_filter, mkl_prim_convert_bias,
+        mkl_prim_convert_input;
+    dnnLayout_t mkl_lt_internal_filter, mkl_lt_internal_bias,
+        mkl_lt_internal_input;
+    void *mkl_buf_convert_input, *mkl_buf_convert_filter,
+         *mkl_buf_convert_bias;
+    mkl_prim_convert_filter = nullptr;
+    mkl_prim_convert_bias   = nullptr;
+    mkl_prim_convert_input  = nullptr;
+    mkl_lt_internal_filter  = nullptr;
+    mkl_lt_internal_bias    = nullptr;
+    mkl_lt_internal_input   = nullptr;
+    mkl_buf_convert_input   = nullptr;
+    mkl_buf_convert_filter  = nullptr;
+    mkl_buf_convert_bias    = nullptr;
+
+    // Compare with internal layouts and convert if needed
+    const Tensor& input = MklGetInput(context, 0);
+    void* mkl_buf_input =
+        const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
+    CHECK_EQ(
+        dnnLayoutCreateFromPrimitive_F32(
+            &mkl_lt_internal_input, mkl_prim_convolution_fwd_, dnnResourceSrc),
+        E_SUCCESS);
+    mkl_convert_input =
+        !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_input_);
+    if (mkl_convert_input) {
+      CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, mkl_lt_input_,
+                                       mkl_lt_internal_input),
+               E_SUCCESS);
+      AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
+                     &mkl_buf_convert_input);
+      CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
+                                        mkl_buf_convert_input),
+               E_SUCCESS);
+      dnnDelete_F32(mkl_prim_convert_input);
+    }
+    dnnLayoutDelete_F32(mkl_lt_internal_input);
+
+    mkl_conv_res_[dnnResourceSrc] =
+        (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
+
+    const Tensor& filter = MklGetInput(context, 1);
+    void* mkl_buf_filter =
+        const_cast<void*>(static_cast<const void*>(filter.flat<T>().data()));
+    CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_filter,
+                                              mkl_prim_convolution_fwd_,
+                                              dnnResourceFilter),
+             E_SUCCESS);
+    mkl_convert_filter =
+        !dnnLayoutCompare_F32(mkl_lt_internal_filter, mkl_lt_filter_);
+    if (mkl_convert_filter) {
+      CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_filter, mkl_lt_filter_,
+                                       mkl_lt_internal_filter),
+               E_SUCCESS);
+      AllocTmpBuffer(context, mkl_tmp_filter_buf_tensor, mkl_lt_internal_filter,
+                     &mkl_buf_convert_filter);
+      CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_filter, mkl_buf_filter,
+                                        mkl_buf_convert_filter),
+               E_SUCCESS);
+      dnnDelete_F32(mkl_prim_convert_filter);
+    }
+    dnnLayoutDelete_F32(mkl_lt_internal_filter);
+
+    mkl_conv_res_[dnnResourceFilter] =
+        (mkl_convert_filter) ? mkl_buf_convert_filter : mkl_buf_filter;
+
+    if (biasEnabled) {
+      const Tensor& bias = MklGetInput(context, 2);
+      void* mkl_buf_bias =
+          const_cast<void*>(static_cast<const void*>(bias.flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_bias,
+                                                mkl_prim_convolution_fwd_,
+                                                dnnResourceBias),
+               E_SUCCESS);
+      mkl_convert_bias =
+          !dnnLayoutCompare_F32(mkl_lt_internal_bias, mkl_lt_bias_);
+      if (mkl_convert_bias) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_bias, mkl_lt_bias_,
+                                         mkl_lt_internal_bias),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_bias_buf_tensor, mkl_lt_internal_bias,
+                       &mkl_buf_convert_bias);
+        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_bias, mkl_buf_bias,
+                                          mkl_buf_convert_bias),
+                 E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_bias);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_bias);
+
+      mkl_conv_res_[dnnResourceBias] =
+          (mkl_convert_bias) ? mkl_buf_convert_bias : mkl_buf_bias;
+    }
+  }
+
+  void MklCleanup() {
+    bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor();
+    dnnDelete_F32(mkl_prim_convolution_fwd_);
+    if (!input_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_input_);
+    dnnLayoutDelete_F32(mkl_lt_filter_);
+    if (biasEnabled) dnnLayoutDelete_F32(mkl_lt_bias_);
+  }
+  
+  std::vector<int32> strides_;
+  Padding padding_;
+  TensorFormat data_format_;
+
+  MklConv2DOpParams mkl_params_;
+  dnnPrimitive_t mkl_prim_convolution_fwd_ = nullptr;
+  void* mkl_conv_res_[dnnResourceNumber];
+  dnnLayout_t mkl_lt_filter_ = nullptr, mkl_lt_bias_ = nullptr,
+              mkl_lt_input_ = nullptr;
+  
+
+};
+
+#define REGISTER_MKL_CPU(T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("MklConv2D").Device(DEVICE_CPU)                                 \
+      .TypeConstraint<T>("T")                                              \
+      .Label(mkl_layer_registry::kMklLayerLabel),                          \
+      MklConv2DOp<CPUDevice, T, false>);                                   \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("MklConv2DWithBias").Device(DEVICE_CPU)                         \
+      .TypeConstraint<T>("T")                                              \
+      .Label(mkl_layer_registry::kMklLayerLabel),                          \
+      MklConv2DOp<CPUDevice, T, true>);
+
+TF_CALL_float(REGISTER_MKL_CPU);
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.cc b/tensorflow/core/kernels/mkl_tfconv_op.cc
new file mode 100644
index 0000000000..5925a5b7c1
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_tfconv_op.cc
@@ -0,0 +1,135 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include <vector>
+#include <algorithm>
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/macros.h"
+
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+  typedef Eigen::ThreadPoolDevice CPUDevice;
+
+///////////////////////////////////////////////////////////
+//               Op kernel
+///////////////////////////////////////////////////////////
+
+template <typename Device, typename T>
+class MklToTfOp : public OpKernel {
+ public:
+  explicit MklToTfOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // 1. Check that input tensor is in MKL format.
+    const Tensor& input_tensor = MklGetInput(context, 0);
+    MklShape input_shape;
+    GetMklShape(context, 0, &input_shape);
+
+    // if input is already in Tf format, then just copy input tensor to output.
+    if (!input_shape.IsMklTensor()) {
+      context->set_output(0, input_tensor);
+      VLOG(1) << "MKLToTFConversion: No conversion needed, "
+              << "copying input to output";
+      return;
+    }
+
+    // Check that input data type is same as operator data type and that it is
+    // same as output data type.
+    DataType input_data_type  = input_type(0);
+    DataType output_data_type = output_type(0);
+    CHECK_EQ(op_data_type, input_data_type);
+    CHECK_EQ(op_data_type, output_data_type);
+
+    // We need to recreate Tf tensor shape based on sizes and strides.
+    // Ideally, we should know what the data_format is, but that attribute
+    // to this op is not reliable. So below, we rely of sorting logic where
+    // we sort strides first and then sizes.
+    TensorShape output_shape;
+    std::vector<std::pair<int, int>> shape_size;
+    for (size_t i = 0; i < input_shape.GetDimension(); i++) {
+      VLOG(1) << "Size: " << input_shape.GetSizes()[i]
+              << ", Strides: " << input_shape.GetStrides()[i];
+      shape_size.push_back(std::make_pair(input_shape.GetSizes()[i],
+                                          input_shape.GetStrides()[i]));
+    }
+
+    std::sort(shape_size.begin(), shape_size.end(), [](
+      std::pair<int, int > a, std::pair<int, int> b) {
+      return (a.second > b.second) ||
+             (a.second == b.second && a.first > b.first);
+    });
+
+    for (std::pair<int, int> s_s : shape_size) {
+      VLOG(1) << "Added dimension: " << s_s.first;
+      output_shape.AddDim(s_s.first);
+    }
+
+    // Allocate output tensor.
+    Tensor* output_tensor = NULL;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, output_shape, &output_tensor));
+
+    // 3. Get input and output layout pointers.
+    dnnLayout_t output_layout = static_cast<dnnLayout_t>(
+                                  input_shape.GetTfLayout());
+
+    // 4. Execute DNNConversion.
+    void *input_buffer  = static_cast<void*>(const_cast<T*>(
+                              input_tensor.flat<T>().data()));
+    void *output_buffer = static_cast<void*>(const_cast<T*>(
+                              output_tensor->flat<T>().data()));
+    input_shape.GetConvertedFlatData(output_layout, input_buffer,
+                                     output_buffer);
+
+    VLOG(1) << "MKLToTFConversion complete successfully.";
+  }
+
+ private:
+    /// Data format of the operation
+    string data_format_str;
+
+    /// Data type of the operation
+    DataType op_data_type;
+};
+
+///////////////////////////////////////////////////////////
+//               Register kernel
+///////////////////////////////////////////////////////////
+
+#define REGISTER_CPU(T) \
+  REGISTER_KERNEL_BUILDER( \
+    Name("MklToTf").Device(DEVICE_CPU).TypeConstraint<T>("T") \
+    .Label(mkl_layer_registry::kMklLayerLabel), \
+    MklToTfOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_CPU);
+#undef REGISTER_CPU
+}  // namespace tensorflow
+#endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl_transpose_op.cc
new file mode 100644
index 0000000000..c00674d72f
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@@ -0,0 +1,67 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/transpose_op.h"
+#include "tensorflow/core/kernels/transpose_functor.h"
+#include "third_party/mkl/include/mkl_trans.h"
+
+namespace tensorflow {
+
+// output = TransposeOp(T<any> input, T<int32> perm) takes a tensor
+// of type T and rank N, and a permutation of 0, 1, ..., N-1. It
+// shuffles the dimensions of the input tensor according to permutation.
+//
+// Specifically, the returned tensor output meets the following condition:
+// 1) output.dims() == input.dims();
+// 2) output.dim_size(i) == input.dim_size(perm[i]);
+// 3) output.tensor<T, N>(i_0, i_1, ..., i_N-1) ==
+//      input.tensor<T, N>(j_0, j_1, ..., j_N-1),
+//    where i_s == j_{perm[s]}
+//
+// REQUIRES: perm is a vector of int32.
+// REQUIRES: input.dims() == perm.size().
+// REQUIRES: perm is a permutation.
+
+Status MklTransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                                      gtl::ArraySlice<int32> perm,
+                                      Tensor* out) {
+  if (in.dims() == 2 && in.dtype() == DT_FLOAT) {
+    float* user_o = out->flat<float>().data();
+    const float* user_i = in.flat<float>().data();
+
+    // Documentation here: https://software.intel.com/en-us/node/520863
+    // Parameters: (ordering:row-major, operation:transpose, num_rows, num_cols,
+    //              alpha (for scaling), array, dist_bet_adjacent_cols/rows
+    //              (source), array, dist_bet_adjacent_cols/rows (dest))
+    mkl_somatcopy('R', 'T', in.dim_size(0), in.dim_size(1), 1,
+                  user_i, in.dim_size(1),
+                  user_o, in.dim_size(0));
+
+    return Status::OK();
+  }
+
+  // Fallback to eigen if transpose parameters not supported by MKL
+  typedef Eigen::ThreadPoolDevice CPUDevice;
+  return ::tensorflow::DoTranspose(ctx->eigen_device<CPUDevice>(), in, perm,
+                                   out);
+} // MklTransposeCpuOp::DoTranspose
+} // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index ddc9c9823b..3fe16c66b8 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -64,6 +64,8 @@ PoolParameters::PoolParameters(OpKernelContext* context,
     OP_REQUIRES_OK(
         context, GetWindowedOutputSize(tensor_in_cols, window_cols, col_stride,
                                        padding, &out_width, &pad_cols));
+    pad_depth = 0;
+    out_depth = depth;
   } else {
     // Our current version of depthwise max pooling does not support
     // any padding, and expects the depth_window to equal the
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
index 9bfbe2a61a..f1627135c5 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
@@ -66,9 +66,7 @@ class ResizeNearestNeighborOp : public OpKernel {
           const int64 in_x =
               std::min(static_cast<int64>(floorf(x * st.width_scale)),
                        (st.in_width - 1));
-          for (int c = 0; c < st.channels; ++c) {
-            output_data(b, y, x, c) = input_data(b, in_y, in_x, c);
-          }
+          std::copy_n(&input_data(b, in_y, in_x, 0), st.channels, &output_data(b, y, x, 0));
         }
       }
     }
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_benchmark_test.cc b/tensorflow/core/kernels/resize_op_benchmark_test.cc
index 07cf653c2f..4d0805a737 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_benchmark_test.cc
+++ b/tensorflow/core/kernels/resize_op_benchmark_test.cc
@@ -21,7 +21,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-static Graph* BM_ResizeNearestNeighbor(int batches, int width, int height) {
+static Graph* BM_Resize(const char* algorithm,
+                        int batches, int width, int height) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor in(DT_FLOAT, TensorShape({batches, width, height, 3}));
   in.flat<float>().setRandom();
@@ -32,21 +33,26 @@ static Graph* BM_ResizeNearestNeighbor(int batches, int width, int height) {
   out_size_flat(1) = height * 2;
 
   Node* ret;
-  NodeBuilder(g->NewName("n"), "ResizeNearestNeighbor")
-      .Input(test::graph::Constant(g, in))
-      .Input(test::graph::Constant(g, out_size))
-      .Finalize(g, &ret);
+  Status s = NodeBuilder(g->NewName("n"), algorithm)
+                 .Input(test::graph::Constant(g, in))
+                 .Input(test::graph::Constant(g, out_size))
+                 .Finalize(g, &ret);
+  assert(s.ok());
   return g;
 }
 
-#define BM_ResizeNearestNeighborDev(DEVICE, B, W, H)                           \
-  static void BM_ResizeNearestNeighbor_##DEVICE##_##B##_##W##_##H(int iters) { \
+#define BM_ResizeDev(DEVICE, ALGORITHM, B, W, H)                               \
+  static void BM_Resize_##ALGORITHM##_##DEVICE##_##B##_##W##_##H(int iters) {  \
     testing::ItemsProcessed(iters* B* W* H * 3);                               \
-    test::Benchmark(#DEVICE, BM_ResizeNearestNeighbor(B, W, H)).Run(iters);    \
+    test::Benchmark(#DEVICE, BM_Resize(#ALGORITHM, B, W, H)).Run(iters);       \
   }                                                                            \
-  BENCHMARK(BM_ResizeNearestNeighbor_##DEVICE##_##B##_##W##_##H)
+  BENCHMARK(BM_Resize_##ALGORITHM##_##DEVICE##_##B##_##W##_##H)
 
-BM_ResizeNearestNeighborDev(cpu, 1, 499, 499);
-BM_ResizeNearestNeighborDev(gpu, 1, 499, 499);
+BM_ResizeDev(cpu, ResizeNearestNeighbor, 10, 499, 499);
+BM_ResizeDev(gpu, ResizeNearestNeighbor, 10, 499, 499);
+
+BM_ResizeDev(cpu, ResizeBilinear, 10, 499, 499);
+BM_ResizeDev(gpu, ResizeBilinear, 10, 499, 499);
 
 }  // namespace tensorflow
+
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 4d303f0173..fb2ceb4a4a 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -180,6 +180,20 @@ Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
                                    out);
 }
 
+#ifdef INTEL_MKL
+#define REGISTER(T)                                           \
+  REGISTER_KERNEL_BUILDER(Name("Transpose")                   \
+                              .Device(DEVICE_CPU)             \
+                              .TypeConstraint<T>("T")         \
+                              .TypeConstraint<int32>("Tperm") \
+                              .HostMemory("perm"),            \
+                          MklTransposeCpuOp);
+TF_CALL_ALL_TYPES(REGISTER);
+REGISTER(bfloat16);
+#undef REGISTER
+
+#else  // INTEL_MKL
+
 #define REGISTER(T)                                           \
   REGISTER_KERNEL_BUILDER(Name("Transpose")                   \
                               .Device(DEVICE_CPU)             \
@@ -190,6 +204,7 @@ Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
 TF_CALL_ALL_TYPES(REGISTER)
 REGISTER(bfloat16);
 #undef REGISTER
+#endif  // INTEL_MKL
 
 #if GOOGLE_CUDA
 Status TransposeGpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h
index 5f40bcecc1..a69eecc2f8 100644
--- a/tensorflow/core/kernels/transpose_op.h
+++ b/tensorflow/core/kernels/transpose_op.h
@@ -41,6 +41,17 @@ class TransposeCpuOp : public TransposeOp {
                      gtl::ArraySlice<int32> perm, Tensor* out) override;
 };
 
+#ifdef INTEL_MKL
+class MklTransposeCpuOp : public TransposeOp {
+ public:
+  explicit MklTransposeCpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
+
+ protected:
+  Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                     gtl::ArraySlice<int32> perm, Tensor* out) override;
+};
+#endif  // INTEL_MKL
+
 class TransposeGpuOp : public TransposeOp {
  public:
   explicit TransposeGpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index eee9961b28..e56b27b0c0 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -2502,4 +2502,45 @@ scale_after_normalization: A bool indicating whether the resulted tensor
   needs to be multiplied with gamma.
 )doc");
 
+#ifdef INTEL_MKL
+REGISTER_OP("MklConv2D")
+    .Input("input: T")
+    .Input("mkl_input: uint8")
+    .Input("filter: T")
+    .Input("mkl_filter: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+MKL version of Conv2D
+)doc");
+
+REGISTER_OP("MklConv2DWithBias")
+    .Input("input: T")
+    .Input("mkl_input: uint8")
+    .Input("filter: T")
+    .Input("mkl_filter: uint8")
+    .Input("bias: T")
+    .Input("mkl_bias: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString());
+
+REGISTER_OP("MklToTf")
+    .Input("input: T")
+    .Input("mkl_input: uint8")
+    .Output("output: T")
+    .Attr("T: {half, float, double}")
+    .Attr(GetConvnetDataFormatAttrString());
+#endif  // INTEL_MKL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index d17b52306d..aa2177dba4 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -25759,6 +25759,59 @@ op {
   description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
 }
 op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    description: "A tensor whose shape is a prefix of `data.shape`."
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "Has same shape as data, except for the first `segment_ids.rank`\ndimensions, which are replaced with a single dimension which has size\n`num_segments`."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Computes the max along segments of a tensor."
+  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`. Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\n  range of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
+}
+op {
   name: "Unstage"
   output_arg {
     name: "values"
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 5db8b68048..f21a646ca1 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -4,11 +4,6 @@ load("@protobuf//:protobuf.bzl", "cc_proto_library")
 load("@protobuf//:protobuf.bzl", "py_proto_library")
 load("//tensorflow:tensorflow.bzl", "if_not_mobile")
 
-# configure may change the following lines
-WITH_GCP_SUPPORT = False
-WITH_HDFS_SUPPORT = False
-WITH_JEMALLOC = True
-
 # Appends a suffix to a list of deps.
 def tf_deps(deps, suffix):
   tf_deps = []
@@ -196,61 +191,54 @@ def tf_additional_test_srcs():
 def tf_kernel_tests_linkstatic():
   return 0
 
-# jemalloc only enabled on Linux for now.
-# TODO(jhseu): Enable on other platforms.
 def tf_additional_lib_defines():
-  defines = []
-  if WITH_JEMALLOC:
-    defines += select({
-        "//tensorflow:linux_x86_64": [
-            "TENSORFLOW_USE_JEMALLOC"
-        ],
-        "//conditions:default": [],
-    })
-  return defines
+  return select({
+      "//tensorflow:with_jemalloc": ["TENSORFLOW_USE_JEMALLOC"],
+      "//conditions:default": [],
+  })
 
 def tf_additional_lib_deps():
-  deps = []
-  if WITH_JEMALLOC:
-    deps += select({
-        "//tensorflow:linux_x86_64": ["@jemalloc"],
-        "//conditions:default": [],
-    })
-  return deps
+  return select({
+      "//tensorflow:with_jemalloc": ["@jemalloc"],
+      "//conditions:default": [],
+  })
 
 def tf_additional_core_deps():
-  deps = []
-  if WITH_GCP_SUPPORT:
-    deps.append("//tensorflow/core/platform/cloud:gcs_file_system")
-  if WITH_HDFS_SUPPORT:
-    deps.append("//tensorflow/core/platform/hadoop:hadoop_file_system")
-  return deps
+  return select({
+      "//tensorflow:with_gcp_support": [
+          "//tensorflow/core/platform/cloud:gcs_file_system",
+      ],
+      "//conditions:default": [],
+  }) + select({
+      "//tensorflow:with_hdfs_support": [
+          "//tensorflow/core/platform/hadoop:hadoop_file_system",
+      ],
+      "//conditions:default": [],
+  })
 
 # TODO(jart, jhseu): Delete when GCP is default on.
 def tf_additional_cloud_op_deps():
-  deps = []
-  if WITH_GCP_SUPPORT:
-    deps = select({
+  return select({
       "//tensorflow:windows": [],
       "//tensorflow:android": [],
       "//tensorflow:ios": [],
-      "//conditions:default":
-        ["//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib"],
-    })
-  return deps
+      "//tensorflow:with_gcp_support": [
+        "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
+      ],
+      "//conditions:default": [],
+  })
 
 # TODO(jart, jhseu): Delete when GCP is default on.
 def tf_additional_cloud_kernel_deps():
-  deps = []
-  if WITH_GCP_SUPPORT:
-    deps = select({
+  return select({
       "//tensorflow:windows": [],
       "//tensorflow:android": [],
       "//tensorflow:ios": [],
-      "//conditions:default":
-        ["//tensorflow/contrib/cloud/kernels:bigquery_reader_ops"],
-    })
-  return deps
+      "//tensorflow:with_gcp_support": [
+        "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
+      ],
+      "//conditions:default": [],
+  })
 
 def tf_lib_proto_parsing_deps():
   return [
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index 23a7b9065a..79f97c1234 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -2,8 +2,6 @@
 # The functions in this file might be referred by tensorflow.bzl. They have to
 # be separate to avoid cyclic references.
 
-WITH_XLA_SUPPORT = False
-
 def tf_cuda_tests_tags():
   return ["local"]
 
@@ -11,16 +9,16 @@ def tf_sycl_tests_tags():
   return ["local"]
 
 def tf_additional_plugin_deps():
-  deps = []
-  if WITH_XLA_SUPPORT:
-    deps.append("//tensorflow/compiler/jit")
-  return deps
+  return select({
+      "//tensorflow:with_xla_support": ["//tensorflow/compiler/jit"],
+      "//conditions:default": [],
+  })
 
 def tf_additional_xla_deps_py():
   return []
 
 def tf_additional_license_deps():
-  licenses = []
-  if WITH_XLA_SUPPORT:
-    licenses.append("@llvm//:LICENSE.TXT")
-  return licenses
+  return select({
+      "//tensorflow:with_xla_support": ["@llvm//:LICENSE.TXT"],
+      "//conditions:default": [],
+  })
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 1d0c9dc8cd..66bda85b2f 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -58,6 +58,7 @@ class LibHDFS {
   std::function<hdfsFS(hdfsBuilder*)> hdfsBuilderConnect;
   std::function<hdfsBuilder*()> hdfsNewBuilder;
   std::function<void(hdfsBuilder*, const char*)> hdfsBuilderSetNameNode;
+  std::function<int(const char*, char**)> hdfsConfGetStr;
   std::function<void(hdfsBuilder*, const char* kerbTicketCachePath)>
       hdfsBuilderSetKerbTicketCachePath;
   std::function<int(hdfsFS, hdfsFile)> hdfsCloseFile;
@@ -85,6 +86,7 @@ class LibHDFS {
       BIND_HDFS_FUNC(hdfsBuilderConnect);
       BIND_HDFS_FUNC(hdfsNewBuilder);
       BIND_HDFS_FUNC(hdfsBuilderSetNameNode);
+      BIND_HDFS_FUNC(hdfsConfGetStr);
       BIND_HDFS_FUNC(hdfsBuilderSetKerbTicketCachePath);
       BIND_HDFS_FUNC(hdfsCloseFile);
       BIND_HDFS_FUNC(hdfsPread);
@@ -147,6 +149,18 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
   hdfsBuilder* builder = hdfs_->hdfsNewBuilder();
   if (scheme == "file") {
     hdfs_->hdfsBuilderSetNameNode(builder, nullptr);
+  } else if (scheme == "viewfs") {
+    char *defaultFS = NULL;
+    hdfs_->hdfsConfGetStr("fs.defaultFS", &defaultFS);
+    StringPiece defaultScheme, defaultCluster, defaultPath;
+    io::ParseURI(defaultFS, &defaultScheme, &defaultCluster, &defaultPath);
+
+    if (scheme != defaultScheme || namenode != defaultCluster) {
+      return errors::Unimplemented("viewfs is only supported as a fs.defaultFS.");
+    }
+    // The default NameNode configuration will be used (from the XML configuration files). See:
+    // https://github.com/tensorflow/tensorflow/blob/v1.0.0/third_party/hadoop/hdfs.h#L259
+    hdfs_->hdfsBuilderSetNameNode(builder, "default");
   } else {
     hdfs_->hdfsBuilderSetNameNode(builder, nn.c_str());
   }
@@ -478,5 +492,6 @@ Status HadoopFileSystem::Stat(const string& fname, FileStatistics* stats) {
 }
 
 REGISTER_FILE_SYSTEM("hdfs", HadoopFileSystem);
+REGISTER_FILE_SYSTEM("viewfs", HadoopFileSystem);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index aad35890af..b6fb18bd99 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -53,6 +53,17 @@ limitations under the License.
 #define TF_SCANF_ATTRIBUTE(string_index, first_to_check)
 #endif
 
+// Control visiblity outside .so
+#if defined(COMPILER_MSVC)
+# ifdef TF_COMPILE_LIBRARY
+#  define TF_EXPORT __declspec(dllexport)
+# else
+#  define TF_EXPORT __declspec(dllimport)
+# endif   // TF_COMPILE_LIBRARY
+#else
+# define TF_EXPORT __attribute__((visibility("default")))
+#endif  // COMPILER_MSVC
+
 // GCC can be told that a certain branch is not likely to be taken (for
 // instance, a CHECK failure), and use that information in static analysis.
 // Giving it this information can help it optimize for the common case in
diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/platform/windows/cpu_info.h
index 77a1946e61..d6e78dbc8f 100644
--- a/tensorflow/core/platform/windows/cpu_info.h
+++ b/tensorflow/core/platform/windows/cpu_info.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
 #define TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
 
+// included so __cpuidex function is available for GETCPUID on Windows
+#include <intrin.h>
+
 // Byte order defines provided by gcc. MSVC doesn't define those so
 // we define them here.
 // We assume that all windows platform out there are little endian.
diff --git a/tensorflow/core/platform/windows/intrinsics_port.h b/tensorflow/core/platform/windows/intrinsics_port.h
index a4fa1e9971..e52f5b1646 100644
--- a/tensorflow/core/platform/windows/intrinsics_port.h
+++ b/tensorflow/core/platform/windows/intrinsics_port.h
@@ -24,6 +24,9 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 #define _mm_load_pd1 _mm_load1_pd
+
+// only define these intrinsics if immintrin.h doesn't have them (VS2015 and earlier)
+#if _MSC_VER < 1910
 static inline int
 _mm256_extract_epi32(__m256i a, const int i)
 {
@@ -39,3 +42,4 @@ _mm256_insert_epi32(__m256i a, int b, const int i)
 }
 #endif
 #endif
+#endif
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index facadc7f57..72e7e06e65 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -230,11 +230,9 @@ Status WindowsFileSystem::NewRandomAccessFile(
   result->reset();
 
   // Open the file for read-only random access
-  // Random access is to disable read-ahead as the system reads too much data
   // Open in async mode which makes Windows allow more parallelism even
   // if we need to do sync I/O on top of it.
-  DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_RANDOM_ACCESS |
-      FILE_FLAG_OVERLAPPED;
+  DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_OVERLAPPED;
   // Shared access is necessary for tests to pass
   // almost all tests would work with a possible exception of fault_injection.
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
@@ -306,8 +304,8 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
   result->reset();
   Status s = Status::OK();
 
-  // Open the file for read-only random access
-  DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_RANDOM_ACCESS;
+  // Open the file for read-only
+  DWORD file_flags = FILE_ATTRIBUTE_READONLY;
 
   // Open in async mode which makes Windows allow more parallelism even
   // if we need to do sync I/O on top of it.
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
new file mode 100644
index 0000000000..6d09995b51
--- /dev/null
+++ b/tensorflow/core/util/mkl_util.h
@@ -0,0 +1,296 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
+#ifdef INTEL_MKL
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "third_party/mkl/include/mkl_service.h"
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+
+// The file contains a number of utility classes and functions used by MKL
+// enabled kernels
+
+namespace tensorflow {
+
+// This class encapsulates all the meta data that is associated with an MKL
+// tensor. A tensor is an MKL tensor if it was created as the result of an
+// MKL operation, and did not go through a conversion to a standard
+// Tensorflow tensor.
+
+class MklShape {
+ public:
+  MklShape() {}
+  TF_DISALLOW_COPY_AND_ASSIGN(MklShape);  // Cannot copy
+
+  ~MklShape() {
+    if (sizes_) delete[] sizes_;
+    if (strides_) delete[] strides_;
+    if (mklLayout_) CHECK_EQ(dnnLayoutDelete_F32(mklLayout_), E_SUCCESS);
+    if (tfLayout_) CHECK_EQ(dnnLayoutDelete_F32(tfLayout_), E_SUCCESS);
+  }
+
+  const bool IsMklTensor() const { return isMklTensor_; }
+
+  void SetMklTensor(const bool isMklTensor) { isMklTensor_ = isMklTensor; }
+
+  void SetMklLayout(const void* primitive, size_t resourceType) {
+    CHECK_EQ(
+        dnnLayoutCreateFromPrimitive_F32(&mklLayout_, (dnnPrimitive_t)primitive,
+                                         (dnnResourceType_t)resourceType),
+        E_SUCCESS);
+  }
+
+  void SetTfLayout(const size_t dimension, const size_t* sizes,
+                   const size_t* strides) {
+    dimension_ = dimension;
+    if (dimension > 0) {  // MKl doesn't support dimension 0
+      sizes_ = new size_t[dimension];
+      strides_ = new size_t[dimension];
+
+      for (int ii = 0; ii < dimension; ii++) {
+        sizes_[ii] = sizes[ii];
+        strides_[ii] = strides[ii];
+      }
+      CHECK_EQ(dnnLayoutCreate_F32(&tfLayout_, dimension, sizes, strides),
+               E_SUCCESS);
+    }
+  }
+
+  const dnnLayout_t GetMklLayout() const { return mklLayout_; }
+  const dnnLayout_t GetTfLayout() const { return tfLayout_; }
+  const dnnLayout_t GetCurLayout() const {
+    return isMklTensor_ ? mklLayout_ : tfLayout_;
+  }
+  size_t GetDimension() const { return dimension_; }
+  const size_t* GetSizes() const { return sizes_; }
+  const size_t* GetStrides() const { return strides_; }
+
+  void GetConvertedFlatData(dnnLayout_t targetLayout, void* input,
+                            void* output) const {
+    dnnLayout_t curLayout;
+    if (isMklTensor_)
+      curLayout = mklLayout_;
+    else
+      curLayout = tfLayout_;
+    dnnPrimitive_t convert;
+    CHECK_EQ(dnnConversionCreate_F32(&convert, curLayout, targetLayout),
+             E_SUCCESS);
+    CHECK_EQ(dnnConversionExecute_F32(convert, input, output), E_SUCCESS);
+    CHECK_EQ(dnnDelete_F32(convert), E_SUCCESS);
+  }
+
+// The following methods are used for serializing and de-serializing the
+// contents of the mklshape object.
+// The data is serialized in this order
+// isMklTensor_
+// dimension_
+// sizes
+// strides
+// mklLayout_
+// tfLayout_
+
+#define SIZE_OF_MKL_DNN_BUF \
+  (dnnLayoutSerializationBufferSize_F32())  // Size of buffer needed to
+                                            // serialize dnn_layout pointer
+
+// Size of buffer to hold the serialized object, the size is computed as follows
+// sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes) + sizeof(strides)
+// + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer)
+
+#define SIZE_OF_MKL_SERIAL_DATA(dims) \
+  (2 * sizeof(size_t) + 2 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF)
+
+// First we need to define some macro for offsets into the serial buffer where
+// different elements of Mklshape is written/read from
+
+#define IS_MKL_TENSOR_OFFSET 0
+// Location from start of buffer where isMklTensor_ is serialized
+#define DIMS_OFFSET \
+  (IS_MKL_TENSOR_OFFSET + sizeof(size_t))  // Location of dimension_
+#define SIZES_OFFSET(dims) \
+  (DIMS_OFFSET +           \
+  sizeof(size_t))  // Location of sizes. Note dim is not used here, left here
+                    // to make macros consistent.
+#define STRIDES_OFFSET(dims) \
+  (SIZES_OFFSET(dims) + dims * sizeof(size_t))  // Location of strides
+#define MKL_LAYOUT_OFFSET(dims) \
+  (STRIDES_OFFSET(dims) + dims * sizeof(size_t))  // Location of mklLayout_
+#define TF_LAYOUT_OFFSET(dims) \
+  (MKL_LAYOUT_OFFSET(dims) + SIZE_OF_MKL_DNN_BUF)  // Location of tfLayout_
+
+  // TODO(agramesh1) make sure to create a const to share with rewrite pass
+  // for min size of MKL metadata tensor.
+
+  void DeSerializeMklShape(const unsigned char* buf, size_t buf_size) {
+    CHECK(buf_size >= sizeof(size_t)) << "Bufsize too small in DeSerialize";
+    // Make sure buffer holds at least  isMklTensor_
+    isMklTensor_ =
+        *reinterpret_cast<const size_t*>(buf + IS_MKL_TENSOR_OFFSET) != 0;
+
+    if (isMklTensor_) {  // If it is an MKL Tensor then read the rest
+      dimension_ = *(reinterpret_cast<const size_t*>(buf + DIMS_OFFSET));
+      CHECK(buf_size >= SIZE_OF_MKL_SERIAL_DATA(dimension_))
+          << "Bufsize too small in DeSerialize";
+      sizes_ = new size_t[dimension_];
+      strides_ = new size_t[dimension_];
+      for (int i = 0; i < dimension_; i++) {
+        sizes_[i] =
+            reinterpret_cast<const size_t*>(buf + SIZES_OFFSET(dimension_))[i];
+        strides_[i] = reinterpret_cast<const size_t*>(
+            buf + STRIDES_OFFSET(dimension_))[i];
+      }
+      CHECK_EQ(dnnLayoutDeserialize_F32(&mklLayout_,
+                                        buf + MKL_LAYOUT_OFFSET(dimension_)),
+               E_SUCCESS);
+      CHECK_EQ(dnnLayoutDeserialize_F32(&tfLayout_,
+                                        buf + TF_LAYOUT_OFFSET(dimension_)),
+               E_SUCCESS);
+    }
+  }
+
+  void SerializeMklShape(unsigned char* buf, size_t buf_size) const {
+    CHECK(buf_size >= SIZE_OF_MKL_SERIAL_DATA(dimension_))
+        << "Bufsize too small to Serialize";
+    *reinterpret_cast<size_t*>(buf + IS_MKL_TENSOR_OFFSET) =
+        isMklTensor_ ? 1 : 0;
+    if (isMklTensor_) {
+      *(reinterpret_cast<size_t*>(buf + DIMS_OFFSET)) = dimension_;
+      for (int i = 0; i < dimension_; i++) {
+        reinterpret_cast<size_t*>(buf + SIZES_OFFSET(dimension_))[i] =
+            sizes_[i];
+        reinterpret_cast<size_t*>(buf + STRIDES_OFFSET(dimension_))[i] =
+            strides_[i];
+      }
+      CHECK_EQ(dnnLayoutSerialize_F32(mklLayout_,
+                                      buf + MKL_LAYOUT_OFFSET(dimension_)),
+               E_SUCCESS);
+      CHECK_EQ(
+          dnnLayoutSerialize_F32(tfLayout_, buf + TF_LAYOUT_OFFSET(dimension_)),
+          E_SUCCESS);
+    }
+  }
+
+ private:
+  bool isMklTensor_ =
+      false;  // Flag to indicate if the tensor is an  MKL tensor or not
+  dnnLayout_t mklLayout_ = nullptr;  // Pointer to the MKL layout
+  dnnLayout_t tfLayout_ = nullptr;   // Pointer to layout of corresponding
+  // Tensorflow tensor, used when conversion from MKL to standard tensor
+  size_t dimension_ = 0;
+  size_t* sizes_ = nullptr;    // Required by MKL for conversions
+  size_t* strides_ = nullptr;  // Required by MKL for conversions
+};
+
+int inline GetTensorDataIndex(int n) {
+  return 2 * n;  // index corresponding to nth input/output tensor
+}
+
+int inline GetTensorMetaDataIndex(int n) {
+  // index corresponding to meta data of nth input/output tensor
+  return 2 * n + 1;
+}
+// Get the MKL shape from the second string tensor
+inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
+  mklshape->DeSerializeMklShape(
+      ctext->input(GetTensorMetaDataIndex(n)).flat<uint8>().data(),
+      ctext->input(GetTensorMetaDataIndex(n)).flat<uint8>().size() *
+          sizeof(uint8));
+}
+
+// Gets the actual input
+inline const Tensor& MklGetInput(OpKernelContext* ctext, int n) {
+  return ctext->input(GetTensorDataIndex(n));
+}
+
+// Allocate the output tensor, create a second output tensor that will contain
+// the MKL shape serialized
+inline void AllocateOutputSetMklshape(OpKernelContext* ctext, int n,
+                                      Tensor** output,
+                                      const TensorShape& tfshape,
+                                      const MklShape& mklshape) {
+  Tensor* second_tensor = nullptr;
+  TensorShape second_shape;
+  second_shape.AddDim(SIZE_OF_MKL_SERIAL_DATA(mklshape.GetDimension()));
+  OP_REQUIRES_OK(
+      ctext, ctext->allocate_output(GetTensorDataIndex(n), tfshape, output));
+  OP_REQUIRES_OK(ctext, ctext->allocate_output(GetTensorMetaDataIndex(n),
+                                               second_shape, &second_tensor));
+  mklshape.SerializeMklShape(
+      second_tensor->flat<uint8>().data(),
+      second_tensor->flat<uint8>().size() * sizeof(uint8));
+}
+
+// Allocates a temp tensor and returns the data buffer for temporary storage.
+// Currently
+// we only support F32, will need to templatize if other types are added
+inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
+                           dnnLayout_t lt_buff, void** buf_out) {
+  TensorShape tf_shape;
+
+  tf_shape.AddDim(
+      dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(lt_buff)) /
+          sizeof(float) +
+      1);
+  OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<float>::v(),
+                                                 tf_shape, tensor_out));
+  *buf_out = static_cast<void*>(tensor_out->flat<float>().data());
+}
+
+inline void GetStridesFromSizes(TensorFormat data_format, size_t* strides,
+                                const size_t* sizes) {
+  // MKL requires strides in NCHW
+  if (data_format == FORMAT_NHWC) {
+    strides[0] = sizes[2];
+    strides[1] = sizes[0] * sizes[2];
+    strides[2] = 1;
+    strides[3] = sizes[0] * sizes[1] * sizes[2];
+  } else {
+    strides[0] = 1;
+    strides[1] = sizes[0];
+    strides[2] = sizes[0] * sizes[1];
+    strides[3] = sizes[0] * sizes[1] * sizes[2];
+  }
+}
+
+namespace mkl_layer_registry {
+
+static const char* kMklLayerLabel = "MklLayer";
+static const string kMklLayerLabelPattern = "label='MklLayer'";
+
+// Check whether opname is registered as MKL-compliant in the registry.
+//
+// @input: name of the op
+// @return: true if opname is registered as Mkl layer op
+static inline bool IsMklLayer(const std::string& op_name) {
+  string kernel = KernelsRegisteredForOp(op_name);
+  return kernel.find(kMklLayerLabelPattern) != string::npos;
+}
+
+} // namespace mkl_layer_registry
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index f95298d377..4fc4c2faa2 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -1056,7 +1056,7 @@ cuda_op_kernel.cu.o -I $TF_INC -fPIC -lcudart
 
 Note that if your CUDA libraries are not installed in `/usr/local/lib64`,
 you'll need to specify the path explicitly in the second (g++) command above.
-For example, add `-L /usr/local/cuda-8.0/lib64/` if your CUDA is installed in 
+For example, add `-L /usr/local/cuda-8.0/lib64/` if your CUDA is installed in
 `/usr/local/cuda-8.0`.
 
 ### Implement the gradient in Python {#implement-gradient}
@@ -1160,7 +1160,9 @@ for ZeroOut:
 ```
 
 `c->set_output(0, c->input(0));` declares that the first output's shape should
-be set to the first input's shape. There are a number of common shape functions
+be set to the first input's shape. If the output is selected by its index as in the above example, the second parameter of `set_output` should be a `ShapeHandle` object. You can create an empty `ShapeHandle` object by its default constructor. The `ShapeHandle` object for an input with index `idx` can be obtained by `c->input(idx)`.
+
+There are a number of common shape functions
 that apply to many ops, such as `shape_inference::UnchangedShape` which can be
 found in [common_shape_fns.h](https://www.tensorflow.org/code/tensorflow/core/framework/common_shape_fns.h) and used as follows:
 
@@ -1220,7 +1222,15 @@ particular dimension has a very specific value using `InferenceContext::Dim` and
 `InferenceContext::WithValue`; you can specify that an output dimension is the
 sum / product of two input dimensions using `InferenceContext::Add` and
 `InferenceContext::Multiply`. See the `InferenceContext` class for
-all of the various shape manipulations you can specify.
+all of the various shape manipulations you can specify. The following example sets
+shape of the first output to (n, 3), where first input has shape (n, ...)
+
+```c++
+.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+    c->set_output(0, c->Matrix(c->Dim(c->input(0), 0), 3));
+    return Status::OK();
+});
+```
 
 If you have a complicated shape function, you should consider adding a test for
 validating that various input shape combinations produce the expected output
diff --git a/tensorflow/docs_src/get_started/get_started.md b/tensorflow/docs_src/get_started/get_started.md
index ae0007359d..b71249de0a 100644
--- a/tensorflow/docs_src/get_started/get_started.md
+++ b/tensorflow/docs_src/get_started/get_started.md
@@ -374,7 +374,7 @@ estimator.fit(input_fn=input_fn, steps=1000)
 
 # Here we evaluate how well our model did. In a real example, we would want
 # to use a separate validation and testing data set to avoid overfitting.
-estimator.evaluate(input_fn=input_fn)
+print(estimator.evaluate(input_fn=input_fn))
 ```
 When run, it produces
 ```
diff --git a/tensorflow/docs_src/get_started/mnist/mechanics.md b/tensorflow/docs_src/get_started/mnist/mechanics.md
index afd9039017..b55a5c19ff 100644
--- a/tensorflow/docs_src/get_started/mnist/mechanics.md
+++ b/tensorflow/docs_src/get_started/mnist/mechanics.md
@@ -351,7 +351,7 @@ training.
 
 ```python
 if step % 100 == 0:
-    print 'Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)
+    print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
 ```
 
 #### Visualize the Status
@@ -421,19 +421,19 @@ the training and test datasets.  The `do_eval()` function is called thrice, for
 the training, validation, and test datasets.
 
 ```python
-print 'Training Data Eval:'
+print('Training Data Eval:')
 do_eval(sess,
         eval_correct,
         images_placeholder,
         labels_placeholder,
         data_sets.train)
-print 'Validation Data Eval:'
+print('Validation Data Eval:')
 do_eval(sess,
         eval_correct,
         images_placeholder,
         labels_placeholder,
         data_sets.validation)
-print 'Test Data Eval:'
+print('Test Data Eval:')
 do_eval(sess,
         eval_correct,
         images_placeholder,
diff --git a/tensorflow/docs_src/programmers_guide/faq.md b/tensorflow/docs_src/programmers_guide/faq.md
index a400d91654..fa8b6fb7f1 100644
--- a/tensorflow/docs_src/programmers_guide/faq.md
+++ b/tensorflow/docs_src/programmers_guide/faq.md
@@ -92,12 +92,12 @@ two following snippets of code are equivalent:
 # Using `Session.run()`.
 sess = tf.Session()
 c = tf.constant(5.0)
-print sess.run(c)
+print(sess.run(c))
 
 # Using `Tensor.eval()`.
 c = tf.constant(5.0)
 with tf.Session():
-  print c.eval()
+  print(c.eval())
 ```
 
 In the second example, the session acts as a
diff --git a/tensorflow/docs_src/programmers_guide/variables.md b/tensorflow/docs_src/programmers_guide/variables.md
index 9189618368..04bfca5f3b 100644
--- a/tensorflow/docs_src/programmers_guide/variables.md
+++ b/tensorflow/docs_src/programmers_guide/variables.md
@@ -144,6 +144,11 @@ specified list, of the variables in the graph.  The saver object provides
 methods to run these ops, specifying paths for the checkpoint files to write to
 or read from.
 
+Note that to restore a model checkpoint without a graph one must first import
+the graph from the meta graph file (typical extension is `.meta`). This is
+done with @{tf.train.import_meta_graph}, which in turn returns a `Saver` from
+which one can than perform a `restore`.
+
 ### Checkpoint Files
 
 Variables are saved in binary files that, roughly, contain a map from variable
diff --git a/tensorflow/docs_src/tutorials/linear.md b/tensorflow/docs_src/tutorials/linear.md
index 3569d47efd..30daf335bf 100644
--- a/tensorflow/docs_src/tutorials/linear.md
+++ b/tensorflow/docs_src/tutorials/linear.md
@@ -217,7 +217,7 @@ results = e.evaluate(input_fn=input_fn_test, steps=1)
 
 # Print the stats for the evaluation.
 for key in sorted(results):
-    print "%s: %s" % (key, results[key])
+    print("%s: %s" % (key, results[key]))
 ```
 
 ### Wide and deep learning
diff --git a/tensorflow/docs_src/tutorials/using_gpu.md b/tensorflow/docs_src/tutorials/using_gpu.md
index e4e342adfe..d64cdafdef 100644
--- a/tensorflow/docs_src/tutorials/using_gpu.md
+++ b/tensorflow/docs_src/tutorials/using_gpu.md
@@ -28,7 +28,7 @@ c = tf.matmul(a, b)
 # Creates a session with log_device_placement set to True.
 sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
 # Runs the op.
-print sess.run(c)
+print(sess.run(c))
 ```
 
 You should see the following output:
@@ -61,7 +61,7 @@ with tf.device('/cpu:0'):
 # Creates a session with log_device_placement set to True.
 sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
 # Runs the op.
-print sess.run(c)
+print(sess.run(c))
 ```
 
 You will see that now `a` and `b` are assigned to `cpu:0`.
@@ -131,7 +131,7 @@ with tf.device('/gpu:2'):
 # Creates a session with log_device_placement set to True.
 sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
 # Runs the op.
-print sess.run(c)
+print(sess.run(c))
 ```
 
 If the device you have specified does not exist, you will get
@@ -160,7 +160,7 @@ with tf.device('/gpu:2'):
 sess = tf.Session(config=tf.ConfigProto(
       allow_soft_placement=True, log_device_placement=True))
 # Runs the op.
-print sess.run(c)
+print(sess.run(c))
 ```
 
 ## Using multiple GPUs
@@ -182,7 +182,7 @@ with tf.device('/cpu:0'):
 # Creates a session with log_device_placement set to True.
 sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
 # Runs the op.
-print sess.run(sum)
+print(sess.run(sum))
 ```
 
 You will see the following output.
diff --git a/tensorflow/docs_src/tutorials/wide.md b/tensorflow/docs_src/tutorials/wide.md
index 079efb201e..471811ea1a 100644
--- a/tensorflow/docs_src/tutorials/wide.md
+++ b/tensorflow/docs_src/tutorials/wide.md
@@ -188,7 +188,7 @@ def input_fn(df):
   categorical_cols = {k: tf.SparseTensor(
       indices=[[i, 0] for i in range(df[k].size)],
       values=df[k].values,
-      shape=[df[k].size, 1])
+      dense_shape=[df[k].size, 1])
                       for k in CATEGORICAL_COLUMNS}
   # Merges the two dictionaries into one.
   feature_cols = dict(continuous_cols.items() + categorical_cols.items())
@@ -261,6 +261,8 @@ learned through the model training process we'll go through later.
 We'll do the similar trick to define the other categorical features:
 
 ```python
+race = tf.contrib.layers.sparse_column_with_hash_bucket("race", hash_bucket_size=100)
+marital_status = tf.contrib.layers.sparse_column_with_hash_bucket("marital_status", hash_bucket_size=100)
 relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
 workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
 occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
@@ -377,7 +379,7 @@ the labels of the holdout data:
 ```python
 results = m.evaluate(input_fn=eval_input_fn, steps=1)
 for key in sorted(results):
-    print "%s: %s" % (key, results[key])
+    print("%s: %s" % (key, results[key]))
 ```
 
 The first line of the output should be something like `accuracy: 0.83557522`,
diff --git a/tensorflow/docs_src/tutorials/wide_and_deep.md b/tensorflow/docs_src/tutorials/wide_and_deep.md
index b5e5981fe1..dd830eeca9 100644
--- a/tensorflow/docs_src/tutorials/wide_and_deep.md
+++ b/tensorflow/docs_src/tutorials/wide_and_deep.md
@@ -255,7 +255,7 @@ After reading in the data, you can train and evaluate the model:
 m.fit(input_fn=train_input_fn, steps=200)
 results = m.evaluate(input_fn=eval_input_fn, steps=1)
 for key in sorted(results):
-    print "%s: %s" % (key, results[key])
+    print("%s: %s" % (key, results[key]))
 ```
 
 The first line of the output should be something like `accuracy: 0.84429705`. We
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
index a95e93ce69..c1a893e9ee 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
@@ -432,7 +432,7 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
         // Everything else is 0, so just pick a suitable slider to push up when the
         // selected one goes down.
         if (adapter.items[lastOtherStyle] == slider) {
-          lastOtherStyle = lastOtherStyle + 1 % NUM_STYLES;
+          lastOtherStyle = (lastOtherStyle + 1) % NUM_STYLES;
         }
         adapter.items[lastOtherStyle].setValue(1.0f - value);
       }
diff --git a/tensorflow/examples/learn/README.md b/tensorflow/examples/learn/README.md
index b36986855f..37157fc296 100644
--- a/tensorflow/examples/learn/README.md
+++ b/tensorflow/examples/learn/README.md
@@ -1,7 +1,7 @@
 # TF Learn Examples
 
 Learn is a high-level API for TensorFlow that allows you to create,
-train, and use deep learning models easily. See the [Quickstart tutorial](../../g3doc/tutorials/tflearn/index.md)
+train, and use deep learning models easily. See the [Quickstart tutorial](https://www.tensorflow.org/get_started/tflearn)
 for an introduction to the API.
 
 To run most of these examples, you need to install the `scikit learn` library (`sudo pip install sklearn`).
diff --git a/tensorflow/examples/learn/boston.py b/tensorflow/examples/learn/boston.py
index 2986ff9106..19cfdee513 100644
--- a/tensorflow/examples/learn/boston.py
+++ b/tensorflow/examples/learn/boston.py
@@ -16,19 +16,22 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from sklearn import cross_validation
+
+from sklearn import datasets
+from sklearn import model_selection
 from sklearn import metrics
 from sklearn import preprocessing
+
 import tensorflow as tf
 
 
 def main(unused_argv):
   # Load dataset
-  boston = tf.contrib.learn.datasets.load_dataset('boston')
+  boston = datasets.load_boston()
   x, y = boston.data, boston.target
 
   # Split dataset into train / test
-  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
+  x_train, x_test, y_train, y_test = model_selection.train_test_split(
       x, y, test_size=0.2, random_state=42)
 
   # Scale data (training set) to 0 mean and unit standard deviation.
diff --git a/tensorflow/examples/learn/iris.py b/tensorflow/examples/learn/iris.py
index 7b65eb521a..ec2aa9b573 100644
--- a/tensorflow/examples/learn/iris.py
+++ b/tensorflow/examples/learn/iris.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
+from sklearn import datasets
 from sklearn import metrics
 from sklearn import model_selection
 
@@ -26,7 +26,7 @@ import tensorflow as tf
 
 def main(unused_argv):
   # Load dataset.
-  iris = tf.contrib.learn.datasets.load_dataset('iris')
+  iris = datasets.load_iris()
   x_train, x_test, y_train, y_test = model_selection.train_test_split(
       iris.data, iris.target, test_size=0.2, random_state=42)
 
diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py
index c3d00a11b9..7e10014c39 100644
--- a/tensorflow/examples/learn/text_classification.py
+++ b/tensorflow/examples/learn/text_classification.py
@@ -24,6 +24,7 @@ import numpy as np
 import pandas
 from sklearn import metrics
 import tensorflow as tf
+from tensorflow.contrib.layers.python.layers import encoders
 
 learn = tf.contrib.learn
 
@@ -37,7 +38,7 @@ n_words = 0
 def bag_of_words_model(features, target):
   """A bag-of-words model. Note it disregards the word order in the text."""
   target = tf.one_hot(target, 15, 1, 0)
-  features = tf.contrib.layers.bow_encoder(
+  features = encoders.bow_encoder(
       features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE)
   logits = tf.contrib.layers.fully_connected(features, 15, activation_fn=None)
   loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
diff --git a/tensorflow/examples/tutorials/deepdream/deepdream.ipynb b/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
index cbcc54ce3c..016b21cd12 100644
--- a/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
+++ b/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
@@ -278,7 +278,7 @@
     "            tensor = n.attr['value'].tensor\n",
     "            size = len(tensor.tensor_content)\n",
     "            if size > max_const_size:\n",
-    "                tensor.tensor_content = bytes(\"<stripped %d bytes>\"%size, 'utf-8')\n",
+    "                tensor.tensor_content = bytes(\"<stripped %d bytes>\"%size)\n",
     "    return strip_def\n",
     "  \n",
     "def rename_nodes(graph_def, rename_func):\n",
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index 25800c109e..f54a7c37a1 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -62,7 +62,7 @@ print('Data size', len(words))
 vocabulary_size = 50000
 
 
-def build_dataset(words):
+def build_dataset(words, vocabulary_size):
   count = [['UNK', -1]]
   count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
   dictionary = dict()
@@ -81,7 +81,7 @@ def build_dataset(words):
   reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
   return data, count, dictionary, reverse_dictionary
 
-data, count, dictionary, reverse_dictionary = build_dataset(words)
+data, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size)
 del words  # Hint to reduce memory.
 print('Most common words (+UNK)', count[:5])
 print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
@@ -181,7 +181,7 @@ with graph.as_default():
       valid_embeddings, normalized_embeddings, transpose_b=True)
 
   # Add variable initializer.
-  init = tf.initialize_all_variables()
+  init = tf.global_variables_initializer()
 
 # Step 5: Begin training.
 num_steps = 100001
diff --git a/tensorflow/go/genop/generate.sh b/tensorflow/go/genop/generate.sh
index b961f7200a..d791e39c40 100644
--- a/tensorflow/go/genop/generate.sh
+++ b/tensorflow/go/genop/generate.sh
@@ -20,11 +20,17 @@ go get github.com/golang/protobuf/proto
 go get github.com/golang/protobuf/protoc-gen-go
 
 cd $(dirname $0)
-TF_DIR=${GOPATH}/src/github.com/tensorflow/tensorflow
-PROTOC="${TF_DIR}/bazel-out/host/bin/external/protobuf/protoc"
+for g in $(echo $GOPATH | sed "s/:/ /g"); do
+    TF_DIR="${g}/src/github.com/tensorflow/tensorflow"
+    PROTOC="${TF_DIR}/bazel-out/host/bin/external/protobuf/protoc"
+    if [ -x "${PROTOC}" ]; then
+        break
+    fi
+done
 
 if [ ! -x "${PROTOC}" ]
 then
+  set +e
   PATH_PROTOC=$(which protoc)
   if [ ! -x "${PATH_PROTOC}" ]
   then
@@ -34,6 +40,7 @@ then
     exit 1
   fi
   PROTOC=$PATH_PROTOC
+  set -e
 fi
 
 # Ensure that protoc-gen-go is available in $PATH
diff --git a/tensorflow/java/README.md b/tensorflow/java/README.md
index 26377ba0d2..20eb6a8265 100644
--- a/tensorflow/java/README.md
+++ b/tensorflow/java/README.md
@@ -110,7 +110,7 @@ libraries will need to be built from source.
     brew install swig
     ```
 
-3.  [Configure](https://www.tensorflow.org/get_started/os_setup#configure_the_installation)
+3.  [Configure](https://www.tensorflow.org/install/install_sources#configure_the_installation)
     (e.g., enable GPU support) and build:
 
     ```sh
@@ -120,8 +120,8 @@ libraries will need to be built from source.
       //tensorflow/java:libtensorflow_jni
     ```
 
-The JAR (`libtensorflow.jar`) and native library (`libtensorflow_jni.so`) will 
-be in `bazel-bin/tensorflow/java`.
+The JAR (`libtensorflow.jar`) and native library (`libtensorflow_jni.so` on Linux or `libtensorflow_jni.dylib` on OS X) will 
+be in `bazel-bin/tensorflow/java`. Using these artifacts follow both steps 3 and 4 in the [quickstart](#quickstart) section in order to get your application up and running.
 
 ### Maven
 
diff --git a/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java b/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java
index c3938fe23f..b4591dd869 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java
@@ -27,7 +27,8 @@ package org.tensorflow;
 public class SavedModelBundle implements AutoCloseable {
 
   /**
-   * Load a saved model from an export directory.
+   * Load a saved model from an export directory. The model that is being loaded should be created using
+   * the <a href="https://www.tensorflow.org/api_docs/python/tf/saved_model">Saved Model API</a>.
    *
    * @param exportDir the directory path containing a saved model.
    * @param tags the tags identifying the specific metagraphdef to load.
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
index efd6c81b30..692de2289d 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
@@ -172,8 +172,7 @@ public final class Tensor implements AutoCloseable {
    *
    * <p>Creates a Tensor with the provided shape of any type where the tensor's data has been
    * encoded into {@code data} as per the specification of the TensorFlow <a
-   * href="https://www.tensorflow.org/code/tensorflow/c/c_api.h">C
-   * API</a>.
+   * href="https://www.tensorflow.org/code/tensorflow/c/c_api.h">C API</a>.
    *
    * @param dataType the tensor datatype.
    * @param shape the tensor shape.
diff --git a/tensorflow/java/src/main/java/org/tensorflow/package-info.java b/tensorflow/java/src/main/java/org/tensorflow/package-info.java
index 3b7b8079f9..dd4859e1b1 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/package-info.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/package-info.java
@@ -19,8 +19,8 @@ limitations under the License.
  * <p><b>WARNING</b>: The API is currently experimental and is not covered by TensorFlow <a
  * href="https://www.tensorflow.org/programmers_guide/version_semantics">API stability
  * guarantees</a>. See <a
- * href="https://www.tensorflow.org/code/tensorflow/java/README.md">README.md</a>
- * for installation instructions.
+ * href="https://www.tensorflow.org/code/tensorflow/java/README.md">README.md</a> for installation
+ * instructions.
  *
  * <p>The <a
  * href="https://www.tensorflow.org/code/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java">LabelImage</a>
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index cc31555690..038dc4147a 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -711,14 +711,14 @@ class BaseSession(SessionInterface):
        # v is the numpy array [10, 20]
        # 'fetches' can be a list.
        v = session.run([a, b])
-       # v a Python list with 2 numpy arrays: the numpy array [10, 20] and the
+       # v is a Python list with 2 numpy arrays: the 1-D array [10, 20] and the
        # 1-D array [1.0, 2.0]
        # 'fetches' can be arbitrary lists, tuples, namedtuple, dicts:
        MyData = collections.namedtuple('MyData', ['a', 'b'])
        v = session.run({'k1': MyData(a, b), 'k2': [b, a]})
        # v is a dict with
-       # v['k1'] is a MyData namedtuple with 'a' the numpy array [10, 20] and
-       # 'b' the numpy array [1.0, 2.0]
+       # v['k1'] is a MyData namedtuple with 'a' (the numpy array [10, 20]) and
+       # 'b' (the numpy array [1.0, 2.0])
        # v['k2'] is a list with the numpy array [1.0, 2.0] and the numpy array
        # [10, 20].
     ```
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 84bcd8e701..952c4adbfa 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -15,6 +15,7 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
 
 py_library(
     name = "debug_py",
@@ -33,11 +34,12 @@ py_library(
 py_library(
     name = "debug_pip",
     deps = [
-        ":debug_examples",
         ":debug_py",
         ":offline_analyzer",
         ":session_debug_testlib",
-    ],
+    ] + if_not_windows([
+        ":debug_examples",
+    ]),
 )
 
 py_library(
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index bce7e30b68..71230ba000 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -84,9 +84,7 @@ class TensordotTest(test_lib.TestCase):
                                    b_ph: b,
                                    axes_ph: axes_value})
 
-  def test_no_partial_shape_inference(self):
-    # If one of the shapes is only partially defined, the output shape is
-    # unknown.
+  def test_partial_shape_inference(self):
     a = array_ops.placeholder(dtypes.float32)
     b = array_ops.placeholder(dtypes.float32)
     axes = ([1], [0])
@@ -95,13 +93,21 @@ class TensordotTest(test_lib.TestCase):
     a.set_shape([None, 2])
     b.set_shape([2, 3])
     output = math_ops.tensordot(a, b, axes)
-    self.assertEqual(output.get_shape().ndims, None)
+    output_shape = output.get_shape()
+    self.assertEqual(output_shape.ndims, 2)
+    output_shape = output_shape.as_list()
+    self.assertEqual(output_shape[0], None)
+    self.assertEqual(output_shape[1], 3)
     a = array_ops.placeholder(dtypes.float32)
     b = array_ops.placeholder(dtypes.float32)
     a.set_shape([2, 2])
     b.set_shape([2, None])
     output = math_ops.tensordot(a, b, axes)
-    self.assertEqual(output.get_shape().ndims, None)
+    output_shape = output.get_shape()
+    self.assertEqual(output_shape.ndims, 2)
+    output_shape = output_shape.as_list()
+    self.assertEqual(output_shape[0], 2)
+    self.assertEqual(output_shape[1], None)
 
 
 def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 2601c61c47..3e40423ad6 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -294,7 +294,7 @@ class AveragePooling2D(_Pooling2D):
     data_format: A string. The ordering of the dimensions in the inputs.
       `channels_last` (default) and `channels_first` are supported.
       `channels_last` corresponds to inputs with shape
-      `(batch, height, channels, width)` while `channels_first` corresponds to
+      `(batch, height, width, channels)` while `channels_first` corresponds to
       inputs with shape `(batch, channels, height, width)`.
     name: A string, the name of the layer.
   """
@@ -329,7 +329,7 @@ def average_pooling2d(inputs,
     data_format: A string. The ordering of the dimensions in the inputs.
       `channels_last` (default) and `channels_first` are supported.
       `channels_last` corresponds to inputs with shape
-      `(batch, height, channels, width)` while `channels_first` corresponds to
+      `(batch, height, width, channels)` while `channels_first` corresponds to
       inputs with shape `(batch, channels, height, width)`.
     name: A string, the name of the layer.
 
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index f9fd5d77c9..c4a27009c3 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -275,7 +275,7 @@ def exit(data, name=None):
 def switch(data, pred, dtype=None, name=None):
   """Forwards `data` to an output determined by `pred`.
 
-  If `pred` is true, the `data` input is forwared to the first output.
+  If `pred` is false, the `data` input is forwared to the first output.
   Otherwise, the data goes to the second output.
 
   This op handles `Tensor`s and `IndexedSlices`.
@@ -323,7 +323,7 @@ def switch(data, pred, dtype=None, name=None):
 def _SwitchRefOrTensor(data, pred, name="Switch"):
   """Forwards `data` to an output determined by `pred`.
 
-  If `pred` is true, the `data` input is forwared to the first output.
+  If `pred` is false, the `data` input is forwared to the first output.
   Otherwise, the data goes to the second output.
 
   This op handles `Tensor`s and `IndexedSlices`.
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 62072e1279..0a2d4e4792 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -90,22 +90,23 @@ def _is_tensor(x):
   return isinstance(x, (ops.Tensor, variables.Variable))
 
 
-def _ImageDimensions(image):
+def _ImageDimensions(image, rank):
   """Returns the dimensions of an image tensor.
 
   Args:
-    image: A 3-D Tensor of shape `[height, width, channels]`.
+    image: A rank-D Tensor. For 3-D  of shape: `[height, width, channels]`.
+    rank: The expected rank of the image
 
   Returns:
-    A list of `[height, width, channels]` corresponding to the dimensions of the
+    A list of corresponding to the dimensions of the
     input image.  Dimensions that are statically known are python integers,
     otherwise they are integer scalar tensors.
   """
   if image.get_shape().is_fully_defined():
     return image.get_shape().as_list()
   else:
-    static_shape = image.get_shape().with_rank(3).as_list()
-    dynamic_shape = array_ops.unstack(array_ops.shape(image), 3)
+    static_shape = image.get_shape().with_rank(rank).as_list()
+    dynamic_shape = array_ops.unstack(array_ops.shape(image), rank)
     return [s if s is not None else d
             for s, d in zip(static_shape, dynamic_shape)]
 
@@ -144,22 +145,39 @@ def _Check3DImage(image, require_static=True):
     return []
 
 
-def _CheckAtLeast3DImage(image):
+def _CheckAtLeast3DImage(image, require_static=True):
   """Assert that we are working with properly shaped image.
 
   Args:
     image: >= 3-D Tensor of size [*, height, width, depth]
+    require_static: If `True`, requires that all dimensions of `image` are
+      known and non-zero.
 
   Raises:
     ValueError: if image.shape is not a [>= 3] vector.
+
+  Returns:
+    An empty list, if `image` has fully defined dimensions. Otherwise, a list
+    containing an assert op is returned.
   """
-  if not image.get_shape().is_fully_defined():
+  try:
+    if image.get_shape().ndims is None:
+      image_shape = image.get_shape().with_rank(3)
+    else:
+      image_shape = image.get_shape().with_rank_at_least(3)
+  except ValueError:
+    raise ValueError("'image' must be at least three-dimensional.")
+  if require_static and not image_shape.is_fully_defined():
     raise ValueError('\'image\' must be fully defined.')
-  if image.get_shape().ndims < 3:
-    raise ValueError('\'image\' must be at least three-dimensional.')
-  if not all(x > 0 for x in image.get_shape()):
+  if any(x == 0 for x in image_shape):
     raise ValueError('all dims of \'image.shape\' must be > 0: %s' %
-                     image.get_shape())
+                     image_shape)
+  if not image_shape.is_fully_defined():
+    return [check_ops.assert_positive(array_ops.shape(image),
+                                      ["all dims of 'image.shape' "
+                                       "must be > 0."])]
+  else:
+    return []
 
 
 def fix_image_flip_shape(image, result):
@@ -397,14 +415,18 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
   `target_height` by `target_width`.
 
   Args:
-    image: 3-D tensor with shape `[height, width, channels]`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     offset_height: Number of rows of zeros to add on top.
     offset_width: Number of columns of zeros to add on the left.
     target_height: Height of output image.
     target_width: Width of output image.
 
   Returns:
-    3-D tensor of shape `[target_height, target_width, channels]`
+    If `image` was 4-D, a 4-D float Tensor of shape
+    `[batch, target_height, target_width, channels]`
+    If `image` was 3-D, a 3-D float Tensor of shape
+    `[target_height, target_width, channels]`
 
   Raises:
     ValueError: If the shape of `image` is incompatible with the `offset_*` or
@@ -414,9 +436,22 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
   image = ops.convert_to_tensor(image, name='image')
 
   assert_ops = []
-  assert_ops += _Check3DImage(image, require_static=False)
+  assert_ops += _CheckAtLeast3DImage(image, require_static=False)
+
+  is_batch = True
+  image_shape = image.get_shape()
+  if image_shape.ndims == 3:
+    is_batch = False
+    image = array_ops.expand_dims(image, 0)
+  elif image_shape.ndims is None:
+    is_batch = False
+    image = array_ops.expand_dims(image, 0)
+    image.set_shape([None] * 4)
+  elif image_shape.ndims != 4:
+    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+
+  batch, height, width, depth = _ImageDimensions(image, rank=4)
 
-  height, width, depth = _ImageDimensions(image)
   after_padding_width = target_width - offset_width - width
   after_padding_height = target_height - offset_height - height
 
@@ -433,15 +468,18 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
   # Do not pad on the depth dimensions.
   paddings = array_ops.reshape(
       array_ops.stack([
-          offset_height, after_padding_height, offset_width,
+          0, 0, offset_height, after_padding_height, offset_width,
           after_padding_width, 0, 0
-      ]), [3, 2])
+      ]), [4, 2])
   padded = array_ops.pad(image, paddings)
 
   padded_shape = [None if _is_tensor(i) else i
-                  for i in [target_height, target_width, depth]]
+                  for i in [batch, target_height, target_width, depth]]
   padded.set_shape(padded_shape)
 
+  if not is_batch:
+    padded = array_ops.squeeze(padded, squeeze_dims=[0])
+
   return padded
 
 
@@ -455,7 +493,8 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
   `offset_height + target_height, offset_width + target_width`.
 
   Args:
-    image: 3-D tensor with shape `[height, width, channels]`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     offset_height: Vertical coordinate of the top-left corner of the result in
                    the input.
     offset_width: Horizontal coordinate of the top-left corner of the result in
@@ -464,7 +503,10 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
     target_width: Width of the result.
 
   Returns:
-    3-D tensor of image with shape `[target_height, target_width, channels]`
+    If `image` was 4-D, a 4-D float Tensor of shape
+    `[batch, target_height, target_width, channels]`
+    If `image` was 3-D, a 3-D float Tensor of shape
+    `[target_height, target_width, channels]`
 
   Raises:
     ValueError: If the shape of `image` is incompatible with the `offset_*` or
@@ -474,9 +516,21 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
   image = ops.convert_to_tensor(image, name='image')
 
   assert_ops = []
-  assert_ops += _Check3DImage(image, require_static=False)
+  assert_ops += _CheckAtLeast3DImage(image, require_static=False)
+
+  is_batch = True
+  image_shape = image.get_shape()
+  if image_shape.ndims == 3:
+    is_batch = False
+    image = array_ops.expand_dims(image, 0)
+  elif image_shape.ndims is None:
+    is_batch = False
+    image = array_ops.expand_dims(image, 0)
+    image.set_shape([None] * 4)
+  elif image_shape.ndims != 4:
+    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
-  height, width, depth = _ImageDimensions(image)
+  batch, height, width, depth = _ImageDimensions(image, rank=4)
 
   assert_ops += _assert(offset_width >= 0, ValueError,
                         'offset_width must be >= 0.')
@@ -493,13 +547,16 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
   image = control_flow_ops.with_dependencies(assert_ops, image)
 
   cropped = array_ops.slice(image,
-                            array_ops.stack([offset_height, offset_width, 0]),
-                            array_ops.stack([target_height, target_width, -1]))
+                            array_ops.stack([0, offset_height, offset_width, 0]),
+                            array_ops.stack([-1, target_height, target_width, -1]))
 
   cropped_shape = [None if _is_tensor(i) else i
-                   for i in [target_height, target_width, depth]]
+                   for i in [batch, target_height, target_width, depth]]
   cropped.set_shape(cropped_shape)
 
+  if not is_batch:
+    cropped = array_ops.squeeze(cropped, squeeze_dims=[0])
+
   return cropped
 
 
@@ -516,7 +573,8 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
   dimension.
 
   Args:
-    image: 3-D tensor of shape `[height, width, channels]`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     target_height: Target height.
     target_width: Target width.
 
@@ -524,13 +582,27 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
     ValueError: if `target_height` or `target_width` are zero or negative.
 
   Returns:
-    Cropped and/or padded image of shape
-    `[target_height, target_width, channels]`
+    Cropped and/or padded image.
+    If `images` was 4-D, a 4-D float Tensor of shape
+    `[batch, new_height, new_width, channels]`.
+    If `images` was 3-D, a 3-D float Tensor of shape
+    `[new_height, new_width, channels]`.
   """
   image = ops.convert_to_tensor(image, name='image')
+  image_shape = image.get_shape()
+  is_batch = True
+  if image_shape.ndims == 3:
+    is_batch = False
+    image = array_ops.expand_dims(image, 0)
+  elif image_shape.ndims is None:
+    is_batch = False
+    image = array_ops.expand_dims(image, 0)
+    image.set_shape([None] * 4)
+  elif image_shape.ndims != 4:
+    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
   assert_ops = []
-  assert_ops += _Check3DImage(image, require_static=False)
+  assert_ops += _CheckAtLeast3DImage(image, require_static=False)
   assert_ops += _assert(target_width > 0, ValueError,
                         'target_width must be > 0.')
   assert_ops += _assert(target_height > 0, ValueError,
@@ -563,7 +635,7 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
     else:
       return x == y
 
-  height, width, _ = _ImageDimensions(image)
+  _, height, width, _ = _ImageDimensions(image, rank=4)
   width_diff = target_width - width
   offset_crop_width = max_(-width_diff // 2, 0)
   offset_pad_width = max_(width_diff // 2, 0)
@@ -585,7 +657,7 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
   if resized.get_shape().ndims is None:
     raise ValueError('resized contains no shape.')
 
-  resized_height, resized_width, _ = _ImageDimensions(resized)
+  _, resized_height, resized_width, _ = _ImageDimensions(resized, rank=4)
 
   assert_ops = []
   assert_ops += _assert(equal_(resized_height, target_height), ValueError,
@@ -594,6 +666,10 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
                         'resized width is not correct.')
 
   resized = control_flow_ops.with_dependencies(assert_ops, resized)
+
+  if not is_batch:
+    resized = array_ops.squeeze(resized, squeeze_dims=[0])
+
   return resized
 
 
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index c8691f4eb8..799f7e4935 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -299,7 +299,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     return y_v.reshape(x_np.shape)
 
   def _adjustHueTf(self, x_np, delta_h):
-    with self.test_session(use_gpu=False):
+    with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np)
       y = image_ops.adjust_hue(x, delta_h)
       y_tf = y.eval()
@@ -1185,9 +1185,13 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     offset_height, offset_width = [0, 0]
     target_height, target_width = [2, 2]
 
-    for x_shape in ([1, 3, 5, 1], [3, 5]):
+    for x_shape in ([3, 5],):
+      self._assertRaises(x, x_shape, offset_height, offset_width, target_height,
+                         target_width, "'image' must be at least three-dimensional.")
+
+    for x_shape in ([1, 3, 5, 1, 1],):
       self._assertRaises(x, x_shape, offset_height, offset_width, target_height,
-                         target_width, "must be three-dimensional")
+                         target_width, "'image' must have either 3 or 4 dimensions.")
 
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
@@ -1430,9 +1434,13 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     offset_height, offset_width = [0, 0]
     target_height, target_width = [2, 2]
 
-    for x_shape in ([1, 3, 5, 1], [3, 5]):
+    for x_shape in ([3, 5],):
       self._assertRaises(x, x_shape, offset_height, offset_width, target_height,
-                         target_width, "must be three-dimensional")
+                         target_width, "'image' must be at least three-dimensional")
+
+    for x_shape in ([1, 3, 5, 1, 1],):
+      self._assertRaises(x, x_shape, offset_height, offset_width, target_height,
+                         target_width, "'image' must have either 3 or 4 dimensions.")
 
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
@@ -2220,9 +2228,13 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     x = [0] * 15
     target_height, target_width = [4, 4]
 
-    for x_shape in ([1, 3, 5, 1], [3, 5]):
+    for x_shape in ([3, 5],):
+      self._assertRaises(x, x_shape, target_height, target_width,
+                         "'image' must have either 3 or 4 dimensions.")
+
+    for x_shape in ([1, 3, 5, 1, 1],):
       self._assertRaises(x, x_shape, target_height, target_width,
-                         "must be three-dimensional")
+                         "'image' must have either 3 or 4 dimensions.")
 
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index d3d954f33d..fe4a47b9ae 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -2298,12 +2298,14 @@ def tensordot(a, b, axes, name=None):
         assumes that `a` is the second argument in the contraction operation.
 
     Returns:
-      A pair `(reshaped_a, free_dims)` where `reshaped_a` is the tensor `a`
-      reshaped to allow contraction via `matmul` and `free_dims` is either a
-      list of integers or an `int32` `Tensor`, depending on if `axes` is a list
-      and the shape of `a`  is fully defined.
+      A tuple `(reshaped_a, free_dims, free_dims_static)` where `reshaped_a` is
+      the tensor `a` reshaped to allow contraction via `matmul`, `free_dims` is
+      either a list of integers or an `int32` `Tensor`, depending on whether
+      the shape of a is fully specified, and free_dims_static is either a list
+      of integers and None values, or None, representing the inferred
+      static shape of the free dimensions
+      
     """
-    # TODO(b/33084409): Implement partial shape inference.
     if a.get_shape().is_fully_defined() and isinstance(axes, (list, tuple)):
       shape_a = a.get_shape().as_list()
       axes = [i if i >= 0 else i + len(shape_a) for i in axes]
@@ -2314,8 +2316,15 @@ def tensordot(a, b, axes, name=None):
       perm = list(axes) + free if flipped else free + list(axes)
       new_shape = [prod_axes, prod_free] if flipped else [prod_free, prod_axes]
       reshaped_a = array_ops.reshape(array_ops.transpose(a, perm), new_shape)
-      return reshaped_a, free_dims
+      return reshaped_a, free_dims, free_dims
     else:
+      if a.get_shape().ndims is not None and isinstance(axes, (list, tuple)):
+        shape_a = a.get_shape().as_list()
+        axes = [i if i >= 0 else i + len(shape_a) for i in axes]
+        free = [i for i in xrange(len(shape_a)) if i not in axes]
+        free_dims_static = [shape_a[i] for i in free]
+      else:
+        free_dims_static = None
       shape_a = array_ops.shape(a)
       rank_a = array_ops.rank(a)
       axes = ops.convert_to_tensor(axes, dtype=dtypes.int32, name="axes")
@@ -2334,7 +2343,7 @@ def tensordot(a, b, axes, name=None):
         perm = array_ops.concat([free, axes], 0)
         new_shape = array_ops.stack([prod_free_dims, prod_axes_dims])
       reshaped_a = array_ops.reshape(array_ops.transpose(a, perm), new_shape)
-      return reshaped_a, free_dims
+      return reshaped_a, free_dims, free_dims_static
 
   def _tensordot_axes(a, axes):
     """Generates two sets of contraction axes for the two tensor arguments."""
@@ -2366,16 +2375,19 @@ def tensordot(a, b, axes, name=None):
     a = ops.convert_to_tensor(a, name="a")
     b = ops.convert_to_tensor(b, name="b")
     a_axes, b_axes = _tensordot_axes(a, axes)
-    a_reshape, a_free_dims = _tensordot_reshape(a, a_axes)
-    b_reshape, b_free_dims = _tensordot_reshape(b, b_axes, True)
+    a_reshape, a_free_dims, a_free_dims_static = _tensordot_reshape(a, a_axes)
+    b_reshape, b_free_dims, b_free_dims_static = _tensordot_reshape(b, b_axes, True)
     ab_matmul = matmul(a_reshape, b_reshape)
     if isinstance(a_free_dims, list) and isinstance(b_free_dims, list):
       return array_ops.reshape(ab_matmul, a_free_dims + b_free_dims, name=name)
     else:
-      a_free_dims = ops.convert_to_tensor(a_free_dims)
-      b_free_dims = ops.convert_to_tensor(b_free_dims)
-      return array_ops.reshape(
+      a_free_dims = ops.convert_to_tensor(a_free_dims, dtype=dtypes.int32)
+      b_free_dims = ops.convert_to_tensor(b_free_dims, dtype=dtypes.int32)
+      product = array_ops.reshape(
           ab_matmul, array_ops.concat([a_free_dims, b_free_dims], 0), name=name)
+      if a_free_dims_static is not None and b_free_dims_static is not None:
+        product.set_shape(a_free_dims_static + b_free_dims_static)
+      return product
 
 
 # FFT ops were moved to tf.spectral. tf.fft symbols were part of the TensorFlow
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index c267fb8ccd..bdb34dd78e 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -1473,7 +1473,7 @@ def false_negatives(labels, predictions, weights=None,
                     metrics_collections=None,
                     updates_collections=None,
                     name=None):
-  """Computes the total number of false positives.
+  """Computes the total number of false negatives.
 
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 51ec1c313b..4a8ac42161 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -278,7 +278,8 @@ def with_space_to_batch(
       For N=3, the valid values are "NDHWC" (default) and "NCDHW".
 
   Returns:
-    The output Tensor as described above.
+    The output Tensor as described above, dimensions will vary based on the op
+    provided.
 
   Raises:
     ValueError: if `padding` is invalid or the arguments are incompatible.
@@ -529,17 +530,16 @@ def convolution(input, filter,  # pylint: disable=redefined-builtin
   of N `strides` (defaulting [1]*N), this computes for each N-D spatial output
   position (x[0], ..., x[N-1]):
 
+  ```
     output[b, x[0], ..., x[N-1], k] =
-
         sum_{z[0], ..., z[N-1], q}
-
             filter[z[0], ..., z[N-1], q, k] *
             padded_input[b,
                          x[0]*strides[0] + dilation_rate[0]*z[0],
                          ...,
                          x[N-1]*strides[N-1] + dilation_rate[N-1]*z[N-1],
                          q]
-
+  ```
   where `padded_input` is obtained by zero padding the input using an effective
   spatial filter shape of `(spatial_filter_shape-1) * dilation_rate + 1` and
   output striding `strides` as described in the
@@ -682,6 +682,7 @@ def pool(input,  # pylint: disable=redefined-builtin
       0 <= x[i] < output_spatial_shape[i],
       0 <= c < num_channels:
 
+  ```
     output[b, x[0], ..., x[N-1], c] =
       REDUCE_{z[0], ..., z[N-1]}
         input[b,
@@ -689,6 +690,7 @@ def pool(input,  # pylint: disable=redefined-builtin
               ...
               x[N-1]*strides[N-1] - pad_before[N-1] + dilation_rate[N-1]*z[N-1],
               c],
+  ```
 
   where the reduction function REDUCE depends on the value of `pooling_type`,
   and pad_before is defined based on the value of `padding` as described in the
@@ -698,10 +700,12 @@ def pool(input,  # pylint: disable=redefined-builtin
   In the case that `data_format` starts with `"NC"`, the `input` and output are
   simply transposed as follows:
 
+  ```
     pool(input, data_format, **kwargs) =
       tf.transpose(pool(tf.transpose(input, [0] + range(2,N+2) + [1]),
                         **kwargs),
                    [0, N+1] + range(1, N+1))
+  ```
 
   Args:
     input: Tensor of rank N+2, of shape
@@ -740,6 +744,7 @@ def pool(input,  # pylint: disable=redefined-builtin
 
     If padding = "SAME":
       output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides[i])
+
     If padding = "VALID":
       output_spatial_shape[i] =
         ceil((input_spatial_shape[i] - (window_shape[i] - 1) * dilation_rate[i])
@@ -844,9 +849,14 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
 
   More specifically:
 
-      output[b, i, j, k] = sum_{di, dj, q} filters[di, dj, q, k] *
-            value[b, i + rate * di, j + rate * dj, q]
-
+  ```
+  output[batch, height, width, out_channel] =
+      sum_{dheight, dwidth, in_channel} (
+          filters[dheight, dwidth, in_channel, out_channel] * 
+          value[batch, height + rate * dheight, width + rate * dwidth, in_channel]
+      )
+  ```
+  
   Atrous convolution allows us to explicitly control how densely to compute
   feature responses in fully convolutional networks. Used in conjunction with
   bilinear interpolation, it offers an alternative to `conv2d_transpose` in
@@ -932,6 +942,14 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
 
   Returns:
     A `Tensor` with the same type as `value`.
+    Output shape with `'VALID`` padding is:
+
+        [batch, height - 2 * (filter_width - 1), 
+         width - 2 * (filter_height - 1), out_channels].
+    
+    Output shape with `'SAME'` padding is:
+
+        [batch, height, width, out_channels].
 
   Raises:
     ValueError: If input/output depth does not match `filters`' shape, or if
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 076c6d41d9..c3dddf85f3 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Module implementing RNN Cells."""
+"""Module implementing RNN Cells.
+
+This module contains the abstract definition of a RNN cell: `_RNNCell`.
+Actual implementations of various types of RNN cells are located in
+`tensorflow.contrib`.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -72,10 +77,12 @@ def _zero_state_tensors(state_size, batch_size, dtype):
 class _RNNCell(object):
   """Abstract object representing an RNN cell.
 
-  The definition of cell in this package differs from the definition used in the
-  literature. In the literature, cell refers to an object with a single scalar
-  output. The definition in this package refers to a horizontal array of such
-  units.
+  Every `RNNCell` must have the properties below and implement `__call__` with
+  the following signature.
+
+  This definition of cell differs from the definition used in the literature.
+  In the literature, 'cell' refers to an object with a single scalar output.
+  This definition refers to a horizontal array of such units.
 
   An RNN cell, in the most abstract setting, is anything that has
   a state and performs some operation that takes a matrix of inputs.
@@ -84,13 +91,6 @@ class _RNNCell(object):
   state matrix with `self.state_size` columns.  If `self.state_size` is a
   tuple of integers, then it results in a tuple of `len(state_size)` state
   matrices, each with a column size corresponding to values in `state_size`.
-
-  This module provides a number of basic commonly used RNN cells, such as
-  LSTM (Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number
-  of operators that allow add dropouts, projections, or embeddings for inputs.
-  Constructing multi-layer cells is supported by the class `MultiRNNCell`,
-  or by calling the `rnn` ops several times. Every `RNNCell` must have the
-  properties below and implement `__call__` with the following signature.
   """
 
   def __call__(self, inputs, state, scope=None):
@@ -140,7 +140,7 @@ class _RNNCell(object):
 
       If `state_size` is a nested list or tuple, then the return value is
       a nested list or tuple (of the same structure) of `2-D` tensors with
-    the shapes `[batch_size x s]` for each s in `state_size`.
+      the shapes `[batch_size x s]` for each s in `state_size`.
     """
     with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
       state_size = self.state_size
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 70ecda1dda..335fd110e7 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -37,6 +37,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 
 # Determine whether we are in an interactive environment
+_interactive = False
 try:
   # This is only defined in interactive shells
   if _sys.ps1: _interactive = True
diff --git a/tensorflow/tensorboard/README.md b/tensorflow/tensorboard/README.md
index b9addd4b68..0c21bb508f 100644
--- a/tensorflow/tensorboard/README.md
+++ b/tensorflow/tensorboard/README.md
@@ -91,7 +91,7 @@ produce a consistent history of what happened.
 ### Runs: Comparing different executions of your model
 
 You may want to visually compare multiple executions of your model; for example,
-suppose you've changed the hyperparameters and want to see if its converging
+suppose you've changed the hyperparameters and want to see if it's converging
 faster. TensorBoard enables this through different "runs". When TensorBoard is
 passed a `logdir` at startup, it recursively walks the directory tree rooted at
 `logdir` looking for subdirectories that contain tfevents data. Every time it
diff --git a/tensorflow/tensorboard/defs.bzl b/tensorflow/tensorboard/defs.bzl
index 7ad97f91f8..bae7078c5b 100644
--- a/tensorflow/tensorboard/defs.bzl
+++ b/tensorflow/tensorboard/defs.bzl
@@ -36,7 +36,7 @@ def tensorboard_typescript_genrule(name, srcs, typings=[], **kwargs):
   # data attribute won't be considered when --genrule_strategy=sandboxed. See
   # https://github.com/bazelbuild/bazel/issues/1147 and its linked issues.
   data = [
-      "@org_nodejs//:bin/node",
+      "@org_nodejs",
       "@com_microsoft_typescript",
   ]
   native.genrule(
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 471a2173aa..aebdfed837 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -124,6 +124,7 @@ def tf_copts():
                 "/DLANG_CXX11",
                 "/D__VERSION__=\\\"MSVC\\\"",
                 "/DPLATFORM_WINDOWS",
+                "/DTF_COMPILE_LIBRARY",
                 "/DEIGEN_HAS_C99_MATH",
                 "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
               ],
@@ -392,7 +393,7 @@ def tf_cc_tests(srcs, deps, name='', linkstatic=0, tags=[], size="medium",
 
 def tf_cc_test_mkl(srcs, deps, name='', linkstatic=0, tags=[], size="medium",
                     args=None):
-  tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args)
+  if_mkl(tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args))
 
 def tf_cc_tests_gpu(srcs, deps, name='', linkstatic=0, tags=[], size="medium",
                     args=None):
diff --git a/tensorflow/tools/benchmark/BUILD b/tensorflow/tools/benchmark/BUILD
index 3b1901fd56..a2ffca97ec 100644
--- a/tensorflow/tools/benchmark/BUILD
+++ b/tensorflow/tools/benchmark/BUILD
@@ -34,6 +34,7 @@ cc_library(
             "//tensorflow/core:lib",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
+            "//tensorflow/core:framework_lite",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:tensorflow",
             "//tensorflow/core:test",
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index 0fcfaf747b..db2ac31baf 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -272,7 +273,11 @@ Status TimeMultipleRuns(double sleep_seconds, int num_runs,
     // This can be helpful to determine the effect of mobile processor
     // scaling and thermal throttling.
     if (sleep_seconds > 0.0) {
+#ifdef PLATFORM_WINDOWS
+      Sleep(sleep_seconds * 1000);
+#else
       nanosleep(&req, nullptr);
+#endif
     }
   }
   std::stringstream stream;
diff --git a/tensorflow/tools/ci_build/Dockerfile.android b/tensorflow/tools/ci_build/Dockerfile.android
index 887589bc93..4d46c672ab 100644
--- a/tensorflow/tools/ci_build/Dockerfile.android
+++ b/tensorflow/tools/ci_build/Dockerfile.android
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:14.04
 
 MAINTAINER Jan Prach <jendap@google.com>
 
@@ -10,9 +10,8 @@ RUN add-apt-repository -y ppa:openjdk-r/ppa && \
 RUN /install/install_deb_packages.sh
 RUN /install/install_bazel.sh
 
-# Set up bazelrc.
-COPY install/.bazelrc /root/.bazelrc
-ENV BAZELRC /root/.bazelrc
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
 
 # Install extra libraries for android sdk.
 RUN apt-get update && apt-get install -y \
diff --git a/tensorflow/tools/ci_build/Dockerfile.cmake b/tensorflow/tools/ci_build/Dockerfile.cmake
index 8a28fe6cdf..22eaf11b91 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cmake
+++ b/tensorflow/tools/ci_build/Dockerfile.cmake
@@ -7,9 +7,10 @@ COPY install/*.sh /install/
 RUN /install/install_bootstrap_deb_packages.sh
 RUN /install/install_deb_packages.sh
 
+RUN apt-get update
+RUN apt-get install -y --no-install-recommends python-pip
 RUN pip install --upgrade numpy
 
 # Install golang
 RUN add-apt-repository -y ppa:ubuntu-lxc/lxd-stable
-RUN apt-get update
 RUN apt-get install -y golang
diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu b/tensorflow/tools/ci_build/Dockerfile.cpu
index 8e0be14ca6..206108930a 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cpu
+++ b/tensorflow/tools/ci_build/Dockerfile.cpu
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:14.04
 
 MAINTAINER Jan Prach <jendap@google.com>
 
@@ -15,6 +15,5 @@ RUN /install/install_buildifier.sh
 RUN /install/install_auditwheel.sh
 RUN /install/install_golang.sh
 
-# Set up bazelrc.
-COPY install/.bazelrc /root/.bazelrc
-ENV BAZELRC /root/.bazelrc
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
diff --git a/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu b/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
index 79cf1844f2..b914f51918 100644
--- a/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
+++ b/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
@@ -22,6 +22,5 @@ RUN /install/install_golang.sh
 # Fix a virtualenv install issue specific to Debian Jessie.
 RUN pip install --upgrade virtualenv
 
-# Set up bazelrc.
-COPY install/.bazelrc /root/.bazelrc
-ENV BAZELRC /root/.bazelrc
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu b/tensorflow/tools/ci_build/Dockerfile.gpu
index 1cf1e40404..68493965fa 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu
@@ -1,7 +1,12 @@
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu14.04
 
 MAINTAINER Jan Prach <jendap@google.com>
 
+# In the Ubuntu 14.04 images, cudnn is placed in system paths. Move them to
+# /usr/local/cuda
+RUN cp /usr/include/cudnn.h /usr/local/cuda/include
+RUN cp /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
+
 # Copy and run the install scripts.
 COPY install/*.sh /install/
 RUN /install/install_bootstrap_deb_packages.sh
@@ -12,9 +17,8 @@ RUN /install/install_pip_packages.sh
 RUN /install/install_bazel.sh
 RUN /install/install_golang.sh
 
-# Set up bazelrc.
-COPY install/.bazelrc /root/.bazelrc
-ENV BAZELRC /root/.bazelrc
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 
 # Configure the build for our CUDA configuration.
diff --git a/tensorflow/tools/ci_build/Dockerfile.hadoop b/tensorflow/tools/ci_build/Dockerfile.hadoop
index 7af9f38708..489493c26e 100644
--- a/tensorflow/tools/ci_build/Dockerfile.hadoop
+++ b/tensorflow/tools/ci_build/Dockerfile.hadoop
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:14.04
 
 MAINTAINER Jonathan Hseu <jhseu@google.com>
 
@@ -14,6 +14,5 @@ RUN /install/install_proto3.sh
 RUN /install/install_buildifier.sh
 RUN /install/install_hadoop.sh
 
-# Set up bazelrc.
-COPY install/.bazelrc /root/.bazelrc
-ENV BAZELRC /root/.bazelrc
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
diff --git a/tensorflow/tools/ci_build/Dockerfile.tensorboard b/tensorflow/tools/ci_build/Dockerfile.tensorboard
index 12b8aa18da..9795872e2c 100644
--- a/tensorflow/tools/ci_build/Dockerfile.tensorboard
+++ b/tensorflow/tools/ci_build/Dockerfile.tensorboard
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:14.04
 
 MAINTAINER Jan Prach <jendap@google.com>
 
diff --git a/tensorflow/tools/ci_build/README.md b/tensorflow/tools/ci_build/README.md
index 4b7858ca89..1fa618e698 100644
--- a/tensorflow/tools/ci_build/README.md
+++ b/tensorflow/tools/ci_build/README.md
@@ -20,20 +20,20 @@ run continuous integration [ci.tensorflow.org](https://ci.tensorflow.org).
 2. Clone tensorflow repository.
 
    ```bash
-git clone https://github.com/tensorflow/tensorflow.git
-```
+   git clone https://github.com/tensorflow/tensorflow.git
+   ```
 
 3. Go to tensorflow directory
 
    ```bash
-cd tensorflow
-```
+   cd tensorflow
+   ```
 
 4. Build what you want, for example
 
    ```bash
-tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
-```
+   tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
+   ```
 
 
 
diff --git a/tensorflow/tools/ci_build/builds/run_pip_tests.sh b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
index be076cd4c0..10bed0b786 100755
--- a/tensorflow/tools/ci_build/builds/run_pip_tests.sh
+++ b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
@@ -104,28 +104,26 @@ export TF_NEED_CUDA=$IS_GPU
 yes "" | ./configure
 
 # Figure out how many concurrent tests we can run and do run the tests.
+BAZEL_PARALLEL_TEST_FLAGS=""
 if [[ $IS_GPU == 1 ]]; then
   # Number of test threads is the number of GPU cards available.
   if [[ $IS_MAC == 1 ]]; then
-    PAR_TEST_JOBS=1
+    BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=1"
   else
     PAR_TEST_JOBS=$TF_GPU_COUNT
+    BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=${TF_GPU_COUNT} \
+        --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute"
   fi
-
-  # Actually run the tests.
-  bazel test ${BAZEL_FLAGS} --local_test_jobs=${PAR_TEST_JOBS} \
-    --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-    -- ${BAZEL_TEST_TARGETS}
-
 else
   # Number of test threads is the number of physical CPUs.
   if [[ $IS_MAC == 1 ]]; then
-    PAR_TEST_JOBS=$(sysctl -n hw.ncpu)
+    BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=$(sysctl -n hw.ncpu)"
   else
-    PAR_TEST_JOBS=$(grep -c ^processor /proc/cpuinfo)
+    BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=$(grep -c ^processor /proc/cpuinfo)"
   fi
-
-  # Actually run the tests.
-  bazel test ${BAZEL_FLAGS} --local_test_jobs=${PAR_TEST_JOBS} \
-    -- ${BAZEL_TEST_TARGETS}
 fi
+
+# Actually run the tests.
+bazel test ${BAZEL_FLAGS} ${BAZEL_PARALLEL_TEST_FLAGS} -- \
+    ${BAZEL_TEST_TARGETS}
+
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index e1a312b858..cb204bc25f 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -19,7 +19,7 @@
 #
 # The script obeys the following required environment variables:
 #   TF_BUILD_CONTAINER_TYPE:   (CPU | GPU | ANDROID | ANDROID_FULL)
-#   TF_BUILD_PYTHON_VERSION:   (PYTHON2 | PYTHON3)
+#   TF_BUILD_PYTHON_VERSION:   (PYTHON2 | PYTHON3 | PYTHON3.5)
 #   TF_BUILD_IS_PIP:           (NO_PIP | PIP | BOTH)
 #
 # The below environment variable is required, but will be deprecated together
@@ -33,7 +33,8 @@
 #   ANDROID & PIP    (Android and PIP builds are mutually exclusive)
 #
 #   2) TF_BUILD_PYTHON_VERSION is set to PYTHON3, the build will use the version
-# pointed to by "which python3" on the system.
+# pointed to by "which python3" on the system, which is typically python3.4. To
+# build for python3.5, set the environment variable to PYTHON3.5
 #
 #
 # Additionally, the script follows the directions of optional environment
@@ -426,7 +427,9 @@ fi
 # Process Python version
 if [[ ${TF_BUILD_PYTHON_VERSION} == "python2" ]]; then
   :
-elif [[ ${TF_BUILD_PYTHON_VERSION} == "python3" ]]; then
+elif [[ ${TF_BUILD_PYTHON_VERSION} == "python3" || \
+        ${TF_BUILD_PYTHON_VERSION} == "python3.4" || \
+        ${TF_BUILD_PYTHON_VERSION} == "python3.5" ]]; then
   # Supply proper environment variable to select Python 3
   if [[ "${DO_DOCKER}" == "1" ]]; then
     EXTRA_PARAMS="${EXTRA_PARAMS} -e CI_BUILD_PYTHON=${TF_BUILD_PYTHON_VERSION}"
@@ -493,6 +496,30 @@ echo ""
 
 TMP_DIR=""
 DOCKERFILE_FLAG=""
+if [[ "${TF_BUILD_PYTHON_VERSION}" == "python3.5" ]]; then
+  # Modify Dockerfile for Python3.5 build
+  TMP_DIR=$(mktemp -d)
+  echo "Docker build will occur in temporary directory: ${TMP_DIR}"
+
+  # Copy the files required for the docker build
+  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+  cp -r "${SCRIPT_DIR}/install" "${TMP_DIR}/install" || \
+      die "ERROR: Failed to copy directory ${SCRIPT_DIR}/install"
+
+  DOCKERFILE="${SCRIPT_DIR}/Dockerfile.${TF_BUILD_CONTAINER_TYPE}"
+  cp "${DOCKERFILE}" "${TMP_DIR}/" || \
+      die "ERROR: Failed to copy Dockerfile at ${DOCKERFILE}"
+  DOCKERFILE="${TMP_DIR}/Dockerfile.${TF_BUILD_CONTAINER_TYPE}"
+
+  # Replace a line in the Dockerfile
+  sed -i \
+      's/RUN \/install\/install_pip_packages.sh/RUN \/install\/install_python3.5_pip_packages.sh/g' \
+      "${DOCKERFILE}" && \
+      echo "Copied and modified Dockerfile for Python 3.5 build: ${DOCKERFILE}" || \
+      die "ERROR: Faild to copy and modify Dockerfile: ${DOCKERFILE}"
+
+  DOCKERFILE_FLAG="--dockerfile ${DOCKERFILE}"
+fi
 
 chmod +x ${TMP_SCRIPT}
 
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 44aaed8ae9..9ecf16c46f 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -105,7 +105,7 @@ do_pylint() {
   if [[ $1 == "PYTHON2" ]]; then
     PYLINT_BIN="python /usr/local/lib/python2.7/dist-packages/pylint/lint.py"
   elif [[ $1 == "PYTHON3" ]]; then
-    PYLINT_BIN="python3 /usr/local/lib/python3.5/dist-packages/pylint/lint.py"
+    PYLINT_BIN="python3 /usr/local/lib/python3.4/dist-packages/pylint/lint.py"
   else
     echo "Unrecognized python version (PYTHON2 | PYTHON3): $1"
     return 1
diff --git a/tensorflow/tools/ci_build/install/install_deb_packages.sh b/tensorflow/tools/ci_build/install/install_deb_packages.sh
index 23dc6d42c4..a62a6f8a3c 100755
--- a/tensorflow/tools/ci_build/install/install_deb_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_deb_packages.sh
@@ -42,14 +42,14 @@ apt-get install -y --no-install-recommends \
     openjdk-8-jre-headless \
     pkg-config \
     python-dev \
-    python-pip \
+    python-setuptools \
+    python-virtualenv \
     python3-dev \
-    python3-pip \
+    python3-setuptools \
     rsync \
     sudo \
     swig \
     unzip \
-    virtualenv \
     wget \
     zip \
     zlib1g-dev
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 19c46bbcd4..8011f8de24 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -16,56 +16,64 @@
 
 set -e
 
+# We don't apt-get install so that we can install a newer version of pip. Not
+# needed after we upgrade to Ubuntu 16.04
+easy_install -U pip
+easy_install3 -U pip
+
 # Install pip packages from whl files to avoid the time-consuming process of
 # building from source.
 
-pip install wheel
+pip2 install wheel
 pip3 install wheel
 
 # Install six.
-pip install --upgrade six==1.10.0
+pip2 install --upgrade six==1.10.0
 pip3 install --upgrade six==1.10.0
 
 # Install werkzeug.
-pip install --upgrade werkzeug==0.11.10
+pip2 install --upgrade werkzeug==0.11.10
 pip3 install --upgrade werkzeug==0.11.10
 
 # Install protobuf.
-pip install --upgrade protobuf==3.2.0
+pip2 install --upgrade protobuf==3.2.0
 pip3 install --upgrade protobuf==3.2.0
 
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
 
-pip install --upgrade numpy==1.12.0
-pip3 install --upgrade numpy==1.12.0
+# numpy needs to be installed from source to fix segfaults. See:
+# https://github.com/tensorflow/tensorflow/issues/6968
+# This workaround isn't needed for Ubuntu 16.04 or later.
+pip2 install --no-binary=:all: --upgrade numpy==1.12.0
+pip3 install --no-binary=:all: --upgrade numpy==1.12.0
 
-pip install scipy==0.18.1
+pip2 install scipy==0.18.1
 pip3 install scipy==0.18.1
 
-pip install scikit-learn==0.18.1
+pip2 install scikit-learn==0.18.1
 pip3 install scikit-learn==0.18.1
 
 # pandas required by tf.learn/inflow
-pip install pandas==0.19.2
+pip2 install pandas==0.19.2
 pip3 install pandas==0.19.2
 
 # Benchmark tests require the following:
-pip install psutil
+pip2 install psutil
 pip3 install psutil
-pip install py-cpuinfo
+pip2 install py-cpuinfo
 pip3 install py-cpuinfo
 
 # pylint tests require the following:
-pip install pylint
+pip2 install pylint
 pip3 install pylint
 
 # pep8 tests require the following:
-pip install pep8
+pip2 install pep8
 pip3 install pep8
 
 # tf.mock require the following for python2:
-pip install mock
+pip2 install mock
 
-pip install portpicker
+pip2 install portpicker
 pip3 install portpicker
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
new file mode 100755
index 0000000000..e7e2d256cd
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Install packages required by Python3.5 build
+
+# TODO(cais): Remove this file once we upgrade to ubuntu:16.04 docker images for
+# Python 3.5 builds.
+
+# fkrull/deadsnakes is for Python3.5
+add-apt-repository -y ppa:fkrull/deadsnakes
+apt-get update
+
+set +e
+# Upgrade swig to 3.0.8
+SWIG_VERSION="3.0.8"
+swig_ver_flat=$(echo $SWIG_VERSION | sed 's/\.//g' | sed 's/^0*//g')
+local_swig_ver=$(swig -version | grep -i version | awk '{print $3}')
+local_swig_ver_flat=$(echo $local_swig_ver | sed 's/\.//g' | sed 's/^0*//g')
+if [[ -z $local_swig_ver_flat ]]; then
+  local_swig_ver_flat=0
+fi
+if (( $local_swig_ver_flat < $swig_ver_flat )); then
+  set -e
+  wget -q http://downloads.sourceforge.net/swig/swig-3.0.8.tar.gz
+  tar xzf swig-3.0.8.tar.gz
+  pushd swig-3.0.8
+  apt-get install -y --no-install-recommends libpcre3-dev
+  ./configure
+  make
+  make install
+  rm -f /usr/bin/swig
+  ln -s /usr/local/bin/swig /usr/bin/swig
+  popd
+  rm -rf swig-3.0.8 swig-3.0.8.tar.gz
+fi
+set -e
+# Install Python 3.5 and dev library
+apt-get install -y --no-install-recommends python3.5 libpython3.5-dev
+
+# Install pip3.5
+set +e
+pip35_version=$(pip3.5 --version | grep "python 3.5")
+if [[ -z $pip35_version ]]; then
+  set -e
+  wget -q https://bootstrap.pypa.io/get-pip.py
+  python3.5 get-pip.py
+  rm -f get-pip.py
+fi
+
+set -e
+# Install six.
+pip3.5 install --upgrade six==1.10.0
+
+# Install protobuf.
+pip3.5 install --upgrade protobuf==3.2.0
+
+# Remove obsolete version of six, which can sometimes confuse virtualenv.
+rm -rf /usr/lib/python3/dist-packages/six*
+
+# Install numpy, scipy and scikit-learn required by the builds
+
+# numpy needs to be installed from source to fix segfaults. See:
+# https://github.com/tensorflow/tensorflow/issues/6968
+# This workaround isn't needed for Ubuntu 16.04 or later.
+pip3.5 install --no-binary=:all: --upgrade numpy==1.12.0
+
+pip3.5 install scipy==0.18.1
+
+pip3.5 install scikit-learn==0.18.1
+
+# pandas required by tf.learn/inflow
+pip3 install pandas==0.19.2
+
+# Install recent-enough version of wheel for Python 3.5 wheel builds
+pip3.5 install wheel==0.29.0
+
+pip3.5 install portpicker
+
+pip3.5 install werkzeug
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 73c08e5d0b..1488e8d78c 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -101,11 +101,8 @@ exclude_gpu_cc_tests="${extra_failing_gpu_cc_tests} + ${exclude_cpu_cc_tests}"
 function get_failing_cpu_py_tests() {
     echo "
     //$1/tensorflow/python:basic_session_run_hooks_test + \
-    //$1/tensorflow/python:bigquery_reader_ops_test + \
     //$1/tensorflow/python:contrib_test + \
     //$1/tensorflow/python:dequantize_op_test + \
-    //$1/tensorflow/python:directory_watcher_test + \
-    //$1/tensorflow/python:event_multiplexer_test + \
     //$1/tensorflow/python:file_io_test + \
     //$1/tensorflow/python:file_system_test + \
     //$1/tensorflow/python:framework_meta_graph_test + \
diff --git a/tensorflow/tools/compatibility/README.md b/tensorflow/tools/compatibility/README.md
index 9dba070a4f..aabc7b253d 100644
--- a/tensorflow/tools/compatibility/README.md
+++ b/tensorflow/tools/compatibility/README.md
@@ -11,7 +11,10 @@ It will print a list of errors it finds that it can't fix. You can also run
 it on a directory tree:
 
 ```
+# just upgrade the .py files
 tf_upgrade.py --intree coolcode --outtree coolcode-upgraded
+# after upgrade the .py files, then copy all the other files to the outtree
+tf_upgrade.py --intree coolcode --outtree coolcode-upgraded --copyotherfiles True
 ```
 
 In either case, it will also dump out a report e.g. which will detail changes
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index 26bf117256..80439f835a 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -140,6 +140,7 @@ class APIChangeSpec(object):
         "tf.batch_svd": "tf.svd",
         "tf.batch_fft": "tf.fft",
         "tf.batch_ifft": "tf.ifft",
+        "tf.batch_fft2d": "tf.fft2d",
         "tf.batch_ifft2d": "tf.ifft2d",
         "tf.batch_fft3d": "tf.fft3d",
         "tf.batch_ifft3d": "tf.ifft3d",
@@ -566,7 +567,7 @@ class TensorFlowCodeUpgrader(object):
     return 1, text, process_errors
   # pylint: enable=broad-except
 
-  def process_tree(self, root_directory, output_root_directory):
+  def process_tree(self, root_directory, output_root_directory, copy_other_files):
     """Processes upgrades on an entire tree of python files in place.
 
     Note that only Python files. If you have custom code in other languages,
@@ -596,13 +597,21 @@ class TensorFlowCodeUpgrader(object):
     # Collect list of files to process (we do this to correctly handle if the
     # user puts the output directory in some sub directory of the input dir)
     files_to_process = []
+    files_to_copy = []
     for dir_name, _, file_list in os.walk(root_directory):
       py_files = [f for f in file_list if f.endswith(".py")]
+      copy_files = [f for f in file_list if not f.endswith(".py")]
       for filename in py_files:
         fullpath = os.path.join(dir_name, filename)
         fullpath_output = os.path.join(
             output_root_directory, os.path.relpath(fullpath, root_directory))
         files_to_process.append((fullpath, fullpath_output))
+      if copy_other_files:
+        for filename in copy_files:
+          fullpath = os.path.join(dir_name, filename)
+          fullpath_output = os.path.join(
+              output_root_directory, os.path.relpath(fullpath, root_directory))
+          files_to_copy.append((fullpath, fullpath_output))
 
     file_count = 0
     tree_errors = []
@@ -619,6 +628,11 @@ class TensorFlowCodeUpgrader(object):
       _, l_report, l_errors = self.process_file(input_path, output_path)
       tree_errors += l_errors
       report += l_report
+    for input_path, output_path in files_to_copy:
+      output_directory = os.path.dirname(output_path)
+      if not os.path.isdir(output_directory):
+        os.makedirs(output_directory)
+      shutil.copy(input_path, output_path)
     return file_count, report, tree_errors
 
 
@@ -651,6 +665,13 @@ Simple usage:
       help="If converting a whole tree of files, the output "
       "directory (relative or absolute).")
   parser.add_argument(
+      "--copyotherfiles",
+      dest="copy_other_files",
+      help=("If converting a whole tree of files, whether to "
+            "copy the other files."),
+      type=bool,
+      default=False)
+  parser.add_argument(
       "--reportfile",
       dest="report_filename",
       help=("The name of the file where the report log is "
@@ -669,7 +690,7 @@ Simple usage:
     files_processed = 1
   elif args.input_tree:
     files_processed, report_text, errors = upgrade.process_tree(
-        args.input_tree, args.output_tree)
+        args.input_tree, args.output_tree, args.copy_other_files)
   else:
     parser.print_help()
   if report_text:
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index a67f1af2bd..dd18b61017 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -61,12 +61,11 @@ RUN add-apt-repository -y ppa:openjdk-r/ppa && \
 # Running bazel inside a `docker build` command causes trouble, cf:
 #   https://github.com/bazelbuild/bazel/issues/134
 # The easiest solution is to set up a bazelrc file forcing --batch.
-RUN echo "startup --batch" >>/root/.bazelrc
+RUN echo "startup --batch" >>/etc/bazel.bazelrc
 # Similarly, we need to workaround sandboxing issues:
 #   https://github.com/bazelbuild/bazel/issues/418
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
-    >>/root/.bazelrc
-ENV BAZELRC /root/.bazelrc
+    >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
 ENV BAZEL_VERSION 0.4.5
 WORKDIR /
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index d1a733458d..8ead2f15ae 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -61,12 +61,11 @@ RUN add-apt-repository -y ppa:openjdk-r/ppa && \
 # Running bazel inside a `docker build` command causes trouble, cf:
 #   https://github.com/bazelbuild/bazel/issues/134
 # The easiest solution is to set up a bazelrc file forcing --batch.
-RUN echo "startup --batch" >>/root/.bazelrc
+RUN echo "startup --batch" >>/etc/bazel.bazelrc
 # Similarly, we need to workaround sandboxing issues:
 #   https://github.com/bazelbuild/bazel/issues/418
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
-    >>/root/.bazelrc
-ENV BAZELRC /root/.bazelrc
+    >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
 ENV BAZEL_VERSION 0.4.5
 WORKDIR /
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index c97ce7561f..299d50c359 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -76,7 +76,11 @@ def configure(src_base_path, debug=False):
   # Remove and recreate the path
   if os.path.exists(gen_path):
     if os.path.isdir(gen_path):
-      shutil.rmtree(gen_path)
+      try:
+        shutil.rmtree(gen_path)
+      except PermissionError:
+        raise RuntimeError("Cannot delete directory %s due to permission "
+                           "error, inspect and remove manually" % gen_path)
     else:
       raise RuntimeError("Cannot delete non-directory %s, inspect ",
                          "and remove manually" % gen_path)
diff --git a/tensorflow/tools/graph_transforms/summarize_graph_main.cc b/tensorflow/tools/graph_transforms/summarize_graph_main.cc
index 8c23ae7a74..f45dfbba0c 100644
--- a/tensorflow/tools/graph_transforms/summarize_graph_main.cc
+++ b/tensorflow/tools/graph_transforms/summarize_graph_main.cc
@@ -109,7 +109,7 @@ Status SummarizeGraph(const GraphDef& graph, const string& graph_path) {
     if (node.op() == "Placeholder") {
       placeholders.push_back(&node);
     }
-    if (node.op() == "Variable") {
+    if (node.op() == "Variable" || node.op() == "VariableV2") {
       variables.push_back(&node);
     }
   }
@@ -168,7 +168,8 @@ Status SummarizeGraph(const GraphDef& graph, const string& graph_path) {
     if (node.device() != "") {
       ++device_counts[node.device()];
     }
-    if ((node.op() == "Const") || (node.op() == "Variable")) {
+    if ((node.op() == "Const") || (node.op() == "Variable") ||
+        (node.op() == "VariableV2")) {
       Tensor tensor;
       if (node.attr().count("value") &&
           tensor.FromProto(node.attr().at("value").tensor())) {
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 74a7818967..d9c67862e7 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -66,23 +66,21 @@ py_binary(
         "README",
         "setup.py",
         ":included_headers",
-        "//tensorflow/contrib/ndlstm",
         "//tensorflow/contrib/nn:nn_py",
         "//tensorflow/contrib/session_bundle:session_bundle_pip",
-        "//tensorflow/contrib/slim",
         "//tensorflow/contrib/slim/python/slim/data:data_pip",
-        "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
-        "//tensorflow/contrib/specs",
-        "//tensorflow/contrib/tensor_forest:init_py",
-        "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
         "//tensorflow/python:util_example_parser_configuration",
         "//tensorflow/python/debug:debug_pip",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/tools:tools_pip",
-        # The following target has an issue when archiving them into the python
-        # zip, exclude them for now.
-        # "//tensorflow/tensorboard",
-        # This package does not build. Exclude it in windows for now.
+        "//tensorflow/tensorboard",
+        # These targets don't build on Windows yet. Exclude them for now.
+        # "//tensorflow/contrib/ndlstm",
+        # "//tensorflow/contrib/slim",
+        # "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
+        # "//tensorflow/contrib/specs",
+        # "//tensorflow/contrib/tensor_forest:init_py",
+        # "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
         # "//tensorflow/examples/tutorials/mnist:package",
     ],
     srcs_version = "PY2AND3",
diff --git a/tensorflow/tools/pip_package/MANIFEST.in b/tensorflow/tools/pip_package/MANIFEST.in
index 22b00c4284..fe21f221b1 100644
--- a/tensorflow/tools/pip_package/MANIFEST.in
+++ b/tensorflow/tools/pip_package/MANIFEST.in
@@ -1,4 +1,6 @@
 include README
 recursive-include * *.py
 recursive-include * *.so
+recursive-include * *.dll
+recursive-include * *.lib
 recursive-include * *.csv
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 25aecb5707..4c4973080f 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -64,6 +64,10 @@ temp_workaround_http_archive = repository_rule(
 # If TensorFlow is linked as a submodule.
 # path_prefix and tf_repo_name are no longer used.
 def tf_workspace(path_prefix = "", tf_repo_name = ""):
+  # We must check the bazel version before trying to parse any other BUILD
+  # files, in case the parsing of those build files depends on the bazel
+  # version we require here.
+  check_version("0.4.5")
   cuda_configure(name = "local_config_cuda")
   sycl_configure(name = "local_config_sycl")
   if path_prefix:
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 01e070f2be..a2b3e7d79e 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -826,8 +826,17 @@ def _cuda_autoconf_impl(repository_ctx):
 
 cuda_configure = repository_rule(
     implementation = _cuda_autoconf_impl,
-    local = True,
+    environ = [
+        _GCC_HOST_COMPILER_PATH,
+        "TF_NEED_CUDA",
+        _CUDA_TOOLKIT_PATH,
+        _CUDNN_INSTALL_PATH,
+        _TF_CUDA_VERSION,
+        _TF_CUDNN_VERSION,
+        _TF_CUDA_COMPUTE_CAPABILITIES,
+    ],
 )
+
 """Detects and configures the local CUDA toolchain.
 
 Add the following to your WORKSPACE FILE:
diff --git a/third_party/sycl/crosstool/computecpp.tpl b/third_party/sycl/crosstool/computecpp.tpl
index 66dd9aea7b..595e7136a6 100755
--- a/third_party/sycl/crosstool/computecpp.tpl
+++ b/third_party/sycl/crosstool/computecpp.tpl
@@ -65,7 +65,7 @@ def main():
       # strip asan for the device
       computecpp_device_compiler_flags = ['-sycl-compress-name', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '-isystem',
           COMPUTECPP_INCLUDE, '-std=c++11', '-sycl', '-emit-llvm', '-no-serial-memop', '-Xclang', '-cl-denorms-are-zero', '-Xclang', '-cl-fp32-correctly-rounded-divide-sqrt']
-      computecpp_device_compiler_flags += [flag for flag in compiler_flags if not flag.startswith(('-fsanitize'))]
+      computecpp_device_compiler_flags += [flag for flag in compiler_flags if not flag.startswith(('-fsanitize', '-march=native', '-mavx'))]
 
       x = subprocess.call([COMPUTECPP_DRIVER] + computecpp_device_compiler_flags )
       if(x == 0):
diff --git a/util/python/python_config.sh b/util/python/python_config.sh
index 789c4b35b3..4b18bf3578 100755
--- a/util/python/python_config.sh
+++ b/util/python/python_config.sh
@@ -181,7 +181,7 @@ function setup_python {
   # Write tools/bazel.rc
   echo "# Autogenerated by configure: DO NOT EDIT" > tools/bazel.rc
   sed -e "s/\$PYTHON_MAJOR_VERSION/$python_major_version/g" \
-      -e "s[\$PYTHON_BINARY[\"$PYTHON_BIN_PATH\"[g" \
+      -e "s|\$PYTHON_BINARY|\"$PYTHON_BIN_PATH\"|g" \
       tools/bazel.rc.template >> tools/bazel.rc
   # Write tools/python_bin_path.sh
   echo "export PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\"" > tools/python_bin_path.sh