From 0912bc8cc7f491cdcc5b8a74600292c6e810247b Mon Sep 17 00:00:00 2001
From: Ilya Biryukov <ibiryukov@google.com>
Date: Mon, 11 Jun 2018 14:16:30 -0700
Subject: Fix 'cc_op_gen' to use static storage for constant arrays.

Previously, the generate would emit code like this:
  struct Attrs {
    ArraySlice<int> dilations_ = {1, 1, 1, 1};
  };

This code is incorrect, since the array slice references a temporary object
that dies after initialization finishes.

After this change change the generator will produce static functions to
initialize the values:
  struct Attrs {
    ArraySlice<int> dilations_ = Default_dilations();

  private:
    ArraySlice<int> Default_dilations() {
      static int kStorage[] = {1, 1, 1, 1};
      return ArraySlice<int>(kStorage);
    }
  };

Presumably, it used to work because all compilers chose to use static storage
in those cases anyway. However, new versions of clang tend to miscompile this
code, causing test failures. (This error was found when trying to upgrade our
clang revision from r328903 to r331746).

PiperOrigin-RevId: 200110952
---
 tensorflow/cc/framework/cc_op_gen.cc | 71 +++++++++++++++++++++++++++++++-----
 1 file changed, 61 insertions(+), 10 deletions(-)

(limited to 'tensorflow/cc')
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index d6a4f141b6..dfdef88945 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -273,6 +273,12 @@ string PrintAttrValue(const string& op, const AttrValue& attr_value) {
   return "<Unknown AttrValue type>";  // Prevent missing return warning
 }
 
+bool IsEmptyList(const AttrValue::ListValue& list) {
+  return list.s_size() == 0 && list.i_size() == 0 && list.f_size() == 0 &&
+         list.b_size() == 0 && list.type_size() == 0 &&
+         list.shape_size() == 0 && list.tensor_size() == 0;
+}
+
 string ToCamelCase(const string& str) {
   string result;
   const char joiner = '_';
@@ -297,9 +303,9 @@ string ToCamelCase(const string& str) {
 // indicate whether to treat the type as const when accepting the C++ type as an
 // argument to a function.
 std::pair<const char*, bool> AttrTypeName(StringPiece attr_type) {
-  static const std::unordered_map<StringPiece, std::pair<const char*, bool>,
-                                  StringPieceHasher>
-      attr_type_map{
+  static const auto* attr_type_map =
+      new std::unordered_map<StringPiece, std::pair<const char*, bool>,
+                             StringPieceHasher>{
           {"string", {"StringPiece", false}},
           {"list(string)", {"gtl::ArraySlice<string>", true}},
           {"int", {"int64", false}},
@@ -317,14 +323,34 @@ std::pair<const char*, bool> AttrTypeName(StringPiece attr_type) {
           {"func", {"NameAttrList", true}},
       };
 
-  auto entry = attr_type_map.find(attr_type);
-  if (entry == attr_type_map.end()) {
+  auto entry = attr_type_map->find(attr_type);
+  if (entry == attr_type_map->end()) {
     LOG(FATAL) << "Unsupported Attr type: " << attr_type;
     return {"", false};
   }
   return entry->second;
 }
 
+const char* ListElementTypeName(StringPiece attr_type) {
+  static const auto* attr_list_type_map =
+      new std::unordered_map<StringPiece, const char*, StringPieceHasher>{
+          {"list(string)", "string"},
+          {"list(int)", "int"},
+          {"list(float)", "float"},
+          {"list(bool)", "bool"},
+          {"list(type)", "DataType"},
+          {"list(shape)", "PartialTensorShape"},
+          {"list(tensor)", "TensorProto"},
+      };
+
+  auto entry = attr_list_type_map->find(attr_type);
+  if (entry == attr_list_type_map->end()) {
+    LOG(FATAL) << "Unsupported or non-list Attr type: " << attr_type;
+    return "";
+  }
+  return entry->second;
+}
+
 bool IsCPPKeyword(StringPiece name) {
   static const std::unordered_set<StringPiece, StringPieceHasher>
       // Keywords obtained from http://en.cppreference.com/w/cpp/keyword
@@ -668,6 +694,7 @@ OpInfo::OpInfo(const OpDef& graph_op_def, const ApiDef& api_def,
 string OpInfo::GetOpAttrStruct() const {
   string struct_fields;
   string setters;
+  string defaults_static_storage;
 
   for (int i = 0; i < graph_op_def.attr_size(); ++i) {
     const auto& attr(graph_op_def.attr(i));
@@ -705,11 +732,32 @@ string OpInfo::GetOpAttrStruct() const {
                        "_ = x;\n");
     strings::StrAppend(&setters, "      return ret;\n    }\n\n");
 
-    strings::StrAppend(
-        &struct_fields, "    ", attr_type_name, " ", api_def_attr.rename_to(),
-        "_ = ",
-        PrintAttrValue(graph_op_def.name(), api_def_attr.default_value()),
-        ";\n");
+    string field_initiliazer;
+    auto& default_value = api_def_attr.default_value();
+    if (default_value.value_case() == AttrValue::kList &&
+        !IsEmptyList(default_value.list())) {
+      // Non-empty lists need static storage for their defaults. Define a
+      // function with static local variable that stores the array.
+      strings::StrAppend(&defaults_static_storage, "    static ",
+                         attr_type_name, " Default_", api_def_attr.rename_to(),
+                         "() {\n");
+      strings::StrAppend(
+          &defaults_static_storage, "      static const ",
+          ListElementTypeName(attr.type()), " kStorage[] = ",
+          PrintAttrValue(graph_op_def.name(), api_def_attr.default_value()),
+          ";\n");
+      strings::StrAppend(&defaults_static_storage, "      return ",
+                         attr_type_name, "(kStorage);\n    }\n");
+      // Set the field_initializer to call the defined function.
+      strings::StrAppend(&field_initiliazer, "Default_",
+                         api_def_attr.rename_to(), "()");
+    } else {
+      field_initiliazer =
+          PrintAttrValue(graph_op_def.name(), api_def_attr.default_value());
+    }
+    strings::StrAppend(&struct_fields, "    ", attr_type_name, " ",
+                       api_def_attr.rename_to(), "_ = ", field_initiliazer,
+                       ";\n");
   }
 
   if (struct_fields.empty()) {
@@ -721,6 +769,9 @@ string OpInfo::GetOpAttrStruct() const {
   string struct_decl = MakeComment(attrs_comment, "  ");
   strings::StrAppend(&struct_decl, "  struct Attrs {\n");
   strings::StrAppend(&struct_decl, setters, struct_fields);
+  if (!defaults_static_storage.empty()) {
+    strings::StrAppend(&struct_decl, "  private:\n", defaults_static_storage);
+  }
   strings::StrAppend(&struct_decl, "  };\n");
 
   return struct_decl;
-- 
cgit v1.2.3


From e80732c9895d1283af9b98d6277ad1a1015e2e9a Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 18 Jun 2018 09:57:19 -0700
Subject: Merge changes from github.

PiperOrigin-RevId: 201011811
---
 CONTRIBUTING.md                                    |   2 +-
 README.md                                          |   1 +
 RELEASE.md                                         |  67 +++-
 configure.py                                       |   5 +
 tensorflow/BUILD                                   |   4 +-
 tensorflow/c/generate-pc.sh                        |  11 +-
 tensorflow/cc/gradients/math_grad.cc               |   1 +
 tensorflow/cc/gradients/nn_grad.cc                 |  47 +++
 tensorflow/cc/gradients/nn_grad_test.cc            |  84 ++++-
 tensorflow/compiler/aot/codegen_test_h.golden      |   4 +-
 .../compiler/aot/embedded_protocol_buffers.h       |   2 +-
 tensorflow/compiler/aot/runtime.h                  |   4 +-
 tensorflow/compiler/aot/runtime_test.cc            |  16 +-
 tensorflow/compiler/xla/service/cpu/BUILD          |  18 +-
 tensorflow/compiler/xla/service/cpu/cpu_runtime.cc |   2 +
 tensorflow/compiler/xla/service/cpu/cpu_runtime.h  |   1 +
 tensorflow/compiler/xla/service/cpu/ir_emitter.cc  |   8 +-
 .../compiler/xla/service/cpu/runtime_fft_impl.h    |  20 +-
 .../xla/service/cpu/runtime_single_threaded_fft.cc |  32 ++
 .../xla/service/cpu/runtime_single_threaded_fft.h  |  31 ++
 .../compiler/xla/service/cpu/simple_orc_jit.cc     |   2 +
 tensorflow/compiler/xla/service/pattern_matcher.h  |   2 +-
 .../compiler/xla/service/tuple_simplifier.cc       |   7 +
 tensorflow/compiler/xla/service/tuple_simplifier.h |   9 +-
 .../compiler/xla/service/tuple_simplifier_test.cc  |  77 ++++
 tensorflow/contrib/autograph/__init__.py           |   3 +
 tensorflow/contrib/cmake/tf_c.cmake                |  22 +-
 tensorflow/contrib/cmake/tf_cc_ops.cmake           |   2 +-
 tensorflow/contrib/cmake/tf_python.cmake           |   3 +-
 tensorflow/contrib/cmake/tools/create_def_file.py  |   9 +-
 .../bijectors/sinh_arcsinh_bijector_test.py        |  28 +-
 tensorflow/contrib/eager/python/datasets.py        |   3 +-
 .../python/examples/notebooks/4_high_level.ipynb   |   4 +-
 .../feature_column/sequence_feature_column.py      |  22 +-
 .../feature_column/sequence_feature_column_test.py |  41 ++
 tensorflow/contrib/ffmpeg/__init__.py              |   1 -
 tensorflow/contrib/ffmpeg/ffmpeg_ops.py            |   1 -
 tensorflow/contrib/framework/__init__.py           |   3 +-
 .../ops/fused_conv2d_bias_activation_op_test.py    |  11 +-
 .../src_impl/hexagon_controller.c                  |   2 +-
 tensorflow/contrib/lite/download_dependencies.sh   |   4 +-
 .../contrib/lite/examples/minimal/minimal.cc       |   2 +-
 .../contrib/lite/g3doc/tf_ops_compatibility.md     |  14 +-
 tensorflow/contrib/lite/java/ovic/README.md        |   4 +-
 .../kernels/internal/reference/reference_ops.h     |   4 +-
 tensorflow/contrib/lite/python/interpreter.py      |   2 +-
 .../interpreter_wrapper/interpreter_wrapper.cc     |   9 +-
 .../interpreter_wrapper/interpreter_wrapper.h      |   3 +-
 tensorflow/contrib/lite/python/lite.py             |  11 +
 tensorflow/contrib/lite/toco/import_tensorflow.cc  |   2 +-
 tensorflow/contrib/lite/toco/toco_port.cc          |   6 +
 tensorflow/contrib/lite/toco/toco_port.h           |  18 +
 tensorflow/contrib/makefile/compile_nsync.sh       |   2 +-
 .../contrib/makefile/download_dependencies.sh      |   4 +-
 .../contrib/metrics/python/ops/metric_ops.py       |   2 +-
 tensorflow/contrib/mpi_collectives/kernels/ring.h  |   2 +-
 .../contrib/opt/python/training/adamax_test.py     |   6 +-
 .../opt/python/training/model_average_optimizer.py |   2 +-
 tensorflow/contrib/periodic_resample/BUILD         |  20 +-
 .../kernels/periodic_resample_op.cc                |   5 +
 .../kernels/periodic_resample_op.h                 | 415 +++++++++++++++------
 .../contrib/periodic_resample/ops/array_ops.cc     |  53 ++-
 .../periodic_resample/ops/array_ops_test.cc        |  41 ++
 .../kernel_tests/periodic_resample_op_test.py      |  27 +-
 .../python/ops/periodic_resample_op.py             |   8 +-
 .../predictor/contrib_estimator_predictor.py       |   5 +-
 .../contrib/predictor/core_estimator_predictor.py  |   5 +-
 .../contrib/predictor/predictor_factories.py       |  24 +-
 .../contrib/predictor/predictor_factories_test.py  |  19 +
 .../contrib/predictor/saved_model_predictor.py     |   6 +-
 tensorflow/contrib/quantize/README.md              |   2 +-
 .../contrib/slim/python/slim/evaluation_test.py    |  25 +-
 tensorflow/contrib/summary/summary.py              |   5 +-
 .../contrib/tensor_forest/client/eval_metrics.py   |  45 +--
 .../contrib/tensor_forest/python/tensor_forest.py  |  34 +-
 .../tensor_forest/python/tensor_forest_test.py     |  45 +++
 .../contrib/tensorrt/convert/convert_graph.cc      |  66 ++--
 .../contrib/tensorrt/convert/convert_nodes.cc      |  97 +++--
 tensorflow/contrib/tpu/python/tpu/datasets.py      |  16 +-
 tensorflow/contrib/tpu/python/tpu/datasets_test.py |  26 ++
 tensorflow/core/BUILD                              |   9 +-
 .../core/api_def/base_api/api_def_Selu.pbtxt       |   4 +
 .../api_def/base_api/api_def_StringSplitV2.pbtxt   |  48 +++
 .../api_def/python_api/api_def_StringSplitV2.pbtxt |   4 +
 tensorflow/core/common_runtime/bfc_allocator.cc    |   8 +-
 tensorflow/core/common_runtime/bfc_allocator.h     |   3 +-
 .../direct_session_with_tracking_alloc_test.cc     |  16 +
 .../common_runtime/mkl_threadpool_device_test.cc   |  53 +++
 tensorflow/core/common_runtime/process_util.cc     |  11 +-
 .../core/common_runtime/threadpool_device.cc       |  25 +-
 .../rpc/grpc_master_service_impl.cc                |   4 +-
 .../core/distributed_runtime/rpc/grpc_testlib.cc   |  10 +-
 tensorflow/core/framework/allocator.h              |   5 -
 tensorflow/core/framework/op_gen_lib.cc            |   1 +
 .../remote_fused_graph_execute_info.proto          |   2 +-
 tensorflow/core/framework/tensor_test.cc           |  24 +-
 tensorflow/core/graph/mkl_layout_pass.cc           | 148 +++++++-
 tensorflow/core/graph/mkl_layout_pass_test.cc      |  31 ++
 tensorflow/core/grappler/costs/graph_properties.cc |   1 -
 tensorflow/core/grappler/optimizers/BUILD          |   2 +-
 tensorflow/core/grappler/optimizers/remapper.cc    |   4 +-
 tensorflow/core/kernels/as_string_op.cc            |   2 +
 tensorflow/core/kernels/cwise_op_clip.cc           |  43 +--
 .../core/kernels/dense_update_functor_gpu.cu.cc    |   1 +
 tensorflow/core/kernels/gather_functor.cc          |   1 +
 tensorflow/core/kernels/gather_functor_gpu.cu.cc   |   1 +
 tensorflow/core/kernels/gather_nd_op.cc            |   4 +
 tensorflow/core/kernels/gather_nd_op_gpu.cu.cc     |   2 +
 tensorflow/core/kernels/gather_op.cc               |   1 +
 tensorflow/core/kernels/mkl_concat_op.cc           | 213 ++++++++---
 tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc  |   2 +
 tensorflow/core/kernels/mkl_pooling_ops_common.h   |   6 +-
 tensorflow/core/kernels/scatter_nd_op.cc           |   4 +
 tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc    |   1 +
 .../core/kernels/scoped_allocator_ops_test.cc      |   9 +-
 tensorflow/core/kernels/segment_reduction_ops.h    |  10 +-
 tensorflow/core/kernels/sparse_matmul_op.cc        |   2 +-
 tensorflow/core/kernels/string_split_op.cc         | 130 +++++++
 tensorflow/core/ops/candidate_sampling_ops.cc      |   5 +-
 tensorflow/core/ops/dataset_ops.cc                 |  24 +-
 tensorflow/core/ops/image_ops.cc                   |   4 +-
 tensorflow/core/ops/math_ops.cc                    |   2 +-
 tensorflow/core/ops/nn_ops.cc                      |   1 +
 tensorflow/core/ops/string_ops.cc                  |  20 +-
 tensorflow/core/platform/cpu_info.cc               |  23 ++
 tensorflow/core/platform/cpu_info.h                |   7 +
 tensorflow/core/platform/default/build_config.bzl  |   2 +
 .../core/platform/hadoop/hadoop_file_system.cc     |  21 +-
 tensorflow/core/platform/posix/port.cc             |   5 +
 tensorflow/core/public/version.h                   |   4 +-
 tensorflow/core/util/mkl_util.h                    |  50 ++-
 tensorflow/docs_src/community/groups.md            |  29 +-
 tensorflow/docs_src/get_started/eager.md           |   2 +-
 tensorflow/docs_src/get_started/index.md           |   4 +-
 tensorflow/docs_src/install/install_c.md           |   2 +-
 tensorflow/docs_src/install/install_go.md          |   2 +-
 tensorflow/docs_src/install/install_java.md        |  24 +-
 tensorflow/docs_src/install/install_linux.md       |  24 +-
 tensorflow/docs_src/install/install_mac.md         |  10 +-
 tensorflow/docs_src/install/install_sources.md     |  17 +-
 tensorflow/docs_src/mobile/linking_libs.md         |   2 +-
 tensorflow/docs_src/mobile/prepare_models.md       |   4 +-
 tensorflow/docs_src/performance/quantization.md    |   2 +-
 .../docs_src/programmers_guide/estimators.md       |  19 +-
 .../docs_src/programmers_guide/feature_columns.md  |   4 +-
 tensorflow/examples/learn/iris.py                  |   7 +-
 tensorflow/go/op/wrappers.go                       |  12 +-
 tensorflow/java/src/gen/cc/op_generator.cc         |  11 +-
 tensorflow/java/src/gen/cc/op_specs.cc             |   1 +
 tensorflow/python/eager/backprop.py                |   4 +-
 tensorflow/python/estimator/BUILD                  |   5 +-
 tensorflow/python/estimator/exporter.py            |   4 +-
 tensorflow/python/estimator/inputs/numpy_io.py     |   8 +-
 .../python/estimator/inputs/numpy_io_test.py       |   5 +-
 tensorflow/python/estimator/inputs/pandas_io.py    |   7 +-
 .../python/estimator/inputs/pandas_io_test.py      |   5 +-
 .../estimator/inputs/queues/feeding_functions.py   |   2 +-
 tensorflow/python/estimator/keras.py               |   4 +-
 tensorflow/python/estimator/keras_test.py          |  14 +-
 .../python/grappler/layout_optimizer_test.py       |   4 +-
 tensorflow/python/keras/activations.py             |   2 +
 tensorflow/python/keras/callbacks.py               |  21 +-
 tensorflow/python/keras/callbacks_test.py          |   2 +
 tensorflow/python/keras/engine/network.py          |   2 +-
 tensorflow/python/keras/engine/saving_test.py      |   4 +-
 tensorflow/python/keras/engine/training.py         |   7 +-
 tensorflow/python/keras/engine/training_eager.py   |   2 +-
 tensorflow/python/keras/initializers_test.py       |  26 +-
 tensorflow/python/keras/layers/core.py             |  26 +-
 tensorflow/python/keras/models_test.py             |  14 +
 .../python/kernel_tests/as_string_op_test.py       |  10 +
 tensorflow/python/kernel_tests/betainc_op_test.py  |   4 +-
 tensorflow/python/kernel_tests/clip_ops_test.py    |  13 +
 tensorflow/python/kernel_tests/conv_ops_test.py    |  32 +-
 .../python/kernel_tests/gather_nd_op_test.py       |  32 +-
 tensorflow/python/kernel_tests/gather_op_test.py   |  20 +-
 tensorflow/python/kernel_tests/init_ops_test.py    |  27 ++
 tensorflow/python/kernel_tests/pooling_ops_test.py |   4 +-
 tensorflow/python/kernel_tests/py_func_test.py     |  31 +-
 .../python/kernel_tests/scatter_nd_ops_test.py     |   6 +-
 tensorflow/python/kernel_tests/scatter_ops_test.py |  14 +-
 .../kernel_tests/segment_reduction_ops_test.py     |   4 +-
 .../python/kernel_tests/string_split_op_test.py    |  96 +++++
 tensorflow/python/ops/array_ops.py                 |   4 +
 tensorflow/python/ops/gradient_checker.py          |   8 +-
 tensorflow/python/ops/image_ops_impl.py            |  74 ++--
 tensorflow/python/ops/image_ops_test.py            | 261 ++++++++++---
 tensorflow/python/ops/init_ops.py                  |   3 +-
 tensorflow/python/ops/logging_ops.py               |   5 +-
 tensorflow/python/ops/math_ops.py                  |  28 +-
 tensorflow/python/ops/nn_impl.py                   |   5 +-
 tensorflow/python/ops/nn_ops.py                    |   4 +-
 tensorflow/python/ops/nn_test.py                   |  10 +
 tensorflow/python/ops/script_ops.py                |  35 +-
 tensorflow/python/ops/sparse_ops.py                |   4 +
 tensorflow/python/ops/string_ops.py                |  53 +++
 tensorflow/python/ops/variable_scope.py            |  21 +-
 .../python/tools/import_pb_to_tensorboard.py       |   0
 tensorflow/tensorflow.bzl                          |   2 +-
 .../tools/api/generator/create_python_api.py       |   8 +-
 tensorflow/tools/api/golden/tensorflow.image.pbtxt |   2 +-
 tensorflow/tools/api/golden/tensorflow.pbtxt       |   4 +
 .../tools/api/golden/tensorflow.strings.pbtxt      |   4 +
 tensorflow/tools/ci_build/builds/pip.sh            |   4 +
 .../tools/ci_build/builds/with_the_same_user       |   2 +-
 tensorflow/tools/ci_build/ci_build.sh              |   7 +
 tensorflow/tools/ci_build/copy_binary.py           |   3 +-
 .../tools/ci_build/install/install_pip_packages.sh |   4 +
 .../install/install_python3.5_pip_packages.sh      |   4 +-
 .../install/install_python3.6_pip_packages.sh      |   5 +-
 .../tools/ci_build/linux/mkl/basic-mkl-test.sh     |  29 ++
 tensorflow/tools/ci_build/pi/build_raspberry_pi.sh |   8 +-
 .../def_file_filter/def_file_filter_configure.bzl  |   6 +-
 tensorflow/tools/dist_test/local_test.sh           |  12 +-
 tensorflow/tools/dist_test/remote_test.sh          |  11 +-
 tensorflow/tools/docker/Dockerfile.devel           |   2 +-
 tensorflow/tools/docker/Dockerfile.devel-cpu-mkl   |   2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu       |   6 +-
 tensorflow/tools/docker/Dockerfile.gpu             |   2 +-
 tensorflow/tools/pip_package/BUILD                 |   1 +
 tensorflow/tools/pip_package/build_pip_package.sh  | 160 +++++---
 tensorflow/tools/pip_package/setup.py              |   3 +-
 .../proto_text/gen_proto_text_functions_lib.cc     |   3 +
 .../tools/quantization/quantize_graph_test.py      |  12 +-
 tensorflow/tools/test/upload_test_benchmarks.py    |   1 -
 tensorflow/workspace.bzl                           |  40 +-
 third_party/eigen.BUILD                            |   1 +
 third_party/highwayhash.BUILD                      |   1 +
 third_party/jpeg/jpeg.BUILD                        |   2 +
 third_party/png.BUILD                              |   9 +-
 third_party/py/python_configure.bzl                |  24 +-
 third_party/repo.bzl                               |   5 +-
 232 files changed, 3343 insertions(+), 909 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
 create mode 100644 tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
 create mode 100644 tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
 create mode 100644 tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
 create mode 100644 tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
 mode change 100755 => 100644 tensorflow/python/tools/import_pb_to_tensorboard.py
 create mode 100755 tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh

(limited to 'tensorflow/cc')

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8669c25c45..db4b1581ae 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -90,7 +90,7 @@ Bazel BUILD files also need to include a license section, e.g.,
 Changes to TensorFlow C++ code should conform to
 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
 
-Use `clang-tidy` to check your C/C++ changes. To install clang-tidy on ubuntu:16.04, do:
+Use `clang-tidy` to check your C/C++ changes. To install `clang-tidy` on ubuntu:16.04, do:
 
 ```bash
 apt-get install -y clang-tidy
diff --git a/README.md b/README.md
index 6fb4486d0d..63853137cf 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,7 @@ $ python
 42
 >>> sess.close()
 ```
+Learn more examples about how to do specific tasks in TensorFlow at the [tutorials page of tensorflow.org](https://www.tensorflow.org/tutorials/).
 
 ## Contribution guidelines
 
diff --git a/RELEASE.md b/RELEASE.md
index 84d9d52868..e09e9c6190 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,62 @@
+# Release 1.9.0
+
+## Major Features And Improvements
+* Update tf.keras to the Keras 2.1.6 API.
+* `tfe.Network` is deprecated. Please inherit from `tf.keras.Model`.
+* Adding support of core feature columns and losses to gradient boosted trees estimators.
+* The distributions.Bijector API supports broadcasting for Bijectors with new API changes. See [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/distributions/bijectors/Bijector) for more details.
+* Layered variable names have changed in the following conditions:
+  * Using `tf.keras.layers` with custom variable scopes.
+  * Using `tf.layers` in  a subclassed `tf.keras.Model` class. See [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/layers) for more details
+
+## Breaking Chances
+  * If you're opening empty variable scopes; replace `variable_scope`('', ...) by `variable_scope`(`tf.get_variable_scope()`, ...).
+
+## Bug Fixes and Other Changes
+* `tf.data`:
+  * The `DatasetBase::DebugString()` method is now `const`.
+  * Added the `tf.contrib.data.sample_from_datasets()` API for randomly sampling from multiple datasets.
+* Eager Execution:
+* `tf.keras`:
+  * Move Keras code out of _impl folder and remove API files.
+  * `tf.keras.Model.save_weights` now saves in TensorFlow format by default.
+  * Enable dataset iterators to be passed to `tf.keras.Model` training/eval methods.
+* Accelerated Linear Algebra (XLA):
+* TensorFlow Debugger (tfdbg): fix an issue in which the TensorBoard Debugger Plugin could not handle total source file size exceeding gRPC message size limit (4 MB).
+* `tf.contrib`:
+  * Add `tf.contrib.data.choose_from_datasets()`.
+  * `tf.contrib.data.make_csv_dataset()` now supports line breaks in quoted strings. Two arguments were removed from `make_csv_dataset`.
+  * `tf.contrib.framework.zero_initializer` supports ResourceVariable.
+  * Adding "constrained_optimization" to tensorflow/contrib.
+* Other:
+  * Add GCS Configuration Ops.
+  * Changing signature of `MakeIterator` to enable propagating error status.
+  * KL divergence for two Dirichlet distributions.
+  * More consistent GcsFileSystem behavior for certain reads past EOF.
+  * Update benchmark for tf.scan to match ranges across eager and graph modes.
+  * Fixed bug in `tf.reduce_prod gradient` for complex dtypes.
+  * Add optional `args` argument to `Dataset.from_generator()`.
+  * Allow the use of '.' in variables (e.g. "hparams.parse('a.b=1.0')"), which would previously raise an error. This will correspond to an attribute name with an embedded '.' symbol (e.g. 'a.b'), which can only be accessed indirectly (e.g. through getattr and setattr).  To set this up the user will first need to explicitly add the variable to the hparam object (e.g. "hparams.add_hparam(name='a.b', value=0.0)").
+  * Benchmark for tf.scan in graph and eager modes.
+  * Added complex128 support to FFT, FFT2D, FFT3D, IFFT, IFFT2D, and IFFT3D.
+  * Making ids unique in `nn.embedding_lookup_sparse`. This helps to reduce RPC calls for looking up the embeddings when there are repeated ids in the batch.
+  * Support indicator column in boosted trees.
+  * Prevent `tf.gradients()` from backpropagating through integer tensors.
+  * LinearOperator[1D,2D,3D]Circulant added to `tensorflow.linalg`.
+  * Conv3D, Conv3DBackpropInput, Conv3DBackpropFilter now supports arbitrary.
+  * Added `tf.train.Checkpoint` for reading/writing object-based checkpoints.
+  * `Dataset.list_files()` now produces determinstic results when `shuffle=False` or a `seed` is passed.
+  * Added LinearOperatorKronecker, a dense-free implementation of the Kronecker Product.
+  * Allow LinearOperator to broadcast.
+  * SavedModelBuilder will now deduplicate asset names that point to files with the same basename and the same contents. Note that this may result in new asset files included in SavedModels in cases where assets with the same name but different contents were previously overwriting each other.
+
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+Abdullah Alrasheed, Achal Shah, Ad-530, ADiegoCAlonso, Aditya Yogi, Ag Ramesh, akindyakov, Andy Kernahan, Anya Petrova, Aurelien Geron, Ben, Ben Barsdell, Bhavani-Subramanian, braincodercn, Brett Koonce, Brian Nemsick, Brian Zier, Bryan Heden, candy.dc, cclauss, Clayne Robison, ctiijima, Dalmo Cirne, David Norman, David T.H. Kao, DosLin, ekelsen, Elson Rodriguez, Erik Smistad, Felix Abecassis, Fergal Cotter, fo40225, foo0x29a, Freedom" Koan-Sin Tan, FréDéRic Branchaud-Charron, gdh1995, Geoffrey Irving, Giuseppe, gracehoney, Guido Zuidhof, Guillaume Klein, Guozhong Zhuang, Haggai, Harald Husum, imsheridan, Ivan Zhang, Jan Zikes, Jayaram Bobba, Jesse Benson, Jesse Gumz, Jiajia Li, Jie, jinghuangintel, Jingwen, jjsjann123, Joe Yearsley, Joel Hestness, Joel Shor, josephyearsley, Junpeng Lao, Karol M. Langner, Kb Sriram, krantideep95, Krish Ravindranath, Letian Feng, Loo Rong Jie, Lukas Geiger, Maciej, Mahmoud Abuzaina, ManHyuk, Mark Ryan, mbhuiyan, Michal Turek, Mostafa Alaa, Myungsung Kwak, Nand Dalal, Nehal J Wani, Neil Tenenholtz, ngc92, Nicholas Nadeau, P.Eng., Avs, Niranjan Hasabnis, P-Hidringer, Paul Van Eck, Peng Yu, Qing Zhao, Qingying Chen, Quanlong, Rajendra Arora, Rholais Lii, rmanyari, Robin Richtsfeld, Russell Klopfer, Sagi, Sam Sendelbach, Sandeep N Gupta, Sandip Giri, Sarah Edkins, Scott Tseng, Sdalbsoo, Sergii Khomenko, Seungwoo Choi (Biggie), Seyed Majid Azimi, Shaoning Zeng, shengfuintel, Siu Kei, Muk, Smit Shilu, soonson, Stefan Schweter, Sukhwan Kim, Sunitha Kambhampati, Taehoon Lee, tamimaddari82, Tang, Wenyi, Ted Chang, u2takey, Utkarsh Upadhyay, Vadim Markovtsev, voegtlel, Wai Hon Law, wangsiyu, Wenhao Hu, wenhao.hu, William D. Irons, Yan Facai (颜发才), Yanbo Liang, Yihong Wang, Yilei (Dolee) Yang, Yong Tang, Yuan (Terry) Tang
+
 # Release 1.8.0
 
 ## Major Features And Improvements
@@ -404,14 +463,6 @@ answered questions, and were part of inspiring discussions.
 
 # Release 1.4.0
 
-## Major Features And Improvements
-* `tf.keras` is now part of the core TensorFlow API.
-* [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
-  the core TensorFlow API.
-  * The API is now subject to backwards compatibility guarantees.
-
-# Release 1.4.0
-
 ## Major Features And Improvements
 * `tf.keras` is now part of the core TensorFlow API.
 * [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
diff --git a/configure.py b/configure.py
index bde7af8c0e..ada342a50a 100644
--- a/configure.py
+++ b/configure.py
@@ -1397,6 +1397,10 @@ def set_grpc_build_flags():
   write_to_bazelrc('build --define grpc_no_ares=true')
 
 
+def set_build_strip_flag():
+  write_to_bazelrc('build --strip=always')
+
+
 def set_windows_build_flags():
   if is_windows():
     # The non-monolithic build is not supported yet
@@ -1519,6 +1523,7 @@ def main():
 
   set_grpc_build_flags()
   set_cc_opt_flags(environ_cp)
+  set_build_strip_flag()
   set_windows_build_flags()
 
   if get_var(
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index a73c4ca3aa..6d134dbb80 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -475,7 +475,7 @@ tf_cc_shared_object(
 # excludes all but a subset of function names.
 # On MacOS, the linker does not support version_script, but has an
 # an "-exported_symbols_list" command.  -z defs disallows undefined
-# symbols in object files and -s strips the output.
+# symbols in object files.
 
 tf_cc_shared_object(
     name = "libtensorflow.so",
@@ -489,7 +489,6 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
-            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow/c:version_script.lds)",
         ],
@@ -515,7 +514,6 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
-            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow:tf_version_script.lds)",
         ],
diff --git a/tensorflow/c/generate-pc.sh b/tensorflow/c/generate-pc.sh
index 02a6a58b61..7184ad68fb 100755
--- a/tensorflow/c/generate-pc.sh
+++ b/tensorflow/c/generate-pc.sh
@@ -15,10 +15,12 @@
 # ==============================================================================
 
 TF_PREFIX='/usr/local'
+LIBDIR='lib'
 
 usage() {
     echo "Usage: $0 OPTIONS"
     echo -e "-p, --prefix\tset installation prefix (default: /usr/local)"
+    echo -e "-l, --libdir\tset lib directory (default: lib)"
     echo -e "-v, --version\tset TensorFlow version"
     echo -e "-h, --help\tdisplay this message"
 }
@@ -26,7 +28,7 @@ usage() {
 [ $# == 0 ] && usage && exit 0
 
 # read the options
-ARGS=$(getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@")
+ARGS=$(getopt -o p:l:v:h --long prefix:,libdir:,version:,help -n $0 -- "$@")
 eval set -- "$ARGS"
 
 # extract options and their arguments into variables.
@@ -38,6 +40,11 @@ while true ; do
                 "") shift 2 ;;
                 *) TF_PREFIX=$2 ; shift 2 ;;
             esac ;;
+        -l|--libdir)
+            case "$2" in
+                "") shift 2 ;;
+                *) LIBDIR=$2 ; shift 2 ;;
+            esac ;;
         -v|--version)
             case "$2" in
                 "") shift 2 ;;
@@ -55,7 +62,7 @@ echo "Generating pkgconfig file for TensorFlow $TF_VERSION in $TF_PREFIX"
 cat << EOF > tensorflow.pc
 prefix=${TF_PREFIX}
 exec_prefix=\${prefix}
-libdir=\${exec_prefix}/lib
+libdir=\${exec_prefix}/${LIBDIR}
 includedir=\${prefix}/include
 
 Name: TensorFlow
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index 52c177212a..35a01e0341 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -38,6 +38,7 @@ REGISTER_NO_GRADIENT_OP("NotEqual");
 REGISTER_NO_GRADIENT_OP("LogicalAnd");
 REGISTER_NO_GRADIENT_OP("LogicalOr");
 REGISTER_NO_GRADIENT_OP("LogicalNot");
+REGISTER_NO_GRADIENT_OP("Floor");
 
 // Conjugate helper function returns the conjugate of an Output if it
 // is complex valued.
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 0cb3132e94..c73482d5f4 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -255,6 +255,53 @@ Status LRNGradHelper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("LRN", LRNGradHelper);
 
+Status SoftplusGradHelper(const Scope& scope, const Operation& op,
+                          const std::vector<Output>& grad_inputs,
+                          std::vector<Output>* grad_outputs) {
+  auto dx = internal::SoftplusGrad(scope, grad_inputs[0], op.input(0));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Softplus", SoftplusGradHelper);
+
+Status SoftsignGradHelper(const Scope& scope, const Operation& op,
+                          const std::vector<Output>& grad_inputs,
+                          std::vector<Output>* grad_outputs) {
+  auto dx = internal::SoftsignGrad(scope, grad_inputs[0], op.input(0));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Softsign", SoftsignGradHelper);
+
+Status FractionalAvgPoolGradHelper(const Scope& scope, const Operation& op,
+                                   const std::vector<Output>& grad_inputs,
+                                   std::vector<Output>* grad_outputs) {
+  bool overlapping;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
+  auto dx = internal::FractionalAvgPoolGrad(
+      scope, Shape(scope, op.input(0), Shape::OutType(DT_INT64)),
+      grad_inputs[0], op.output(1), op.output(2),
+      internal::FractionalAvgPoolGrad::Overlapping(overlapping));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("FractionalAvgPool", FractionalAvgPoolGradHelper);
+
+Status FractionalMaxPoolGradHelper(const Scope& scope, const Operation& op,
+                                   const std::vector<Output>& grad_inputs,
+                                   std::vector<Output>* grad_outputs) {
+  bool overlapping;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
+  auto dx = internal::FractionalMaxPoolGrad(
+      scope, op.input(0), op.output(0), grad_inputs[0], op.output(1),
+      op.output(2), internal::FractionalMaxPoolGrad::Overlapping(overlapping));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("FractionalMaxPool", FractionalMaxPoolGradHelper);
+
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index c4eba7ecb0..b4d457a9d1 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -28,6 +28,8 @@ namespace {
 using ops::BiasAdd;
 using ops::Conv2D;
 using ops::Elu;
+using ops::FractionalAvgPool;
+using ops::FractionalMaxPool;
 using ops::L2Loss;
 using ops::LogSoftmax;
 using ops::LRN;
@@ -41,6 +43,8 @@ using ops::Relu;
 using ops::Relu6;
 using ops::Selu;
 using ops::Softmax;
+using ops::Softplus;
+using ops::Softsign;
 
 class NNGradTest : public ::testing::Test {
  protected:
@@ -71,22 +75,30 @@ class NNGradTest : public ::testing::Test {
     EXPECT_LT(max_error, 1e-3);
   }
 
-  // Sets tensor with random values, ensuring that the max value is largest by
-  // a reasonable amount.
-  // This is an issue for MaxPool, MaxPoolV2 and MaxPool3D, in which
-  // perturbations by the numeric gradient computation in the gradient checker
-  // can change the max value if values are too close together.
+  // Sets tensor with random values, ensuring that every pair of elements are at
+  // least a reasonable amount apart.
+  // This is an issue for max pooling operations, in which perturbations by the
+  // numeric gradient computation in the gradient checker can change the max
+  // value if a pool has values that are too close together.
   template <typename T>
-  void SetRandomValuesWithBumpedMax(Tensor* tensor) {
+  void SetRandomValuesForMaxPooling(Tensor* tensor) {
     auto tensor_flat = tensor->flat<T>();
-    tensor_flat.setRandom();
-    int32 max_index = 0;
-    for (size_t i = 1; i < tensor->NumElements(); i++) {
-      if (tensor_flat(i) > tensor_flat(max_index)) {
-        max_index = i;
-      }
+    // First set the array to an increasing sequence of values spaced
+    // a reasonable amount apart
+    T cur = 0;
+    for (size_t i = 0; i < tensor->NumElements(); i++) {
+      tensor_flat(i) = cur;
+      cur += 5e-2;
+    }
+    // Fischer-Yates shuffle the array
+    for (size_t i = tensor->NumElements() - 1; i >= 1; i--) {
+      // j <- random integer 0 <= j <= i
+      size_t j = random::New64() % (i + 1);
+      // swap values at i, j
+      T tmp = tensor_flat(i);
+      tensor_flat(i) = tensor_flat(j);
+      tensor_flat(j) = tmp;
     }
-    tensor_flat(max_index) += 1e-2;
   }
 
   Scope scope_;
@@ -189,7 +201,7 @@ TEST_F(NNGradTest, MaxPoolGradHelper) {
   const std::vector<int> strides{1, 2, 2, 1};
   auto y = MaxPool(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -202,7 +214,7 @@ TEST_F(NNGradTest, MaxPoolGradV2Helper) {
   Tensor strides = test::AsTensor<int>({1, 2, 2, 1}, {4});
   auto y = MaxPoolV2(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -215,7 +227,7 @@ TEST_F(NNGradTest, MaxPool3DGradHelper) {
   const std::vector<int> strides{1, 3, 3, 3, 1};
   auto y = MaxPool3D(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -248,5 +260,45 @@ TEST_F(NNGradTest, LRN){
   RunTest(x, x_shape, y, x_shape);
 }
 
+TEST_F(NNGradTest, SoftplusGrad) {
+  TensorShape shape({3, 7});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Softplus(scope_, x);
+  RunTest(x, shape, y, shape);
+}
+
+TEST_F(NNGradTest, SoftsignGrad) {
+  TensorShape shape({3, 7});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Softsign(scope_, x);
+  RunTest(x, shape, y, shape);
+}
+
+TEST_F(NNGradTest, FractionalAvgPoolGradHelper) {
+  TensorShape x_shape({1, 3, 7, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Force consistent pooling regions for unit testing.
+  auto y = FractionalAvgPool(
+      scope_, x, {1, 1.2, 1.9, 1},
+      FractionalAvgPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
+          2));
+  TensorShape y_shape({1, 2, 3, 1});
+  RunTest(x, x_shape, y.output, y_shape);
+}
+
+TEST_F(NNGradTest, FractionalMaxPoolGradHelper) {
+  TensorShape x_shape({1, 3, 7, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Force consistent pooling regions for unit testing.
+  auto y = FractionalMaxPool(
+      scope_, x, {1, 1.2, 1.9, 1},
+      FractionalMaxPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
+          2));
+  Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
+  TensorShape y_shape({1, 2, 3, 1});
+  RunTest(x, x_init_value, y.output, y_shape);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 6e050cf564..6641d45e83 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -56,9 +56,9 @@ namespace bar {
 //
 // Memory stats:
 //   arg bytes total:    104
-//   arg bytes aligned:  128
+//   arg bytes aligned:  192
 //   temp bytes total:   126
-//   temp bytes aligned: 224
+//   temp bytes aligned: 320
 class MyClass : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.h b/tensorflow/compiler/aot/embedded_protocol_buffers.h
index ebfe4806c2..4e194a6aba 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.h
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.h
@@ -71,7 +71,7 @@ struct ProtobufToEmbed {
   const ::tensorflow::protobuf::MessageLite* message;
 };
 
-// Embeds a a sequence of protocol buffers into an object file.
+// Embeds a sequence of protocol buffers into an object file.
 //
 // `target_triple` is the target triple for the target architecture for the
 // generated object file.
diff --git a/tensorflow/compiler/aot/runtime.h b/tensorflow/compiler/aot/runtime.h
index d085864f00..d1a669ceb1 100644
--- a/tensorflow/compiler/aot/runtime.h
+++ b/tensorflow/compiler/aot/runtime.h
@@ -25,8 +25,8 @@ namespace tensorflow {
 namespace tfcompile {
 namespace runtime {
 
-// Align to 32-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
-static constexpr size_t kAlign = 32;
+// Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
+static constexpr size_t kAlign = 64;
 
 // aligned_buffer_bytes returns the sum of each size in `sizes`, skipping -1
 // values.  There are `n` entries in `sizes`.  Each buffer is aligned to kAlign
diff --git a/tensorflow/compiler/aot/runtime_test.cc b/tensorflow/compiler/aot/runtime_test.cc
index 6d603a02eb..06ec623eb2 100644
--- a/tensorflow/compiler/aot/runtime_test.cc
+++ b/tensorflow/compiler/aot/runtime_test.cc
@@ -24,7 +24,7 @@ namespace runtime {
 namespace {
 
 TEST(Runtime, AlignmentValue) {
-  // We've chosen 32 byte alignment for the tfcompile runtime to mimic the
+  // We've chosen 64 byte alignment for the tfcompile runtime to mimic the
   // regular tensorflow allocator, which was chosen to play nicely with Eigen.
   // The tfcompile runtime also has a requirement that comes from the xla
   // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8
@@ -39,13 +39,13 @@ TEST(Runtime, AlignedBufferBytes) {
   EXPECT_EQ(aligned_buffer_bytes(sizesA, 1), 0);
 
   static constexpr intptr_t sizesB[1] = {3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 32);
+  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 64);
 
   static constexpr intptr_t sizesC[1] = {32};
-  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 32);
+  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 64);
 
   static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 192);
+  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 320);
 }
 
 void* add_ptr(void* base, uintptr_t delta) {
@@ -101,11 +101,11 @@ TEST(Runtime, MallocFreeContiguousBuffers) {
   EXPECT_NE(base, nullptr);
   EXPECT_EQ(bufD[0], add_ptr(base, 0));
   EXPECT_EQ(bufD[1], nullptr);
-  EXPECT_EQ(bufD[2], add_ptr(base, 32));
+  EXPECT_EQ(bufD[2], add_ptr(base, 64));
   EXPECT_EQ(bufD[3], nullptr);
-  EXPECT_EQ(bufD[4], add_ptr(base, 64));
-  EXPECT_EQ(bufD[5], add_ptr(base, 128));
-  EXPECT_EQ(bufD[6], add_ptr(base, 160));
+  EXPECT_EQ(bufD[4], add_ptr(base, 128));
+  EXPECT_EQ(bufD[5], add_ptr(base, 192));
+  EXPECT_EQ(bufD[6], add_ptr(base, 256));
   for (int i = 0; i < 7; ++i) {
     const intptr_t size = sizesD[i];
     if (size != -1) {
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index d82922a359..1067b38f93 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -178,6 +178,7 @@ cc_library(
         ":runtime_matmul",
         ":runtime_matmul_mkl",
         ":runtime_single_threaded_conv2d",
+        ":runtime_single_threaded_fft",
         ":runtime_single_threaded_matmul",
         "@llvm//:execution_engine",
         "@llvm//:core",
@@ -516,7 +517,6 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
@@ -578,6 +578,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "runtime_single_threaded_fft",
+    srcs = [
+        "runtime_fft_impl.h",
+        "runtime_single_threaded_fft.cc",
+    ],
+    hdrs = ["runtime_single_threaded_fft.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:framework_lite",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "runtime_single_threaded_matmul",
     srcs = ["runtime_single_threaded_matmul.cc"],
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 215405f680..54c52bc08f 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -51,6 +51,8 @@ extern const char* const kEigenConvF16SymbolName =
 extern const char* const kEigenConvF32SymbolName =
     "__xla_cpu_runtime_EigenConvF32";
 extern const char* const kEigenFftSymbolName = "__xla_cpu_runtime_EigenFft";
+extern const char* const kEigenSingleThreadedFftSymbolName =
+    "__xla_cpu_runtime_EigenSingleThreadedFft";
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName =
     "__xla_cpu_runtime_EigenSingleThreadedMatMulF16";
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index 1dce6efa5c..aa0e967123 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -52,6 +52,7 @@ extern const char* const kMKLSingleThreadedMatMulF64SymbolName;
 extern const char* const kEigenConvF16SymbolName;
 extern const char* const kEigenConvF32SymbolName;
 extern const char* const kEigenFftSymbolName;
+extern const char* const kEigenSingleThreadedFftSymbolName;
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF64SymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 2c20be155f..758b8c62b4 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1172,7 +1172,13 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
       {int8_ptr_type, int8_ptr_type, int8_ptr_type, int32_type, int32_type,
        int64_type, int64_type, int64_type, int64_type},
       /*isVarArg=*/false);
-  const char* fn_name = runtime::kEigenFftSymbolName;
+
+  bool multi_threaded_eigen =
+      hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+  const char* fn_name = multi_threaded_eigen
+                            ? runtime::kEigenFftSymbolName
+                            : runtime::kEigenSingleThreadedFftSymbolName;
+
   llvm::Function* fft_func = llvm::cast<llvm::Function>(
       module_->getOrInsertFunction(fn_name, fft_type));
   fft_func->setCallingConv(llvm::CallingConv::C);
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
index 984cb0616e..0bf693edd0 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
@@ -21,8 +21,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/types.h"
 
 // 'tensorflow' namespace is used so that int64 and other types don't require
@@ -71,11 +69,9 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
-  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = fft_shape[i];
     out_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
-    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<float, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -88,8 +84,8 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   const auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
 
   // Compute the full FFT using a temporary tensor.
-  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
-  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
+  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(in_dims);
+
   const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> zero_start_indices;
   full_fft.device(device) =
       input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(axes);
@@ -112,11 +108,9 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
-  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
     out_dims[i + 1] = fft_shape[i];
-    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -129,8 +123,7 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   // region we will slice from input given fft_shape. We slice input to
   // fft_shape on its inner-most dimensions, except the last (which we
   // slice to fft_shape[-1] / 2 + 1).
-  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
-  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
+  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(out_dims);
 
   // Calculate the starting point and range of the source of
   // negative frequency part.
@@ -179,7 +172,6 @@ template <int FFTRank, typename EigenDevice>
 void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
                       int32 fft_type, int64 input_batch, int64 fft_length0,
                       int64 fft_length1, int64 fft_length2) {
-  CHECK(::xla::FftType_IsValid(fft_type)) << fft_type;
   switch (fft_type) {
     case ::xla::FftType::FFT:
       EigenFftC2C<true, FFTRank, EigenDevice>(
@@ -204,7 +196,8 @@ void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
           input_batch, fft_length0, fft_length1, fft_length2);
       break;
     default:
-      LOG(FATAL) << "Unsupported FFT type: " << fft_type;
+      // Unsupported FFT type
+      abort();
   }
 }
 
@@ -230,7 +223,8 @@ void EigenFftImpl(const EigenDevice& device, void* out, void* operand,
                                                  fft_length1, fft_length2);
       break;
     default:
-      LOG(FATAL) << "Unsupported FFT rank " << fft_rank;
+      // Unsupported FFT rank
+      abort();
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
new file mode 100644
index 0000000000..2613ddb127
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
+
+#include "tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+using tensorflow::int32;
+using tensorflow::int64;
+
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenSingleThreadedFft(
+    const void* run_options_ptr, void* out, void* operand, int32 fft_type,
+    int32 fft_rank, int64 input_batch, int64 fft_length0, int64 fft_length1,
+    int64 fft_length2) {
+  tensorflow::xla::EigenFftImpl(Eigen::DefaultDevice(), out, operand, fft_type,
+                                fft_rank, input_batch, fft_length0, fft_length1,
+                                fft_length2);
+}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
new file mode 100644
index 0000000000..dcd133d012
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
+
+#include "tensorflow/core/platform/types.h"
+
+extern "C" {
+
+extern void __xla_cpu_runtime_EigenSingleThreadedFft(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, void* out,
+    void* operand, tensorflow::int32 fft_type, tensorflow::int32 fft_rank,
+    tensorflow::int64 input_batch, tensorflow::int64 fft_length0,
+    tensorflow::int64 fft_length1, tensorflow::int64 fft_length2);
+
+}  // extern "C"
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 8d8c5e4c44..c4c90515ac 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -202,6 +203,7 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF64);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedFft);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index d3bc47e61e..2515222cf2 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -204,7 +204,7 @@ class LayoutPattern {
   // Modifies the pattern to match only if the layout equals the given proto.
   // The layout must outlive the returned pattern.
   constexpr LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>> EqualTo(
-      const Layout* layout) const {
+      const ::xla::Layout* layout) const {
     return LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>>(
         LayoutPatternEqualImpl<Impl>(impl_, layout), matched_layout_);
   }
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.cc b/tensorflow/compiler/xla/service/tuple_simplifier.cc
index e536c8afbf..77bdcc9de0 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.cc
@@ -30,10 +30,17 @@ limitations under the License.
 
 namespace xla {
 
+TupleSimplifier::TupleSimplifier(bool exclude_entry_computation) :
+    exclude_entry_computation_(exclude_entry_computation) {}
+
 StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
   // Initially add all GTE and Tuple instructions to the worklist.
   std::queue<HloInstruction*> worklist;
   for (auto* computation : module->computations()) {
+    if (exclude_entry_computation_ &&
+        computation == module->entry_computation()) {
+      continue;
+    }
     for (auto* instruction : computation->instructions()) {
       if (instruction->opcode() == HloOpcode::kTuple ||
           instruction->opcode() == HloOpcode::kGetTupleElement) {
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.h b/tensorflow/compiler/xla/service/tuple_simplifier.h
index e5e9b10b5b..7509501883 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.h
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.h
@@ -27,13 +27,20 @@ namespace xla {
 // the module.
 class TupleSimplifier : public HloPassInterface {
  public:
-  TupleSimplifier() {}
+  TupleSimplifier() : TupleSimplifier(/*exclude_entry_computation=*/false) {}
+  explicit TupleSimplifier(bool exclude_entry_computation);
   ~TupleSimplifier() override {}
   tensorflow::StringPiece name() const override { return "tuple-simplifier"; }
 
   // Run tuple simplification on the given computation. Returns whether the
   // computation was changed.
   StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  // When set, this pipeline stage will perform optimization of all computations
+  // apart from the module's entry computation. This is used by Graphcore's
+  // backend.
+  bool exclude_entry_computation_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
index ca9ae91281..d3635eae81 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
@@ -42,6 +42,12 @@ class TupleSimplifierTest : public HloTestBase {
     TF_ASSERT_OK(changed_status.status());
     EXPECT_EQ(change_expected, changed_status.ValueOrDie());
   }
+  void Run(HloModule* module, bool change_expected, bool exclude_entry) {
+    TupleSimplifier simplifier(exclude_entry);
+    auto changed_status = simplifier.Run(module);
+    TF_ASSERT_OK(changed_status.status());
+    EXPECT_EQ(change_expected, changed_status.ValueOrDie());
+  }
 
   const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
   const Shape tuple_shape_ = ShapeUtil::MakeTupleShape(
@@ -211,5 +217,76 @@ TEST_F(TupleSimplifierTest, IncompatibleTuples) {
   EXPECT_THAT(computation->root_instruction(), tuple);
 }
 
+TEST_F(TupleSimplifierTest, CanExcludeEntryComputation) {
+  //  Verify that the root computation can be excluded
+  auto module = CreateNewModule();
+
+  HloInstruction* p0;
+  HloInstruction* p1;
+  HloComputation* c0;
+  HloComputation* c1;
+  HloComputation* entry;
+
+  {
+    HloComputation::Builder builder(TestName() + "_1");
+    p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
+    HloInstruction* gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 0));
+    HloInstruction* gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 1));
+    HloInstruction* gte2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 2));
+
+    builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
+
+    c0 = module->AddEmbeddedComputation(builder.Build());
+  }
+  {
+    HloComputation::Builder builder(TestName() + "_2");
+    p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
+    HloInstruction* gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 0));
+    HloInstruction* gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 1));
+    HloInstruction* gte2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 2));
+
+    builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
+
+    c1 = module->AddEmbeddedComputation(builder.Build());
+  }
+  {
+    HloComputation::Builder builder(TestName() + "_Entry");
+    HloInstruction* tuple_param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
+    HloInstruction* call0 = builder.AddInstruction(
+        HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c0));
+    HloInstruction* call1 = builder.AddInstruction(
+        HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c1));
+    HloInstruction* gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, call0, 0));
+    HloInstruction* gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, call1, 1));
+    HloInstruction* tuple0 =
+        builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
+    HloInstruction* gte2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 0));
+    HloInstruction* gte3 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 1));
+
+    builder.AddInstruction(HloInstruction::CreateTuple({gte2, gte3}));
+
+    entry = module->AddEntryComputation(builder.Build());
+  }
+
+  Run(module.get(), /*change_expected=*/true, /*exclude_entry=*/ true);
+
+  EXPECT_THAT(c0->root_instruction(), p0);
+  EXPECT_THAT(c1->root_instruction(), p1);
+  EXPECT_THAT(entry->instruction_count(), 9);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/contrib/autograph/__init__.py b/tensorflow/contrib/autograph/__init__.py
index 637e49c082..dbdbad8f4c 100644
--- a/tensorflow/contrib/autograph/__init__.py
+++ b/tensorflow/contrib/autograph/__init__.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 
 # TODO(mdan): Bring only the relevant symbols to the top level.
 from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph import operators
 from tensorflow.contrib.autograph.impl.api import convert
 from tensorflow.contrib.autograph.impl.api import converted_call
 from tensorflow.contrib.autograph.impl.api import do_not_convert
@@ -43,6 +44,8 @@ _allowed_symbols = [
     'do_not_convert',
     'to_code',
     'to_graph',
+    # Overloaded operators
+    'operators',
     # Special functions and directives
     'set_element_type',
     'set_loop_options',
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
index bda5e26f43..2e0a2fcef4 100644
--- a/tensorflow/contrib/cmake/tf_c.cmake
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -37,13 +37,15 @@ add_dependencies(
   tf_core_lib
   tf_protos_cc)
 
-add_library(tf_c_python_api OBJECT
-  "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
-  "${tensorflow_source_dir}/tensorflow/c/python_api.h"
-)
-add_dependencies(
-  tf_c_python_api
-  tf_c
-  tf_core_lib
-  tf_core_framework
-  tf_protos_cc)
+if(tensorflow_BUILD_PYTHON_BINDINGS)
+  add_library(tf_c_python_api OBJECT
+    "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
+    "${tensorflow_source_dir}/tensorflow/c/python_api.h"
+  )
+  add_dependencies(
+    tf_c_python_api
+    tf_c
+    tf_core_lib
+    tf_core_framework
+    tf_protos_cc)
+endif()
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index f73da0b8ab..6c90cf398c 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -155,7 +155,7 @@ if (WIN32)
     set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib")
   endif()
 else (WIN32)
-  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so")
+  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}")
 endif (WIN32)
 add_custom_target(tf_extension_ops)
 
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index a0c3ddd28b..9244604489 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -715,7 +715,7 @@ if(WIN32)
   endif()
 else()
   add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}
                                      ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
 endif()
 
@@ -832,7 +832,6 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
 add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/testing/python/framework/util_test.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
-
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/README
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index cffe069aa3..4f957f1e0b 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -44,7 +44,8 @@ UNDNAME = "undname.exe"
 DUMPBIN = "dumpbin.exe"
 
 # Exclude if matched
-EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::")
+EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::|Internal|"
+                        r"python_op_gen_internal|grappler")
 
 # Include if matched before exclude
 INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
@@ -56,6 +57,10 @@ INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
                            r"tensorflow::ops::internal::Enter|"
                            r"tensorflow::strings::internal::AppendPieces|"
                            r"tensorflow::strings::internal::CatPieces|"
+                           r"tensorflow::errors::Internal|"
+                           r"tensorflow::Tensor::CopyFromInternal|"
+                           r"tensorflow::kernel_factory::"
+                           r"OpKernelRegistrar::InitInternal|"
                            r"tensorflow::io::internal::JoinPathImpl")
 
 # Include if matched after exclude
@@ -64,7 +69,7 @@ INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
                         r"tensorflow::|"
                         r"functor::|"
                         r"\?nsync_|"
-                        r"perftools::gputools")
+                        r"stream_executor::")
 
 # We want to identify data members explicitly in the DEF file, so that no one
 # can implicitly link against the DLL if they use one of the variables exported
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
index 45760a29ee..795f1993ba 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
@@ -151,16 +151,24 @@ class SinhArcsinhBijectorTest(test.TestCase):
         self.assertAllClose(y, bijector.forward(x).eval(), rtol=1e-4, atol=0.)
         self.assertAllClose(x, bijector.inverse(y).eval(), rtol=1e-4, atol=0.)
 
-        # Do the numpy calculation in float128 to avoid inf/nan.
-        y_float128 = np.float128(y)
-        self.assertAllClose(
-            np.log(np.cosh(
-                np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
-                    y_float128**2 + 1)) -
-            np.log(tailweight),
-            bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
-            rtol=1e-4,
-            atol=0.)
+        # On IBM PPC systems, longdouble (np.float128) is same as double except that it can have more precision.
+        # Type double being of 8 bytes, can't hold square of max of float64 (which is also 8 bytes) and
+        # below test fails due to overflow error giving inf. So this check avoids that error by skipping square
+        # calculation and corresponding assert.
+
+        if np.amax(y) <= np.sqrt(np.finfo(np.float128).max) and \
+           np.fabs(np.amin(y)) <= np.sqrt(np.fabs(np.finfo(np.float128).min)):
+
+          # Do the numpy calculation in float128 to avoid inf/nan.
+          y_float128 = np.float128(y)
+          self.assertAllClose(
+              np.log(np.cosh(
+                  np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
+                      y_float128**2 + 1)) -
+              np.log(tailweight),
+              bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
+              rtol=1e-4,
+              atol=0.)
         self.assertAllClose(
             -bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
             bijector.forward_log_det_jacobian(x, event_ndims=0).eval(),
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index d7909dd5a2..adf92c27ea 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -106,7 +106,8 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
             target_device=target,
             buffer_size=10,
             container="",
-            shared_name=_generate_shared_name("function_buffer_resource"))
+            shared_name=_generate_shared_name(
+                "contrib_eager_iterator_function_buffer_resource"))
         self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
             handle=self._buffer_resource_handle,
             handle_device=self._device)
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
index 4fe3a0e3f3..5749f22ac5 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
@@ -68,7 +68,7 @@
         "# simply construct the object. Most layers take as a first argument the number\n",
         "# of output dimensions / channels.\n",
         "layer = tf.keras.layers.Dense(100)\n",
-        "# The number of input dimensionss is often unnecessary, as it can be inferred\n",
+        "# The number of input dimensions is often unnecessary, as it can be inferred\n",
         "# the first time the layer is used, but it can be provided if you want to \n",
         "# specify it manually, which is useful in some complex models.\n",
         "layer = tf.keras.layers.Dense(10, input_shape=(None, 5))"
@@ -267,7 +267,7 @@
         "  * `build`, where you know the shapes of the input tensors and can do the rest of the initialization\n",
         "  * `call`, where you do the forward computation\n",
         "\n",
-        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes requires to create the variables will need to be explicitly specified."
+        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes required to create the variables will need to be explicitly specified."
       ]
     },
     {
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index 84a413c791..05bcdac2ca 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -346,7 +346,8 @@ def sequence_numeric_column(
     key,
     shape=(1,),
     default_value=0.,
-    dtype=dtypes.float32):
+    dtype=dtypes.float32,
+    normalizer_fn=None):
   """Returns a feature column that represents sequences of numeric data.
 
   Example:
@@ -370,6 +371,12 @@ def sequence_numeric_column(
     default_value: A single value compatible with `dtype` that is used for
       padding the sparse data into a dense `Tensor`.
     dtype: The type of values.
+    normalizer_fn: If not `None`, a function that can be used to normalize the
+      value of the tensor after `default_value` is applied for parsing.
+      Normalizer function takes the input `Tensor` as its argument, and returns
+      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
+      even though the most common use case of this function is normalization, it
+      can be used for any kind of Tensorflow transformations.
 
   Returns:
     A `_SequenceNumericColumn`.
@@ -383,12 +390,16 @@ def sequence_numeric_column(
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
+  if normalizer_fn is not None and not callable(normalizer_fn):
+    raise TypeError(
+        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
 
   return _SequenceNumericColumn(
       key,
       shape=shape,
       default_value=default_value,
-      dtype=dtype)
+      dtype=dtype,
+      normalizer_fn=normalizer_fn)
 
 
 def _assert_all_equal_and_return(tensors, name=None):
@@ -407,7 +418,7 @@ class _SequenceNumericColumn(
     fc._SequenceDenseColumn,
     collections.namedtuple(
         '_SequenceNumericColumn',
-        ['key', 'shape', 'default_value', 'dtype'])):
+        ['key', 'shape', 'default_value', 'dtype', 'normalizer_fn'])):
   """Represents sequences of numeric data."""
 
   @property
@@ -419,7 +430,10 @@ class _SequenceNumericColumn(
     return {self.key: parsing_ops.VarLenFeature(self.dtype)}
 
   def _transform_feature(self, inputs):
-    return inputs.get(self.key)
+    input_tensor = inputs.get(self.key)
+    if self.normalizer_fn is not None:
+      input_tensor = self.normalizer_fn(input_tensor)
+    return input_tensor
 
   @property
   def _variable_shape(self):
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
index ee74cf56dc..45d7b74046 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
 
@@ -947,6 +948,7 @@ class SequenceNumericColumnTest(test.TestCase):
     self.assertEqual((1,), a.shape)
     self.assertEqual(0., a.default_value)
     self.assertEqual(dtypes.float32, a.dtype)
+    self.assertIsNone(a.normalizer_fn)
 
   def test_shape_saved_as_tuple(self):
     a = sfc.sequence_numeric_column('aaa', shape=[1, 2])
@@ -965,6 +967,10 @@ class SequenceNumericColumnTest(test.TestCase):
         ValueError, 'dtype must be convertible to float'):
       sfc.sequence_numeric_column('aaa', dtype=dtypes.string)
 
+  def test_normalizer_fn_must_be_callable(self):
+    with self.assertRaisesRegexp(TypeError, 'must be a callable'):
+      sfc.sequence_numeric_column('aaa', normalizer_fn='NotACallable')
+
   def test_get_sequence_dense_tensor(self):
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, values [[0.], [1]]
@@ -985,6 +991,41 @@ class SequenceNumericColumnTest(test.TestCase):
       self.assertAllEqual(
           expected_dense_tensor, dense_tensor.eval(session=sess))
 
+  def test_get_sequence_dense_tensor_with_normalizer_fn(self):
+
+    def _increment_two(input_sparse_tensor):
+      return sparse_ops.sparse_add(
+          input_sparse_tensor,
+          sparse_tensor.SparseTensor(((0, 0), (1, 1)), (2.0, 2.0), (2, 2))
+      )
+
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0.], [1]]
+        # example 1, [[10.]]
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+
+    # Before _increment_two:
+    #   [[0.], [1.]],
+    #   [[10.], [0.]],
+    # After _increment_two:
+    #   [[2.], [1.]],
+    #   [[10.], [2.]],
+    expected_dense_tensor = [
+        [[2.], [1.]],
+        [[10.], [2.]],
+    ]
+    numeric_column = sfc.sequence_numeric_column(
+        'aaa', normalizer_fn=_increment_two)
+
+    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_dense_tensor, dense_tensor.eval(session=sess))
+
   def test_get_sequence_dense_tensor_with_shape(self):
     """Tests get_sequence_dense_tensor with shape !=(1,)."""
     sparse_input = sparse_tensor.SparseTensorValue(
diff --git a/tensorflow/contrib/ffmpeg/__init__.py b/tensorflow/contrib/ffmpeg/__init__.py
index daba965a98..484ffee3e7 100644
--- a/tensorflow/contrib/ffmpeg/__init__.py
+++ b/tensorflow/contrib/ffmpeg/__init__.py
@@ -28,7 +28,6 @@ from __future__ import print_function
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_audio
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import encode_audio
-from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 
 from tensorflow.python.util.all_util import remove_undocumented
 
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
index 020b5c99c6..b1b5126d9e 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from tensorflow.contrib.ffmpeg.ops import gen_decode_audio_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_encode_audio_op_py
-from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 10d1ecc738..dc49383c5c 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -119,14 +119,13 @@ from tensorflow.python.framework.smart_cond import smart_cond
 from tensorflow.python.framework.smart_cond import smart_constant_value
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
-from tensorflow.python.ops.array_ops import broadcast_to
 from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_1d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_3d
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['nest', 'broadcast_to']
+_allowed_symbols = ['nest']
 _nest_allowed_symbols = [
     'assert_same_structure',
     'is_sequence',
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
index 65cb94b5a4..a955e21b72 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -301,8 +301,8 @@ class FusedConv2DBiasActivationTest(test.TestCase):
           conv = tensors[i]
           value = values[i]
           ref_value = ref_values[i]
-          print("expected = ", ref_value)
-          print("actual = ", value)
+          tf_logging.info("expected = ", ref_value)
+          tf_logging.info("actual = ", value)
           tol = 1e-5
           if value.dtype == np.float16:
             tol = 1e-3
@@ -843,7 +843,8 @@ class FusedConvInt8Tests(test.TestCase):
                                                 vertical_stride, padding_type)
     output_width = CalculateConvolvedOutputDim(input_width, filter_width,
                                                horizontal_stride, padding_type)
-    print("output_height=", output_height, ", output_width=", output_width)
+    tf_logging.info("output_height=", output_height, ", output_width=", 
+			                 output_width)
 
     side_input, _, _ = gen_array_ops.quantize_v2(
         random_ops.random_uniform(
@@ -880,8 +881,8 @@ class FusedConvInt8Tests(test.TestCase):
     with self.test_session(
         use_gpu=True, config=NoMemoryOptimizationConfig()) as sess:
       actual_y, expected_y = sess.run([actual, expected])
-      print("actual_y = ", actual_y)
-      print("expected_y = ", expected_y)
+      tf_logging.info("actual_y = ", actual_y)
+      tf_logging.info("expected_y = ", expected_y)
       self.assertTrue(np.array_equal(actual_y, expected_y))
 
   def testFusedConvInt8(self):
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
index 6a5d982dc8..2e5c84704f 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "hexagon_controller.h"
 
-#include <malloc.h>
+#include <stdlib.h>
 #include <stdio.h>
 
 #include "adspmsgd.h"
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index 436c3e1d4c..840015a7fa 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -30,9 +30,7 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
-# the archive has been propagated in mirror.bazel.build.
-GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
diff --git a/tensorflow/contrib/lite/examples/minimal/minimal.cc b/tensorflow/contrib/lite/examples/minimal/minimal.cc
index 106e3b0270..8b0ace96cc 100644
--- a/tensorflow/contrib/lite/examples/minimal/minimal.cc
+++ b/tensorflow/contrib/lite/examples/minimal/minimal.cc
@@ -38,7 +38,7 @@ using namespace tflite;
 
 int main(int argc, char *argv[]) {
   if(argc != 2) {
-    fprintf(stderr, "Usage: %s <model>\n");
+    fprintf(stderr, "minimal <tflite model>\n");
     return 1;
   }
   const char* filename = argv[1];
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index bb2e615eac..965273f0f0 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -128,7 +128,6 @@ TensorFlow operation not listed above are likely unsupported. Notably, the
 following common ops are not supported at the moment:
 
 *   [tf.depth_to_space](https://www.tensorflow.org/api_docs/python/tf/depth_to_space)
-*   [tf.gather](https://www.tensorflow.org/api_docs/python/tf/gather)
 *   [tf.image.resize_bilinear](https://www.tensorflow.org/api_docs/python/tf/image/resize_bilinear)
 *   [tf.tanh](https://www.tensorflow.org/api_docs/python/tf/tanh)
 
@@ -306,6 +305,19 @@ Options {
 }
 ```
 
+**GATHER**
+
+```
+Inputs {
+  0: params tensor
+  1: indices tensor
+  2: axis tensor (optional)
+}
+Outputs {
+  0: a tensor with same type as the params tensor.
+}
+```
+
 **GREATER**
 
 ```
diff --git a/tensorflow/contrib/lite/java/ovic/README.md b/tensorflow/contrib/lite/java/ovic/README.md
index 5efa70987e..26349347fa 100644
--- a/tensorflow/contrib/lite/java/ovic/README.md
+++ b/tensorflow/contrib/lite/java/ovic/README.md
@@ -2,7 +2,7 @@
 
 This folder contains building code for track one of the [Low Power ImageNet Recognition Challenge workshop at CVPR 2018.](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018)
 
-## Pre-requesits
+## Pre-requisite
 
 Follow the steps [here](https://www.tensorflow.org/mobile/tflite/demo_android) to install Tensorflow, Bazel, and the Android NDK and SDK.
 
@@ -49,7 +49,7 @@ Once you have a submission that follows the instructions from the [competition s
 You can call the validator binary below to verify that your model fits the format requirements. This often helps you to catch size mismatches (e.g. output should be [1, 1001] instead of [1,1,1,1001]). Let say the submission file is located at `/path/to/my_model.lite`, then call:
 
 ```sh
-bazel build --cxxopt--std=c++11 //tensorflow/contrib/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
+bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
 bazel-bin/tensorflow/contrib/lite/java/ovic/ovic_validator /path/to/my_model.lite
 ```
 
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index a2f192bbc2..1908f7fa6c 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1934,7 +1934,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // The quantization of the input, output arrays is as follows:
 //  - The input activations are quantized as uint8 on the interval
 //    [-1, 127/128].
-//    The rationale for that is that that is the natural interval for output
+//    The rationale for that is that is the natural interval for output
 //    activations (see next point) and these need to be concatenated together.
 //    We could accommodate different ranges by re-scaling, but we empirically
 //    found that setting the input activations range to be [-1, 127/128] in the
@@ -1999,7 +1999,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // However, for a fixed-point implementation in 16-bit integers, using 5
 // integer bits to represent the [-16, 16] range would leave only 11
 // fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
-// representable values. Notice that that is higher than the
+// representable values. Notice that is higher than the
 // worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
 // Using [-8, 8] thus seems like the better compromise overall, enjoying
 // an increment of 2.4e-4 between representable values and a worst-case
diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
index 9400e757b9..fd90823425 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -55,7 +55,7 @@ class Interpreter(object):
     elif model_content and not model_path:
       self._interpreter = (
           _interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromBuffer(
-              model_content, len(model_content)))
+              model_content))
       if not self._interpreter:
         raise ValueError(
             'Failed to create model from {} bytes'.format(len(model_content)))
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index f705551fcb..b283551c45 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -397,9 +397,14 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
-    const char* data, size_t len) {
+    PyObject* data) {
+  char * buf = nullptr;
+  Py_ssize_t length;
+  if (PY_TO_CPPSTRING(data, &buf, &length) == -1) {
+    return nullptr;
+  }
   std::unique_ptr<tflite::FlatBufferModel> model =
-      tflite::FlatBufferModel::BuildFromBuffer(data, len);
+      tflite::FlatBufferModel::BuildFromBuffer(buf, length);
   return model ? new InterpreterWrapper(std::move(model)) : nullptr;
 }
 
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
index b0ed7c4559..cbeb53bee7 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -40,8 +40,7 @@ class InterpreterWrapper {
   static InterpreterWrapper* CreateWrapperCPPFromFile(const char* model_path);
 
   // SWIG caller takes ownership of pointer.
-  static InterpreterWrapper* CreateWrapperCPPFromBuffer(const char* data,
-                                                        size_t len);
+  static InterpreterWrapper* CreateWrapperCPPFromBuffer(PyObject* data);
 
   ~InterpreterWrapper();
   bool AllocateTensors();
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 0913cd2c5c..88dda7290b 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -34,6 +34,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from six import PY3
+
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
 from tensorflow.contrib.lite.python import lite_constants as constants
@@ -54,6 +56,7 @@ from tensorflow.python.framework.importer import import_graph_def
 from tensorflow.python.ops.variables import global_variables_initializer
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
+# from tensorflow.python.util.all_util import remove_undocumented
 
 
 class TocoConverter(object):
@@ -203,6 +206,12 @@ class TocoConverter(object):
       except (_text_format.ParseError, DecodeError):
         try:
           print("Ignore 'tcmalloc: large alloc' warnings.")
+
+          if not isinstance(file_content, str):
+            if PY3:
+              file_content = file_content.decode('utf-8')
+            else:
+              file_content = file_content.encode('utf-8')
           _text_format.Merge(file_content, graph_def)
         except (_text_format.ParseError, DecodeError):
           raise ValueError(
@@ -382,3 +391,5 @@ def _freeze_graph(sess, output_tensors):
                                                         output_arrays)
   else:
     return sess.graph_def
+
+# remove_undocumented(__name__)
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index e33b430937..5c7fa09891 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -178,7 +178,7 @@ ArrayDataType ConvertDataType(tensorflow::DataType dtype) {
   else if (dtype == DT_STRING)
     return ArrayDataType::kString;
   else
-    LOG(INFO) << "Unsupported data type in placehoder op: " << dtype;
+    LOG(INFO) << "Unsupported data type in placeholder op: " << dtype;
   return ArrayDataType::kNone;
 }
 
diff --git a/tensorflow/contrib/lite/toco/toco_port.cc b/tensorflow/contrib/lite/toco/toco_port.cc
index 1b21c8bc60..de76fd4032 100644
--- a/tensorflow/contrib/lite/toco/toco_port.cc
+++ b/tensorflow/contrib/lite/toco/toco_port.cc
@@ -20,6 +20,12 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
+#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
+namespace std {
+double round(double x) { return ::round(x); }
+}  // namespace std
+#endif
+
 namespace toco {
 namespace port {
 void CopyToBuffer(const string& src, char* dest) {
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h
index 5c019cb2bf..17f82b9dd7 100644
--- a/tensorflow/contrib/lite/toco/toco_port.h
+++ b/tensorflow/contrib/lite/toco/toco_port.h
@@ -34,6 +34,24 @@ limitations under the License.
 #define TFLITE_PROTO_NS google::protobuf
 #endif
 
+#ifdef __ANDROID__
+#include <sstream>
+namespace std {
+
+template <typename T>
+std::string to_string(T value)
+{
+    std::ostringstream os ;
+    os << value ;
+    return os.str() ;
+}
+
+#ifdef __ARM_ARCH_7A__
+double round(double x);
+#endif
+}
+#endif
+
 namespace toco {
 namespace port {
 
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index e8c6edd7ba..a28fc3a87f 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -270,7 +270,7 @@ for arch in $archs; do
                         PLATFORM_LDFLAGS=-pthread
                         MKDEP=${CC} -M -std=c++11
                         PLATFORM_C=../../platform/c++11/src/nsync_semaphore_mutex.cc \
-                                   ../../platform/c++11/src/per_thread_waiter.cc \
+                                   ../../platform/posix/src/per_thread_waiter.c \
                                    ../../platform/c++11/src/yield.cc \
                                    ../../platform/c++11/src/time_rep_timespec.cc \
                                    ../../platform/c++11/src/nsync_panic.cc
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index eff9081e35..48953e2e38 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -27,9 +27,7 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
-# the archive has been propagated in mirror.bazel.build.
-GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 2ed99d50a4..a6be2084aa 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -2503,7 +2503,7 @@ def _compute_recall_at_precision(tp, fp, fn, precision, name):
     name: An optional variable_scope name.
 
   Returns:
-    The recall at a the given `precision`.
+    The recall at a given `precision`.
   """
   precisions = math_ops.div(tp, tp + fp + _EPSILON)
   tf_index = math_ops.argmin(
diff --git a/tensorflow/contrib/mpi_collectives/kernels/ring.h b/tensorflow/contrib/mpi_collectives/kernels/ring.h
index 1d56d588bc..c001615d3f 100644
--- a/tensorflow/contrib/mpi_collectives/kernels/ring.h
+++ b/tensorflow/contrib/mpi_collectives/kernels/ring.h
@@ -129,7 +129,7 @@ cudaStream_t CudaStreamForMPI();
  *  has the fully accumulated Segment 1; and so on. The scatter-reduce is
  * complete.
  *
- *  Next, the allgather distributes these fully accumululated chunks across all
+ *  Next, the allgather distributes these fully accumulated chunks across all
  * nodes. Communication proceeds in the same ring, once again in N-1 steps. At
  * the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
  * For example, at the first iteration, the following transfers will occur:
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index 21bf3f5313..915e6504e1 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -224,8 +224,10 @@ class AdaMaxOptimizerTest(test.TestCase):
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0),
+                                             rtol=1e-2)
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1),
+                                             rtol=1e-2)
           if use_resource:
             self.assertEqual("var0_%d/AdaMax:0" % (i,),
                              opt.get_slot(var=var0, name="m").name)
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer.py b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
index a7c97a1da2..b6b10e500b 100644
--- a/tensorflow/contrib/opt/python/training/model_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
@@ -62,7 +62,7 @@ class ModelAverageCustomGetter(object):
   """
 
   def __init__(self, worker_device):
-    """Create a new `ElasticAverageCustomGetter`.
+    """Create a new `ModelAverageCustomGetter`.
 
     Args:
       worker_device: String.  Name of the `worker` job.
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
index 6ca7fe8b6e..aad1ca04c5 100644
--- a/tensorflow/contrib/periodic_resample/BUILD
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -6,12 +6,13 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "py_test",
+    "tf_cc_test",
     "tf_gen_op_libs",
     "tf_custom_op_library",
     "tf_custom_op_py_library",
     "tf_gen_op_wrapper_py",
 )
+load("//tensorflow:tensorflow.bzl", "py_test")
 
 cc_library(
     name = "all_ops",
@@ -84,6 +85,23 @@ py_test(
         ":init_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradient_checker",
+    ],
+)
+
+tf_cc_test(
+    name = "periodic_resample_op_cc_test",
+    size = "small",
+    srcs = [
+        "ops/array_ops_test.cc",
+    ],
+    deps = [
+        ":all_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_proto",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
index e18923c8aa..514689cf45 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
@@ -22,4 +22,9 @@ namespace tensorflow {
 REGISTER_KERNEL_BUILDER(Name("PeriodicResample").Device(DEVICE_CPU),
                         PeriodicResampleOp);
 
+
+REGISTER_KERNEL_BUILDER(Name("PeriodicResampleOpGrad")
+                            .Device(DEVICE_CPU),
+                        PeriodicResampleOpGrad);
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
index 3ab588c458..42fba81a5c 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
@@ -25,92 +25,202 @@
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 namespace {
 
-template <class IndexVecT, class IndexT>
-IndexT compute_input_index(
-    IndexVecT* target_dimensions, const IndexT& output_index,
-    const IndexVecT& original_dimensions, const int& adjustable_dimension,
-    const std::vector<tensorflow::int64>& dimension_ceiling,
-    const std::vector<tensorflow::int64>& cumulative_dimensions, IndexT* result,
-    std::vector<IndexT>* output_indices, const int& rank) {
-  *result = 0;
-  output_indices->clear();
+// Computes input tensor index for given output index during forward
+// propagation through periodic_resample operation.
+class InputIndexer {
+ public:
+  InputIndexer(const std::vector<tensorflow::int64>& output_dimensions,
+               const tensorflow::TensorShape& input_shape,
+               int adjustable_dimension)
+      : output_dimensions_(output_dimensions),
+        adjustable_dimension_(adjustable_dimension),
+        rank_(input_shape.dims()),
+        linear_output_index_(0),
+        linear_input_index_(0),
+        adjustable_dimension_carriage_sum_(0) {
+    auto input_dimensions = TensorShapeToVector(input_shape);
+    // factors by which input_dimensions increases/decreases w.r.t.
+    // output_dimensions
+    dimension_ceiling_ =
+        ComputeDimensionCeiling(output_dimensions, input_dimensions);
+    cumulative_dimensions_ = ComputeCumulativeDimensions();
+
+    output_indices_.resize(output_dimensions_.size());
+    input_indices_.resize(output_dimensions_.size());
+
+    // Compute index_factors
+    index_factors_.resize(rank_);
+    tensorflow::int64 last_index_factor = 1;
+    for (auto r = rank_ - 1; r >= 0; --r) {
+      index_factors_[r] = last_index_factor;
+      last_index_factor *= input_dimensions[r];
+    }
+  }
+
+  tensorflow::int64 linear_input_index() const { return linear_input_index_; }
+
+  void MoveToOutputIndex(tensorflow::int64 output_index);
+  void IncrementOutputIndex();
+
+ private:
+  void RecomputeInputAdjustableDimensionIndex() {
+    tensorflow::int64 index = adjustable_dimension_carriage_sum_;
+    index *= output_dimensions_[adjustable_dimension_];
+    index += output_indices_[adjustable_dimension_];
+    input_indices_[adjustable_dimension_] = index;
+  }
+
+  std::vector<tensorflow::int64> TensorShapeToVector(
+      const tensorflow::TensorShape& tensor_shape);
+
+  std::vector<tensorflow::int64> ComputeDimensionCeiling(
+      const std::vector<tensorflow::int64>& output_dimensions,
+      const std::vector<tensorflow::int64>& input_dimensions);
+
+  std::vector<tensorflow::int64> ComputeCumulativeDimensions();
+
+  const std::vector<tensorflow::int64> output_dimensions_;
+  std::vector<tensorflow::int64> dimension_ceiling_;
+  std::vector<tensorflow::int64> index_factors_;
+  std::vector<tensorflow::int64> cumulative_dimensions_;
+  std::vector<tensorflow::int64> output_indices_;
+  std::vector<tensorflow::int64> input_indices_;
+
+  const int adjustable_dimension_;
+  const int rank_;
+  tensorflow::int64 linear_output_index_;
+  tensorflow::int64 linear_input_index_;
+  tensorflow::int64 adjustable_dimension_carriage_sum_;
+};
+
+void InputIndexer::MoveToOutputIndex(tensorflow::int64 output_index) {
+  linear_output_index_ = output_index;
+  linear_input_index_ = 0;
 
   // un-rasterize the output index
   auto last_reduced_i = output_index;
-  for (auto r = rank - 1; r >= 0; --r) {
-    (*output_indices)[r] = last_reduced_i % (*target_dimensions)[r];
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    output_indices_[r] = last_reduced_i % output_dimensions_[r];
     last_reduced_i =
-        (last_reduced_i - (*output_indices)[r]) / (*target_dimensions)[r];
+        (last_reduced_i - output_indices_[r]) / output_dimensions_[r];
   }
 
+  tensorflow::int64 carriage_sum = 0;
+  for (int qi = 0; qi < rank_; ++qi) {
+    if (qi == adjustable_dimension_) continue;
+    carriage_sum += cumulative_dimensions_[qi] *
+                    (output_indices_[qi] % dimension_ceiling_[qi]);
+  }
+  adjustable_dimension_carriage_sum_ = carriage_sum;
+
   // rasterize the input index
-  IndexT last_index_factor = 1;
-  for (auto r = rank - 1; r >= 0; --r) {
-    IndexT index = 0;
-    if (r != adjustable_dimension)
-      index = (*output_indices)[r] / dimension_ceiling[r];
-    else {
-      for (int qi = 0; qi < rank; ++qi) {
-        if (qi == adjustable_dimension) continue;
-        index += cumulative_dimensions[qi] *
-                 ((*output_indices)[qi] % dimension_ceiling[qi]);
-      }
-      index *= (*target_dimensions)[adjustable_dimension];
-      index += (*output_indices)[r];
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    if (r != adjustable_dimension_) {
+      input_indices_[r] = output_indices_[r] / dimension_ceiling_[r];
+    } else {
+      RecomputeInputAdjustableDimensionIndex();
     }
-    *result += last_index_factor * index;
-    last_index_factor *= original_dimensions[r];
   }
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    linear_input_index_ += index_factors_[r] * input_indices_[r];
+  }
+}
+
+void InputIndexer::IncrementOutputIndex() {
+  linear_output_index_++;
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    auto old_carriage_sum_increment =
+        cumulative_dimensions_[r] *
+        (output_indices_[r] % dimension_ceiling_[r]);
+    output_indices_[r] = (output_indices_[r] + 1) % output_dimensions_[r];
+    if (r != adjustable_dimension_) {
+      auto new_input_index = output_indices_[r] / dimension_ceiling_[r];
+      linear_input_index_ +=
+          (new_input_index - input_indices_[r]) * index_factors_[r];
+
+      input_indices_[r] = new_input_index;
+
+      auto new_carriage_sum_increment =
+          cumulative_dimensions_[r] *
+          (output_indices_[r] % dimension_ceiling_[r]);
 
-  return *result;
+      adjustable_dimension_carriage_sum_ = adjustable_dimension_carriage_sum_ -
+                                           old_carriage_sum_increment +
+                                           new_carriage_sum_increment;
+    }
+
+    if (output_indices_[r] != 0) {
+      // No more carries to higher indices.
+      break;
+    }
+  }
+  auto old_adjustable_dimension_input_index =
+      input_indices_[adjustable_dimension_];
+  RecomputeInputAdjustableDimensionIndex();
+  linear_input_index_ += (input_indices_[adjustable_dimension_] -
+                           old_adjustable_dimension_input_index) *
+                          index_factors_[adjustable_dimension_];
 }
 
-template <class InputDataT,
-          class IndexVecT>  // both types are needed here b/c IndexVecT and
-                            // InputDataT are not related
-                            void
-                            fill_periodic_tensor(
-                                tensorflow::OpKernelContext* context,
-                                const IndexVecT& desired_shape,
-                                const tensorflow::Tensor& input_tensor) {
-  // input is a strided array (last index is fastest, C-ordered)
-  auto input = input_tensor.flat<InputDataT>();
-  const int rank = input_tensor.dims();
-  // original and target dimensions
-  std::vector<tensorflow::int64> original_dimensions(rank),
-      target_dimensions(rank);
-  tensorflow::int64 total_size(input_tensor.NumElements()), new_sliced_size(1);
-  // factors by which original_dimensions increases/decreases w.r.t.
-  // target_dimensions
-  std::vector<tensorflow::int64> dimension_ceiling(rank),
-      cumulative_dimensions(rank);
-  // index of adjustable dimension
-  int adjustable_dimension;
-  tensorflow::TensorShape output_shape;
+std::vector<tensorflow::int64> InputIndexer::TensorShapeToVector(
+    const tensorflow::TensorShape& tensor_shape) {
+  std::vector<tensorflow::int64> result(tensor_shape.dims());
+  int count = 0;
+  for (const auto dim_info : tensor_shape) {
+    result[count] = dim_info.size;
+    ++count;
+  }
+  return result;
+}
 
-  // requires that the rank of the input tensor and length of the desired shape
-  // are equal
-  OP_REQUIRES(context, rank == desired_shape.size(),
-              tensorflow::errors::InvalidArgument(
-                  "periodic_resample expects the rank of the input tensor, ",
-                  rank, ", to be the same as the length of the desired shape, ",
-                  desired_shape.size(), "."));
+std::vector<tensorflow::int64> InputIndexer::ComputeDimensionCeiling(
+    const std::vector<tensorflow::int64>& output_dimensions,
+    const std::vector<tensorflow::int64>& input_dimensions) {
+  std::vector<tensorflow::int64> dimension_ceiling(input_dimensions.size());
+  for (size_t i = 0; i < input_dimensions.size(); ++i) {
+    dimension_ceiling[i] = (output_dimensions[i] + input_dimensions[i] - 1) /
+        input_dimensions[i];
+  }
+  return dimension_ceiling;
+}
 
-  bool found = false;
-  const auto& input_tensor_shape = input_tensor.shape();
+std::vector<tensorflow::int64> InputIndexer::ComputeCumulativeDimensions() {
+  std::vector<tensorflow::int64> cumulative_dimensions(rank_);
+  int count = 0;
+  for (int i = 0; i < rank_; ++i) {
+    if (count == 0) {
+      cumulative_dimensions[count] = 1;
+    } else {
+      cumulative_dimensions[count] =
+          cumulative_dimensions[count - 1] * dimension_ceiling_[count - 1];
+    }
+    ++count;
+  }
+  return cumulative_dimensions;
+}
 
+template <typename IndexVecT>
+void process_desired_shape(tensorflow::OpKernelContext* context,
+                           const tensorflow::TensorShape& input_tensor_shape,
+                           const IndexVecT& desired_shape,
+                           int* adjustable_dimension,
+                           std::vector<tensorflow::int64>* target_dimensions,
+                           tensorflow::int64* output_size) {
+  tensorflow::int64 new_sliced_size = 1;
+  bool found = false;
+  const int rank = input_tensor_shape.dims();
   for (int i = 0; i < rank; ++i) {
-    // if (desired_shape(i) < 1) {
     if (desired_shape[i] < 1) {
       // only one index can be adjustable
       OP_REQUIRES(context, !found,
                   tensorflow::errors::InvalidArgument(
                       "periodic_resample expects only "
                       "one index to be marked as adjustable."));
-      adjustable_dimension = i;
+      *adjustable_dimension = i;
       found = true;
     } else {
       OP_REQUIRES(
@@ -122,9 +232,8 @@ template <class InputDataT,
               i, " input tensor has size ", input_tensor_shape.dim_size(i),
               ", desired shape has size ", desired_shape[i], "."));
 
-      // target_dimensions[i] = desired_shape(i);
-      target_dimensions[i] = desired_shape[i];
-      new_sliced_size *= target_dimensions[i];
+      (*target_dimensions)[i] = desired_shape[i];
+      new_sliced_size *= (*target_dimensions)[i];
     }
   }
   // at least one index needs to be adjustable
@@ -132,26 +241,50 @@ template <class InputDataT,
               tensorflow::errors::InvalidArgument(
                   "periodic_resample expects at least "
                   "one index to be marked as adjustable."));
+  (*target_dimensions)[*adjustable_dimension] =
+      input_tensor_shape.num_elements() / new_sliced_size;
 
-  int count = 0;
-  for (const auto dim_info : input_tensor.shape()) {
-    original_dimensions[count] = dim_info.size;
-    ++count;
-  }
+  *output_size = new_sliced_size * (*target_dimensions)[*adjustable_dimension];
+}
 
-  target_dimensions[adjustable_dimension] = total_size / new_sliced_size;
+// Heuristic number based on measurements on
+// Intel(R) Core(TM) i7-4930K CPU @ 3.40GHz
+const tensorflow::int64 costPerFillIndex = 35;
 
-  count = 0;
-  for (int i = 0; i < input_tensor.shape().dims(); ++i) {
-    dimension_ceiling[count] = tensorflow::int64(std::ceil(
-        float(target_dimensions[count]) / float(original_dimensions[count])));
-    if (count == 0)
-      cumulative_dimensions[count] = 1;
-    else
-      cumulative_dimensions[count] =
-          cumulative_dimensions[count - 1] * dimension_ceiling[count - 1];
-    ++count;
-  }
+enum class Mode {
+  kForward,
+  kGradient
+};
+
+// Computes either periodic_resample operation output or gradients for it,
+// depending on |mode|.
+// |original_shape| is always shape of input to periodic_resample operation.
+// |source_tensor| is either source for periodic_resample (for forward mode)
+//     or gradients tensor.
+// |desired_shape| is always shape, provided by user, to which forward
+//     propagation attempts resample input tensor.
+template <class InputDataT, Mode mode>
+void
+do_periodic_resample_op(tensorflow::OpKernelContext* context,
+                        const tensorflow::TensorShape& original_shape,
+                        const tensorflow::PartialTensorShape& desired_shape,
+                        const tensorflow::Tensor& source_tensor) {
+  const int rank = source_tensor.dims();
+
+  // requires that the rank of the input tensor and length of the desired shape
+  // are equal
+  OP_REQUIRES(context, rank == desired_shape.dims(),
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample expects the rank of the input tensor, ",
+                  rank, ", to be the same as the length of the desired shape, ",
+                  desired_shape.dims(), "."));
+
+  std::vector<tensorflow::int64> target_dimensions(rank);
+  tensorflow::int64 new_size = 0;
+  // index of adjustable dimension
+  int adjustable_dimension = 0;
+  process_desired_shape(context, original_shape, desired_shape.dim_sizes(),
+                        &adjustable_dimension, &target_dimensions, &new_size);
 
   // ensure that the new dimension is greater than zero
   OP_REQUIRES(context, target_dimensions[adjustable_dimension] > 0,
@@ -160,11 +293,14 @@ template <class InputDataT,
                   "adjustable dimension, ",
                   adjustable_dimension, ", isn't greater than zero, ",
                   target_dimensions[adjustable_dimension], "."));
-  for (int i = 0; i < rank; ++i) {
-    output_shape.AddDim(target_dimensions[i]);
+  tensorflow::TensorShape output_shape;
+  if (mode == Mode::kForward) {
+    for (int i = 0; i < rank; ++i) {
+      output_shape.AddDim(target_dimensions[i]);
+    }
+  } else {
+    output_shape = original_shape;
   }
-  const auto new_size =
-      new_sliced_size * target_dimensions[adjustable_dimension];
 
   // Create an output tensor and attach it to the current context
   tensorflow::Tensor* output_tensor = nullptr;
@@ -172,47 +308,73 @@ template <class InputDataT,
                  context->allocate_output(0, output_shape, &output_tensor));
   auto output = output_tensor->flat<InputDataT>();
 
-  // memory is allocated for these variables outside the inner loop for
-  // efficiency (although, I could create a separate class scope for
-  // this purpose instead)
-  tensorflow::int64 result = 0;
-  std::vector<tensorflow::int64> output_indices(target_dimensions.size());
+  // input is a strided array (last index is fastest, C-ordered)
+  auto input = source_tensor.flat<InputDataT>();
 
   // Fill output tensor with periodically resampled input tensor values
-  for (tensorflow::int64 output_index = 0; output_index < new_size;
-       ++output_index) {
-    output(output_index) = input(compute_input_index(
-        &target_dimensions, output_index, original_dimensions,
-        adjustable_dimension, dimension_ceiling, cumulative_dimensions, &result,
-        &output_indices, rank));
-  }
+  InputIndexer input_indexer(target_dimensions, original_shape,
+                             adjustable_dimension);
+
+  auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+  auto fill_output_tensor = [&input_indexer, &output, &input](
+      tensorflow::int64 start, tensorflow::int64 limit) {
+    InputIndexer local_indexer(input_indexer);
+    local_indexer.MoveToOutputIndex(start);
+    for (tensorflow::int64 output_index = start; output_index < limit;
+         ++output_index) {
+      if (mode == Mode::kForward) {
+        output(output_index) = input(local_indexer.linear_input_index());
+      } else {
+        output(local_indexer.linear_input_index()) = input(output_index);
+      }
+      local_indexer.IncrementOutputIndex();
+    }
+  };
+  ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
+                      new_size, costPerFillIndex, fill_output_tensor);
 }
 
+#define DATA_TYPE_SWITCH(data_type, context, CASE)                            \
+  switch (data_type) {                                                        \
+    CASE(float)                                                               \
+    CASE(double)                                                              \
+    CASE(tensorflow::int32)                                                   \
+    CASE(tensorflow::int64)                                                   \
+    default:                                                                  \
+      context->CtxFailure(__FILE__, __LINE__,                                 \
+          tensorflow::errors::InvalidArgument(                                \
+              "Unsuppored tensor elements type"));                            \
+      break;                                                                  \
+  }
+
 void create_output_tensor(
     tensorflow::OpKernelContext* context,
     const tensorflow::Tensor& input_tensor,
     const tensorflow::DataType& input_tensor_type,
-    const tensorflow::PartialTensorShape& desired_shape_tensor) {
-  auto desired_shape = desired_shape_tensor.dim_sizes();
-
-  // obligatory type switch
-  switch (input_tensor_type) {
-    case tensorflow::DataTypeToEnum<float>::value:
-      fill_periodic_tensor<float>(context, desired_shape, input_tensor);
+    const tensorflow::PartialTensorShape& desired_shape) {
+#define CASE(type)                                                            \
+    case tensorflow::DataTypeToEnum<type>::value:                             \
+      do_periodic_resample_op<type, Mode::kForward>(                          \
+          context, input_tensor.shape(), desired_shape, input_tensor);        \
       break;
-    case tensorflow::DataTypeToEnum<double>::value:
-      fill_periodic_tensor<double>(context, desired_shape, input_tensor);
-      break;
-    case tensorflow::DataTypeToEnum<tensorflow::int32>::value:
-      fill_periodic_tensor<tensorflow::int32>(context, desired_shape,
-                                              input_tensor);
-      break;
-    case tensorflow::DataTypeToEnum<tensorflow::int64>::value:
-      fill_periodic_tensor<tensorflow::int64>(context, desired_shape,
-                                              input_tensor);
+
+  DATA_TYPE_SWITCH(input_tensor_type, context, CASE);
+#undef CASE
+}
+
+void create_grad_tensor(tensorflow::OpKernelContext* context,
+                        const tensorflow::Tensor& grad_tensor,
+                        const tensorflow::DataType& grad_tensor_type,
+                        const tensorflow::TensorShape& original_shape,
+                        const tensorflow::PartialTensorShape& desired_shape) {
+#define CASE(type)                                                            \
+    case tensorflow::DataTypeToEnum<type>::value:                             \
+      do_periodic_resample_op<type, Mode::kGradient>(                         \
+          context, original_shape, desired_shape, grad_tensor);               \
       break;
-    default:;
-  }
+
+  DATA_TYPE_SWITCH(grad_tensor_type, context, CASE);
+#undef CASE
 }
 
 }  // namespace
@@ -238,4 +400,25 @@ class PeriodicResampleOp : public tensorflow::OpKernel {
   tensorflow::PartialTensorShape desired_shape;
 };
 
+class PeriodicResampleOpGrad : public tensorflow::OpKernel {
+ public:
+  explicit PeriodicResampleOpGrad(tensorflow::OpKernelConstruction* context)
+      : tensorflow::OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("original_shape", &original_shape));
+    OP_REQUIRES_OK(context, context->GetAttr("desired_shape", &desired_shape));
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    const tensorflow::Tensor& grad_tensor = context->input(0);
+    const tensorflow::DataType grad_tensor_type = context->input_dtype(0);
+    create_grad_tensor(context, grad_tensor, grad_tensor_type, original_shape,
+                       desired_shape);
+  }
+
+ private:
+  tensorflow::TensorShape original_shape;
+  tensorflow::PartialTensorShape desired_shape;
+};
+
 #endif  // TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops.cc b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
index 82bd796956..fd38cd09b4 100644
--- a/tensorflow/contrib/periodic_resample/ops/array_ops.cc
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
@@ -26,7 +26,42 @@ REGISTER_OP("PeriodicResample")
     .Input("values: T")
     .Attr("shape: shape")
     .Output("output: T")
-    .SetShapeFn(shape_inference::ExplicitShape)
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      tensorflow::PartialTensorShape desired_shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &desired_shape));
+      shape_inference::ShapeHandle input_tensor_shape = c->input(0);
+      shape_inference::DimensionHandle num_input_elements =
+          c->NumElements(input_tensor_shape);
+      shape_inference::ShapeHandle result_shape_handle;
+      if (!shape_inference::InferenceContext::ValueKnown(num_input_elements)) {
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+            desired_shape, &result_shape_handle));
+      } else {
+        const int rank = c->Rank(input_tensor_shape);
+        std::vector<tensorflow::int64> target_dimensions(rank);
+        tensorflow::int64 new_sliced_size = 1;
+        int adjustable_dimension = 0;
+        for (int i = 0; i < rank; ++i) {
+          if (desired_shape.dim_size(i) < 1) {
+            adjustable_dimension = i;
+          } else {
+            target_dimensions[i] = desired_shape.dim_size(i);
+            new_sliced_size *= target_dimensions[i];
+          }
+        }
+        target_dimensions[adjustable_dimension] =
+            shape_inference::InferenceContext::Value(
+                num_input_elements) / new_sliced_size;
+        tensorflow::TensorShape result_shape;
+        for (int i = 0; i < rank; ++i) {
+          result_shape.AddDim(target_dimensions[i]);
+        }
+        TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(
+            result_shape, &result_shape_handle));
+      }
+      c->set_output(0, result_shape_handle);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Periodically resample elements of a tensor to conform to `shape`.
 
@@ -101,4 +136,20 @@ output: Periodically resampled tensor that has dimensions specified as in
 
 )doc");
 
+
+REGISTER_OP("PeriodicResampleOpGrad")
+    .Attr("T: numbertype")
+    .Input("grad: T")
+    .Attr("original_shape: shape")
+    .Attr("desired_shape: shape")
+    .Output("grad_values: T")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      tensorflow::TensorShape original_shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("original_shape", &original_shape));
+      shape_inference::ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(original_shape, &s));
+      c->set_output(0, s);
+      return Status::OK();
+});
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc b/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
new file mode 100644
index 0000000000..43b7c1799f
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(ArrayOpsTest, PeriodicResample_ShapeFn) {
+  ShapeInferenceTestOp op("PeriodicResample");
+  // Case 1: output shape can be fully inferreed.
+  PartialTensorShape shape({4, 4, -1});
+  TensorShapeProto shape_proto;
+  shape.AsProto(&shape_proto);
+
+  TF_ASSERT_OK(NodeDefBuilder("test", "PeriodicResample")
+                   .Input({"values", 0, DT_INT32})
+                   .Attr("shape", shape_proto)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[2,2,4]", "[4,4,1]");
+  // Case 2: output shape can not be inferred - report desired shape.
+  INFER_OK(op, "[2,2,?]", "[4,4,?]");
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
index a25de55e18..31a6fe1d94 100644
--- a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
+++ b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
@@ -21,8 +21,11 @@ from __future__ import print_function
 import numpy
 
 from tensorflow.contrib.periodic_resample import periodic_resample
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -93,7 +96,6 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
   def testPeriodicResampleErrors(self):
     input_tensor = numpy.zeros(shape=[1, 2, 2, 4])
     with self.test_session():
-      variables.global_variables_initializer().run()
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
           'Dimension 3 input tensor has size 4, desired shape has size 1'):
@@ -103,6 +105,29 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
           '4, to be the same as the length of the desired shape, 3'):
         periodic_resample(input_tensor, [None, 4, 4]).eval()
 
+  def testPeriodicResampleGradient(self):
+    desired_shape = numpy.array([4, 4, None])
+    result_shape = (4, 4, 1)
+    input_shape = (2, 2, 4)
+    with self.test_session() as sess:
+      x = array_ops.placeholder(dtypes.float32, shape=input_shape)
+      output = periodic_resample(x, desired_shape)
+      error = gradient_checker.compute_gradient_error(
+          x, input_shape, output, result_shape)
+      self.assertLess(error, 1e-4)
+
+  def testPeriodicResampleShapeInference(self):
+    with self.test_session() as sess:
+      # Case 1: output shape can be fully inferreed.
+      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, 4))
+      output = periodic_resample(x, [4, 4, None])
+      self.assertEqual(output.shape, [4, 4, 1])
+      # Case 2: output shape can not be inferred - report desired shape.
+      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, None))
+      output = periodic_resample(x, [4, 4, None])
+      self.assertTrue(output.shape.is_compatible_with([4, 4, None]))
+      self.assertEqual(output.shape[2].value, None)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
index 348623d8f8..470e300ccb 100644
--- a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
+++ b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
@@ -21,11 +21,17 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.contrib.periodic_resample.python.ops import gen_periodic_resample_op
 
-from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample
+from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample, periodic_resample_op_grad
 
 from tensorflow.contrib.util import loader
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
 # pylint: enable=unused-import
 
 _periodic_resample_op = loader.load_op_library(
     resource_loader.get_path_to_datafile('_periodic_resample_op.so'))
+
+@ops.RegisterGradient("PeriodicResample")
+def _periodic_resample_grad_cc(op, grad):
+  return periodic_resample_op_grad(
+      grad, op.inputs[0].shape, op.get_attr('shape'))
diff --git a/tensorflow/contrib/predictor/contrib_estimator_predictor.py b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
index b7a98c68e2..af3b2ad1b5 100644
--- a/tensorflow/contrib/predictor/contrib_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
@@ -34,7 +34,8 @@ class ContribEstimatorPredictor(predictor.Predictor):
                prediction_input_fn,
                input_alternative_key=None,
                output_alternative_key=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `ContribEstimatorPredictor`.
 
     Args:
@@ -48,6 +49,7 @@ class ContribEstimatorPredictor(predictor.Predictor):
         multi-headed models.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -58,6 +60,7 @@ class ContribEstimatorPredictor(predictor.Predictor):
       checkpoint_path = saver.latest_checkpoint(estimator.model_dir)
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
+              config=config,
               checkpoint_filename_with_path=checkpoint_path))
 
     input_alternative_key = (
diff --git a/tensorflow/contrib/predictor/core_estimator_predictor.py b/tensorflow/contrib/predictor/core_estimator_predictor.py
index d78d94c269..a725072e72 100644
--- a/tensorflow/contrib/predictor/core_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/core_estimator_predictor.py
@@ -51,7 +51,8 @@ class CoreEstimatorPredictor(predictor.Predictor):
                estimator,
                serving_input_receiver_fn,
                output_key=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -62,6 +63,7 @@ class CoreEstimatorPredictor(predictor.Predictor):
         `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -71,6 +73,7 @@ class CoreEstimatorPredictor(predictor.Predictor):
       checkpoint_dir = estimator.model_dir
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
+              config=config,
               checkpoint_dir=checkpoint_dir))
 
     feed_tensor_info = signature_def.inputs
diff --git a/tensorflow/contrib/predictor/predictor_factories.py b/tensorflow/contrib/predictor/predictor_factories.py
index 6e77e934fe..f275bc15ad 100644
--- a/tensorflow/contrib/predictor/predictor_factories.py
+++ b/tensorflow/contrib/predictor/predictor_factories.py
@@ -30,7 +30,8 @@ def from_contrib_estimator(estimator,
                            prediction_input_fn,
                            input_alternative_key=None,
                            output_alternative_key=None,
-                           graph=None):
+                           graph=None,
+                           config=None):
   """Constructs a `Predictor` from a `tf.contrib.learn.Estimator`.
 
   Args:
@@ -44,6 +45,7 @@ def from_contrib_estimator(estimator,
       multi-headed models.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -62,13 +64,15 @@ def from_contrib_estimator(estimator,
       prediction_input_fn,
       input_alternative_key=input_alternative_key,
       output_alternative_key=output_alternative_key,
-      graph=graph)
+      graph=graph,
+      config=config)
 
 
 def from_estimator(estimator,
                    serving_input_receiver_fn,
                    output_key=None,
-                   graph=None):
+                   graph=None,
+                   config=None):
   """Constructs a `Predictor` from a `tf.python.estimator.Estimator`.
 
   Args:
@@ -79,6 +83,7 @@ def from_estimator(estimator,
       `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -93,14 +98,19 @@ def from_estimator(estimator,
                     'tf.contrib.learn.Estimator. You likely want to call '
                     'from_contrib_estimator.')
   return core_estimator_predictor.CoreEstimatorPredictor(
-      estimator, serving_input_receiver_fn, output_key=output_key, graph=graph)
+      estimator,
+      serving_input_receiver_fn,
+      output_key=output_key,
+      graph=graph,
+      config=config)
 
 
 def from_saved_model(export_dir,
                      signature_def_key=None,
                      signature_def=None,
                      tags=None,
-                     graph=None):
+                     graph=None,
+                     config=None):
   """Constructs a `Predictor` from a `SavedModel` on disk.
 
   Args:
@@ -115,6 +125,7 @@ def from_saved_model(export_dir,
       `SignatureDef`. Defaults to `DEFAULT_TAGS`.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -128,4 +139,5 @@ def from_saved_model(export_dir,
       signature_def_key=signature_def_key,
       signature_def=signature_def,
       tags=tags,
-      graph=graph)
+      graph=graph,
+      config=config)
diff --git a/tensorflow/contrib/predictor/predictor_factories_test.py b/tensorflow/contrib/predictor/predictor_factories_test.py
index 578d9424b2..a2ef1dc3af 100644
--- a/tensorflow/contrib/predictor/predictor_factories_test.py
+++ b/tensorflow/contrib/predictor/predictor_factories_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.predictor import predictor_factories
 from tensorflow.contrib.predictor import testing_common
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import test
 
 MODEL_DIR_NAME = 'contrib/predictor/test_export_dir'
@@ -41,6 +42,11 @@ class PredictorFactoriesTest(test.TestCase):
     """Test loading from_saved_model with tags."""
     predictor_factories.from_saved_model(self._export_dir, tags='serve')
 
+  def testFromSavedModelWithSessionConfig(self):
+    """Test loading from_saved_model with session config."""
+    predictor_factories.from_saved_model(
+        self._export_dir, config=config_pb2.ConfigProto())
+
   def testFromSavedModelWithBadTags(self):
     """Test that loading fails for bad tags."""
     bad_tags_regex = ('.*? could not be found in SavedModel')
@@ -53,6 +59,13 @@ class PredictorFactoriesTest(test.TestCase):
     predictor_factories.from_contrib_estimator(
         estimator, input_fn, output_alternative_key='sum')
 
+  def testFromContribEstimatorWithSessionConfig(self):
+    estimator = testing_common.get_arithmetic_estimator(core=False)
+    input_fn = testing_common.get_arithmetic_input_fn(core=False)
+    predictor_factories.from_contrib_estimator(
+        estimator, input_fn, output_alternative_key='sum',
+        config=config_pb2.ConfigProto())
+
   def testFromContribEstimatorWithCoreEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=True)
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
@@ -64,6 +77,12 @@ class PredictorFactoriesTest(test.TestCase):
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
     predictor_factories.from_estimator(estimator, input_fn)
 
+  def testFromCoreEstimatorWithSessionConfig(self):
+    estimator = testing_common.get_arithmetic_estimator(core=True)
+    input_fn = testing_common.get_arithmetic_input_fn(core=True)
+    predictor_factories.from_estimator(
+        estimator, input_fn, config=config_pb2.ConfigProto())
+
   def testFromCoreEstimatorWithContribEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=False)
     input_fn = testing_common.get_arithmetic_input_fn(core=False)
diff --git a/tensorflow/contrib/predictor/saved_model_predictor.py b/tensorflow/contrib/predictor/saved_model_predictor.py
index 0dbca0f813..95da6d04ed 100644
--- a/tensorflow/contrib/predictor/saved_model_predictor.py
+++ b/tensorflow/contrib/predictor/saved_model_predictor.py
@@ -121,7 +121,8 @@ class SavedModelPredictor(predictor.Predictor):
                input_names=None,
                output_names=None,
                tags=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -142,6 +143,7 @@ class SavedModelPredictor(predictor.Predictor):
         the correct `SignatureDef`. Defaults to `DEFAULT_TAGS`.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     Raises:
       ValueError: If more than one of signature_def_key OR signature_def OR
         (input_names AND output_names) is specified.
@@ -152,7 +154,7 @@ class SavedModelPredictor(predictor.Predictor):
     self._graph = graph or ops.Graph()
 
     with self._graph.as_default():
-      self._session = session.Session()
+      self._session = session.Session(config=config)
       loader.load(self._session, tags.split(','), export_dir)
 
     if input_names is None:
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index c83623ec94..27a933c0f9 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -6,7 +6,7 @@ inference. The details of the transformation implemented in this package is
 described here [1].
 
 This is done using the
-[fake quantization op](https://www.tensorflow.org/versions/r0.12/api_docs/python/array_ops/fake_quantization).
+[fake quantization op](https://www.tensorflow.org/api_guides/python/array_ops#Fake_quantization).
 
 Literature has shown that fixed point networks provide comparable performance to
 floating point networks [2]. This is achieved by modeling the quantization
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 94fc12ca81..3d0308aaf3 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -26,7 +26,6 @@ import time
 import numpy as np
 
 from tensorflow.contrib.framework.python.ops import variables as variables_lib
-from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.slim.python.slim import evaluation
 from tensorflow.contrib.training.python.training import evaluation as evaluation_lib
 from tensorflow.core.protobuf import saver_pb2
@@ -37,6 +36,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import gfile
@@ -89,8 +89,8 @@ class EvaluationTest(test.TestCase):
     self._predictions, self._scale = TestModel(self._inputs)
 
   def testFinalOpsOnEvaluationLoop(self):
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
     # Create checkpoint and log directories:
@@ -136,9 +136,10 @@ class EvaluationTest(test.TestCase):
     self.assertTrue(obj.hook_was_run)
 
   def _create_names_to_metrics(self, predictions, labels):
-    accuracy0, update_op0 = metric_ops.streaming_accuracy(predictions, labels)
-    accuracy1, update_op1 = metric_ops.streaming_accuracy(predictions + 1,
-                                                          labels)
+    accuracy0, update_op0 = metrics.accuracy(
+        labels=labels, predictions=predictions)
+    accuracy1, update_op1 = metrics.accuracy(
+        labels=labels, predictions=predictions + 1)
 
     names_to_values = {'Accuracy': accuracy0, 'Another_accuracy': accuracy1}
     names_to_updates = {'Accuracy': update_op0, 'Another_accuracy': update_op1}
@@ -198,8 +199,8 @@ class EvaluationTest(test.TestCase):
     predictions_limited = input.limit_epochs(self._predictions, num_epochs=1)
     labels_limited = input.limit_epochs(self._labels, num_epochs=1)
 
-    value_op, update_op = metric_ops.streaming_accuracy(
-        predictions_limited, labels_limited)
+    value_op, update_op = metrics.accuracy(
+        labels=labels_limited, predictions=predictions_limited)
 
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
@@ -260,8 +261,8 @@ class SingleEvaluationTest(test.TestCase):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
 
     # Run the evaluation and verify the results:
     accuracy_value = evaluation.evaluate_once(
@@ -276,8 +277,8 @@ class SingleEvaluationTest(test.TestCase):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
 
     dumping_root = os.path.join(self.get_temp_dir(), 'tfdbg_dump_dir')
     dumping_hook = hooks.DumpingDebugHook(dumping_root, log_usage=False)
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index 99ced53e11..d22b80ac88 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -21,6 +21,7 @@ from @{tf.summary.merge_all} to @{tf.summary.FileWriter}.
 
 To use with eager execution enabled, write your code as follows:
 
+```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -30,9 +31,11 @@ with summary_writer.as_default(), tf.contrib.summary.always_record_summaries():
   tf.contrib.summary.scalar("loss", my_loss)
   # In this case every call to tf.contrib.summary.scalar will generate a record
   # ...
+```
 
 To use it with graph execution, write your code as follows:
 
+```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -53,7 +56,7 @@ with tf.Session(...) as sess:
   while not_done_training:
     sess.run([train_op, tf.contrib.summary.all_summary_ops()])
     # ...
-
+```
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index e893e1d1c8..d8236a0a6f 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -21,10 +21,10 @@ import numpy as np
 
 from tensorflow.contrib import losses
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
-from tensorflow.contrib.metrics.python.ops import metric_ops
 
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 
 INFERENCE_PROB_NAME = prediction_key.PredictionKey.PROBABILITIES
@@ -38,12 +38,13 @@ def _top_k_generator(k):
     targets = math_ops.to_int32(targets)
     if targets.get_shape().ndims > 1:
       targets = array_ops.squeeze(targets, axis=[1])
-    return metric_ops.streaming_mean(nn.in_top_k(probabilities, targets, k))
+    return metrics.mean(nn.in_top_k(probabilities, targets, k))
   return _top_k
 
 
 def _accuracy(predictions, targets, weights=None):
-  return metric_ops.streaming_accuracy(predictions, targets, weights=weights)
+  return metrics.accuracy(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _r2(probabilities, targets, weights=None):
@@ -53,7 +54,7 @@ def _r2(probabilities, targets, weights=None):
   squares_residuals = math_ops.reduce_sum(
       math_ops.square(targets - probabilities), 0)
   score = 1 - math_ops.reduce_sum(squares_residuals / squares_total)
-  return metric_ops.streaming_mean(score, weights=weights)
+  return metrics.mean(score, weights=weights)
 
 
 def _squeeze_and_onehot(targets, depth):
@@ -62,7 +63,7 @@ def _squeeze_and_onehot(targets, depth):
 
 
 def _sigmoid_entropy(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.sigmoid_cross_entropy(probabilities,
                                    _squeeze_and_onehot(
                                        targets,
@@ -71,7 +72,7 @@ def _sigmoid_entropy(probabilities, targets, weights=None):
 
 
 def _softmax_entropy(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.sparse_softmax_cross_entropy(probabilities,
                                           math_ops.to_int32(targets)),
       weights=weights)
@@ -82,7 +83,7 @@ def _predictions(predictions, unused_targets, **unused_kwargs):
 
 
 def _class_log_loss(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.log_loss(probabilities,
                       _squeeze_and_onehot(targets,
                                           array_ops.shape(probabilities)[1])),
@@ -90,34 +91,36 @@ def _class_log_loss(probabilities, targets, weights=None):
 
 
 def _precision(predictions, targets, weights=None):
-  return metric_ops.streaming_precision(predictions, targets, weights=weights)
+  return metrics.precision(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _precision_at_thresholds(predictions, targets, weights=None):
-  return metric_ops.streaming_precision_at_thresholds(
-      array_ops.slice(predictions, [0, 1], [-1, 1]),
-      targets,
-      np.arange(
-          0, 1, 0.01, dtype=np.float32),
+  return metrics.precision_at_thresholds(
+      labels=targets,
+      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
+      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _recall(predictions, targets, weights=None):
-  return metric_ops.streaming_recall(predictions, targets, weights=weights)
+  return metrics.recall(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _recall_at_thresholds(predictions, targets, weights=None):
-  return metric_ops.streaming_recall_at_thresholds(
-      array_ops.slice(predictions, [0, 1], [-1, 1]),
-      targets,
-      np.arange(
-          0, 1, 0.01, dtype=np.float32),
+  return metrics.recall_at_thresholds(
+      labels=targets,
+      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
+      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _auc(probs, targets, weights=None):
-  return metric_ops.streaming_auc(array_ops.slice(probs, [0, 1], [-1, 1]),
-                                  targets, weights=weights)
+  return metrics.auc(
+      labels=targets,
+      predictions=array_ops.slice(probs, [0, 1], [-1, 1]),
+      weights=weights)
 
 
 _EVAL_METRICS = {
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index 7a35a70bbe..6f62cd11a9 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -295,7 +295,7 @@ def get_epoch_variable():
 
 
 # A simple container to hold the training variables for a single tree.
-class TreeTrainingVariables(object):
+class TreeVariables(object):
   """Stores tf.Variables for training a single random tree.
 
   Uses tf.get_variable to get tree-specific names so that this can be used
@@ -303,7 +303,7 @@ class TreeTrainingVariables(object):
   then relies on restoring that model to evaluate).
   """
 
-  def __init__(self, params, tree_num, training):
+  def __init__(self, params, tree_num, training, tree_config='', tree_stat=''):
     if (not hasattr(params, 'params_proto') or
         not isinstance(params.params_proto,
                        _params_proto.TensorForestParams)):
@@ -315,27 +315,28 @@ class TreeTrainingVariables(object):
       # TODO(gilberth): Manually shard this to be able to fit it on
       # multiple machines.
       self.stats = stats_ops.fertile_stats_variable(
-          params, '', self.get_tree_name('stats', tree_num))
+          params, tree_stat, self.get_tree_name('stats', tree_num))
     self.tree = model_ops.tree_variable(
-        params, '', self.stats, self.get_tree_name('tree', tree_num))
+        params, tree_config, self.stats, self.get_tree_name('tree', tree_num))
 
   def get_tree_name(self, name, num):
     return '{0}-{1}'.format(name, num)
 
 
-class ForestTrainingVariables(object):
+class ForestVariables(object):
   """A container for a forests training data, consisting of multiple trees.
 
-  Instantiates a TreeTrainingVariables object for each tree. We override the
+  Instantiates a TreeVariables object for each tree. We override the
   __getitem__ and __setitem__ function so that usage looks like this:
 
-    forest_variables = ForestTrainingVariables(params)
+    forest_variables = ForestVariables(params)
 
     ... forest_variables.tree ...
   """
 
   def __init__(self, params, device_assigner, training=True,
-               tree_variables_class=TreeTrainingVariables):
+               tree_variables_class=TreeVariables,
+               tree_configs=None, tree_stats=None):
     self.variables = []
     # Set up some scalar variables to run through the device assigner, then
     # we can use those to colocate everything related to a tree.
@@ -347,7 +348,13 @@ class ForestTrainingVariables(object):
 
     for i in range(params.num_trees):
       with ops.device(self.device_dummies[i].device):
-        self.variables.append(tree_variables_class(params, i, training))
+        kwargs = {}
+        if tree_configs is not None:
+          kwargs.update(dict(tree_config=tree_configs[i]))
+        if tree_stats is not None:
+          kwargs.update(dict(tree_stat=tree_stats[i]))
+        self.variables.append(tree_variables_class(
+            params, i, training, **kwargs))
 
   def __setitem__(self, t, val):
     self.variables[t] = val
@@ -361,9 +368,11 @@ class RandomForestGraphs(object):
 
   def __init__(self,
                params,
+               tree_configs=None,
+               tree_stats=None,
                device_assigner=None,
                variables=None,
-               tree_variables_class=TreeTrainingVariables,
+               tree_variables_class=TreeVariables,
                tree_graphs=None,
                training=True):
     self.params = params
@@ -371,9 +380,10 @@ class RandomForestGraphs(object):
         device_assigner or framework_variables.VariableDeviceChooser())
     logging.info('Constructing forest with params = ')
     logging.info(self.params.__dict__)
-    self.variables = variables or ForestTrainingVariables(
+    self.variables = variables or ForestVariables(
         self.params, device_assigner=self.device_assigner, training=training,
-        tree_variables_class=tree_variables_class)
+        tree_variables_class=tree_variables_class,
+        tree_configs=tree_configs, tree_stats=tree_stats)
     tree_graph_class = tree_graphs or RandomTreeGraphs
     self.trees = [
         tree_graph_class(self.variables[i], self.params, i)
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
index bbe627b157..1c9c81827e 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
@@ -18,10 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from google.protobuf.json_format import ParseDict
+from tensorflow.contrib.decision_trees.proto import generic_tree_model_pb2 as _tree_proto
 from tensorflow.contrib.tensor_forest.python import tensor_forest
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -110,6 +114,47 @@ class TensorForestTest(test_util.TensorFlowTestCase):
     self.assertTrue(isinstance(paths, ops.Tensor))
     self.assertTrue(isinstance(var, ops.Tensor))
 
+  def testInfrenceFromRestoredModel(self):
+    input_data = [[-1., 0.], [-1., 2.],  # node 1
+                  [1., 0.], [1., -2.]]  # node 2
+    expected_prediction = [[0.0, 1.0], [0.0, 1.0],
+                           [0.0, 1.0], [0.0, 1.0]]
+    hparams = tensor_forest.ForestHParams(
+        num_classes=2,
+        num_features=2,
+        num_trees=1,
+        max_nodes=1000,
+        split_after_samples=25).fill()
+    tree_weight = {'decisionTree':
+                       {'nodes':
+                        [{'binaryNode':
+                          {'rightChildId': 2,
+                           'leftChildId': 1,
+                           'inequalityLeftChildTest':
+                           {'featureId': {'id': '0'},
+                            'threshold': {'floatValue': 0}}}},
+                         {'leaf': {'vector':
+                                   {'value': [{'floatValue': 0.0},
+                                              {'floatValue': 1.0}]}},
+                          'nodeId': 1},
+                         {'leaf': {'vector':
+                                   {'value': [{'floatValue': 0.0},
+                                              {'floatValue': 1.0}]}},
+                          'nodeId': 2}]}}
+    restored_tree_param = ParseDict(tree_weight,
+                                    _tree_proto.Model()).SerializeToString()
+    graph_builder = tensor_forest.RandomForestGraphs(hparams,
+                                                     [restored_tree_param])
+    probs, paths, var = graph_builder.inference_graph(input_data)
+    self.assertTrue(isinstance(probs, ops.Tensor))
+    self.assertTrue(isinstance(paths, ops.Tensor))
+    self.assertTrue(isinstance(var, ops.Tensor))
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+      self.assertEquals(probs.eval().shape, (4, 2))
+      self.assertEquals(probs.eval().tolist(), expected_prediction)
+
   def testTrainingConstructionClassificationSparse(self):
     input_data = sparse_tensor.SparseTensor(
         indices=[[0, 0], [0, 3], [1, 0], [1, 7], [2, 1], [3, 9]],
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index b7b26cfb1c..da4dd5a14c 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -91,8 +91,11 @@ void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
       if (!subgraph_node_ids.count(edge->src()->id()) &&
           !edge->src()->IsSource() && !edge->IsControlEdge()) {
         incoming_edges->insert(edge);
+        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
+                << " Y, ";
       } else {
-        VLOG(2) << node->name() << " -> " << edge->src()->name() << " N, ";
+        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
+                << " N, ";
       }
     }
   }
@@ -106,10 +109,12 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
     for (const tensorflow::Edge* edge : node->out_edges()) {
       if (!subgraph_node_ids.count(edge->dst()->id()) &&
           !edge->dst()->IsSink() && !edge->IsControlEdge()) {
-        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " Y, ";
+        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
+                << " Y, ";
         outgoing_edges->insert(edge);
       } else {
-        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " N, ";
+        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
+                << " N, ";
       }
     }
   }
@@ -181,29 +186,27 @@ struct ConvertGraphParams {
 static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
   GetSubGraphIncomingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_incoming_edges);
+
+  std::set<std::pair<int, int>> unique_tensors;
+  // Add only unique input source nodes. If output of an outside node is shared
+  // between multiple nodes inside the engine, only one edge should be created
   for (const tensorflow::Edge* edge : p->subgraph_incoming_edges) {
-    p->subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
-  }
-  auto output_name_to_index_map = BuildTensorNameMap(p->output_names);
-  std::set<std::pair<int, int>> subgraph_outputs_set;
-  // Collect outputs referenced from output_names
-  for (int node_id : p->subgraph_node_ids) {
-    tensorflow::Node* node = p->graph.FindNodeId(node_id);
-    if (output_name_to_index_map.count(node->name())) {
-      for (int index : output_name_to_index_map.at(node->name())) {
-        subgraph_outputs_set.insert({node_id, index});
-      }
-    }
+    unique_tensors.insert({edge->src()->id(), edge->src_output()});
   }
+  p->subgraph_inputs.insert(p->subgraph_inputs.begin(), unique_tensors.begin(),
+                            unique_tensors.end());
   GetSubGraphOutgoingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_outgoing_edges);
+  unique_tensors.clear();
+  // Similar to above, if multiple ouside nodes are sharing the output of an
+  // internal node only one output port should be created and shared between
+  // outputs
   for (const tensorflow::Edge* edge : p->subgraph_outgoing_edges) {
-    subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
+    unique_tensors.insert({edge->src()->id(), edge->src_output()});
   }
-  p->subgraph_outputs.reserve(subgraph_outputs_set.size());
+  p->subgraph_outputs.reserve(unique_tensors.size());
   p->subgraph_outputs.insert(p->subgraph_outputs.begin(),
-                             subgraph_outputs_set.begin(),
-                             subgraph_outputs_set.end());
+                             unique_tensors.begin(), unique_tensors.end());
   return tensorflow::Status::OK();
 }
 
@@ -225,7 +228,6 @@ tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
   for (auto in_edge :
        params->subgraph_incoming_edges) {  // loop over incoming edges and
                                            // attach them to calib node
-    // tensorflow::Node* src_node = in_edge->src();
     auto src_output = in_edge->src_output();
     auto dst_node = in_edge->dst();
     auto dst_input = in_edge->dst_input();
@@ -257,19 +259,24 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
   for (size_t i = 0; i < params->subgraph_inputs.size(); ++i) {
     subgraph_edge_to_input_map.insert({params->subgraph_inputs.at(i), i});
   }
+  std::set<std::pair<int, int>> unique_tensors;
   for (const tensorflow::Edge* edge : params->subgraph_incoming_edges) {
     std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
+    if (unique_tensors.count(old_src)) continue;
+    unique_tensors.insert(old_src);
     int new_src_output = subgraph_edge_to_input_map.at(old_src);
     params->graph.AddEdge(edge->src(), edge->src_output(), trt_node,
                           new_src_output);
+    VLOG(1) << "Wire " << edge->src()->name() << ":" << edge->src_output()
+            << " -> " << trt_node->name() << ":" << new_src_output;
     params->graph.RemoveEdge(edge);
   }
-
-  VLOG(2) << "new wiring edges: " << trt_node->in_edges().size();
-  for (const tensorflow::Edge* edge : trt_node->in_edges()) {
-    VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "new edge count: " << trt_node->in_edges().size();
+    for (const tensorflow::Edge* edge : trt_node->in_edges()) {
+      VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
+    }
   }
-
   TF_RETURN_IF_ERROR(status);
 
   // Re-map outgoing edges to use the new TRT node instead of the orig subgraph
@@ -283,6 +290,8 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
     int new_src_output = subgraph_edge_to_output_map.at(old_src);
     TF_RETURN_IF_ERROR(params->graph.UpdateEdge(
         trt_node, new_src_output, edge->dst(), edge->dst_input()));
+    VLOG(1) << "Wire " << trt_node->name() << ":" << new_src_output << " -> "
+            << edge->dst()->name() << ":" << edge->dst_input();
   }
   // Remove the original subgraph
   for (int node_id : params->subgraph_node_ids) {
@@ -317,9 +326,12 @@ tensorflow::Status ConvertCalibGraphToInferGraph(
       tensorflow::GraphConstructorOptions(), graph_def, &graph));
   //  get calib nodes
   std::vector<tensorflow::Node*> calib_nodes;
-  for (auto node : graph.op_nodes()) {
+  std::vector<tensorflow::Node*> topo_order;
+  tensorflow::GetPostOrder(graph, &topo_order);
+  for (auto rit = topo_order.rbegin(); rit != topo_order.rend(); ++rit) {
+    auto node = *rit;
     if (node->type_string() == "TRTCalibOp") {
-      VLOG(1) << "Found Calib Node";
+      VLOG(1) << "Found Calib Node " << node->name();
       calib_nodes.push_back(node);
     }
   }
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 96e0700862..4e4d295538 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -362,10 +362,11 @@ void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
       break;
     }
     case tensorflow::DataType::DT_HALF: {
-      Reorder2({k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
-               istrides, static_cast<Eigen::half*>(
-                             const_cast<void*>(oweights->GetValues())),
-               ostrides);
+      Reorder2(
+          {k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
+          istrides,
+          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
+          ostrides);
       break;
     }
     default:
@@ -1179,9 +1180,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
-                                             " not supported at: " +
-                                             node_def.name());
+    return tensorflow::errors::Unimplemented(
+        "binary op: " + node_def.op() +
+        " not supported at: " + node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -2138,9 +2139,7 @@ void Converter::register_op_converters() {
 }
 
 }  // namespace
-tensorflow::Status GetTensorRTGraph(tensorrt::convert::SubGraphParams& s) {
-  return tensorflow::errors::Unimplemented("Not implemented yet");
-}
+
 tensorflow::Status ConvertCalibrationNodeToEngineNode(
     tensorflow::Graph& graph, tensorflow::Node* c_node) {
   const auto ndef = c_node->def();
@@ -2164,9 +2163,23 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   for (auto n : graph.op_nodes()) {
     node_maps.insert({n->name(), n});
   }
+  std::set<int> subgraph_ids;
+  for (const auto internal_node : segment_nodes) {
+    subgraph_ids.insert(node_maps.at(internal_node)->id());
+  }
+  if (VLOG_IS_ON(2)) {
+    string node_names = StrCat(c_node->name(), " segment nodes= ");
+
+    for (const auto& node_name : segment_nodes) {
+      StrAppend(&node_names, node_name, ", ");
+    }
+    VLOG(2) << node_names;
+  }
+
   VLOG(1) << "Output Nodes:";
   std::vector<tensorflow::DataType> out_types;
   std::vector<const tensorflow::Edge*> out_edges;
+
   for (auto& i : output_nodes) {
     auto node_port = tensorflow::str_util::Split(i, ":");
     VLOG(1) << " " << i << " in graph " << node_maps.count(i);
@@ -2186,18 +2199,24 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
         out_types.push_back(out_node->output_type(0));
       }
       for (auto out_edge : out_node->out_edges()) {
+        if (subgraph_ids.count(out_edge->dst()->id()))
+          continue;  // skip internal edges;
         if (out_edge->src_output() == port) {
           out_edges.push_back(out_edge);
-          break;
+          VLOG(1) << "OUTPUT EDGE " << out_edge->src()->name() << ":"
+                  << out_edge->src_output() << " -> " << out_edge->dst()->name()
+                  << ":" << out_edge->dst_input();
         }
       }
     } else {
       LOG(WARNING) << " couldn't find output node " << out_node_name;
     }
   }
-  VLOG(1) << "Input Nodes:";
-  for (auto& i : input_names) {
-    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << c_node->name() << " Input Nodes:";
+    for (auto& i : input_names) {
+      VLOG(1) << " Input " << i << " in graph " << node_maps.count(i);
+    }
   }
   auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
   auto resmgr = trt_rm->getManager("TRTCalibOps");
@@ -2231,14 +2250,24 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   calib_res->builder_ = nullptr;
   tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
   std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
+  income_edges.resize(c_node->num_inputs());
   for (const auto in_edge : c_node->in_edges()) {
     auto src = in_edge->src();
     int dest_port = in_edge->dst_input();
-    income_edges.emplace_back(src->name(), in_edge->src_output(),
-                              c_node->input_type(dest_port));
+    VLOG(1) << "Incoming connection " << src->name() << ":"
+            << in_edge->src_output() << " -> " << c_node->name() << ":"
+            << dest_port;
+    income_edges.at(dest_port) = {src->name(), in_edge->src_output(),
+                                  c_node->input_type(dest_port)};
   }
   tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
       income_edges);
+  if (VLOG_IS_ON(2)) {
+    for (const auto& inp : input_list) {
+      VLOG(2) << " Input from inputlist " << inp.node << ":" << inp.index << " "
+              << tensorflow::DataTypeString(inp.data_type);
+    }
+  }
   op_builder.Input(input_list);
   tensorflow::NodeDef engine_node;
   const char* engine_plan_data = static_cast<const char*>(engine_plan->data());
@@ -2255,13 +2284,26 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   }
   auto trt_engine_node = graph.AddNode(engine_node, &status);
   TF_RETURN_IF_ERROR(status);
-  for (size_t i = 0; i < out_edges.size(); i++) {
-    VLOG(1) << "Connecting trt_engine_node output " << i << " with "
-            << out_edges.at(i)->dst()->name() << " port "
-            << out_edges.at(i)->dst_input();
-    TF_RETURN_IF_ERROR(graph.UpdateEdge(trt_engine_node, i,
-                                        out_edges.at(i)->dst(),
-                                        out_edges.at(i)->dst_input()));
+  std::map<string, int> port_map;
+  for (size_t t = 0; t < output_nodes.size(); t++) {
+    port_map.insert({output_nodes.at(t), t});
+  }
+  for (auto& i : out_edges) {
+    string s(i->src()->name());
+    if (i->src_output()) StrAppend(&s, ":", i->src_output());
+    int out_port = port_map.at(s);
+    VLOG(1) << "Connecting " << trt_engine_node->name() << ":" << out_port
+            << " -> " << i->dst()->name() << ":" << i->dst_input();
+    TF_RETURN_IF_ERROR(
+        graph.UpdateEdge(trt_engine_node, out_port, i->dst(), i->dst_input()));
+  }
+  for (const auto ed : trt_engine_node->in_edges()) {
+    VLOG(1) << "In Edge  " << ed->src()->name() << ":" << ed->src_output()
+            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
+  }
+  for (const auto ed : trt_engine_node->out_edges()) {
+    VLOG(1) << "Out Edge " << ed->src()->name() << ":" << ed->src_output()
+            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
   }
   VLOG(1) << "Segment nodes:";
   for (auto& i : segment_nodes) {
@@ -2332,6 +2374,7 @@ tensorflow::Status ConvertSubgraph(
     std::vector<string>* output_names,
     std::vector<tensorflow::DataType>* output_dtypes,
     const string& engine_name) {
+  std::set<string> added_tensors;
   for (const std::pair<int, int>& input : s.input_inds) {
     VLOG(2) << "parsing input. Node id= " << input.first;
     int node_id = input.first;
@@ -2374,7 +2417,6 @@ tensorflow::Status ConvertSubgraph(
 
     auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
-    input_dtypes->push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
     auto type_status = ConvertDType(tf_dtype, &dtype);
@@ -2410,8 +2452,10 @@ tensorflow::Status ConvertSubgraph(
     if (output_idx != 0) {
       input_tensor_name = StrCat(node_name, ":", output_idx);
     }
-
+    if (added_tensors.count(input_tensor_name)) continue;
+    added_tensors.insert(input_tensor_name);
     input_names->push_back(input_tensor_name);
+    input_dtypes->push_back(tf_dtype);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
         input_tensor_name.c_str(), dtype, input_dim_pseudo_chw);
 
@@ -2435,6 +2479,7 @@ tensorflow::Status ConvertSubgraph(
 
   // Gather output metadata
   int trt_engine_op_output_idx = 0;
+  added_tensors.clear();
   for (const std::pair<int, int>& output : s.output_inds) {
     int node_id = output.first;
     int output_idx = output.second;
@@ -2451,6 +2496,8 @@ tensorflow::Status ConvertSubgraph(
     if (output_idx != 0)
       tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
     VLOG(2) << "Output tensor name: " << tensor_name;
+    if (added_tensors.count(tensor_name)) continue;
+    added_tensors.insert(tensor_name);
     output_names->push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index 2e472a2805..d879170b68 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -166,11 +166,21 @@ def StreamingFilesDataset(files,
     return remote_iterator.get_next()
 
   def MapFn(unused_input):
-    return functional_ops.remote_call(
+    if isinstance(source_dataset.output_types, dtypes.DType):
+      output_types = [source_dataset.output_types]
+    elif isinstance(source_dataset.output_types, (list, tuple)):
+      output_types = source_dataset.output_types
+    else:
+      raise ValueError('source dataset has invalid output types')
+    remote_calls = functional_ops.remote_call(
         args=[source_handle],
-        Tout=[dtypes.string],
+        Tout=output_types,
         f=LoadingFunc,
-        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)[0]
+        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
+    if len(remote_calls) == 1:
+      return remote_calls[0]
+    else:
+      return remote_calls
 
   with ops.device('/job:%s' % worker_job):
     output_dataset = dataset_ops.Dataset.range(2).repeat().map(
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
index 918cf0ed8e..b58d05eac5 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
@@ -26,6 +26,8 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -162,6 +164,30 @@ class DatasetsTest(test.TestCase):
 
     self.assertEqual(set(all_contents), set(retrieved_values))
 
+  def testArbitraryReaderFuncFromDatasetGenerator(self):
+
+    def my_generator():
+      yield (1, [1] * 10)
+
+    def gen_dataset(dummy):
+      return dataset_ops.Dataset.from_generator(
+          my_generator, (dtypes.int64, dtypes.int64),
+          (tensor_shape.TensorShape([]), tensor_shape.TensorShape([10])))
+
+    dataset = datasets.StreamingFilesDataset(
+        dataset_ops.Dataset.range(10), filetype=gen_dataset)
+
+    iterator = dataset.make_initializable_iterator()
+    self._sess.run(iterator.initializer)
+    get_next = iterator.get_next()
+
+    retrieved_values = self._sess.run(get_next)
+
+    self.assertIsInstance(retrieved_values, (list, tuple))
+    self.assertEqual(len(retrieved_values), 2)
+    self.assertEqual(retrieved_values[0], 1)
+    self.assertItemsEqual(retrieved_values[1], [1] * 10)
+
   def testUnexpectedFiletypeString(self):
     with self.assertRaises(ValueError):
       datasets.StreamingFilesDataset(
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index d89633199d..b1c224a345 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -699,7 +699,9 @@ cc_library(
     srcs = ["platform/stacktrace_handler.cc"],
     hdrs = ["platform/stacktrace_handler.h"],
     deps = [
+        ":abi",
         ":lib_platform",
+        ":stacktrace",
     ],
 )
 
@@ -3089,6 +3091,8 @@ cc_library(
         # we now need at least "str_util".
         ":lib",
         ":lib_platform",
+        ":stacktrace_handler",
+        ":test_lite",
         "//tensorflow/core/platform/default/build_config:test_lite_main",
     ],
     alwayslink = 1,
@@ -3569,7 +3573,10 @@ tf_cc_tests_gpu(
 tf_cc_test_mkl(
     name = "mkl_runtime_tests",
     size = "small",
-    srcs = ["common_runtime/mkl_cpu_allocator_test.cc"],
+    srcs = [
+        "common_runtime/mkl_cpu_allocator_test.cc",
+        "common_runtime/mkl_threadpool_device_test.cc",
+    ],
     linkstatic = 1,
     deps = [
         ":core",
diff --git a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
index cbe76de415..985f09312f 100644
--- a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
@@ -4,6 +4,10 @@ op {
   description: <<END
 if < 0, `scale * features` otherwise.
 
+To be used together with
+`initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
+For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+
 See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
new file mode 100644
index 0000000000..6e13d0d049
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
@@ -0,0 +1,48 @@
+op {
+  graph_op_name: "StringSplitV2"
+  in_arg {
+    name: "input"
+    description: <<END
+`1-D` string `Tensor`, the strings to split.
+END
+  }
+  in_arg {
+    name: "sep"
+    description: <<END
+`0-D` string `Tensor`, the delimiter character.
+END
+  }
+  attr {
+    name: "maxsplit"
+    description: <<END
+An `int`. If `maxsplit > 0`, limit of the split of the result.
+END
+  }
+  summary: "Split elements of `source` based on `sep` into a `SparseTensor`."
+  description: <<END
+Let N be the size of source (typically N will be the batch size). Split each
+element of `source` based on `sep` and return a `SparseTensor`
+containing the split tokens. Empty tokens are ignored.
+
+For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+then the output will be
+```
+st.indices = [0, 0;
+              0, 1;
+              1, 0;
+              1, 1;
+              1, 2]
+st.shape = [2, 3]
+st.values = ['hello', 'world', 'a', 'b', 'c']
+```
+
+If `sep` is given, consecutive delimiters are not grouped together and are
+deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+string, consecutive whitespace are regarded as a single separator, and the
+result will contain no empty strings at the startor end if the string has
+leading or trailing whitespace.
+
+Note that the above mentioned behavior matches python's str.split.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
new file mode 100644
index 0000000000..0e8576fb01
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StringSplitV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 8f2a419756..9cda17867b 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -86,7 +86,7 @@ BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
   return &(chunks_[h]);
 }
 
-bool BFCAllocator::Extend(size_t rounded_bytes) {
+bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
   size_t available_bytes = memory_limit_ - total_region_allocated_bytes_;
   // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
   available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
@@ -108,7 +108,7 @@ bool BFCAllocator::Extend(size_t rounded_bytes) {
 
   // Try allocating.
   size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
-  void* mem_addr = suballocator_->Alloc(32, bytes);
+  void* mem_addr = suballocator_->Alloc(alignment, bytes);
   if (mem_addr == nullptr && !started_backpedal_) {
     // Only backpedal once.
     started_backpedal_ = true;
@@ -119,7 +119,7 @@ bool BFCAllocator::Extend(size_t rounded_bytes) {
     while (mem_addr == nullptr) {
       bytes = RoundedBytes(bytes * kBackpedalFactor);
       if (bytes < rounded_bytes) break;
-      mem_addr = suballocator_->Alloc(32, bytes);
+      mem_addr = suballocator_->Alloc(alignment, bytes);
     }
   }
 
@@ -261,7 +261,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
   }
 
   // Try to extend
-  if (Extend(rounded_bytes)) {
+  if (Extend(unused_alignment, rounded_bytes)) {
     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
     if (ptr != nullptr) {
       return ptr;
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index ba5a3eea3a..52aedb1e9c 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -305,7 +305,8 @@ class BFCAllocator : public VisitableAllocator {
   // Try to add a new memory region that can satisfy an allocation of
   // 'rounded_bytes' bytes.  Returns true on success and false on
   // failure.
-  bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  bool Extend(size_t alignment, size_t rounded_bytes)
+      EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Returns a pointer to an underlying allocated chunk of size
   // 'rounded_bytes'.
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index c21a1ea9f2..9028e6298c 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -102,9 +102,25 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
         EXPECT_EQ(2, shape.dim(0).size());
         EXPECT_EQ(1, shape.dim(1).size());
         if (node->name() == y->name()) {
+#ifdef INTEL_MKL
+          // if MKL is used, it goes through various additional 
+          // graph rewrite pass. In TF, everytime a graph pass 
+          // happens, "constant" nodes are allocated
+          // and deallocated. Each allocation calls the
+          // (FindChunkPtr of BFCAllocator),
+          // which increments the value of AllocationId. 
+          // Thus AllocationId becomes more than 3 and 4 if 
+          // MKL is used. Now they are 9 and 10 for MKL. 
+          EXPECT_EQ(19, cm->AllocationId(node, 0));
+#else
           EXPECT_EQ(21, cm->AllocationId(node, 0));
+#endif 
         } else {
+#ifdef INTEL_MKL
+          EXPECT_EQ(20, cm->AllocationId(node, 0));
+#else
           EXPECT_EQ(22, cm->AllocationId(node, 0));
+#endif 
         }
       }
       EXPECT_LE(0, cm->MaxExecutionTime(node));
diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
new file mode 100644
index 0000000000..5d583a8360
--- /dev/null
+++ b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+#ifdef _OPENMP
+TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) {
+  SessionOptions options;
+  unsetenv("OMP_NUM_THREADS");
+
+  ThreadPoolDevice* tp = new ThreadPoolDevice(
+      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
+
+  const int ht = port::NumHyperthreadsPerCore();
+  EXPECT_EQ(omp_get_max_threads(), (port::NumSchedulableCPUs() + ht - 1) / ht);
+}
+
+TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) {
+  SessionOptions options;
+  setenv("OMP_NUM_THREADS", "314", 1);
+
+  ThreadPoolDevice* tp = new ThreadPoolDevice(
+      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
+
+  EXPECT_EQ(omp_get_max_threads(), 314);
+}
+#endif  // _OPENMP
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index 21912236d0..a5d31b75c7 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 
 #ifdef INTEL_MKL
+#ifdef _OPENMP
 #include <omp.h>
-#endif
+#endif  // _OPENMP
+#endif  // INTEL_MKL
 #include <string.h>
 
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -57,7 +59,10 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   // MKL library executes ops in parallel using OMP threads
   // Set inter_op conservatively to avoid thread oversubscription that could
   // lead to severe perf degradations and OMP resource exhaustion
-  const int mkl_intra_op = omp_get_max_threads();
+  int mkl_intra_op = 1;
+#ifdef _OPENMP
+  mkl_intra_op = omp_get_max_threads();
+#endif  // _OPENMP
   CHECK_GE(mkl_intra_op, 1);
   const int32 mkl_inter_op = std::max(
       (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
@@ -68,7 +73,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
 #else
   // Default to using the number of cores available in the process.
   return port::NumSchedulableCPUs();
-#endif
+#endif  // INTEL_MKL
 }
 
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index f7a07fe503..74a87215e1 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -31,7 +31,11 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 
 #ifdef INTEL_MKL
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #endif
 
 namespace tensorflow {
@@ -43,7 +47,26 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
     : LocalDevice(options, Device::BuildDeviceAttributes(
                                name, DEVICE_CPU, memory_limit, locality)),
       allocator_(allocator),
-      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {}
+      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {
+#ifdef INTEL_MKL
+#ifdef _OPENMP
+  const char* user_omp_threads = getenv("OMP_NUM_THREADS");
+  if (user_omp_threads == nullptr) {
+    // OMP_NUM_THREADS controls MKL's intra-op parallelization
+    // Default to available physical cores
+    const int mkl_intra_op = port::NumSchedulableCPUs();
+    const int ht = port::NumHyperthreadsPerCore();
+    omp_set_num_threads((mkl_intra_op + ht - 1) / ht);
+  } else {
+    uint64 user_val = 0;
+    if (strings::safe_strtou64(user_omp_threads, &user_val)) {
+      // Superflous but triggers OpenMP loading
+      omp_set_num_threads(user_val);
+    }
+  }
+#endif  // _OPENMP
+#endif  // INTEL_MKL
+}
 
 ThreadPoolDevice::~ThreadPoolDevice() {}
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index 1cea1b1462..770a0fcf14 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -147,7 +147,9 @@ MasterService::Stub::Stub(
 }
 
 MasterService::AsyncService::AsyncService() {
-  for (int i = 0; i < 10; ++i) {
+  int method_len = sizeof(grpcMasterService_method_names) / 
+                    sizeof(grpcMasterService_method_names[0]);
+  for (int i = 0; i < method_len; ++i) {
     AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcMasterService_method_names[i],
         ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
index 89f83f9f24..a8508d2d4f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_session.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
@@ -50,9 +51,14 @@ Status TestCluster::MakeTestCluster(const SessionOptions& options, int n,
   }
 
   for (int i = 0; i < n; ++i) {
+    string server_file =
+        strings::StrCat(testing::TensorFlowSrcRoot(),
+                        "/core/distributed_runtime/rpc/grpc_testlib_server");
+    if (!options.env->FileExists(server_file).ok()) {
+      return errors::Internal("Could not find grpc_testlib_server");
+    }
     const std::vector<string> argv(
-        {strings::StrCat(testing::TensorFlowSrcRoot(),
-                         "/core/distributed_runtime/rpc/grpc_testlib_server"),
+        {server_file,
          /* see grpc_testlib_server.cc for flags */
          tf_jobs, "--tf_job=localhost", strings::StrCat("--tf_task=", i),
          strings::StrCat("--num_cpus=", num_cpus),
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 2c87156dca..2bb4d32d57 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -67,13 +67,8 @@ struct AllocatorStats {
 // device memory.
 class Allocator {
  public:
-#ifdef EIGEN_VECTORIZE_AVX512
   // Align to 64 byte boundary.
   static constexpr size_t kAllocatorAlignment = 64;
-#else
-  // Align to 32 byte boundary.
-  static constexpr size_t kAllocatorAlignment = 32;
-#endif
 
   virtual ~Allocator();
 
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 3d7920a6e2..4b56d807df 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_gen_lib.h"
 
+#include <algorithm>
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/framework/remote_fused_graph_execute_info.proto b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
index eb689ec1e6..10072724d2 100644
--- a/tensorflow/core/framework/remote_fused_graph_execute_info.proto
+++ b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RemoteFusedGraphExecuteInfoProto";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-//add go_package externally
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index b613effd18..80e168df97 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -1147,29 +1147,29 @@ TEST(Tensor, FailureToAllocate) {
 
 // On the alignment.
 //
-// As of 2015/8, tensorflow::Tensor allocates its buffer with 32-byte
+// As of 2018/5, tensorflow::Tensor allocates its buffer with 64-byte
 // alignment. Tensor::tensor/flat/vec/matrix methods requires the
 // buffer satisfies Eigen::Aligned (e.g., 16-bytes aligned usually,
-// and 32-bytes for AVX). Tensor::Slice requires the caller to ensure
-// its result is aligned if the caller intends to use those methods.
-// In this test case, we simply make sure each slice is 32-byte
-// aligned: sizeof(float) * 4 * 2 = 32.
+// 32-bytes for AVX, and 64-bytes for AVX512). Tensor::Slice requires
+// the caller to ensure its result is aligned if the caller intends
+// to use those methods. In this test case, we simply make sure each
+// slice is 64-byte aligned: sizeof(float) * 4 * 36 = 576.  576 % 64 = 0.
 TEST(Tensor, Slice_Basic) {
   Tensor saved;
   {  // General
-    Tensor x(DT_FLOAT, TensorShape({10, 4, 34}));
+    Tensor x(DT_FLOAT, TensorShape({10, 4, 36}));
     // Fills in known values.
     for (int i = 0; i < 10; ++i) {
       x.Slice(i, i + 1).flat<float>().setConstant(i * 1.f);
     }
     // A simple slice along dim0.
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 34})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 36})));
     auto tx = x.tensor<float, 3>();
     auto ty = y.tensor<float, 3>();
     for (int i = 0; i < 4; ++i) {
       for (int j = 0; j < 4; ++j) {
-        for (int k = 0; k < 34; ++k) {
+        for (int k = 0; k < 36; ++k) {
           EXPECT_EQ(ty(i, j, k), 4.0 + i);
           EXPECT_EQ(&tx(4 + i, j, k), &ty(i, j, k));
         }
@@ -1186,7 +1186,7 @@ TEST(Tensor, Slice_Basic) {
     auto tz = z.tensor<float, 3>();
     EXPECT_EQ(1, z.dim_size(0));
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 34; ++k) {
+      for (int k = 0; k < 36; ++k) {
         EXPECT_EQ(tz(0, j, k), 6.0);
       }
     }
@@ -1198,16 +1198,16 @@ TEST(Tensor, Slice_Basic) {
     EXPECT_EQ(1, saved.dim_size(0));
     auto tsaved = saved.tensor<float, 3>();
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 34; ++k) {
+      for (int k = 0; k < 36; ++k) {
         EXPECT_EQ(tsaved(0, j, k), 6.0);
       }
     }
   }
   {  // Empty
-    Tensor x(DT_FLOAT, TensorShape({10, 0, 34}));
+    Tensor x(DT_FLOAT, TensorShape({10, 0, 36}));
     x.flat<float>().setRandom();
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 34})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 36})));
   }
 
   {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 72a13d4da7..b9667998d6 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -2691,14 +2691,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // If Op has been specifically assigned to a non-CPU device, then No.
     if (!n->assigned_device_name().empty() &&
-        !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->assigned_device_name(), kCPUDeviceSubStr)) {
       result = false;
       reason = "Op has been assigned a runtime device that is not CPU.";
     }
 
     // If user has specifically assigned this op to a non-CPU device, then No.
     if (!n->def().device().empty() &&
-        !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->def().device(), kCPUDeviceSubStr)) {
       result = false;
       reason = "User has assigned a device that is not CPU.";
     }
@@ -2865,9 +2865,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
-  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized 
-  // path. The unoptimized path is slow. Thus we dont rewrite the node 
-  // and use default Eigen. But for depth_radius=2, MKL DNN optimized 
+  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized
+  // path. The unoptimized path is slow. Thus we dont rewrite the node
+  // and use default Eigen. But for depth_radius=2, MKL DNN optimized
   // path is taken, i.e., eigen node is rewritten by MKl DNN node.
   static bool LrnRewrite(const Node* n) {
     CHECK_NOTNULL(n);
@@ -2876,13 +2876,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     CHECK_EQ(GetNodeAttr(n->def(), "depth_radius", &depth_radius).ok(), true);
 
     // if the depth_radius of LRN is not 2, don't rewrite the node by MKL DNN
-    // and use eigen node instead 
+    // and use eigen node instead
     if (depth_radius == 2) {
       return true;
     }
     VLOG(1) << "LrnRewrite: The model sets depth_radius as not 2 which"
             << "case is not optimized by Intel MKL, thus using Eigen op"
-            << "for LRN " ; 
+            << "for LRN ";
 
     return false;
   }
@@ -3015,6 +3015,35 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                 std::vector<NodeBuilder::NodeOut>* ws_tensors,
                                 bool* are_ws_tensors_added);
 
+  // Helper function used by FixMklMetaDataEdges. Fixes the metadata edge
+  // pointed by 'e_metadata' corresponding to the data edge 'e_data' in graph
+  // 'g'. Returns true is fixup was done; otherwise, it returns false.
+  bool FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
+    const Edge* e_data, const Edge* e_metadata);
+
+  // Are the input Mkl metadata edges for node 'n' in graph 'g' correctly
+  // connected? If not, then fix them. This is needed because a graph may have
+  // some input Mkl metadata edges incorrectly setup after node merge and
+  // rewrite passes. This could happen because GetReversePostOrder function may
+  // not provide topologically sorted order if a graph contains cycles. The
+  // function returns true if at least one Mkl metadata edge for node 'n' was
+  // fixed. Otherwise, it returns false.
+  //
+  // Example:
+  //
+  // X = MklConv2D(_, _, _)
+  // Y = MklConv2DWithBias(_, _, _, _, _, _)
+  // Z = MklAdd(X, Y, DummyMklTensor, Y:1)
+  //
+  // For a graph such as shown above, note that 3rd argument of MklAdd contains
+  // DummyMklTensor. Actually, it should be getting the Mkl metadata from
+  // MklConv2D op (specifically, X:2). This incorrect plumbing could be possible
+  // (although rare) if the Mkl NodeMerge + NodeRewrite passes visit Z before X
+  // (possible if X, Y, Z are part of a loop.) This function fixes the Mkl
+  // metadata edges only - it does not rewrite nodes nor does it modify the Mkl
+  // data edges (1st and 2nd arguments of MklAdd).
+  bool FixMklMetaDataEdges(std::unique_ptr<Graph>* g, Node* n);
+
   // Functions specific to operators to copy attributes
   // We need operator-specific function to copy attributes because the framework
   // does not provide any generic function for it.
@@ -4241,6 +4270,92 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   return nullptr;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+//              Post-rewrite Mkl metadata fixup pass
+///////////////////////////////////////////////////////////////////////////////
+bool MklLayoutRewritePass::FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
+    const Edge* e_data, const Edge* e_metadata) {
+  if (g == nullptr || e_data == nullptr || e_metadata == nullptr) {
+    return false;
+  }
+
+  Node* n_data = e_data->src();
+  int n_data_op_slot = e_data->src_output();
+  int n_metadata_op_slot = GetTensorMetaDataIndex(n_data_op_slot,
+                                                  n_data->num_outputs());
+
+  // If the source of meta edge is a constant node (producing dummy Mkl metadata
+  // tensor), then we will need to fix.
+  if (IsConstant(e_metadata->src())) {
+    Node* e_metadata_dst = e_metadata->dst();
+    int e_metadata_in_slot = e_metadata->dst_input();
+    CHECK_NOTNULL((*g)->AddEdge(n_data, n_metadata_op_slot,
+                  e_metadata_dst, e_metadata_in_slot));
+
+    (*g)->RemoveEdge(e_metadata);
+    return true;
+  }
+
+  return false;
+}
+
+bool MklLayoutRewritePass::FixMklMetaDataEdges(std::unique_ptr<Graph>* g,
+    Node* n) {
+  bool result = false;
+
+  // If graph node is not Mkl node, then return.
+  DataType T = DT_INVALID;
+  if (!GetNodeAttr(n->def(), "T", &T).ok() ||
+      !mkl_op_registry::IsMklOp(n->type_string(), T)) {
+    return result;
+  }
+
+  // If it is Mkl node, then check if the input edges to this node that carry
+  // Mkl metadata are linked up correctly with the source node.
+
+  // For Mkl nodes, we generate twice the number of input tensors (n for Mkl
+  // data tensors + n for Mkl metadata tensors). We need to check for correct
+  // connection of n metadata tensors only.
+  int num_data_inputs = n->num_inputs() / 2;
+  for (int idx = 0; idx < num_data_inputs; idx++) {
+    // Get the edge connecting input slot with index (idx).
+    const Edge* e = nullptr;
+    TF_CHECK_OK(n->input_edge(idx, &e));
+
+    // If e is control edge, then skip.
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    // Check that the source node for edge 'e' is Mkl node. If it is not an Mkl
+    // node, then we don't need to do anything.
+    Node* e_src = e->src();
+    if (GetNodeAttr(e_src->def(), "T", &T).ok() &&
+        mkl_op_registry::IsMklOp(e_src->type_string(), T)) {
+      // Source node for edge 'e' is Mkl node.
+      // Destination node and destination input slot of e is node 'n' and 'idx'
+      // resp.
+      CHECK_EQ(e->dst(), n);
+      CHECK_EQ(e->dst_input(), idx);
+
+      // Let's get edge that carries Mkl metadata corresponding to Mkl data edge
+      // 'e'. For that, let's first get the input slot of 'n' where the meta
+      // edge will feed the value.
+      int e_meta_in_slot = GetTensorMetaDataIndex(e->dst_input(),
+                                                  n->num_inputs());
+      const Edge* e_meta = nullptr;
+      TF_CHECK_OK(n->input_edge(e_meta_in_slot, &e_meta));
+
+      // Let's check if we need to fix this meta edge.
+      if (FixMklMetaDataEdgeIfNeeded(g, e, e_meta)) {
+        result = true;
+      }
+    }
+  }
+
+  return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 //              Run function for the pass
 ///////////////////////////////////////////////////////////////////////////////
@@ -4307,6 +4422,25 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
 
   DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite)", &**g);
 
+  order.clear();
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+    if (FixMklMetaDataEdges(g, n)) {
+      string node_name = n->name();
+      string op_name = n->type_string();
+
+      VLOG(1) << "MklLayoutRewritePass: fixed metadata edges for node "
+              << node_name << " with op " << op_name;
+      result = true;
+    }
+  }
+  DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite+Fixup)",
+            &**g);
+
   return result;
 }
 
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 029cdcf94a..7645b4a7f0 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -3518,6 +3518,37 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
             "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
 }
 
+/////////////////////////////////////////////////////////////////////
+//         Post-rewrite fixup pass test
+
+TEST_F(MklLayoutPassTest, PostRewriteFixUpPass) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_UINT8 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_UINT8 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'E' op: '_MklAdd'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A', 'D', 'D']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Const);E(_MklAdd);"
+            "M(_MklInput);N(_MklInput)|A->C;A->E:1;B->C:1;C->E;C:2->E:2;"
+            "D->E:3;M->C:2;N->C:3");
+}
+
 /////////////////////////////////////////////////////////////////////
 
 static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 6749a7c571..0c02876ac5 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -610,7 +610,6 @@ class SymbolicShapeRefiner {
     }
   };
 
-  // Compute the shape of the tensors outputed by node 'node' at output port
   // 'port_index' as the union of shape1 and shape2.
   ShapeHandle OutputAsUnion(const NodeDef* node, int port_index,
                             ShapeHandle shape1, ShapeHandle shape2) {
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 1b18087cdf..8ca726df0b 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -679,6 +679,7 @@ cc_library(
     deps = [
         ":constant_folding",
         ":graph_optimizer",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
@@ -780,7 +781,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:scoped_allocator_ops_op_lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 4dde7ed1b4..03e36a7b9c 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -200,8 +201,7 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
         }
       }
       if (optimizable) {
-        VLOG(2) << "Optimizing fused batch norm node " << node.DebugString()
-                << std::endl;
+        VLOG(1) << "Optimizing fused batch norm node " << node.DebugString();
         AddBatchNormNodes(optimized_graph, node);
         continue;
       }
diff --git a/tensorflow/core/kernels/as_string_op.cc b/tensorflow/core/kernels/as_string_op.cc
index 66c4aff3e3..a7757d1361 100644
--- a/tensorflow/core/kernels/as_string_op.cc
+++ b/tensorflow/core/kernels/as_string_op.cc
@@ -73,6 +73,7 @@ class AsStringOp : public OpKernel {
     }
     switch (dtype) {
       case DT_INT8:
+      case DT_INT16:
       case DT_INT32:
         strings::Appendf(&format_, "d");
         break;
@@ -129,6 +130,7 @@ class AsStringOp : public OpKernel {
       ENCODE_TYPE(DT_FLOAT, float, format_);
       ENCODE_TYPE(DT_DOUBLE, double, format_);
       ENCODE_TYPE(DT_INT8, int8, format_);
+      ENCODE_TYPE(DT_INT16, int16, format_);
       case (DT_BOOL): {
         const auto& input_flat = input_tensor->flat<bool>();
         for (int i = 0; i < input_flat.size(); ++i) {
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index 14d889e8e3..49b90e855b 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -33,52 +33,41 @@ class ClipOp : public OpKernel {
     const Tensor& in0 = ctx->input(0);
     const Tensor& in1 = ctx->input(1);
     const Tensor& in2 = ctx->input(2);
+    OP_REQUIRES(ctx, (in0.shape() == in1.shape() ||
+                      TensorShapeUtils::IsScalar(in1.shape())) &&
+                     (in0.shape() == in2.shape() ||
+                      TensorShapeUtils::IsScalar(in2.shape())),
+                errors::InvalidArgument(
+                    "clip_value_min and clip_value_max must be either of "
+                    "the same shape as input, or a scalar. ",
+                    "input shape: ", in0.shape().DebugString(),
+                    "clip_value_min shape: ", in1.shape().DebugString(),
+                    "clip_value_max shape: ", in2.shape().DebugString()));
+
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
+    if (out->NumElements() == 0) return;  // Nothing to do for empty output
 
     auto in0_flat = in0.flat<T>();
     auto in1_flat = in1.flat<T>();
     auto in2_flat = in2.flat<T>();
+    auto out_flat = out->flat<T>();
     const Device& d = ctx->eigen_device<Device>();
 
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
-    auto out_flat = out->flat<T>();
     if (in1.shape() == in2.shape()) {
       if (in0.shape() == in1.shape()) {
         functor::TernaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                             out_flat);
       } else {
-        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in1.shape()),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::UnaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                           out_flat);
       }
     } else {
       if (in0.shape() == in1.shape()) {
-        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in2.shape()),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryLeftClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                out_flat);
       } else {
-        OP_REQUIRES(ctx,
-                    (in0.shape() == in2.shape() &&
-                     TensorShapeUtils::IsScalar(in1.shape())),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryRightClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                 out_flat);
       }
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 9a3b2303a3..17a85d9773 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -57,6 +57,7 @@ struct DenseUpdate<GPUDevice, T, SUB> {
   template struct functor::DenseUpdate<GPUDevice, T, ADD>; \
   template struct functor::DenseUpdate<GPUDevice, T, SUB>;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+TF_CALL_int32(DEFINE_GPU_KERNELS);
 TF_CALL_int64(DEFINE_GPU_KERNELS);
 #undef DEFINE_GPU_KERNELS
 
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index e6fefe643b..5cd8e04927 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -37,6 +37,7 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.cc b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
index 39b6924d74..4563fc6353 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
@@ -31,6 +31,7 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
+TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 7e5a9e1ec5..4e53291b7f 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -228,6 +228,8 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
+TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
@@ -239,6 +241,8 @@ TF_CALL_complex128(DECLARE_GPU_SPECS);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_ND_GPU(type) REGISTER_GATHER_ND_ALL_INDICES(GPU, type)
 
+TF_CALL_int32(REGISTER_GATHER_ND_GPU);
+TF_CALL_int64(REGISTER_GATHER_ND_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex64(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex128(REGISTER_GATHER_ND_GPU);
diff --git a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
index b03efc684f..da8d2e9e3c 100644
--- a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
@@ -119,6 +119,8 @@ struct GatherNdSlice<GPUDevice, T, Index, IXDIM> {
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
+TF_CALL_int32(DEFINE_GPU_SPECS);
+TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index ef332ebee3..094504d6b9 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -153,6 +153,7 @@ TF_CALL_uint64(REGISTER_GATHER_CPU);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
+TF_CALL_int64(REGISTER_GATHER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
 TF_CALL_complex64(REGISTER_GATHER_GPU);
 TF_CALL_complex128(REGISTER_GATHER_GPU);
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 5eeb23d810..31d1b949ef 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 
 #include <limits>
 #include <vector>
+#include <unordered_map>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -590,8 +591,8 @@ class MklConcatOp : public OpKernel {
       const int N = input_tensors.size();
 
       // Get Tensor shapes.
-      std::vector<MklDnnShape> input_shapes(N);
-      GetMklShapeList(context, "values", &input_shapes);
+      std::vector<MklDnnShape> mkl_input_shapes(N);
+      GetMklShapeList(context, "values", &mkl_input_shapes);
 
       const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM)
                                             ? MklGetInput(context, 0)
@@ -610,19 +611,14 @@ class MklConcatOp : public OpKernel {
       int i = 0;
       bool invoke_eigen = false;
       bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
-      const TensorShape expected_shape = input_shapes[0].IsMklTensor()
-                                             ? input_shapes[0].GetTfShape()
-                                             : input_tensors[0].shape();
+      const TensorShape expected_shape = mkl_input_shapes[0].IsMklTensor()
+                                       ? mkl_input_shapes[0].GetTfShape()
+                                       : input_tensors[0].shape();
       size_t expected_dims = expected_shape.dims();
 
       if (concat_dim < 0) concat_dim = expected_dims + concat_dim;
 
-      for (auto& s : input_shapes) {
-        if (s == expected_shape) {
-          ++i;
-          continue;
-        }
-
+      for (auto& s : mkl_input_shapes) {
         TensorShape s_shape =
             s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
         size_t s_dims = s_shape.dims();
@@ -665,21 +661,14 @@ class MklConcatOp : public OpKernel {
 
       // Call Eigen library
       if (invoke_eigen) {
-        TensorShapeList tf_input_shapes;
-        i = 0;
-        for (auto& s : input_shapes) {
-          TensorShape s_shape =
-              s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
-          tf_input_shapes.push_back(s_shape);
-          ++i;
-        }
-        CallEigenVersion(context, input_tensors, tf_input_shapes);
+        CallEigenVersion(context, input_tensors, mkl_input_shapes);
         return;
       }
 
       memory::dims dst_dims;
+
       if (are_all_mkl_inputs)
-        dst_dims = TFShapeToMklDnnDims(input_shapes[0].GetTfShape());
+        dst_dims = TFShapeToMklDnnDims(mkl_input_shapes[0].GetTfShape());
       else
         // When all the inputs are in Tensorflow format, we don't know
         // what is the input data format. In that case, we just use
@@ -689,26 +678,61 @@ class MklConcatOp : public OpKernel {
       std::vector<memory::primitive_desc> srcs_pd;
       std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
       int64 dst_concat_dim_size = 0;
-      for (int k = 0; k < N; k++) {
-        bool is_mkl_tensor = input_shapes[k].IsMklTensor();
-        memory::dims src_dims;
-
-        // Same comment as dst_dims for src_dims.
-        src_dims = (is_mkl_tensor)
-                       ? TFShapeToMklDnnDims(input_shapes[k].GetTfShape())
-                       : TFShapeToMklDnnDims(input_tensors[k].shape());
-
-        dst_concat_dim_size += src_dims[concat_dim];
-        auto src_md =
-            is_mkl_tensor ? input_shapes[k].GetMklLayout() :
-                          // It does not matter what data format we use here
-                          // (NHWC or NCHW). We just need to ensure that output
-                          // of Concat uses same data format as input.
-                memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
-
-        srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-        auto src_mpd = srcs[k].GetUsrMemPrimDesc();
-        srcs_pd.push_back(src_mpd);
+
+      bool isMklReorderNeeded = false;
+      memory::format mkl_common_format = memory::format::any;
+      if (are_all_mkl_inputs) {
+        mkl_common_format =
+            FindMklCommonFormat(mkl_input_shapes, concat_dim,
+               &isMklReorderNeeded, &dst_concat_dim_size);
+
+        if (!isMklReorderNeeded) {
+          // All MKL tensors have a same format. Reorder is not needed.
+          for (int k = 0; k < N; k++) {
+            if (input_tensors[k].NumElements() == 0)
+              continue;
+
+            auto src_md = mkl_input_shapes[k].GetMklLayout();
+            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+            auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+            srcs_pd.push_back(src_mpd);
+          }
+        } else {
+          // MKL tensors have different formats.
+          // Reorder them to most common format.
+          for (int k = 0; k < N; k++) {
+            if (input_tensors[k].NumElements() == 0)
+              continue;
+
+            auto src_dims = TFShapeToMklDnnDims(
+                mkl_input_shapes[k].GetTfShape());
+            auto src_md = mkl_input_shapes[k].GetMklLayout();
+            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+
+            if (src_md.data.format != mkl_common_format)
+              src_md = memory::desc(src_dims, MklDnnType<T>(),
+                           mkl_common_format);
+
+            srcs_pd.push_back(memory::primitive_desc(src_md, cpu_engine));
+          }
+        }
+      } else {  // All TF inputs
+        for (int k = 0; k < N; k++) {
+          if (input_tensors[k].NumElements() == 0)
+            continue;
+
+          memory::dims src_dims = TFShapeToMklDnnDims(input_tensors[k].shape());
+          dst_concat_dim_size += src_dims[concat_dim];
+
+          // It does not matter what data format to be used (NHWC versus NCHW).
+          // We just need to ensure that output uses same data format as inputs.
+          auto src_md =
+              memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
+
+          srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+          auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+          srcs_pd.push_back(src_mpd);
+        }
       }
       dst_dims[concat_dim] = dst_concat_dim_size;
 
@@ -718,25 +742,33 @@ class MklConcatOp : public OpKernel {
       if (are_all_mkl_inputs) {
         // Since we are passing a specific format for destination,
         // we need to have dst_dims in MklDnn order (NCHW).
-        auto orig_tf_format = input_shapes[0].GetTfDataFormat();
+        auto orig_tf_format = mkl_input_shapes[0].GetTfDataFormat();
         dst_dims_in_nchw = MklDnnDimsInNCHW(
             dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
-        // We will set the output in the same format as input to avoid layout
-        // conversions.
-        // Currently we are setting dst format same as input format.
-        // See if we can make this choice in a better way.
+        // Set the output format same as the most common format of inputs
+        // to avoid layout conversions.
         dst_md = memory::desc(
-            dst_dims_in_nchw, MklDnnType<T>(),
-            (memory::format)input_shapes[0].GetMklLayout().data.format);
+            dst_dims_in_nchw, MklDnnType<T>(), mkl_common_format);
       } else {
-        // Again, format does not matter here. We just need to make it same as
-        // input format.
+        // All inputs are TF tensors.
+        // Set the output format same as input format (nchw).
         dst_md = memory::desc(dst_dims, MklDnnType<T>(), memory::format::nchw);
       }
 
       std::vector<primitive::at> inputs;
-      for (int k = 0; k < input_tensors.size(); k++)
-        inputs.push_back(srcs[k].GetOpMem());
+      std::vector<primitive> net;
+      if (isMklReorderNeeded) {
+        for (int k = 0; k < input_tensors.size(); k++) {
+          if (input_tensors[k].NumElements() > 0) {
+            srcs[k].CheckReorderToOpMem(srcs_pd[k], &net);
+          }
+        }
+      }
+      for (int k = 0; k < input_tensors.size(); k++) {
+        if (input_tensors[k].NumElements() > 0) {
+          inputs.push_back(srcs[k].GetOpMem());
+        }
+      }
 
       // If all inputs are in MKL format, then meaning of concat_dim needs to
       // change. Value of concat_dim is tied to input Tensorflow data format
@@ -745,7 +777,8 @@ class MklConcatOp : public OpKernel {
       // But ifinput tensors are in NHWC order, then semantics need to change.
       // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
       // then since MklDnn order is NCHW, concat_dim needs to be 1.
-      if (are_all_mkl_inputs) concat_dim = input_shapes[0].TfDimIdx(concat_dim);
+      if (are_all_mkl_inputs)
+         concat_dim = mkl_input_shapes[0].TfDimIdx(concat_dim);
 
       auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
 
@@ -758,7 +791,7 @@ class MklConcatOp : public OpKernel {
         dnn_shape_dst.SetMklLayout(&dst_pd);
         dnn_shape_dst.SetElemType(MklDnnType<T>());
         dnn_shape_dst.SetTfLayout(dst_dims.size(), dst_dims_in_nchw,
-                                  input_shapes[0].GetTfDataFormat());
+                                  mkl_input_shapes[0].GetTfDataFormat());
         tf_shape_dst.AddDim((dst_pd.get_size() / sizeof(T)));
       } else {
         dnn_shape_dst.SetMklTensor(false);
@@ -773,7 +806,6 @@ class MklConcatOp : public OpKernel {
       dst.SetUsrMem(dst_md, dst_tensor);
 
       auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
-      std::vector<primitive> net;
       net.push_back(concat_op);
       stream(stream::kind::eager).submit(net).wait();
     } catch (mkldnn::error& e) {
@@ -787,15 +819,27 @@ class MklConcatOp : public OpKernel {
   }
 
   void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
-                        const TensorShapeList& input_shapes) {
-    CHECK_EQ(values.size(), input_shapes.size());
+                        const MklDnnShapeList& mkl_input_shapes) {
+    CHECK_EQ(values.size(), mkl_input_shapes.size());
 
     std::vector<Tensor> converted_values;
-    for (int i = 0; i < input_shapes.size(); i++)
-      converted_values.push_back(values[i]);
+    TensorShapeList tf_input_shapes;
+    for (int i = 0; i < mkl_input_shapes.size(); i++) {
+      if (mkl_input_shapes[i].IsMklTensor()) {
+        // do conversion from MKL to TF
+        Tensor tmp_tensor =
+            ConvertMklToTF<T>(context, values[i], mkl_input_shapes[i]);
+        converted_values.push_back(tmp_tensor);
+        tf_input_shapes.push_back(mkl_input_shapes[i].GetTfShape());
+      } else {
+        // no conversion since it is TF tensor already
+        converted_values.push_back(values[i]);
+        tf_input_shapes.push_back(values[i].shape());
+      }
+    }
 
     // Call Eigen concat.
-    eigen_concat_op_.Compute(context, converted_values, input_shapes);
+    eigen_concat_op_.Compute(context, converted_values, tf_input_shapes);
 
     // Set output Mkl tensor for this op.
     MklDnnShape dnn_shape_output;
@@ -812,6 +856,55 @@ class MklConcatOp : public OpKernel {
         output_tensor->flat<uint8>().data(),
         output_tensor->flat<uint8>().size() * sizeof(uint8));
   }
+
+  // This method finds the most commom format accross all MKL inputs
+  // Inputs:
+  //   1. input_shapes: shapes of input (MKL) tensors.
+  //   2. concat_dim: concat dimension.
+  // Outputs:
+  //   1. is_reorder_needed is set to true if inputs have difference formats
+  //      It is set to false otherwise.
+  //   2. concat_dim_size is the size of concat_dim.
+  // Return:
+  //   return the common MKL format.
+  memory::format FindMklCommonFormat(const MklDnnShapeList& input_shapes,
+      int concat_dim, bool* is_reorder_needed, int64* concat_dim_size) {
+    *is_reorder_needed = false;
+    *concat_dim_size = 0;
+    std::unordered_map<int, int> occurrence_map;
+    if (input_shapes.size() == 0)
+      return memory::format::any;
+
+    // Compute ocurrences of each format of all inputs.
+    for (int k=0; k <input_shapes.size(); k++) {
+      auto src_dims = TFShapeToMklDnnDims(input_shapes[k].GetTfShape());
+      *concat_dim_size += src_dims[concat_dim];
+      int fmt = static_cast<int>(
+          input_shapes[k].GetMklLayout().data.format);
+      occurrence_map[fmt] += 1;
+    }
+
+    if (occurrence_map.size() == 1) {
+       // this means that all inputs have a same format
+       // return it with is_reorder_needed set false.
+       return static_cast<memory::format>(
+           input_shapes[0].GetMklLayout().data.format);
+    }
+
+    // Input tensors have different formats. Thus, reorder is needed.
+    // We pick up the most common format to minimize the total
+    // number of input reorder.
+    memory::format commonest_format = memory::format::any;
+    int max_occurrence = 0;
+    *is_reorder_needed = true;
+    for (auto item : occurrence_map) {
+      if (item.second > max_occurrence) {
+        commonest_format = static_cast<memory::format>(item.first);
+        max_occurrence = item.second;
+      }
+    }
+    return commonest_format;
+  }
 };
 
 #endif
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index c1da0ded1d..f857be6c32 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -18,6 +18,7 @@ limitations under the License.
 // bias.
 
 #ifdef INTEL_MKL
+#ifdef INTEL_MKL_ML
 
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
@@ -264,4 +265,5 @@ class MklConv2DCustomBackpropBiasOp : public OpKernel {
 TF_CALL_float(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 } /* namespace tensorflow */
+#endif /* INTEL_MKL_ML */
 #endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 279167aba2..c0dfed7d7d 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -199,13 +199,15 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
     CHECK_NOTNULL(pool_params);
     CHECK_NOTNULL(dnn_data_input);
     TensorShape input_tensor_shape = input_tensor.shape();
-    memory::desc input_md =
+    if (input_tensor.NumElements() != 0) {
+      memory::desc input_md =
         input_mkl_shape.IsMklTensor()
             ? input_mkl_shape.GetMklLayout()
             : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
                                                      this->data_format_tf_),
                            MklDnnType<T>(), this->data_format_mkldnn_);
-    dnn_data_input->SetUsrMem(input_md, &input_tensor);
+      dnn_data_input->SetUsrMem(input_md, &input_tensor);
+    }
     this->InitMklPoolParameters(context, pool_params, input_mkl_shape,
                                 input_tensor_shape);
   }
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 43c5b29509..e1fc2ea128 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -292,6 +292,7 @@ TF_CALL_string(REGISTER_SCATTER_ND_CPU);
   REGISTER_SCATTER_ND_UPDATE_GPU(type);   \
   REGISTER_SCATTER_ND_GPU(type);
 
+TF_CALL_int32(REGISTER_SCATTER_ND_ALL_GPU);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_complex64(REGISTER_SCATTER_ND_ALL_GPU);
@@ -306,6 +307,8 @@ TF_CALL_complex128(REGISTER_SCATTER_ND_ALL_GPU);
 #define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
   REGISTER_SCATTER_ND_UPDATE(type, SYCL);
 
+TF_CALL_int32(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
+TF_CALL_int32(REGISTER_SCATTER_ND_UPDATE_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
 #undef REGISTER_SCATTER_ND_ADD_SUB_SYCL
@@ -576,6 +579,7 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index a3c21edc15..08b657f4c3 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -170,6 +170,7 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
index bb0129fa6f..634f9ba887 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@@ -216,8 +216,13 @@ TEST_F(ScopedAllocatorConcatOpTest, Success3) {
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, Reshape) {
-  MakeOp({2, 2, 2}, DT_DOUBLE, true, "test", 120, 2);
-  ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}});
+  MakeOp({2, 2, 4}, DT_DOUBLE, true, "test", 120, 2);
+
+  // The elements of the third parameter to ExecOp must be multiples of
+  // Allocator::kAllocatorAlignment in size.  If they are not, the backing
+  // tensor allocated by PrepOp will have too many elements and reshaping
+  // will fail.
+  ExecOp(DT_DOUBLE, 120, {{2, 4}, {2, 4}});
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, NoReshapeAttr) {
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index 7796bf3587..d65692a552 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -16,6 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
+
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
+
 // This file requires the following include because it uses CudaAtomicMax:
 // #include "tensorflow/core/util/cuda_kernel_helper.h"
 
@@ -130,4 +138,4 @@ struct Highest {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index a1f9667b78..866c5dcd52 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -1490,7 +1490,7 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
 
 #endif  // TENSORFLOW_USE_LIBXSMM
 
-// Here is a an overview of the SparseMatMul code. Note that we assume that the
+// Here is an overview of the SparseMatMul code. Note that we assume that the
 // left matrix is sparse.
 //
 // The matrix "left" is divided into a grid with blocksize of (M, KL). Each
diff --git a/tensorflow/core/kernels/string_split_op.cc b/tensorflow/core/kernels/string_split_op.cc
index 4c2b312c34..26ab72f12e 100644
--- a/tensorflow/core/kernels/string_split_op.cc
+++ b/tensorflow/core/kernels/string_split_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
@@ -43,6 +44,63 @@ std::vector<string> Split(const string& str, const string& delimiter,
   return char_vector;
 }
 
+std::vector<string> SplitV2(const string& str, StringPiece sep, int maxsplit) {
+  // This SplitV2 method matches the behavior of python's str.split:
+  //   If sep is given, consecutive delimiters are not grouped together
+  //   and are deemed to delimit empty strings (for example, '1,,2'.split(',')
+  //   returns ['1', '', '2']). The sep argument may consist of multiple
+  //   characters (for example, '1<>2<>3'.split('<>') returns ['1', '2', '3']).
+  //   Splitting an empty string with a specified separator returns [''].
+  //
+  //   If sep is not specified or is None, a different splitting algorithm is
+  //   applied: runs of consecutive whitespace are regarded as a single
+  //   separator, and the result will contain no empty strings at the start or
+  //   end if the string has leading or trailing whitespace. Consequently,
+  //   splitting an empty string or a string consisting of just whitespace
+  //   with a None separator returns [].
+
+  std::vector<string> result;
+
+  StringPiece text(str);
+  if (maxsplit == 0) {
+    result.emplace_back(std::string(text));
+    return result;
+  }
+
+  if (sep.empty()) {
+    StringPiece token;
+    // Remove leading whitespaces.
+    str_util::RemoveLeadingWhitespace(&text);
+    int split = 0;
+    while (str_util::ConsumeNonWhitespace(&text, &token)) {
+      result.emplace_back(std::string(token));
+      str_util::RemoveLeadingWhitespace(&text);
+      ++split;
+      if (maxsplit > 0 && split == maxsplit) {
+        result.emplace_back(std::string(text));
+        return result;
+      }
+    }
+    return result;
+  }
+  auto p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
+  int split = 0;
+  while (p != text.end()) {
+    StringPiece token = text.substr(0, p - text.begin());
+    result.emplace_back(std::string(token));
+    text.remove_prefix(token.size());
+    text.remove_prefix(sep.size());
+    ++split;
+    if (maxsplit > 0 && split == maxsplit) {
+      result.emplace_back(std::string(text));
+      return result;
+    }
+    p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
+  }
+  result.emplace_back(std::string(text));
+  return result;
+}
+
 }  // namespace
 
 class StringSplitOp : public OpKernel {
@@ -122,6 +180,78 @@ class StringSplitOp : public OpKernel {
   bool skip_empty_;
 };
 
+class StringSplitV2Op : public OpKernel {
+ public:
+  explicit StringSplitV2Op(OpKernelConstruction* context)
+      : OpKernel(context), maxsplit_(-1) {
+    OP_REQUIRES_OK(context, context->GetAttr("maxsplit", &maxsplit_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(input_tensor->shape()),
+                errors::InvalidArgument("input must be a vector, got shape: ",
+                                        input_tensor->shape().DebugString()));
+
+    const auto input_vec = input_tensor->vec<string>();
+    const int64 batch_size = input_vec.dimension(0);
+
+    const Tensor* sep_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("sep", &sep_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(sep_tensor->shape()),
+                errors::InvalidArgument("sep must be a scalar, got shape: ",
+                                        sep_tensor->shape().DebugString()));
+    const auto sep_vec = sep_tensor->flat<string>();
+    StringPiece sep(sep_vec(0));
+    std::vector<string> tokens;
+    // Guess that we'll be unpacking a handful of tokens per example.
+    static constexpr int kReserveSize = 4;
+    tokens.reserve(batch_size * kReserveSize);
+
+    int64 output_size = 0;
+    int64 max_num_entries = 0;
+    std::vector<int64> num_indices(batch_size);
+    for (int64 i = 0; i < batch_size; ++i) {
+      std::vector<string> parts = SplitV2(input_vec(i), sep, maxsplit_);
+      int64 n_entries = parts.size();
+      num_indices[i] = n_entries;
+      output_size += n_entries;
+      max_num_entries = std::max(max_num_entries, n_entries);
+      tokens.insert(tokens.end(), parts.begin(), parts.end());
+    }
+
+    Tensor* sp_indices_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({output_size, 2}),
+                                             &sp_indices_t));
+    Tensor* sp_tokens_t;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(1, TensorShape({output_size}), &sp_tokens_t));
+    Tensor* sp_shape_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(2, TensorShape({2}), &sp_shape_t));
+
+    auto sp_indices = sp_indices_t->matrix<int64>();
+    auto sp_tokens = sp_tokens_t->vec<string>();
+    auto sp_shape = sp_shape_t->vec<int64>();
+    sp_shape(0) = batch_size;
+    sp_shape(1) = max_num_entries;
+    size_t c = 0;
+    for (size_t i = 0; i < batch_size; ++i) {
+      for (size_t j = 0; j < num_indices[i]; ++j) {
+        sp_indices(c, 0) = i;
+        sp_indices(c, 1) = j;
+        sp_tokens(c) = tokens[c];
+        ++c;
+      }
+    }
+  }
+
+ private:
+  int maxsplit_;
+};
+
 REGISTER_KERNEL_BUILDER(Name("StringSplit").Device(DEVICE_CPU), StringSplitOp);
+REGISTER_KERNEL_BUILDER(Name("StringSplitV2").Device(DEVICE_CPU),
+                        StringSplitV2Op);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/candidate_sampling_ops.cc b/tensorflow/core/ops/candidate_sampling_ops.cc
index 6e4d100b04..6e589c8d1c 100644
--- a/tensorflow/core/ops/candidate_sampling_ops.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops.cc
@@ -145,12 +145,15 @@ REGISTER_OP("ComputeAccidentalHits")
       int64 num_true;
       TF_RETURN_IF_ERROR(c->GetAttr("num_true", &num_true));
 
-      // Validate true_classes.
+      // Validate true_classes, must be a matrix.
       ShapeHandle true_classes;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &true_classes));
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(
           c->WithValue(c->Dim(true_classes, 1), num_true, &unused));
+      // Validate sampled_candidates, must be a vector.
+      ShapeHandle sampled_candidates;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &sampled_candidates));
 
       // All three outputs are the same shape.
       ShapeHandle v = c->Vector(InferenceContext::kUnknownDim);
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 15e0ca8af9..9dca5f53ce 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -218,7 +218,17 @@ REGISTER_OP("MapAndBatchDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Use index from the end to retrieve the Input shapes,
+      // so that to avoid guessing the length of "other_arguments".
+      // batch_size, num_parallel_batches, and drop_remainder are 0-D scalars.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("MapAndBatchDatasetV2")
     .Input("input_dataset: variant")
@@ -231,7 +241,17 @@ REGISTER_OP("MapAndBatchDatasetV2")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Use index from the end to retrieve the Input shapes,
+      // so that to avoid guessing the length of "other_arguments".
+      // batch_size, num_parallel_calls, and drop_remainder are 0-D scalars.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("PrefetchDataset")
     .Input("input_dataset: variant")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index d949e70c66..87f4991134 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -454,7 +454,9 @@ REGISTER_OP("DrawBoundingBoxes")
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 2), 4, &unused));
 
-      return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
+      // The rank of the input image (rank = 4) has already been restricted
+      // above, and the output is of the same shape as the input.
+      return shape_inference::UnchangedShape(c);
     });
 
 // --------------------------------------------------------------------------
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 1740fa152c..b3487122e2 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1084,7 +1084,7 @@ REGISTER_OP("UnsortedSegmentProd")
     .Input("segment_ids: Tindices")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: realnumbertype")
+    .Attr("T: numbertype")
     .Attr("Tindices: {int32,int64}")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .SetShapeFn(UnsortedSegmentReductionShapeFn);
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index fc60e807b9..41efa49ce3 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1453,6 +1453,7 @@ REGISTER_OP("QuantizedReluX")
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 1d5c743a56..4423062362 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -78,7 +78,7 @@ REGISTER_OP("ReduceJoin")
 REGISTER_OP("AsString")
     .Input("input: T")
     .Output("output: string")
-    .Attr("T: {int32, int64, complex64, float, double, bool, int8}")
+    .Attr("T: {int8, int16, int32, int64, complex64, float, double, bool}")
     .Attr("precision: int = -1")
     .Attr("scientific: bool = false")
     .Attr("shortest: bool = false")
@@ -134,6 +134,24 @@ REGISTER_OP("StringSplit")
       return Status::OK();
     });
 
+REGISTER_OP("StringSplitV2")
+    .Input("input: string")
+    .Input("sep: string")
+    .Output("indices: int64")
+    .Output("values: string")
+    .Output("shape: int64")
+    .Attr("maxsplit: int = -1")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+
+      c->set_output(0, c->Matrix(InferenceContext::kUnknownDim, 2));
+      c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    });
+
 REGISTER_OP("StringStrip")
     .Input("input: string")
     .Output("output: string")
diff --git a/tensorflow/core/platform/cpu_info.cc b/tensorflow/core/platform/cpu_info.cc
index 99de364042..e9da3d8e32 100644
--- a/tensorflow/core/platform/cpu_info.cc
+++ b/tensorflow/core/platform/cpu_info.cc
@@ -344,5 +344,28 @@ int CPUModelNum() {
 #endif
 }
 
+int CPUIDNumSMT() {
+#ifdef PLATFORM_IS_X86
+  // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration
+  // https://software.intel.com/en-us/articles/intel-sdm (Vol 3A)
+  // Section: Detecting Hardware Multi-threads Support and Topology
+  // Uses CPUID Leaf 11 to enumerate system topology on Intel x86 architectures
+  // Other cases not supported
+  uint32 eax, ebx, ecx, edx;
+  // Check if system supports Leaf 11
+  GETCPUID(eax, ebx, ecx, edx, 0, 0);
+  if (eax >= 11) {
+    // 1) Leaf 11 available? CPUID.(EAX=11, ECX=0):EBX != 0
+    // 2) SMT_Mask_Width = CPUID.(EAX=11, ECX=0):EAX[4:0] if CPUID.(EAX=11,
+    // ECX=0):ECX[15:8] is 1
+    GETCPUID(eax, ebx, ecx, edx, 11, 0);
+    if (ebx != 0 && ((ecx & 0xff00) >> 8) == 1) {
+      return 1 << (eax & 0x1f);  // 2 ^ SMT_Mask_Width
+    }
+  }
+#endif  // PLATFORM_IS_X86
+  return 0;
+}
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index b5be7e8b54..175c9ae8b1 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -35,6 +35,10 @@ namespace port {
 // software can change it dynamically.
 int NumSchedulableCPUs();
 
+// Returns an estimate of the number of hyperthreads per physical core
+// on the CPU
+int NumHyperthreadsPerCore();
+
 // Mostly ISA related features that we care about
 enum CPUFeature {
   // Do not change numeric assignments.
@@ -107,6 +111,9 @@ int CPUModelNum();
 // Returns nominal core processor cycles per second of each processor.
 double NominalCPUFrequency();
 
+// Returns num of hyperthreads per physical core
+int CPUIDNumSMT();
+
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index ae81f9b5b3..a319ccbdbe 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -71,6 +71,8 @@ def pyx_library(
         name = filename + "_cython_translation",
         srcs = [filename],
         outs = [filename.split(".")[0] + ".cpp"],
+        # Optionally use PYTHON_BIN_PATH on Linux platforms so that python 3
+        # works. Windows has issues with cython_binary so skip PYTHON_BIN_PATH.
         cmd = "PYTHONHASHSEED=0 $(location @cython//:cython_binary) --cplus $(SRCS) --output-file $(OUTS)",
         tools = ["@cython//:cython_binary"] + pxd_srcs,
     )
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 72c12318ca..ff4b4436bb 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -115,18 +115,17 @@ class LibHDFS {
     const char* kLibHdfsDso = "libhdfs.so";
 #endif
     char* hdfs_home = getenv("HADOOP_HDFS_HOME");
-    if (hdfs_home == nullptr) {
-      status_ = errors::FailedPrecondition(
-          "Environment variable HADOOP_HDFS_HOME not set");
-      return;
-    }
-    string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
-    status_ = TryLoadAndBind(path.c_str(), &handle_);
-    if (!status_.ok()) {
-      // try load libhdfs.so using dynamic loader's search path in case
-      // libhdfs.so is installed in non-standard location
-      status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
+    if (hdfs_home != nullptr) {
+      string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
+      status_ = TryLoadAndBind(path.c_str(), &handle_);
+      if (status_.ok()) {
+        return;
+      }
     }
+
+    // Try to load the library dynamically in case it has been installed
+    // to a in non-standard location.
+    status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
   }
 
   Status status_;
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 8e316472fe..708f32ba80 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -74,6 +74,11 @@ int NumSchedulableCPUs() {
   return kDefaultCores;
 }
 
+int NumHyperthreadsPerCore() {
+  static const int ht_per_core = tensorflow::port::CPUIDNumSMT();
+  return (ht_per_core > 0) ? ht_per_core : 1;
+}
+
 void* AlignedMalloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__)
   return memalign(minimum_alignment, size);
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 522a9d84fd..cb1fd09dbb 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 8
+#define TF_MINOR_VERSION 9
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX ""
+#define TF_VERSION_SUFFIX "-rc0"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index dffc965b14..90b6533690 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -42,6 +42,7 @@ limitations under the License.
 
 #ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
+#include "tensorflow/core/lib/core/stringpiece.h"
 
 using mkldnn::engine;
 using mkldnn::memory;
@@ -712,15 +713,48 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
   return output_tensor;
 }
 #else
+using mkldnn::stream;
+template <typename T> class MklDnnData;
+
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklDnnShape& mkl_shape) {
   Tensor output_tensor;
-  TensorShape output_shape;
-
-  TF_CHECK_OK(
-      Status(error::Code::UNIMPLEMENTED, "Unimplemented conversion function"));
-
+  try {
+    if (!mkl_shape.IsMklTensor())
+      return mkl_tensor;  // return input since it is already TF tensor
+
+    TensorShape output_shape = mkl_shape.GetTfShape();;
+
+    // Allocate output tensor.
+    context->allocate_temp(DataTypeToEnum<T>::v(),
+        output_shape, &output_tensor);
+
+    auto cpu_engine = engine(engine::cpu, 0);
+    MklDnnData<T> input(&cpu_engine);
+
+    // Get Mkl layout of input tensor.
+    auto input_mkl_md = mkl_shape.GetMklLayout();
+    auto output_tf_md = mkl_shape.GetTfLayout();
+    auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
+    input.SetUsrMem(input_mkl_md, &mkl_tensor);
+
+    // reorder
+    if (input.IsReorderNeeded(output_tf_pd)) {
+      std::vector<primitive> net;
+      CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, &output_tensor, &net),
+             true);
+      stream(stream::kind::eager).submit(net).wait();
+    } else {
+      // If not, just forward input tensor to output tensor.
+      CHECK(output_tensor.CopyFrom(mkl_tensor, output_shape));
+    }
+  } catch (mkldnn::error& e) {
+    string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) + ", in file " +
+                       string(__FILE__) + ":" + std::to_string(__LINE__);
+    LOG(FATAL) << "Operation received an exception: " << error_msg;
+  }
   return output_tensor;
 }
 #endif
@@ -1843,7 +1877,7 @@ class FactoryKeyCreator {
   template <typename T>
   void AddAsKey(const T data) {
     auto buffer = reinterpret_cast<const char *>(&data);
-    Append(absl::string_view(buffer, sizeof(T)));
+    Append(StringPiece(buffer, sizeof(T)));
   }
 
   std::string GetKey() {
@@ -1854,8 +1888,8 @@ class FactoryKeyCreator {
   string key_;
   const char delimiter = 'x';
   const int kMaxKeyLength = 256;
-  void Append(absl::string_view s) {
-    key_.append(string(s));
+  void Append(StringPiece s) {
+    key_.append(s.ToString());
     key_.append(1, delimiter);
   }
 };
diff --git a/tensorflow/docs_src/community/groups.md b/tensorflow/docs_src/community/groups.md
index d92f5775fa..0b07d413da 100644
--- a/tensorflow/docs_src/community/groups.md
+++ b/tensorflow/docs_src/community/groups.md
@@ -1,17 +1,38 @@
 # User Groups
 
-TensorFlow has communities around the world.
+TensorFlow has communities around the world. [Submit your community!](https://docs.google.com/forms/d/e/1FAIpQLSc_RQIUYtVgLLihzATaO_WUXkEyBDE_OoRoOXYDPmBEvHuEBA/viewform)
 
 ## Asia
 
-* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
-* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
-* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
+* [TensorFlow China community](https://www.tensorflowers.cn)
+* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/)
+* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/)
+* [Soleil Data Dojo](https://soleildatadojo.connpass.com/)
 * [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/)
+* [TensorFlow Philippines Community](https://www.facebook.com/groups/TensorFlowPH/)
+* [TensorFlow and Deep Learning Singapore](https://www.meetup.com/TensorFlow-and-Deep-Learning-Singapore/)
+* [TensorFlow India](https://www.facebook.com/tensorflowindia)
 
 
 ## Europe
 
 * [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
 * [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
+* [Tensorflow Belgium](https://www.meetup.com/TensorFlow-Belgium)
+* [TensorFlow x Rome Meetup](https://www.meetup.com/it-IT/TensorFlow-x-Rome-Meetup)
+* [TensorFlow London](https://www.meetup.com/TensorFlow-London/)
+* [TensorFlow Edinburgh](https://www.meetup.com/tensorflow-edinburgh/)
 
+
+## America
+
+* [TensorFlow Buenos Aires](https://www.meetup.com/TensorFlow-Buenos-Aires/)
+
+
+## Oceania
+* [Melbourne TensorFlow Meetup](https://www.meetup.com/Melbourne-TensorFlow-Meetup)
+
+
+## Africa
+
+* [TensorFlow Tunis Meetup](https://www.meetup.com/fr-FR/TensorFlow-Tunis-Meetup/)
diff --git a/tensorflow/docs_src/get_started/eager.md b/tensorflow/docs_src/get_started/eager.md
index f08ac74425..bbb25e20c6 100644
--- a/tensorflow/docs_src/get_started/eager.md
+++ b/tensorflow/docs_src/get_started/eager.md
@@ -1,3 +1,3 @@
 # Get Started with Eager Execution
 
-[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/r1.8.0/samples/core/get_started/eager.ipynb)
+[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/r1.9.0/samples/core/get_started/eager.ipynb)
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index 55579d52fb..232d2f1547 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -10,9 +10,9 @@ course prior to diving into TensorFlow documentation:
 TensorFlow is a tool for machine learning. While it contains a wide range of
 functionality, TensorFlow is mainly designed for deep neural network models.
 
-The easiest way to get started with TensorFlow is using Eager Execution.
+The easiest way to get started with TensorFlow is by using Eager Execution.
 
-  * @{$get_started/eager}, is for anyone new to  machine learning or TensorFlow.
+  * @{$get_started/eager}, is for anyone new to machine learning or TensorFlow.
 
 TensorFlow provides many APIs. The remainder of this section focuses on the
 Estimator API which provide scalable, high-performance models. See the
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 1abd840ab3..2901848745 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 52a2a3f8a6..55bc0f64e7 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.9.0-rc0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 1256fb99c4..637231da12 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.8.0</version>
+  <version>1.9.0-rc0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.8.0</version>
+                 <version>1.9.0-rc0</version>
                </dependency>
              </dependencies>
          </project>
@@ -124,12 +124,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.8.0</version>
+  <version>1.9.0-rc0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.8.0</version>
+  <version>1.9.0-rc0</version>
 </dependency>
 ```
 
@@ -148,7 +148,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -167,7 +167,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -175,13 +175,13 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.9.0-rc0.zip).
   3. Extract this .zip file.
 
-
+__Note__: The native library (`tensorflow_jni.dll`) requires `msvcp140.dll` at runtime, which is included in the [Visual C++ 2015 Redistributable](https://www.microsoft.com/en-us/download/details.aspx?id=48145) package. 
 
 ### Validate the installation
 
@@ -227,7 +227,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.8.0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.9.0-rc0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -241,11 +241,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.8.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.9.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.8.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.9.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 0ed8160027..c8d706cf3c 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -339,9 +339,7 @@ Docker will download the TensorFlow binary image the first time you launch it.
 
 #### GPU support
 
-Prior to installing TensorFlow with GPU support, ensure that your system meets all
-[NVIDIA software requirements](#NVIDIARequirements).  To launch a Docker container
-with NVidia GPU support, enter a command of the following format:
+To launch a Docker container with NVidia GPU support, enter a command of the following format (this [does not require any local CUDA installation](https://github.com/nvidia/nvidia-docker/wiki/CUDA#requirements)):
 
 <pre>
 $ <b>nvidia-docker run -it</b> <i>-p hostPort:containerPort TensorFlowGPUImage</i>
@@ -438,7 +436,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -517,7 +515,7 @@ on your system:
   from source. To use the TensorFlow binaries, version 3.5 or higher is required.
   See the [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a
   list of supported GPU cards.
-* [GPU drivers](http://nvidia.com/driver) that support your version of the CUDA
+* [GPU drivers](http://nvidia.com/drivers) that support your version of the CUDA
   Toolkit.
 * The `libcupti-dev` library is the NVIDIA CUDA Profile Tools Interface. This
   library provides advanced profiling support. To install this library,
@@ -684,14 +682,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -703,14 +701,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -722,14 +720,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -741,14 +739,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 29a867a9e3..9d01271c5a 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -522,7 +522,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl
 </pre>
 
 
@@ -530,5 +530,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 5ba522b436..dc6c1e36fc 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -81,7 +81,7 @@ or
 [macOS](#PrepareMac)
 
 
-<a name="#PrepareLinux"></a>
+<a name="PrepareLinux"></a>
 ## Prepare environment for Linux
 
 Before building TensorFlow on Linux, install the following build
@@ -328,10 +328,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.8.0 on Linux:
+for TensorFlow 1.9.0rc0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.9.0rc0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -373,9 +373,9 @@ The build and installation problems you encounter typically depend on the
 operating system.  See the "Common installation problems" section
 of one of the following guides:
 
-  * @{$install_linux#CommonInstallationProblems$Installing TensorFlow on Linux}
-  * @{$install_mac#CommonInstallationProblems$Installing TensorFlow on Mac OS}
-  * @{$install_windows#CommonInstallationProblems$Installing TensorFlow on Windows}
+  * @{$install_linux#common_installation_problems$Installing TensorFlow on Linux}
+  * @{$install_mac#common_installation_problems$Installing TensorFlow on Mac OS}
+  * @{$install_windows#common_installation_problems$Installing TensorFlow on Windows}
 
 Beyond the errors documented in those two guides, the following table
 notes additional errors specific to building TensorFlow.  Note that we
@@ -433,6 +433,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.11.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.9.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.11.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
@@ -456,6 +458,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.11.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
@@ -472,6 +475,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.9.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/mobile/linking_libs.md b/tensorflow/docs_src/mobile/linking_libs.md
index cf0db59021..efef5dd0da 100644
--- a/tensorflow/docs_src/mobile/linking_libs.md
+++ b/tensorflow/docs_src/mobile/linking_libs.md
@@ -27,7 +27,7 @@ called `libandroid_tensorflow_inference_java.jar`. There are three ways to
 include this functionality in your program:
 
 1. Include the jcenter AAR which contains it, as in this
- [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/build.gradle#L59-L65)
+ [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/tfmobile/build.gradle#L59-L65)
 
 2. Download the nightly precompiled version from
 [ci.tensorflow.org](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/).
diff --git a/tensorflow/docs_src/mobile/prepare_models.md b/tensorflow/docs_src/mobile/prepare_models.md
index 8b22c04d87..2b84dbb973 100644
--- a/tensorflow/docs_src/mobile/prepare_models.md
+++ b/tensorflow/docs_src/mobile/prepare_models.md
@@ -105,8 +105,8 @@ inline constants so everything’s in one file.  To handle the conversion, you
 need the `freeze_graph.py` script, that’s held in
 [`tensorflow/python/tools/freeze_graph.py`](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py). You’ll run it like this:
 
-    bazel build tensorflow/tools:freeze_graph
-    bazel-bin/tensorflow/tools/freeze_graph \
+    bazel build tensorflow/python/tools:freeze_graph
+    bazel-bin/tensorflow/python/tools/freeze_graph \
     --input_graph=/tmp/model/my_graph.pb \
     --input_checkpoint=/tmp/model/model.ckpt-1000 \
     --output_graph=/tmp/frozen_graph.pb \
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index 2fea02d861..c97f74139c 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -227,8 +227,8 @@ of 30.0f, and an 8-bit array, the quantized values represent the following:
   <table>
     <tr><th>Quantized</th><th>Float</th></tr>
     <tr><td>0</td><td>-10.0</td></tr>
-    <tr><td>255</td><td>30.0</td></tr>
     <tr><td>128</td><td>10.0</td></tr>
+    <tr><td>255</td><td>30.0</td></tr>
   </table>
   <figcaption>
     <b>Table 2</b>: Example quantized value range
diff --git a/tensorflow/docs_src/programmers_guide/estimators.md b/tensorflow/docs_src/programmers_guide/estimators.md
index c4aae1d9d6..b13b47184d 100644
--- a/tensorflow/docs_src/programmers_guide/estimators.md
+++ b/tensorflow/docs_src/programmers_guide/estimators.md
@@ -21,18 +21,17 @@ Note: TensorFlow also includes a deprecated `Estimator` class at
 
 Estimators provide the following benefits:
 
-*   You can run Estimators-based models on a local host or on a
+*   You can run Estimator-based models on a local host or on a
     distributed multi-server environment without changing your model.
-    Furthermore, you can run Estimators-based models on CPUs, GPUs,
+    Furthermore, you can run Estimator-based models on CPUs, GPUs,
     or TPUs without recoding your model.
 *   Estimators simplify sharing implementations between model developers.
-*   You can develop a state of the art model with high-level intuitive code,
+*   You can develop a state of the art model with high-level intuitive code.
     In short, it is generally much easier to create models with Estimators
     than with the low-level TensorFlow APIs.
-*   Estimators are themselves built on tf.layers, which
+*   Estimators are themselves built on @{tf.layers}, which
     simplifies customization.
-*   Estimators build the graph for you.  In other words, you don't have to
-    build the graph.
+*   Estimators build the graph for you.
 *   Estimators provide a safe distributed training loop that controls how and
     when to:
     *   build the graph
@@ -57,7 +56,7 @@ the "plumbing" for you.  That is, pre-made Estimators create and manage
 pre-made Estimators let you experiment with different model architectures by
 making only minimal code changes.  @{tf.estimator.DNNClassifier$`DNNClassifier`},
 for example, is a pre-made Estimator class that trains classification models
-through dense, feed-forward neural networks.
+based on dense, feed-forward neural networks.
 
 
 ### Structure of a pre-made Estimators program
@@ -79,7 +78,7 @@ of the following four steps:
     an input function:
 
         def input_fn(dataset):
-           ...  # manipulate dataset, extracting feature names and the label
+           ...  # manipulate dataset, extracting the feature dict and the label
            return feature_dict, label
 
     (See @{$programmers_guide/datasets} for full details.)
@@ -96,13 +95,13 @@ of the following four steps:
         population = tf.feature_column.numeric_column('population')
         crime_rate = tf.feature_column.numeric_column('crime_rate')
         median_education = tf.feature_column.numeric_column('median_education',
-                            normalizer_fn='lambda x: x - global_education_mean')
+                            normalizer_fn=lambda x: x - global_education_mean)
 
 3.  **Instantiate the relevant pre-made Estimator.**  For example, here's
     a sample instantiation of a pre-made Estimator named `LinearClassifier`:
 
         # Instantiate an estimator, passing the feature columns.
-        estimator = tf.estimator.Estimator.LinearClassifier(
+        estimator = tf.estimator.LinearClassifier(
             feature_columns=[population, crime_rate, median_education],
             )
 
diff --git a/tensorflow/docs_src/programmers_guide/feature_columns.md b/tensorflow/docs_src/programmers_guide/feature_columns.md
index 845194fe0e..90f5c53a17 100644
--- a/tensorflow/docs_src/programmers_guide/feature_columns.md
+++ b/tensorflow/docs_src/programmers_guide/feature_columns.md
@@ -528,10 +528,10 @@ suggested by the following snippet:
 categorical_column = ... # Create any categorical column
 
 # Represent the categorical column as an embedding column.
-# This means creating a one-hot vector with one element for each category.
+# This means creating an embedding vector lookup table with one element for each category.
 embedding_column = tf.feature_column.embedding_column(
     categorical_column=categorical_column,
-    dimension=dimension_of_embedding_vector)
+    dimension=embedding_dimensions)
 ```
 
 @{$programmers_guide/embedding$Embeddings} is a significant topic within machine
diff --git a/tensorflow/examples/learn/iris.py b/tensorflow/examples/learn/iris.py
index 03e60972aa..86f5204ec3 100644
--- a/tensorflow/examples/learn/iris.py
+++ b/tensorflow/examples/learn/iris.py
@@ -21,7 +21,8 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import urllib
+
+from six.moves.urllib.request import urlretrieve
 
 import tensorflow as tf
 
@@ -38,9 +39,7 @@ FEATURE_KEYS = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
 def maybe_download_iris_data(file_name, download_url):
   """Downloads the file and returns the number of data."""
   if not os.path.exists(file_name):
-    raw = urllib.urlopen(download_url).read()
-    with open(file_name, 'w') as f:
-      f.write(raw)
+    urlretrieve(download_url, file_name)
 
   # The first line is a comma-separated string. The first one is the number of
   # total data in the file.
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 5602775b62..a5224fbda0 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -10955,7 +10955,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 // SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
 //
 // value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
+// supplied image within this range.
 // If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
@@ -18098,9 +18098,10 @@ func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_val
 }
 
 // Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
-//
 // if < 0, `scale * features` otherwise.
 //
+// Assumes weights to have zero mean and variance 1.0 / fan_in.
+//
 // See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
 func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
@@ -21625,7 +21626,7 @@ func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 //    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
 //
 // The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
 // Each element must be in the range `[0, 255]` (It represents the value of a
 // pixel in the output image).  Non-finite values in the input tensor are
 // replaced by this tensor in the output image.  The default value is the color
@@ -24018,7 +24019,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 // SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
 //
 // value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
+// supplied image within this range.
 // If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
@@ -24714,8 +24715,7 @@ type DecodeProtoV2Attr func(optionalAttr)
 // If not specified, defaults to "local://"
 func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
 	return func(m optionalAttr) {
-		m["descriptor_source"] = value
-	}
+		m["descriptor_source"] = value	}
 }
 
 // DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index debd95fc62..9b171f66ec 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -376,9 +376,6 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
     }
   }
   // op annotations
-  op_class.add_annotation(
-      Annotation::Create("Generated", "javax.annotation")
-          .attributes("value = \"TensorFlow Java Op Generator\""));
   if (endpoint.deprecated()) {
     op_class.add_annotation(Annotation::Create("Deprecated"));
     string explanation;
@@ -415,8 +412,12 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
   SourceFileWriter writer(op_file.get());
   std::list<Type> dependencies;
   CollectOpDependencies(op, mode, &dependencies);
-  writer.Write(kLicense).EndLine().BeginType(op_class, PUBLIC | FINAL,
-                                             &dependencies, &op_javadoc);
+  writer.Write(kLicense)
+      .EndLine()
+      .Write("// This class has been generated, DO NOT EDIT!")
+      .EndLine()
+      .EndLine()
+      .BeginType(op_class, PUBLIC | FINAL, &dependencies, &op_javadoc);
   if (!op.optional_attributes().empty()) {
     RenderOptionsClass(op, op_class, &writer);
   }
diff --git a/tensorflow/java/src/gen/cc/op_specs.cc b/tensorflow/java/src/gen/cc/op_specs.cc
index 181fd4c5e3..941ab2699c 100644
--- a/tensorflow/java/src/gen/cc/op_specs.cc
+++ b/tensorflow/java/src/gen/cc/op_specs.cc
@@ -96,6 +96,7 @@ Type TypeResolver::TypeOf(const OpDef_ArgDef& arg_def, bool* iterable_out) {
     *iterable_out = true;
     visited_attrs_.insert(std::make_pair(arg_def.number_attr(), Type::Int()));
   }
+
   Type type = Type::Wildcard();
   if (arg_def.type() != DataType::DT_INVALID) {
     // resolve type from DataType
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index b2e6c60021..bd97b181ff 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -196,11 +196,11 @@ def implicit_val_and_grad(f):
   # TODO(cais): Remove calls to tf.constant() once the gradients functions
   # accept lists and np.ndarrays.
 
-  def grad_fn(*args):
+  def grad_fn(*args, **kwds):
     """Computes the gradient of the wrapped function."""
     this_tape = tape.push_new_tape()
     try:
-      end_node = f(*args)
+      end_node = f(*args, **kwds)
       if end_node is None:
         raise ValueError("Cannot differentiate a function that returns None; "
                          "did you forget to return a value from {}?".format(
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 9cd17e0407..20522098b0 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -978,7 +978,10 @@ py_test(
     size = "large",
     srcs = ["keras_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],
+    tags = [
+        "no_windows",
+        "notsan",
+    ],
     deps = [
         ":keras",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/python/estimator/exporter.py b/tensorflow/python/estimator/exporter.py
index 7cdf840c97..b18212cfcd 100644
--- a/tensorflow/python/estimator/exporter.py
+++ b/tensorflow/python/estimator/exporter.py
@@ -156,7 +156,7 @@ def _loss_smaller(best_eval_result, current_eval_result):
   return best_eval_result[default_key] > current_eval_result[default_key]
 
 
-def _verify_compre_fn_args(compare_fn):
+def _verify_compare_fn_args(compare_fn):
   """Verifies compare_fn arguments."""
   args = set(util.fn_args(compare_fn))
   if 'best_eval_result' not in args:
@@ -265,7 +265,7 @@ class BestExporter(Exporter):
     self._compare_fn = compare_fn
     if self._compare_fn is None:
       raise ValueError('`compare_fn` must not be None.')
-    _verify_compre_fn_args(self._compare_fn)
+    _verify_compare_fn_args(self._compare_fn)
 
     self._saved_model_exporter = _SavedModelExporter(
         name, serving_input_receiver_fn, assets_extra, as_text)
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index 035c7c148c..a6cefdece2 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -136,11 +136,13 @@ def numpy_input_fn(x,
       values in `x` have same shape).
     ValueError: if duplicate keys are in both `x` and `y` when `y` is a dict.
     ValueError: if x or y is an empty dict.
-    TypeError: `x` is not a dict or array, or if `shuffle` is not bool.
+    TypeError: `x` is not a dict or array.
+    ValueError: if 'shuffle' is not provided or a bool.
   """
   if not isinstance(shuffle, bool):
-    raise TypeError('shuffle must be explicitly set as boolean; '
-                    'got {}'.format(shuffle))
+    raise ValueError('shuffle must be provided and explicitly set as boolean '
+                     '(it is recommended to set it as True for training); '
+                     'got {}'.format(shuffle))
 
   def input_fn():
     """Numpy input function."""
diff --git a/tensorflow/python/estimator/inputs/numpy_io_test.py b/tensorflow/python/estimator/inputs/numpy_io_test.py
index 92d057e25d..81b201cc5c 100644
--- a/tensorflow/python/estimator/inputs/numpy_io_test.py
+++ b/tensorflow/python/estimator/inputs/numpy_io_test.py
@@ -286,8 +286,9 @@ class NumpyIoTest(test.TestCase):
     x = np.arange(32, 36)
     y = np.arange(4)
     with self.test_session():
-      with self.assertRaisesRegexp(TypeError,
-                                   'shuffle must be explicitly set as boolean'):
+      with self.assertRaisesRegexp(ValueError,
+                                   'shuffle must be provided and explicitly '
+                                   'set as boolean'):
         # Default shuffle is None.
         numpy_io.numpy_input_fn(x, y)
 
diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py
index 938e244fb3..57f8e5fd6a 100644
--- a/tensorflow/python/estimator/inputs/pandas_io.py
+++ b/tensorflow/python/estimator/inputs/pandas_io.py
@@ -68,15 +68,16 @@ def pandas_input_fn(x,
   Raises:
     ValueError: if `x` already contains a column with the same name as `y`, or
       if the indexes of `x` and `y` don't match.
-    TypeError: `shuffle` is not bool.
+    ValueError: if 'shuffle' is not provided or a bool.
   """
   if not HAS_PANDAS:
     raise TypeError(
         'pandas_input_fn should not be called without pandas installed')
 
   if not isinstance(shuffle, bool):
-    raise TypeError('shuffle must be explicitly set as boolean; '
-                    'got {}'.format(shuffle))
+    raise ValueError('shuffle must be provided and explicitly set as boolean '
+                     '(it is recommended to set it as True for training); '
+                     'got {}'.format(shuffle))
 
   x = x.copy()
   if y is not None:
diff --git a/tensorflow/python/estimator/inputs/pandas_io_test.py b/tensorflow/python/estimator/inputs/pandas_io_test.py
index e5912a3b28..dcecf6dd61 100644
--- a/tensorflow/python/estimator/inputs/pandas_io_test.py
+++ b/tensorflow/python/estimator/inputs/pandas_io_test.py
@@ -70,8 +70,9 @@ class PandasIoTest(test.TestCase):
       return
     x, _ = self.makeTestDataFrame()
     y_noindex = pd.Series(np.arange(-32, -28))
-    with self.assertRaisesRegexp(TypeError,
-                                 'shuffle must be explicitly set as boolean'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'shuffle must be provided and explicitly '
+                                 'set as boolean'):
       # Default shuffle is None
       pandas_io.pandas_input_fn(x, y_noindex)
 
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index 8e2ec83020..51a61adb21 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -250,7 +250,7 @@ class _PandasFeedFn(object):
                num_epochs=None):
     if len(placeholders) != len(dataframe.columns) + 1:
       raise ValueError("Expected {} placeholders; got {}.".format(
-          len(dataframe.columns), len(placeholders)))
+          len(dataframe.columns) + 1, len(placeholders)))
     self._index_placeholder = placeholders[0]
     self._col_placeholders = placeholders[1:]
     self._dataframe = dataframe
diff --git a/tensorflow/python/estimator/keras.py b/tensorflow/python/estimator/keras.py
index c80af08fba..2f439f765e 100644
--- a/tensorflow/python/estimator/keras.py
+++ b/tensorflow/python/estimator/keras.py
@@ -70,7 +70,7 @@ def _convert_tensor(x):
   return x
 
 
-def _any_variable_initalized():
+def _any_variable_initialized():
   """Check if any variable has been initialized in the Keras model.
 
   Returns:
@@ -511,7 +511,7 @@ def model_to_estimator(keras_model=None,
       keras_model_fn, model_dir=model_dir, config=config)
 
   # Check if we need to call get_weights:
-  if _any_variable_initalized():
+  if _any_variable_initialized():
     keras_weights = keras_model.get_weights()
     # Warn if config passed to estimator tries to update GPUOptions. If a
     # session has already been created, the GPUOptions passed to the first
diff --git a/tensorflow/python/estimator/keras_test.py b/tensorflow/python/estimator/keras_test.py
index 6688a84130..5e094ae92b 100644
--- a/tensorflow/python/estimator/keras_test.py
+++ b/tensorflow/python/estimator/keras_test.py
@@ -31,10 +31,10 @@ from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.applications import mobilenet
 from tensorflow.python.keras.optimizers import SGD
+from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -146,13 +146,13 @@ def randomize_io_type(array, name):
 def multi_inputs_multi_outputs_model():
   a = keras.layers.Input(shape=(16,), name='input_a')
   b = keras.layers.Input(shape=(16,), name='input_b')
-  m = keras.layers.Input(shape=(8,), dtype='bool', name='input_m')
+  m = keras.layers.Input(shape=(8,), dtype='string', name='input_m')
   dense = keras.layers.Dense(8, name='dense_1')
 
   a_2 = dense(a)
-  # Apply a mask
-  s_2 = keras.layers.Lambda(lambda k:
-                            K.switch(k[0], k[1], K.zeros_like(k[1])))([m, a_2])
+  # Read m
+  m_2 = keras.layers.Lambda(gen_parsing_ops.string_to_number)(m)
+  s_2 = keras.layers.Lambda(lambda k: k[0] * k[1])([m_2, a_2])
   b_2 = dense(b)
   merged = keras.layers.concatenate([s_2, b_2], name='merge')
   c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
@@ -372,13 +372,13 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
 
     def train_input_fn():
       input_dict = {'input_a': a_train, 'input_b': b_train,
-                    'input_m': input_m_train > 0}
+                    'input_m': input_m_train.astype(np.str)}
       output_dict = {'dense_2': c_train, 'dense_3': d_train}
       return input_dict, output_dict
 
     def eval_input_fn():
       input_dict = {'input_a': a_test, 'input_b': b_test,
-                    'input_m': input_m_test > 0}
+                    'input_m': input_m_test.astype(np.str)}
       output_dict = {'dense_2': c_test, 'dense_3': d_test}
       return input_dict, output_dict
 
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 2d6925d1a8..af5d709f7e 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -1389,7 +1389,7 @@ class LayoutOptimizerTest(test.TestCase):
       expected_num_transposes = 3
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
-      self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
+      self._assert_trans_nchw_to_nhwc('map/while/Add_1-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testLoopWithVecAnd4D(self):
@@ -1413,7 +1413,7 @@ class LayoutOptimizerTest(test.TestCase):
       expected_num_transposes = 2
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
-      self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
+      self._assert_trans_nchw_to_nhwc('map/while/Add_1-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testBinaryOpSecondPort(self):
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index e487f583be..f608dea430 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -93,6 +93,8 @@ def selu(x):
       - To be used together with the initialization "lecun_normal".
       - To be used together with the dropout variant "AlphaDropout".
 
+  References:
+      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
   """
   alpha = 1.6732632423543772848170429916717
   scale = 1.0507009873554804934193349852946
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 70b6a8431a..9f91368e5b 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -724,15 +724,6 @@ class TensorBoard(Callback):
         for weight in layer.weights:
           mapped_weight_name = weight.name.replace(':', '_')
           tf_summary.histogram(mapped_weight_name, weight)
-          if self.write_grads:
-            grads = model.optimizer.get_gradients(model.total_loss, weight)
-
-            def is_indexed_slices(grad):
-              return type(grad).__name__ == 'IndexedSlices'
-
-            grads = [grad.values if is_indexed_slices(grad) else grad
-                     for grad in grads]
-            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
           if self.write_images:
             w_img = array_ops.squeeze(weight)
             shape = K.int_shape(w_img)
@@ -759,6 +750,18 @@ class TensorBoard(Callback):
             assert len(shape) == 4 and shape[-1] in [1, 3, 4]
             tf_summary.image(mapped_weight_name, w_img)
 
+        if self.write_grads:
+          for weight in layer.trainable_weights:
+            mapped_weight_name = weight.name.replace(':', '_')
+            grads = model.optimizer.get_gradients(model.total_loss, weight)
+
+            def is_indexed_slices(grad):
+              return type(grad).__name__ == 'IndexedSlices'
+
+            grads = [grad.values if is_indexed_slices(grad) else grad
+                     for grad in grads]
+            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
+
         if hasattr(layer, 'output'):
           tf_summary.histogram('{}_out'.format(layer.name), layer.output)
     self.merged = tf_summary.merge_all()
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index b355f4a269..5062a26580 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -653,6 +653,8 @@ class KerasCallbacksTest(test.TestCase):
       model.add(
           keras.layers.Dense(
               NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      # non_trainable_weights: moving_variance, moving_mean
+      model.add(keras.layers.BatchNormalization())
       model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
       model.compile(
           loss='categorical_crossentropy',
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index a4cd017d60..1c9135982e 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -123,7 +123,7 @@ class Network(base_layer.Layer):
     # Entries are unique. Includes input and output layers.
     self._layers = []
 
-    # Used in symbolic mode only, only in conjonction with graph-networks
+    # Used in symbolic mode only, only in conjunction with graph-networks
     self._outbound_nodes = []
     self._inbound_nodes = []
 
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index 6a94986b9c..7e82db028b 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -482,7 +482,7 @@ class TestWholeModelSaving(test.TestCase):
       with h5py.File(fname, 'r') as h5file:
         num_names_arrays = len([attr for attr in h5file['model_weights'].attrs
                                 if attr.startswith('layer_names')])
-      # The chunking of layer names array should have happend.
+      # The chunking of layer names array should have happened.
       self.assertGreater(num_names_arrays, 0)
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
@@ -527,7 +527,7 @@ class TestWholeModelSaving(test.TestCase):
         num_weight_arrays = len(
             [attr for attr in h5file['model_weights']['nested_model'].attrs
              if attr.startswith('weight_names')])
-      # The chunking of layer names array should have happend.
+      # The chunking of layer names array should have happened.
       self.assertGreater(num_weight_arrays, 0)
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 89c1f1a40f..fce6cbdb7a 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -409,11 +410,13 @@ class Model(Network):
         else:
           if sample_weight_mode == 'temporal':
             sample_weights.append(array_ops.placeholder_with_default(
-                [[1.]], shape=[None, None], name=name + '_sample_weights'))
+                constant_op.constant([[1.]], dtype=K.floatx()),
+                shape=[None, None], name=name + '_sample_weights'))
             sample_weight_modes.append('temporal')
           else:
             sample_weights.append(array_ops.placeholder_with_default(
-                [1.], shape=[None], name=name + '_sample_weights'))
+                constant_op.constant([1.], dtype=K.floatx()),
+                shape=[None], name=name + '_sample_weights'))
             sample_weight_modes.append(None)
     self.sample_weight_modes = sample_weight_modes
     self._feed_sample_weight_modes = []
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index 2ecbff3a1c..e8838cd3bc 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -732,7 +732,7 @@ def slice_arrays(arrays, indices, contiguous=True):
   """Slices batches out of provided arrays (workaround for eager tensors).
 
   Unfortunately eager tensors don't have the same slicing behavior as
-  Numpy arrays (they folow  the same slicing behavior as symbolic TF tensors),
+  Numpy arrays (they follow the same slicing behavior as symbolic TF tensors),
   hence we cannot use `generic_utils.slice_arrays` directly
   and we have to implement this workaround based on `concat`. This has a
   performance cost.
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index a54d6da839..c519e194bd 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -71,7 +71,7 @@ class KerasInitializersTest(test.TestCase):
                                                       stddev=1,
                                                       seed=126),
                    tensor_shape,
-                   target_mean=0., target_std=None, target_max=2)
+                   target_mean=0., target_max=2, target_min=-2)
 
   def test_constant(self):
     tensor_shape = (5, 6, 4)
@@ -83,49 +83,49 @@ class KerasInitializersTest(test.TestCase):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(3. / fan_in)
+      std = np.sqrt(1. / fan_in)
       self._runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_glorot_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(6. / (fan_in + fan_out))
+      std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_he_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(6. / fan_in)
+      std = np.sqrt(2. / fan_in)
       self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_lecun_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(1. / fan_in)
+      std = np.sqrt(1. / fan_in)
       self._runner(keras.initializers.lecun_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(2. / (fan_in + fan_out))
+      std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_he_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(2. / fan_in)
+      std = np.sqrt(2. / fan_in)
       self._runner(keras.initializers.he_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_orthogonal(self):
     tensor_shape = (20, 20)
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 5061825d38..f60064ed63 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -19,7 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import sys
 import types as python_types
+import warnings
 
 import numpy as np
 
@@ -714,6 +716,7 @@ class Lambda(Layer):
     return self.mask
 
   def get_config(self):
+    module = self.function.__module__
     if isinstance(self.function, python_types.LambdaType):
       function = generic_utils.func_dump(self.function)
       function_type = 'lambda'
@@ -721,21 +724,26 @@ class Lambda(Layer):
       function = self.function.__name__
       function_type = 'function'
 
+    output_shape_module = None
     if isinstance(self._output_shape, python_types.LambdaType):
       output_shape = generic_utils.func_dump(self._output_shape)
       output_shape_type = 'lambda'
+      output_shape_module = self._output_shape.__module__
     elif callable(self._output_shape):
       output_shape = self._output_shape.__name__
       output_shape_type = 'function'
+      output_shape_module = self._output_shape.__module__
     else:
       output_shape = self._output_shape
       output_shape_type = 'raw'
 
     config = {
         'function': function,
+        'module': module,
         'function_type': function_type,
         'output_shape': output_shape,
         'output_shape_type': output_shape_type,
+        'output_shape_module': output_shape_module,
         'arguments': self.arguments
     }
     base_config = super(Lambda, self).get_config()
@@ -745,8 +753,16 @@ class Lambda(Layer):
   def from_config(cls, config, custom_objects=None):
     config = config.copy()
     globs = globals()
+    module = config.pop('module', None)
+    if module in sys.modules:
+      globs.update(sys.modules[module].__dict__)
+    elif module is not None:
+      # Note: we don't know the name of the function if it's a lambda.
+      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
+                    'It may cause errors.'.format(module)
+                    , UserWarning)
     if custom_objects:
-      globs = dict(list(globs.items()) + list(custom_objects.items()))
+      globs.update(custom_objects)
     function_type = config.pop('function_type')
     if function_type == 'function':
       # Simple lookup in custom objects
@@ -760,6 +776,14 @@ class Lambda(Layer):
     else:
       raise TypeError('Unknown function type:', function_type)
 
+    output_shape_module = config.pop('output_shape_module', None)
+    if output_shape_module in sys.modules:
+      globs.update(sys.modules[output_shape_module].__dict__)
+    elif output_shape_module is not None:
+      # Note: we don't know the name of the function if it's a lambda.
+      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
+                    'It may cause errors.'.format(output_shape_module)
+                    , UserWarning)
     output_shape_type = config.pop('output_shape_type')
     if output_shape_type == 'function':
       # Simple lookup in custom objects
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index c616d8f24f..e6e45902a8 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -144,5 +144,19 @@ class CheckpointingTests(test.TestCase):
     model.load_weights(save_prefix)
     self.assertEqual(12., self.evaluate(beta1_power))
 
+class TestModelBackend(test.TestCase):
+
+  def test_model_backend_float64_use_cases(self):
+    # Test case for GitHub issue 19318
+    floatx = keras.backend.floatx()
+    keras.backend.set_floatx('float64')
+
+    x = keras.Input((5,))
+    y = keras.layers.Dense(1)(x)
+    model = keras.models.Model(x, y)
+    model.compile('rmsprop', 'mse')
+
+    keras.backend.set_floatx(floatx)
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/as_string_op_test.py b/tensorflow/python/kernel_tests/as_string_op_test.py
index 9d54add264..94ed8ebd31 100644
--- a/tensorflow/python/kernel_tests/as_string_op_test.py
+++ b/tensorflow/python/kernel_tests/as_string_op_test.py
@@ -130,6 +130,16 @@ class AsStringOpTest(test.TestCase):
       result = output.eval(feed_dict={input_: int_inputs_})
       self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
 
+  def testHalfInt(self):
+    s = lambda strs: [x.decode("ascii") for x in strs]
+
+    with self.test_session():
+      input_ = array_ops.placeholder(dtypes.int16)
+      int_inputs_ = [np.iinfo(np.int16).min, np.iinfo(np.int16).max]
+      output = string_ops.as_string(input_)
+      result = output.eval(feed_dict={input_: int_inputs_})
+      self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
+
   def testBool(self):
     bool_inputs_ = [False, True]
     s = lambda strs: [x.decode("ascii") for x in strs]
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index 08b03f8518..16fdedac41 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -172,7 +172,7 @@ class BetaincTest(test.TestCase):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [gx_s.shape], tf_gout_t, gx_s.shape)
-      print("betainc gradient err = %g " % err)
+      tf_logging.info("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
       # Test broadcast gradient
@@ -181,7 +181,7 @@ class BetaincTest(test.TestCase):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [()], tf_gout_t, ga_s.shape)
-      print("betainc gradient err = %g " % err)
+      tf_logging.info("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
 
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index e08123b041..fb52d10475 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -18,9 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -414,6 +417,16 @@ class ClipTest(test.TestCase):
 
     self.assertAllClose(np_ans, tf_ans)
 
+  def testClipByValueEmptyTensor(self):
+    # Test case for GitHub issue 19337
+    zero = array_ops.placeholder(dtype=dtypes.float32, shape=None)
+    x = clip_ops.clip_by_value(zero, zero, zero)
+    y = clip_ops.clip_by_value(zero, 1.0, 1.0)
+    z = clip_ops.clip_by_value(zero, zero, 1.0)
+    w = clip_ops.clip_by_value(zero, 1.0, zero)
+    with self.test_session(use_gpu=True) as sess:
+      sess.run([x, y, z, w], feed_dict={zero: np.zeros((7, 0))})
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 8699fd5b25..80ba7dafc9 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -312,8 +312,8 @@ class Conv2DTest(test.TestCase):
       expected_values = self.evaluate(expected_results)
       computed_values = self.evaluate(computed_results)
       for e_value, c_value in zip(expected_values, computed_values):
-        print("expected = ", e_value)
-        print("actual = ", c_value)
+        tf_logging.info("expected = ", e_value)
+        tf_logging.info("actual = ", c_value)
         self.assertAllClose(
             e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-4)
 
@@ -337,8 +337,8 @@ class Conv2DTest(test.TestCase):
       for i in range(len(tensors)):
         conv = tensors[i]
         value = values[i]
-        print("expected = ", expected)
-        print("actual = ", value)
+        tf_logging.info("expected = ", expected)
+        tf_logging.info("actual = ", value)
         tol = 1e-5
         if value.dtype == np.float16:
           tol = 1e-3
@@ -547,8 +547,8 @@ class Conv2DTest(test.TestCase):
       # "values" consists of two tensors for two backprops
       value = self.evaluate(conv)
       self.assertShapeEqual(value, conv)
-    print("expected = ", expected)
-    print("actual = ", value)
+    tf_logging.info("expected = ", expected)
+    tf_logging.info("actual = ", value)
     self.assertArrayNear(expected, value.flatten(), err)
 
   def _CompareBackpropInput(self, input_sizes, filter_sizes, output_sizes,
@@ -723,8 +723,8 @@ class Conv2DTest(test.TestCase):
             data_format=data_format)
         value = self.evaluate(conv)
         self.assertShapeEqual(value, conv)
-      print("expected = ", expected)
-      print("actual = ", value)
+      tf_logging.info("expected = ", expected)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(expected, value.flatten(), 1e-5)
 
   def _CompareBackFilter(self, input_sizes, filter_sizes, output_sizes,
@@ -912,8 +912,8 @@ class Conv2DTest(test.TestCase):
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      print("expected = ", value_2)
-      print("actual = ", value)
+      tf_logging.info("expected = ", value_2)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   # Testing for backprops
@@ -965,8 +965,8 @@ class Conv2DTest(test.TestCase):
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      print("expected = ", value_2)
-      print("actual = ", value)
+      tf_logging.info("expected = ", value_2)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
@@ -1178,7 +1178,7 @@ class Conv2DTest(test.TestCase):
           # since fp16 numerical gradients are too imprecise.
           err = np.fabs(jacob_t - reference_jacob_t).max()
 
-        print("conv_2d gradient error = ", err)
+        tf_logging.info("conv_2d gradient error = ", err)
         self.assertLess(err, 0.002)
 
   def testInputGradientValidPaddingStrideOne(self):
@@ -1546,7 +1546,7 @@ class DepthwiseConv2DTest(test.TestCase):
       conv = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
       value = sess.run(conv)
-    print("value = ", value)
+    tf_logging.info("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1668,7 +1668,7 @@ class SeparableConv2DTest(test.TestCase):
         conv = array_ops.transpose(conv, [0, 2, 3, 1])
 
       value = sess.run(conv)
-    print("value = ", value)
+    tf_logging.info("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1826,7 +1826,7 @@ class Conv2DBenchmark(test.Benchmark):
         wall_time = time.time() - start
         self.report_benchmark(
             name="conv_stack_iter_%d" % iter_index, wall_time=wall_time)
-        print("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
+        tf_logging.info("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
 
 
 def GetInceptionFwdTest(input_size, filter_size, stride, padding,
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 91ebe8de99..58e2a8ac2a 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -197,7 +197,21 @@ class GatherNdTest(test.TestCase):
     self.assertEqual(None, shape.ndims)
     self.assertEqual(None, shape[0].value)
 
-  def testBadIndices(self):
+  def testBadIndicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [0, 1, 2]
+      indices = [[[0], [7]]]  # Make this one higher rank
+      gather_nd = array_ops.gather_nd(params, indices)
+      with self.assertRaisesOpError(
+          r"flat indices\[1, :\] = \[7\] does not index into param "
+          r"\(shape: \[3\]\)"):
+        gather_nd.eval()
+
+  def _disabledTestBadIndicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [0, 1, 2]
       indices = [[[0], [7]]]  # Make this one higher rank
@@ -207,7 +221,21 @@ class GatherNdTest(test.TestCase):
           r"\(shape: \[3\]\)"):
         gather_nd.eval()
 
-  def testBadIndicesWithSlices(self):
+  def testBadIndicesWithSlicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [[0, 1, 2]]
+      indices = [[[0], [0], [1]]]  # Make this one higher rank
+      gather_nd = array_ops.gather_nd(params, indices)
+      with self.assertRaisesOpError(
+          r"flat indices\[2, :\] = \[1\] does not index into param "
+          r"\(shape: \[1,3\]\)"):
+        gather_nd.eval()
+
+  def _disabledTestBadIndicesWithSlicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2]]
       indices = [[[0], [0], [1]]]  # Make this one higher rank
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index a2fcd751df..033fa95935 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -27,7 +27,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.platform import test
 
-_TEST_TYPES = (dtypes.float32, dtypes.complex64, dtypes.complex128)
+_TEST_TYPES = (dtypes.int64, dtypes.float32,
+               dtypes.complex64, dtypes.complex128)
 
 
 class GatherTest(test.TestCase):
@@ -122,6 +123,9 @@ class GatherTest(test.TestCase):
                 gather, [tf_params, tf_indices, tf_axis], gather_grad)
             self.assertEqual(indices_grad, None)
             self.assertEqual(axis_grad, None)
+            if dtype.is_integer:
+              self.assertEqual(params_grad, None)
+              continue
             # For axis 0, we are able to create an efficient IndexedSlices for
             # the gradient.
             if axis == 0:
@@ -177,7 +181,19 @@ class GatherTest(test.TestCase):
     gather_t = array_ops.gather(params, indices, axis=axis)
     self.assertEqual(None, gather_t.shape)
 
-  def testBadIndices(self):
+  def testBadIndicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [[0, 1, 2], [3, 4, 5]]
+      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
+        array_ops.gather(params, [[7]], axis=0).eval()
+      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 3\)"):
+        array_ops.gather(params, [[7]], axis=1).eval()
+
+  def _disabledTestBadIndicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2], [3, 4, 5]]
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index a9b55854f1..795aa67248 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -362,6 +362,33 @@ class UniformUnitScalingInitializationTest(test.TestCase):
         dtype=dtypes.string)
 
 
+class VarianceScalingInitializationTest(test.TestCase):
+
+  def testNormalDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops.variance_scaling_initializer(distribution='normal')
+
+    with self.test_session(use_gpu=True):
+      x = init(shape).eval()
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+  def testUniformDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops.variance_scaling_initializer(distribution='uniform')
+
+    with self.test_session(use_gpu=True):
+      x = init(shape).eval()
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+
 # TODO(vrv): move to sequence_ops_test?
 class RangeTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index a0c372db7d..e95c729715 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -947,7 +947,7 @@ class PoolingTest(test.TestCase):
           output_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    print("%s gradient error = " % func_name, err)
+    tf_logging.info("%s gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _ConstructAndTestSecondGradient(self,
@@ -1024,7 +1024,7 @@ class PoolingTest(test.TestCase):
           input_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    print("%s second-order gradient error = " % func_name, err)
+    tf_logging.info("%s second-order gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _testMaxPoolGradValidPadding1_1(self, data_format, use_gpu):
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 677253946e..253e43920b 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gc
 import re
 
 import numpy as np
@@ -434,13 +435,29 @@ class PyFuncTest(test.TestCase):
 
   # ----- Tests shared by py_func and eager_py_func -----
   def testCleanup(self):
-    for _ in xrange(1000):
-      g = ops.Graph()
-      with g.as_default():
-        c = constant_op.constant([1.], dtypes.float32)
-        _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
-        _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
-    self.assertLess(script_ops._py_funcs.size(), 100)
+    # Delete everything created by previous tests to avoid side effects.
+    ops.reset_default_graph()
+    gc.collect()
+    initial_size = script_ops._py_funcs.size()
+    # Encapsulate the graph generation, so locals can be deleted.
+    def make_graphs():
+      for _ in xrange(1000):
+        g = ops.Graph()
+        with g.as_default():
+          c = constant_op.constant([1.], dtypes.float32)
+          _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
+          _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
+          # These ops have a reference to 'c' which has a reference to the graph.
+          # Checks if the functions are being deleted though the graph is referenced from them.
+          # (see #18292)
+          _ = script_ops.py_func(lambda x: x + c.shape[0], [c], [dtypes.float32])
+          _ = script_ops.eager_py_func(lambda x: x + c.shape[0], [c], [dtypes.float32])
+ 
+    # Call garbage collector to enforce deletion.
+    make_graphs()
+    ops.reset_default_graph()
+    gc.collect()
+    self.assertEqual(initial_size, script_ops._py_funcs.size())
 
   # ----- Tests for eager_py_func -----
   @test_util.run_in_graph_and_eager_modes()
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 79fe927b8a..faa4b49a8d 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -144,7 +144,9 @@ class StatefulScatterNdTest(test.TestCase):
         self.assertAllClose(new, ref_var.eval())
 
   def _VariableRankTests(self, np_scatter, tf_scatter):
-    for vtype in (np.float32, np.float64, np.complex64, np.complex128):
+    for vtype in (np.int32,
+                  np.float32, np.float64,
+                  np.complex64, np.complex128):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(np_scatter, tf_scatter, vtype, itype)
 
@@ -221,7 +223,7 @@ class StatefulScatterNdTest(test.TestCase):
   #   self._VariableRankTests(_NumpyDiv, state_ops.scatter_nd_div)
 
   def _ScatterRepeatIndicesTest(self, np_scatter, tf_scatter):
-    for vtype in (np.float32, np.float64):
+    for vtype in (np.int32, np.float32, np.float64):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(
             np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index c70a4ffce7..1a0fa744ae 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -159,7 +159,13 @@ class ScatterTest(test.TestCase):
 
           # Clips small values to avoid division by zero.
           def clip_small_values(x):
-            return 1e-4 * np.sign(x) if np.abs(x) < 1e-4 else x
+            threshold = 1e-4
+            sign = np.sign(x)
+
+            if isinstance(x, np.int32):
+              threshold = 1
+              sign = np.random.choice([-1, 1])
+            return threshold * sign if np.abs(x) < threshold else x
 
           updates = np.vectorize(clip_small_values)(updates)
           old = _AsType(np.random.randn(*((first_dim,) + extra_shape)), vtype)
@@ -181,7 +187,11 @@ class ScatterTest(test.TestCase):
                          tf_scatter,
                          repeat_indices=False,
                          updates_are_scalar=False):
-    for vtype in (np.float32, np.float64):
+    vtypes = [np.float32, np.float64]
+    if tf_scatter != state_ops.scatter_div:
+      vtypes.append(np.int32)
+
+    for vtype in vtypes:
       for itype in (np.int32, np.int64):
         self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices,
                                updates_are_scalar)
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 794be096b7..a82855dfeb 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -264,7 +264,9 @@ class UnsortedSegmentTest(SegmentReductionHelper):
 
     # A subset of ops has been enabled for complex numbers
     self.complex_ops_list = [(np.add, None,
-                              math_ops.unsorted_segment_sum, lambda t: 0)]
+                              math_ops.unsorted_segment_sum, lambda t: 0),
+                             (np.ndarray.__mul__, None,
+                              math_ops.unsorted_segment_prod, lambda t: 1)]
     self.differentiable_dtypes = [dtypes_lib.float16, dtypes_lib.float32,
                                   dtypes_lib.float64]
     self.all_dtypes = (self.differentiable_dtypes +
diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index a5bd1b6ee0..e20daccb28 100644
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -146,5 +146,101 @@ class StringSplitOpTest(test.TestCase):
       self.assertAllEqual(shape, [3, 1])
 
 
+class StringSplitV2OpTest(test.TestCase):
+
+  def testSplitV2(self):
+    strings = ["pigs on the wing", "animals"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
+      self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
+      self.assertAllEqual(shape, [2, 4])
+
+  def testSplitV2MultiCharSeparator(self):
+    # Match Python behavior:
+    # >>> '1<>2<>3'.split('<>')
+    # ['1', '2', '3']
+    # >>> "<><>4<>5<><>6<>".split("<>")
+    # ['', '', '4', '5', '', '6', '']
+    strings = ["1<>2<>3", "<><>4<>5<><>6<>"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep="<>")
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(
+          indices, [[0, 0], [0, 1], [0, 2],
+                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
+      self.assertAllEqual(values, [b"1", b"2", b"3",
+                                   b"", b"", b"4", b"5", b"", b"6", b""])
+      self.assertAllEqual(shape, [2, 7])
+
+  def testSplitV2SimpleSeparator(self):
+    # Match Python behavior:
+    # >>> '1,2,3'.split(',')
+    # ['1', '2', '3']
+    # >>> '1,2,,3,'.split(',')
+    # ['1', '2', '', '3', '']
+    strings = ["1,2,3", "4,5,,6,"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep=',')
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
+                                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
+      self.assertAllEqual(values, [b"1", b"2", b"3",
+                                   b"4", b"5", b"", b"6", b""])
+      self.assertAllEqual(shape, [2, 5])
+
+  def testSplitV2EmptySeparator(self):
+    # Match Python behavior:
+    # >>> '1 2 3'.split()
+    # ['1', '2', '3']
+    #>>> '   1   2   3   '.split()
+    #['1', '2', '3']
+    strings = ["1 2 3", "  4  5    6  "]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
+                                    [1, 0], [1, 1], [1, 2]])
+      self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
+      self.assertAllEqual(shape, [2, 3])
+
+  def testSplitV2SimpleSeparatorMaxSplit(self):
+    # Match Python behavior:
+    # >>> '1,2,3'.split(',', maxsplit=1)
+    # ['1', '2,3']
+    # >>> '4,5,,6,'.split(',', maxsplit=1)
+    # ['4', '5,,6,']
+    strings = ["1,2,3", "4,5,,6,"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep=',', maxsplit=1)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1],
+                                    [1, 0], [1, 1]])
+      self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
+      self.assertAllEqual(shape, [2, 2])
+
+  def testSplitV2EmptySeparatorMaxSplit(self):
+    # Match Python behavior:
+    # '1 2 3'.split(maxsplit=1)
+    # ['1', '2 3']
+    # >>> "  4  5    6  ".split(maxsplit=1)
+    # ['4', '5    6  ']
+    strings = ["1 2 3", "  4  5    6  "]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, maxsplit=1)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1],
+                                    [1, 0], [1, 1]])
+      self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
+      self.assertAllEqual(shape, [2, 2])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 8129334703..fae63b1132 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -2619,6 +2619,10 @@ reverse.__doc__ = gen_array_ops.reverse_v2.__doc__
 
 # pylint: disable=redefined-builtin
 @tf_export("reverse_sequence")
+@deprecation.deprecated_args(
+    None, "seq_dim is deprecated, use seq_axis instead", "seq_dim")
+@deprecation.deprecated_args(
+    None, "batch_dim is deprecated, use batch_axis instead", "batch_dim")
 def reverse_sequence(input,
                      seq_lengths,
                      seq_axis=None,
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 12afcd0b51..94c8d79335 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -283,10 +283,10 @@ def compute_gradient(x,
   numbers.  For example, if `x` is complex with shape `[m]` and `y` is complex
   with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with
 
-      J[:m, :n] = d(Re y)/d(Re x)
-      J[:m, n:] = d(Im y)/d(Re x)
-      J[m:, :n] = d(Re y)/d(Im x)
-      J[m:, n:] = d(Im y)/d(Im x)
+      J[::2, ::2] = d(Re y)/d(Re x)
+      J[::2, 1::2] = d(Im y)/d(Re x)
+      J[1::2, ::2] = d(Re y)/d(Im x)
+      J[1::2, 1::2] = d(Im y)/d(Im x)
 
   Args:
     x: a tensor or list of tensors
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index bdcf420980..f27d9224c1 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
@@ -258,14 +259,14 @@ def random_flip_up_down(image, seed=None):
   dimension, which is `height`.  Otherwise output the image as-is.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
-
+    A tensor of the same type and shape as `image`.
   Raises:
     ValueError: if the shape of `image` not supported.
   """
@@ -280,13 +281,14 @@ def random_flip_left_right(image, seed=None):
   second dimension, which is `width`.  Otherwise output the image as-is.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
+    A tensor of the same type and shape as `image`.
 
   Raises:
     ValueError: if the shape of `image` not supported.
@@ -297,7 +299,8 @@ def random_flip_left_right(image, seed=None):
 def _random_flip(image, flip_index, seed, scope_name):
   """Randomly (50% chance) flip an image along axis `flip_index`.
     Args:
-      image: A 3-D tensor of shape `[height, width, channels].`
+      image: 4-D Tensor of shape `[batch, height, width, channels]` or
+             3-D Tensor of shape `[height, width, channels]`.
       flip_index: The dimension along which to flip the image.
                   Vertical: 0, Horizontal: 1
       seed: A Python integer. Used to create a random seed. See
@@ -306,22 +309,37 @@ def _random_flip(image, flip_index, seed, scope_name):
       scope_name: Name of the scope in which the ops are added.
 
     Returns:
-      A 3-D tensor of the same type and shape as `image`.
+      A tensor of the same type and shape as `image`.
 
     Raises:
       ValueError: if the shape of `image` not supported.
   """
   with ops.name_scope(None, scope_name, [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = _Assert3DImage(image)
-    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-    mirror_cond = math_ops.less(uniform_random, .5)
-    result = control_flow_ops.cond(
-        mirror_cond,
-        lambda: array_ops.reverse(image, [flip_index]),
-        lambda: image,
-        name=scope)
-    return fix_image_flip_shape(image, result)
+    image = _AssertAtLeast3DImage(image)
+    shape = image.get_shape()
+    if shape.ndims == 3 or shape.ndims is None:
+      uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
+      mirror_cond = math_ops.less(uniform_random, .5)
+      result = control_flow_ops.cond(
+          mirror_cond,
+          lambda: array_ops.reverse(image, [flip_index]),
+          lambda: image,
+          name=scope
+      )
+      return fix_image_flip_shape(image, result)
+    elif shape.ndims == 4:
+      uniform_random = random_ops.random_uniform(
+          [array_ops.shape(image)[0]], 0, 1.0, seed=seed
+      )
+      mirror_cond = math_ops.less(uniform_random, .5)
+      return array_ops.where(
+          mirror_cond,
+          image,
+          functional_ops.map_fn(lambda x: array_ops.reverse(x, [flip_index]), image, dtype=image.dtype)
+      )
+    else:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
 
 @tf_export('image.flip_left_right')
@@ -1634,13 +1652,13 @@ def is_jpeg(contents, name=None):
 
 
 @tf_export('image.decode_image')
-def decode_image(contents, channels=None, name=None):
+def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
   """Convenience function for `decode_bmp`, `decode_gif`, `decode_jpeg`,
   and `decode_png`.
 
   Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
-  appropriate operation to convert the input bytes `string` into a `Tensor` of
-  type `uint8`.
+  appropriate operation to convert the input bytes `string` into a `Tensor`
+  of type `dtype`.
 
   Note: `decode_gif` returns a 4-D array `[num_frames, height, width, 3]`, as
   opposed to `decode_bmp`, `decode_jpeg` and `decode_png`, which return 3-D
@@ -1652,10 +1670,11 @@ def decode_image(contents, channels=None, name=None):
     contents: 0-D `string`. The encoded image bytes.
     channels: An optional `int`. Defaults to `0`. Number of color channels for
       the decoded image.
+    dtype: The desired DType of the returned `Tensor`.
     name: A name for the operation (optional)
 
   Returns:
-    `Tensor` with type `uint8` with shape `[height, width, num_channels]` for
+    `Tensor` with type `dtype` and shape `[height, width, num_channels]` for
       BMP, JPEG, and PNG images and shape `[num_frames, height, width, 3]` for
       GIF images.
 
@@ -1679,7 +1698,7 @@ def decode_image(contents, channels=None, name=None):
       channels_msg = 'Channels must be in (None, 0, 3) when decoding BMP images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_decode, assert_channels]):
-        return gen_image_ops.decode_bmp(contents)
+        return convert_image_dtype(gen_image_ops.decode_bmp(contents), dtype)
 
     def _gif():
       # Create assert to make sure that channels is not set to 1
@@ -1692,7 +1711,7 @@ def decode_image(contents, channels=None, name=None):
       channels_msg = 'Channels must be in (None, 0, 3) when decoding GIF images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return gen_image_ops.decode_gif(contents)
+        return convert_image_dtype(gen_image_ops.decode_gif(contents), dtype)
 
     def check_gif():
       # Create assert op to check that bytes are GIF decodable
@@ -1701,7 +1720,11 @@ def decode_image(contents, channels=None, name=None):
 
     def _png():
       """Decodes a PNG image."""
-      return gen_image_ops.decode_png(contents, channels)
+      return convert_image_dtype(
+          gen_image_ops.decode_png(contents, channels,
+                                   dtype=dtypes.uint8
+                                   if dtype == dtypes.uint8
+                                   else dtypes.uint16), dtype)
 
     def check_png():
       """Checks if an image is PNG."""
@@ -1717,7 +1740,8 @@ def decode_image(contents, channels=None, name=None):
                       'images')
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return gen_image_ops.decode_jpeg(contents, channels)
+        return convert_image_dtype(
+            gen_image_ops.decode_jpeg(contents, channels), dtype)
 
     # Decode normal JPEG images (start with \xff\xd8\xff\xe0)
     # as well as JPEG images with EXIF data (start with \xff\xd8\xff\xe1).
@@ -1878,7 +1902,7 @@ def sample_distorted_bounding_box(image_size,
       width / height within this range.
     area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
       The cropped area of the image must contain a fraction of the
-      supplied image within in this range.
+      supplied image within this range.
     max_attempts: An optional `int`. Defaults to `100`.
       Number of attempts at generating a cropped region of the image
       of the specified constraints. After `max_attempts` failures, return the
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 45499dcce0..2a6ab26e96 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -533,6 +533,37 @@ class FlipImageBenchmark(test.Benchmark):
         iters=benchmark_rounds,
         wall_time=step_time)
 
+  def _benchmarkBatchedRandomFlipLeftRight(self, device, cpu_count):
+    image_shape = [16, 299, 299, 3]
+    warmup_rounds = 100
+    benchmark_rounds = 1000
+    config = config_pb2.ConfigProto()
+    if cpu_count is not None:
+      config.inter_op_parallelism_threads = 1
+      config.intra_op_parallelism_threads = cpu_count
+    with session.Session("", graph=ops.Graph(), config=config) as sess:
+      with ops.device(device):
+        inputs = variables.Variable(
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
+            trainable=False,
+            dtype=dtypes.float32)
+        run_op = image_ops.random_flip_left_right(inputs)
+        sess.run(variables.global_variables_initializer())
+        for i in xrange(warmup_rounds + benchmark_rounds):
+          if i == warmup_rounds:
+            start = time.time()
+          sess.run(run_op)
+    end = time.time()
+    step_time = (end - start) / benchmark_rounds
+    tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
+    print("benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s step_time: "
+          "%.2f us" %
+          (tag, step_time * 1e6))
+    self.report_benchmark(
+        name="benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s" % (tag),
+        iters=benchmark_rounds,
+        wall_time=step_time)
+
   def benchmarkFlipLeftRightCpu1(self):
     self._benchmarkFlipLeftRight("/cpu:0", 1)
 
@@ -551,6 +582,15 @@ class FlipImageBenchmark(test.Benchmark):
   def benchmarkRandomFlipLeftRightGpu(self):
     self._benchmarkRandomFlipLeftRight(test.gpu_device_name(), None)
 
+  def benchmarkBatchedRandomFlipLeftRightCpu1(self):
+    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", 1)
+
+  def benchmarkBatchedRandomFlipLeftRightCpuAll(self):
+    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", None)
+
+  def benchmarkBatchedRandomFlipLeftRightGpu(self):
+    self._benchmarkBatchedRandomFlipLeftRight(test.gpu_device_name(), None)
+
 
 class AdjustHueBenchmark(test.Benchmark):
 
@@ -987,7 +1027,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf)
+      y = image_ops.random_flip_left_right(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_left_right"))
 
       count_flipped = 0
@@ -1008,6 +1048,50 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  def testRandomFlipLeftRightWithBatch(self):
+    batch_size = 16
+    seed = 42
+
+    # create single item of test data
+    x_np_raw = np.array(
+        [[1, 2, 3], [1, 2, 3]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+    y_np_raw = np.array(
+        [[3, 2, 1], [3, 2, 1]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+
+    # create batched test data
+    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
+    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_left_right(x_tf, seed=seed)
+      self.assertTrue(y.op.name.startswith("random_flip_left_right"))
+
+      count_flipped = 0
+      count_unflipped = 0
+      for _ in range(100):
+        y_tf = y.eval()
+
+        # check every element of the batch
+        for i in range(batch_size):
+          if y_tf[i][0][0] == 1:
+            self.assertAllEqual(y_tf[i], x_np[i])
+            count_unflipped += 1
+          else:
+            self.assertAllEqual(y_tf[i], y_np[i])
+            count_flipped += 1
+
+      # 100 trials, each containing batch_size elements
+      # Mean: 50 * batch_size
+      # Std Dev: ~5 * sqrt(batch_size)
+      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
+      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
+      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
+      self.assertGreaterEqual(count_flipped, six_sigma)
+      self.assertGreaterEqual(count_unflipped, six_sigma)
+
   def testInvolutionUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1057,9 +1141,11 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
 
+    seed = 42
+
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf, seed=42)
+      y = image_ops.random_flip_up_down(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_up_down"))
       count_flipped = 0
       count_unflipped = 0
@@ -1079,6 +1165,50 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  def testRandomFlipUpDownWithBatch(self):
+    batch_size = 16
+    seed = 42
+
+    # create single item of test data
+    x_np_raw = np.array(
+        [[1, 2, 3], [4, 5, 6]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+    y_np_raw = np.array(
+        [[4, 5, 6], [1, 2, 3]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+
+    # create batched test data
+    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
+    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_up_down(x_tf, seed=seed)
+      self.assertTrue(y.op.name.startswith("random_flip_up_down"))
+
+      count_flipped = 0
+      count_unflipped = 0
+      for _ in range(100):
+        y_tf = y.eval()
+
+        # check every element of the batch
+        for i in range(batch_size):
+          if y_tf[i][0][0] == 1:
+            self.assertAllEqual(y_tf[i], x_np[i])
+            count_unflipped += 1
+          else:
+            self.assertAllEqual(y_tf[i], y_np[i])
+            count_flipped += 1
+
+      # 100 trials, each containing batch_size elements
+      # Mean: 50 * batch_size
+      # Std Dev: ~5 * sqrt(batch_size)
+      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
+      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
+      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
+      self.assertGreaterEqual(count_flipped, six_sigma)
+      self.assertGreaterEqual(count_unflipped, six_sigma)
+
   def testInvolutionTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1156,6 +1286,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     #Ops that support 4D input
     for op in [
         image_ops.flip_left_right, image_ops.flip_up_down,
+        image_ops.random_flip_left_right, image_ops.random_flip_up_down,
         image_ops.transpose_image, image_ops.rot90
     ]:
       transformed_unknown_dims_4 = op(p_unknown_dims_4)
@@ -1166,14 +1297,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
                                    "must be at least three-dimensional"):
         op(p_wrong_rank)
 
-    for op in [
-        image_ops.random_flip_left_right,
-        image_ops.random_flip_up_down,
-    ]:
-      with self.assertRaisesRegexp(ValueError, "must be three-dimensional"):
-        op(p_wrong_rank)
-
-
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -1208,41 +1331,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         y_np = np.rot90(image, k=k, axes=(1, 2))
         self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
-class RandomFlipTest(test_util.TensorFlowTestCase):
-
-  def testRandomLeftRight(self):
-    x_np = np.array([0, 1], dtype=np.uint8).reshape([1, 2, 1])
-    num_iterations = 500
-
-    hist = [0, 0]
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf)
-      for _ in xrange(num_iterations):
-        y_np = y.eval().flatten()[0]
-        hist[y_np] += 1
-
-    # Ensure that each entry is observed within 4 standard deviations.
-    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
-    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
-
-  def testRandomUpDown(self):
-    x_np = np.array([0, 1], dtype=np.uint8).reshape([2, 1, 1])
-    num_iterations = 500
-
-    hist = [0, 0]
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf)
-      for _ in xrange(num_iterations):
-        y_np = y.eval().flatten()[0]
-        hist[y_np] += 1
-
-    # Ensure that each entry is observed within 4 standard deviations.
-    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
-    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
-
-
 class AdjustContrastTest(test_util.TensorFlowTestCase):
 
   def _testContrast(self, x_np, y_np, contrast_factor):
@@ -3880,5 +3968,88 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
       self.assertAllClose(expected_batch, actual_sobel)
 
 
+class DecodeImageTest(test_util.TensorFlowTestCase):
+
+  def testJpegUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/jpeg/testdata"
+      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
+      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
+                                             dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testPngUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/png/testdata"
+      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
+      image0 = image_ops.decode_image(png0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(
+          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testGifUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/gif/testdata"
+      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+      image0 = image_ops.decode_image(gif0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
+                                             dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testBmpUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/bmp/testdata"
+      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
+      image0 = image_ops.decode_image(bmp0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
+                                             dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testJpegFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/jpeg/testdata"
+      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
+      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
+                                             dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testPngFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/png/testdata"
+      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
+      image0 = image_ops.decode_image(png0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(
+          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testGifFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/gif/testdata"
+      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+      image0 = image_ops.decode_image(gif0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
+                                             dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testBmpFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/bmp/testdata"
+      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
+      image0 = image_ops.decode_image(bmp0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
+                                             dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 2df230d470..724fcc39cd 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -467,7 +467,8 @@ class VarianceScaling(Initializer):
     else:
       scale /= max(1., (fan_in + fan_out) / 2.)
     if self.distribution == "normal":
-      stddev = math.sqrt(scale)
+      # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+      stddev = math.sqrt(scale) / .87962566103423978
       return random_ops.truncated_normal(
           shape, 0.0, stddev, dtype, seed=self.seed)
     else:
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 222b8ebc9d..8276047cb6 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -35,8 +35,9 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 # Assert and Print are special symbols in python, so we must
-# use an upper-case version of them.
-@tf_export("Print")
+# have an upper-case version of them.  For users with Python 3 or Python 2.7
+# with `from __future__ import print_function`, we also allow lowercase.
+@tf_export("Print", "print")
 def Print(input_, data, message=None, first_n=None, summarize=None,
           name=None):
   """Prints a list of tensors.
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index e40481f3a7..466d0dadc8 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -125,8 +125,8 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` or `SparseTensor` of type `float32`, `float64`, `int32`,
-      `int64`, `complex64` or `complex128`.
+    x: A `Tensor` or `SparseTensor` of type `float16`, `float32`, `float64`,
+      `int32`, `int64`, `complex64` or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -430,10 +430,10 @@ def pow(x, y, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-     or `complex128`.
-    y: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-     or `complex128`.
+    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
+     `complex64`, or `complex128`.
+    y: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
+     `complex64`, or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -600,7 +600,7 @@ def round(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float32` or `float64`.
+    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, or `int64`.
     name: A name for the operation (optional).
 
   Returns:
@@ -1257,7 +1257,7 @@ def reduce_sum(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1397,7 +1397,7 @@ def reduce_mean(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1469,7 +1469,7 @@ def reduce_prod(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1519,7 +1519,7 @@ def reduce_min(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1568,7 +1568,7 @@ def reduce_max(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1617,7 +1617,7 @@ def reduce_all(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1675,7 +1675,7 @@ def reduce_any(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 783d485892..f47f38e29e 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -621,7 +621,7 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
 
   Args:
-    counts: A `Tensor` containing a the total count of the data (one value).
+    counts: A `Tensor` containing the total count of the data (one value).
     mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly
       shifted) sum of the elements to average over.
     variance_ss: A `Tensor` containing the variance sufficient statistics: the
@@ -689,6 +689,9 @@ def moments(
     # Compute true mean while keeping the dims for proper broadcasting.
     mean = math_ops.reduce_mean(y, axes, keepdims=True, name="mean")
     # sample variance, not unbiased variance
+    # Note: stop_gradient does not change the gradient that gets 
+    #       backpropagated to the mean from the variance calculation,
+    #       because that gradient is zero
     variance = math_ops.reduce_mean(
         math_ops.squared_difference(y, array_ops.stop_gradient(mean)),
         axes,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index a0b55eb077..0c2f5b06c4 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1596,12 +1596,12 @@ def leaky_relu(features, alpha=0.2, name=None):
   Returns:
     The activation value.
   """
-  with ops.name_scope(name, "LeakyRelu", [features, alpha]):
+  with ops.name_scope(name, "LeakyRelu", [features, alpha]) as name:
     features = ops.convert_to_tensor(features, name="features")
     if features.dtype.is_integer:
       features = math_ops.to_float(features)
     alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
-    return math_ops.maximum(alpha * features, features)
+    return math_ops.maximum(alpha * features, features, name=name)
 
 
 def _flatten_outer_dims(logits):
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 46a5f4fae6..035b4735af 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -962,6 +962,16 @@ class LeakyReluTest(test_lib.TestCase):
       self.assertAllClose(
           outputs, [-0.4, -0.2, 0.0, 1.0, 2.0], rtol=tol, atol=tol)
 
+  def testName(self):
+    np_values = np.array([-2, -1, 0, 1, 2], dtype=np.float64)
+    outputs_with_name_set = nn_ops.leaky_relu(
+        constant_op.constant(np_values),
+        name='test_relu_op')
+    self.assertEqual(outputs_with_name_set.name, 'test_relu_op:0')
+    outputs_without_name_set = nn_ops.leaky_relu(
+        constant_op.constant(np_values))
+    self.assertEqual(outputs_without_name_set.name, 'LeakyRelu:0')
+
 
 class SwishTest(test_lib.TestCase):
 
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index f8676ccb5f..219562de5d 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -23,6 +23,7 @@ import threading
 
 # Used by py_util.cc to get tracebacks.
 import traceback  # pylint: disable=unused-import
+import weakref
 
 import numpy as np
 import six
@@ -129,11 +130,14 @@ class FuncRegistry(object):
   def __init__(self):
     self._lock = threading.Lock()
     self._unique_id = 0  # GUARDED_BY(self._lock)
-    self._funcs = {}
+    # Only store weakrefs to the funtions. The strong reference is stored in
+    # the graph.
+    self._funcs = weakref.WeakValueDictionary()
 
   def insert(self, func):
     """Registers `func` and returns a unique token for this entry."""
     token = self._next_unique_token()
+    # Store a weakref to the function
     self._funcs[token] = func
     return token
 
@@ -186,7 +190,7 @@ class FuncRegistry(object):
     Raises:
       ValueError: if no function is registered for `token`.
     """
-    func = self._funcs[token]
+    func = self._funcs.get(token, None)
     if func is None:
       raise ValueError("callback %s is not found" % token)
     if isinstance(func, EagerFunc):
@@ -228,19 +232,6 @@ _py_funcs = FuncRegistry()
 pywrap_tensorflow.InitializePyTrampoline(_py_funcs)
 
 
-class CleanupFunc(object):
-  """A helper class to remove a registered function from _py_funcs."""
-
-  def __init__(self, token):
-    self._token = token
-
-  def __del__(self):
-    if _py_funcs is not None:
-      # If _py_funcs is None, the program is most likely in shutdown, and the
-      # _py_funcs object has been destroyed already.
-      _py_funcs.remove(self._token)
-
-
 def _internal_py_func(func,
                       inp,
                       Tout,
@@ -270,17 +261,15 @@ def _internal_py_func(func,
     # bound to that of the outer graph instead.
     graph = graph._outer_graph
 
-  cleanup = CleanupFunc(token)
-
   # TODO(zhifengc): Consider adding a Graph method to collect
   # `cleanup` objects in one of its member.
-  if not hasattr(graph, "_cleanup_py_funcs_used_in_graph"):
-    graph._cleanup_py_funcs_used_in_graph = []
+  if not hasattr(graph, "_py_funcs_used_in_graph"):
+    graph._py_funcs_used_in_graph = []
 
-  # When `graph` is destroyed, elements in _cleanup_py_funcs_used_in_graph
-  # will be destroyed and their __del__ will remove the 'token' from
-  # the funcs registry.
-  graph._cleanup_py_funcs_used_in_graph.append(cleanup)
+  # Store a reference to the function in the graph to ensure it stays alive
+  # as long as the graph lives. When the graph is destroyed, the function
+  # is left to the garbage collector for destruction as well.
+  graph._py_funcs_used_in_graph.append(func)
   # pylint: enable=protected-access
 
   if eager:
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 0130233746..c3b16a7bd5 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -84,6 +84,8 @@ def _convert_to_sparse_tensors(sp_inputs):
 
 # pylint: disable=protected-access
 @tf_export("sparse_concat")
+@deprecation.deprecated_args(
+    None, "concat_dim is deprecated, use axis instead", "concat_dim")
 def sparse_concat(axis,
                   sp_inputs,
                   name=None,
@@ -597,6 +599,8 @@ class KeywordRequired(object):
 
 
 @tf_export("sparse_split")
+@deprecation.deprecated_args(
+    None, "split_dim is deprecated, use axis instead", "split_dim")
 def sparse_split(keyword_required=KeywordRequired(),
                  sp_input=None,
                  num_split=None,
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index ae79c01949..0280c89c10 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -91,6 +91,59 @@ def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=inv
   shape.set_shape([2])
   return sparse_tensor.SparseTensor(indices, values, shape)
 
+@tf_export("strings.split")
+def string_split_v2(source, sep=None, maxsplit=-1):
+  """Split elements of `source` based on `sep` into a `SparseTensor`.
+
+  Let N be the size of source (typically N will be the batch size). Split each
+  element of `source` based on `sep` and return a `SparseTensor`
+  containing the split tokens. Empty tokens are ignored.
+
+  For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+  then the output will be
+
+  st.indices = [0, 0;
+                0, 1;
+                1, 0;
+                1, 1;
+                1, 2]
+  st.shape = [2, 3]
+  st.values = ['hello', 'world', 'a', 'b', 'c']
+
+  If `sep` is given, consecutive delimiters are not grouped together and are
+  deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+  sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+  string, consecutive whitespace are regarded as a single separator, and the
+  result will contain no empty strings at the startor end if the string has
+  leading or trailing whitespace.
+
+  Note that the above mentioned behavior matches python's str.split.
+
+  Args:
+    source: `1-D` string `Tensor`, the strings to split.
+    sep: `0-D` string `Tensor`, the delimiter character.
+    maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
+
+  Raises:
+    ValueError: If sep is not a string.
+
+  Returns:
+    A `SparseTensor` of rank `2`, the strings split according to the delimiter.
+    The first column of the indices corresponds to the row in `source` and the
+    second column corresponds to the index of the split component in this row.
+  """
+  if sep is None:
+    sep = ''
+  sep = ops.convert_to_tensor(sep, dtype=dtypes.string)
+  source = ops.convert_to_tensor(source, dtype=dtypes.string)
+
+  indices, values, shape = gen_string_ops.string_split_v2(
+      source, sep=sep, maxsplit=maxsplit)
+  indices.set_shape([None, 2])
+  values.set_shape([None])
+  shape.set_shape([2])
+  return sparse_tensor.SparseTensor(indices, values, shape)
+
 
 def _reduce_join_reduction_dims(x, axis, reduction_indices):
   """Returns range(rank(x) - 1, 0, -1) if reduction_indices is None."""
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index f49e2d314d..47414c28af 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -1786,6 +1786,23 @@ class variable_scope(object):
           assert v.name == "foo/bar/v:0"
   ```
 
+  Simple example of how to reenter a premade variable scope safely:
+
+  ```python
+  with tf.variable_scope("foo") as vs:
+    pass
+
+  # Re-enter the variable scope.
+  with tf.variable_scope(vs,
+                         auxiliary_name_scope=False) as vs1:
+    # Restore the original name_scope.
+    with tf.name_scope(vs1.original_name_scope):
+        v = tf.get_variable("v", [1])
+        assert v.name == "foo/v:0"
+        c = tf.constant([1], name="c")
+        assert c.name == "foo/c:0"
+  ```
+
   Basic example of sharing a variable AUTO_REUSE:
 
   ```python
@@ -1924,7 +1941,9 @@ class variable_scope(object):
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
       auxiliary_name_scope: If `True`, we create an auxiliary name scope with
-        the scope. If `False`, we don't touch name scope.
+        the scope. If `False`, we don't create it. Note that the argument is
+        not inherited, and it only takes effect for once when creating. You
+        should only use it for re-entering a premade variable scope.
 
     Returns:
       A scope that can be captured and reused.
diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
old mode 100755
new mode 100644
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 522965990b..b59f8e1f98 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1719,7 +1719,7 @@ def tf_py_build_info_genrule():
       name="py_build_info_gen",
       outs=["platform/build_info.py"],
       cmd=
-      "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
+     "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
       local=1,
       tools=[clean_dep("//tensorflow/tools/build_info:gen_build_info.py")],)
 
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index bca9fa49eb..671b7e387e 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -41,7 +41,11 @@ _GENERATED_FILE_HEADER = """# This file is MACHINE GENERATED! Do not edit.
 # Generated by: tensorflow/tools/api/generator/create_python_api.py script.
 \"\"\"%s
 \"\"\"
+
+from __future__ import print_function
+
 """
+_GENERATED_FILE_FOOTER = "\n\ndel print_function\n"
 
 
 class SymbolExposedTwiceError(Exception):
@@ -149,6 +153,7 @@ class _ModuleInitCodeBuilder(object):
 _names_with_underscore = [%s]
 __all__ = [_s for _s in dir() if not _s.startswith('_')]
 __all__.extend([_s for _s in _names_with_underscore])
+__all__.remove('print_function')
 ''' % underscore_names_str
 
     return module_text_map
@@ -333,7 +338,8 @@ def create_api_files(
     if module or not root_init_template:
       contents = (
           _GENERATED_FILE_HEADER %
-          get_module_docstring(module, package, api_name) + text)
+          get_module_docstring(module, package, api_name) +
+          text + _GENERATED_FILE_FOOTER)
     else:
       # Read base init file
       with open(root_init_template, 'r') as root_init_template_file:
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index 5bb3b3c444..10171b3d60 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -58,7 +58,7 @@ tf_module {
   }
   member_method {
     name: "decode_image"
-    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
   }
   member_method {
     name: "decode_jpeg"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index dc2bd40096..3051c4437e 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -1532,6 +1532,10 @@ tf_module {
     name: "pow"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "print"
+    argspec: "args=[\'input_\', \'data\', \'message\', \'first_n\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "py_func"
     argspec: "args=[\'func\', \'inp\', \'Tout\', \'stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
index a3fbe95bba..b641c39feb 100644
--- a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
@@ -4,4 +4,8 @@ tf_module {
     name: "regex_full_match"
     argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "split"
+    argspec: "args=[\'source\', \'sep\', \'maxsplit\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
+  }
 }
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 5fa75e1d61..883bb93647 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -322,6 +322,10 @@ create_activate_virtualenv_and_install_tensorflow() {
   pip install -v ${PIP_FLAGS} ${WHL_PATH} || \
     die "pip install (forcing to reinstall tensorflow) FAILED"
   echo "Successfully installed pip package ${TF_WHEEL_PATH}"
+
+  # Force downgrade setuptools.
+  pip install --upgrade setuptools==39.1.0
+
 }
 
 ################################################################################
diff --git a/tensorflow/tools/ci_build/builds/with_the_same_user b/tensorflow/tools/ci_build/builds/with_the_same_user
index d4bf546d40..b216e3549f 100755
--- a/tensorflow/tools/ci_build/builds/with_the_same_user
+++ b/tensorflow/tools/ci_build/builds/with_the_same_user
@@ -40,7 +40,7 @@ if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
   ADDUSER_OPTS="--force-badname"
 fi
 
-getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
+getent group "${CI_BUILD_GID}" || addgroup ${ADDUSER_OPTS} --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
 getent passwd "${CI_BUILD_UID}" || adduser ${ADDUSER_OPTS} \
     --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
     --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 072dd6ab99..1f0fd0387a 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -134,6 +134,12 @@ if [[ $? != "0" ]]; then
   die "ERROR: docker build failed. Dockerfile is at ${DOCKERFILE_PATH}"
 fi
 
+# If caller wants the with_the_same_user script to allow bad usernames, 
+# pass the var to the docker environment
+if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
+        CI_BUILD_USER_FORCE_BADNAME_ENV="-e CI_BUILD_USER_FORCE_BADNAME=yes"
+fi
+
 # Run the command inside the container.
 echo "Running '${COMMAND[*]}' inside ${DOCKER_IMG_NAME}..."
 mkdir -p ${WORKSPACE}/bazel-ci_build-cache
@@ -148,6 +154,7 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_BUILD_GROUP=$(id -g -n)" \
     -e "CI_BUILD_GID=$(id -g)" \
     -e "CI_TENSORFLOW_SUBMODULE_PATH=${CI_TENSORFLOW_SUBMODULE_PATH}" \
+    ${CI_BUILD_USER_FORCE_BADNAME_ENV} \
     -v ${WORKSPACE}:/workspace \
     -w /workspace \
     ${GPU_EXTRA_PARAMS} \
diff --git a/tensorflow/tools/ci_build/copy_binary.py b/tensorflow/tools/ci_build/copy_binary.py
index 420d390d2b..148526492d 100755
--- a/tensorflow/tools/ci_build/copy_binary.py
+++ b/tensorflow/tools/ci_build/copy_binary.py
@@ -32,7 +32,8 @@ import shutil
 import tempfile
 import zipfile
 
-TF_NIGHTLY_REGEX = r"(.+)tf_nightly(|_gpu)-(\d\.\d\.\d.dev[\d]{0,8})-(.+)\.whl"
+TF_NIGHTLY_REGEX = (r"(.+)tf_nightly(|_gpu)-(\d\.[\d]{1,2}"
+                    "\.\d.dev[\d]{0,8})-(.+)\.whl")
 BINARY_STRING_TEMPLATE = "%s-%s-%s.whl"
 
 
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 60290df833..88f1d04193 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -115,3 +115,7 @@ pip2 install keras_applications==1.0.2
 pip3 install keras_applications==1.0.2
 pip2 install keras_preprocessing==1.0.1
 pip3 install keras_preprocessing==1.0.1
+
+# Install last working version of setuptools.
+pip2 install --upgrade setuptools==39.1.0
+pip3 install --upgrade setuptools==39.1.0
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index edb9d4b929..acd69ef346 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -39,7 +39,6 @@ if [[ -z $pip35_version ]]; then
 fi
 
 set -e
-pip3.5 install --upgrade setuptools
 pip3.5 install --upgrade pip
 
 pip3.5 install --upgrade virtualenv
@@ -86,4 +85,7 @@ pip3.5 install --upgrade termcolor
 pip3.5 install keras_applications==1.0.2
 pip3.5 install keras_preprocessing==1.0.1
 
+# Install last working version of setuptools.
+pip3.5 install --upgrade setuptools==39.1.0
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 5635977731..323b30f48e 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -49,7 +49,6 @@ cd Python-3.6.1
 make altinstall
 ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
 
-pip3 install --upgrade setuptools
 pip3 install --upgrade pip
 
 pip3 install --upgrade virtualenv
@@ -101,4 +100,8 @@ pip3 install --upgrade termcolor
 # Keras
 pip3.5 install keras_applications==1.0.2
 pip3.5 install keras_preprocessing==1.0.1
+
+# Install last working version of setuptools.
+pip3 install --upgrade setuptools==39.1.0
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh
new file mode 100755
index 0000000000..10a09a415a
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Usage: basic_mkl_test.sh
+
+# Helper function to traverse directories up until given file is found.
+function upsearch () {
+  test / == "$PWD" && return || \
+      test -e "$1" && echo "$PWD" && return || \
+      cd .. && upsearch "$1"
+}
+
+# Set up WORKSPACE.
+WORKSPACE="${WORKSPACE:-$(upsearch WORKSPACE)}"
+
+BUILD_TAG=mkl-ci-test CI_BUILD_USER_FORCE_BADNAME=yes ${WORKSPACE}/tensorflow/tools/ci_build/ci_build.sh cpu tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 1bd1852ffc..b8bce57c87 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -79,6 +79,7 @@ if [[ $1 == "PI_ONE" ]]; then
   --linkopt=-L${OPENBLAS_INSTALL_PATH}/lib/
   --linkopt=-l:libopenblas.a"
   echo "Building for the Pi One/Zero, with no NEON support"
+  WHEEL_ARCH=linux_armv6l
 else
   PI_COPTS='--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
   --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
@@ -86,6 +87,7 @@ else
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8'
+  WHEEL_ARCH=linux_armv7l
   echo "Building for the Pi Two/Three, with NEON acceleration"
 fi
 
@@ -100,6 +102,8 @@ bazel build -c opt ${PI_COPTS} \
   --copt=-fomit-frame-pointer --cpu=armeabi \
   --crosstool_top=@local_config_arm_compiler//:toolchain \
   --verbose_failures \
+  //tensorflow:libtensorflow.so \
+  //tensorflow:libtensorflow_framework.so \
   //tensorflow/tools/benchmark:benchmark_model \
   //tensorflow/tools/pip_package:build_pip_package
 
@@ -112,10 +116,12 @@ BDIST_OPTS="--universal" \
   bazel-bin/tensorflow/tools/pip_package/build_pip_package "${OUTDIR}"
 
 OLD_FN=$(ls "${OUTDIR}" | grep -m 1 \.whl)
-SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-any.whl/; print'
+SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-'${WHEEL_ARCH}'.whl/; print'
 NEW_FN=$(echo "${OLD_FN}" | perl -ne "${SUB}")
 mv "${OUTDIR}/${OLD_FN}" "${OUTDIR}/${NEW_FN}"
 cp bazel-bin/tensorflow/tools/benchmark/benchmark_model "${OUTDIR}"
+cp bazel-bin/tensorflow/libtensorflow.so "${OUTDIR}"
+cp bazel-bin/tensorflow/libtensorflow_framework.so "${OUTDIR}"
 
 echo "Output can be found here:"
 find "${OUTDIR}"
diff --git a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
index 47539b2423..f8f63e276c 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+++ b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
@@ -31,7 +31,11 @@ def _def_file_filter_configure_impl(repository_ctx):
   vc_path = find_vc_path(repository_ctx)
   if vc_path == "visual-studio-not-found":
     auto_configure_fail("Visual C++ build tools not found on your machine")
-  undname_bin_path = find_msvc_tool(repository_ctx, vc_path, "undname.exe").replace("\\", "\\\\")
+
+  undname = find_msvc_tool(repository_ctx, vc_path, "undname.exe")
+  if undname == None:
+    auto_configure_fail("Couldn't find undname.exe under %s, please check your VC installation and set BAZEL_VC environment variable correctly." % vc_path)
+  undname_bin_path = undname.replace("\\", "\\\\")
 
   repository_ctx.template(
     "def_file_filter.py",
diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh
index 06c2b997cb..b0114721bd 100755
--- a/tensorflow/tools/dist_test/local_test.sh
+++ b/tensorflow/tools/dist_test/local_test.sh
@@ -64,9 +64,6 @@ die() {
 # Configurations
 DOCKER_IMG_NAME="tensorflow/tf-dist-test-local-cluster"
 
-# Use TensorFlow v1.5.0 for Python 2.7 and CPU only as we set num_gpus to 0 in the below
-DEFAULT_WHL_FILE_LOCATION="https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl"
-
 # Parse input arguments
 LEAVE_CONTAINER_RUNNING=0
 MODEL_NAME=""
@@ -77,8 +74,7 @@ SYNC_REPLICAS_FLAG=""
 
 WHL_FILE_LOCATION=${1}
 if [[ -z "${WHL_FILE_LOCATION}" ]]; then
-  WHL_FILE_LOCATION=${DEFAULT_WHL_FILE_LOCATION}
-  echo "use default whl file location"
+  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
 fi
 
 while true; do
@@ -131,7 +127,11 @@ echo "Building in temporary directory: ${BUILD_DIR}"
 cp -r ${DIR}/* "${BUILD_DIR}"/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
-if [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
+# Download whl file into the build context directory.
+if [[ -z "${WHL_FILE_LOCATION}" ]]; then
+  pip2 download --no-deps tf-nightly
+  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
+elif [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
     # Download whl file into the build context directory.
     wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \
         die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}"
diff --git a/tensorflow/tools/dist_test/remote_test.sh b/tensorflow/tools/dist_test/remote_test.sh
index 935535312d..e188c88c8f 100755
--- a/tensorflow/tools/dist_test/remote_test.sh
+++ b/tensorflow/tools/dist_test/remote_test.sh
@@ -108,7 +108,7 @@ fi
 # Parse command-line arguments.
 WHL_URL=${1}
 if [[ -z "${WHL_URL}" ]]; then
-  die "whl URL is not specified"
+  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
 fi
 
 # Create docker build context directory.
@@ -121,8 +121,13 @@ cp -r ${DIR}/* ${BUILD_DIR}/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
 # Download whl file into the build context directory.
-wget -P "${BUILD_DIR}" ${WHL_URL} || \
-  die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+if [[ -z "${WHL_URL}" ]]; then
+  pip2 download --no-deps tf-nightly
+  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
+else
+  wget -P "${BUILD_DIR}" ${WHL_URL} || \
+    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+fi
 
 # Build docker image for test.
 docker build ${NO_CACHE_FLAG} \
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 406d134699..57a491255e 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -76,7 +76,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.9 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index a6cd44ced1..6796ad70e5 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.8
+ARG TF_BRANCH=r1.9
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 2fe47f3356..204b5b4dba 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -13,8 +13,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusparse-dev-9-0 \
         curl \
         git \
-        libcudnn7=7.0.5.15-1+cuda9.0 \
-        libcudnn7-dev=7.0.5.15-1+cuda9.0 \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
+        libcudnn7-dev=7.1.4.18-1+cuda9.0 \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
@@ -85,7 +85,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.9 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index bff4a20392..9197651ff4 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusolver-9-0 \
         cuda-cusparse-9-0 \
         curl \
-        libcudnn7=7.0.5.15-1+cuda9.0 \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libpng12-dev \
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 5910f0625e..620fef9363 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -61,6 +61,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/autograph/core:core",
     "//tensorflow/contrib/autograph/impl:impl",
     "//tensorflow/contrib/autograph/lang:lang",
+    "//tensorflow/contrib/autograph/operators:operators",
     "//tensorflow/contrib/autograph/pyct:pyct",
     "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
     "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 0c4065bc77..f7e42ce536 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -41,51 +41,15 @@ function is_windows() {
   fi
 }
 
-function main() {
+function prepare_src() {
   if [ $# -lt 1 ] ; then
     echo "No destination dir provided"
     exit 1
   fi
 
-  DEST=$(real_path $1)
-  TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX)
-
-  PKG_NAME_FLAG=""
-  GPU_BUILD=0
-  NIGHTLY_BUILD=0
-  PROJECT_NAME=""
-  while true; do
-    if [[ "$1" == "--nightly_flag" ]]; then
-      NIGHTLY_BUILD=1
-    elif [[ "$1" == "--gpu" ]]; then
-      GPU_BUILD=1
-    elif [[ "$1" == "--gpudirect" ]]; then
-      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
-    elif [[ "$1" == "--project_name" ]]; then
-      shift
-      if [[ -z "$1" ]]; then
-        break
-      fi
-      PROJECT_NAME="$1"
-    fi
-    shift
-
-    if [[ -z "$1" ]]; then
-      break
-    fi
-  done
-
-  if [[ -n ${PROJECT_NAME} ]]; then
-    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
-  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
-  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly"
-  elif [[ ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_gpu"
-  fi
-
-  echo $(date) : "=== Using tmpdir: ${TMPDIR}"
+  TMPDIR="$1"
+  mkdir -p "$TMPDIR"
+  echo $(date) : "=== Preparing sources in dir: ${TMPDIR}"
 
   if [ ! -d bazel-bin/tensorflow ]; then
     echo "Could not find bazel-bin.  Did you run from the root of the build tree?"
@@ -155,17 +119,28 @@ function main() {
   # over so user defined ops can be compiled.
   mkdir -p ${TMPDIR}/google
   mkdir -p ${TMPDIR}/third_party
-  pushd ${RUNFILES%org_tensorflow}
+  pushd ${RUNFILES%org_tensorflow} > /dev/null
   for header in $(find protobuf_archive -name \*.h); do
     mkdir -p "${TMPDIR}/google/$(dirname ${header})"
     cp "$header" "${TMPDIR}/google/$(dirname ${header})/"
   done
-  popd
+  popd > /dev/null
   cp -R $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party
 
   cp tensorflow/tools/pip_package/MANIFEST.in ${TMPDIR}
   cp tensorflow/tools/pip_package/README ${TMPDIR}
   cp tensorflow/tools/pip_package/setup.py ${TMPDIR}
+}
+
+function build_wheel() {
+  if [ $# -lt 2 ] ; then
+    echo "No src and dest dir provided"
+    exit 1
+  fi
+
+  TMPDIR="$1"
+  DEST="$2"
+  PKG_NAME_FLAG="$3"
 
   # Before we leave the top-level directory, make sure we know how to
   # call python.
@@ -173,15 +148,110 @@ function main() {
     source tools/python_bin_path.sh
   fi
 
-  pushd ${TMPDIR}
+  pushd ${TMPDIR} > /dev/null
   rm -f MANIFEST
   echo $(date) : "=== Building wheel"
   "${PYTHON_BIN_PATH:-python}" setup.py bdist_wheel ${PKG_NAME_FLAG} >/dev/null
   mkdir -p ${DEST}
   cp dist/* ${DEST}
-  popd
-  rm -rf ${TMPDIR}
+  popd > /dev/null
   echo $(date) : "=== Output wheel file is in: ${DEST}"
 }
 
+function usage() {
+  echo "Usage:"
+  echo "$0 [--src srcdir] [--dst dstdir] [options]"
+  echo "$0 dstdir [options]"
+  echo ""
+  echo "    --src                 prepare sources in srcdir"
+  echo "                              will use temporary dir if not specified"
+  echo ""
+  echo "    --dst                 build wheel in dstdir"
+  echo "                              if dstdir is not set do not build, only prepare sources"
+  echo ""
+  echo "  Options:"
+  echo "    --project_name <name> set project name to name"
+  echo "    --gpu                 build tensorflow_gpu"
+  echo "    --gpudirect           build tensorflow_gpudirect"
+  echo "    --nightly_flag        build tensorflow nightly"
+  echo ""
+  exit 1
+}
+
+function main() {
+  PKG_NAME_FLAG=""
+  PROJECT_NAME=""
+  GPU_BUILD=0
+  NIGHTLY_BUILD=0
+  SRCDIR=""
+  DSTDIR=""
+  CLEANSRC=1
+  while true; do
+    if [[ "$1" == "--help" ]]; then
+      usage
+      exit 1
+    elif [[ "$1" == "--nightly_flag" ]]; then
+      NIGHTLY_BUILD=1
+    elif [[ "$1" == "--gpu" ]]; then
+      GPU_BUILD=1
+    elif [[ "$1" == "--gpudirect" ]]; then
+      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
+    elif [[ "$1" == "--project_name" ]]; then
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+    elif [[ "$1" == "--src" ]]; then
+      shift
+      SRCDIR="$(real_path $1)"
+      CLEANSRC=0
+    elif [[ "$1" == "--dst" ]]; then
+      shift
+      DSTDIR="$(real_path $1)"
+    else
+      DSTDIR="$(real_path $1)"
+    fi
+    shift
+
+    if [[ -z "$1" ]]; then
+      break
+    fi
+  done
+
+  if [[ -z "$DSTDIR" ]] && [[ -z "$SRCDIR" ]]; then
+    echo "No destination dir provided"
+    usage
+    exit 1
+  fi
+
+  if [[ -z "$SRCDIR" ]]; then
+    # make temp srcdir if none set
+    SRCDIR="$(mktemp -d -t tmp.XXXXXXXXXX)"
+  fi
+
+  prepare_src "$SRCDIR"
+
+  if [[ -z "$DSTDIR" ]]; then
+      # only want to prepare sources
+      exit
+  fi
+
+  if [[ -n ${PROJECT_NAME} ]]; then
+    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
+  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
+  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly"
+  elif [[ ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tensorflow_gpu"
+  fi
+
+  build_wheel "$SRCDIR" "$DSTDIR" "$PKG_NAME_FLAG"
+
+  if [[ $CLEANSRC -ne 0 ]]; then
+    rm -rf "${TMPDIR}"
+  fi
+}
+
 main "$@"
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index d25a9e77b1..97f625e7e9 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -45,7 +45,7 @@ DOCLINES = __doc__.split('\n')
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.8.0'
+_VERSION = '1.9.0-rc0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
@@ -54,6 +54,7 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
+    'setuptools <= 39.1.0',
     'tensorboard >= 1.8.0, < 1.9.0',
     'termcolor >= 1.1.0',
 ]
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
index 29add6d5ea..15d7c70281 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
@@ -814,6 +814,9 @@ void Generator::Generate(const FileDescriptor& fd) {
   // Add header to cc file.
   SetOutput(&cc_);
   Print("// GENERATED FILE - DO NOT MODIFY");
+  Print();
+  Print("#include <algorithm>");  // for `std::stable_sort()`
+  Print();
   headers = {GetProtoTextHeaderName(fd, true /* impl */)};
   AddHeadersToCurrentSection(headers);
   Print();
diff --git a/tensorflow/tools/quantization/quantize_graph_test.py b/tensorflow/tools/quantization/quantize_graph_test.py
index df71840b64..92bb5127da 100644
--- a/tensorflow/tools/quantization/quantize_graph_test.py
+++ b/tensorflow/tools/quantization/quantize_graph_test.py
@@ -119,8 +119,8 @@ def are_tensors_near(a, b, tolerance):
   flat_a = a.flatten()
   flat_b = b.flatten()
   if len(flat_a) != len(flat_b):
-    print("Tensors are different sizes: " + str(len(flat_a)) + " vs " + str(
-        len(flat_b)))
+    tf_logging.info("Tensors are different sizes: " + str(len(flat_a)) + " vs "
+                    + str(len(flat_b)))
     return False
   value_count = len(flat_a)
   how_many_different = 0
@@ -140,10 +140,10 @@ def are_tensors_near(a, b, tolerance):
   if how_many_different == 0:
     return True
   else:
-    print("Tensors have {0} different values ({1}%), with mean difference"
-          " {2} and mean absolute difference {3}".format(
-              how_many_different, proportion_different * 100, mean_difference,
-              mean_abs_difference))
+    tf_logging.info("Tensors have {0} different values ({1}%), with mean"
+                    " difference {2} and mean absolute difference {3}".format(
+                        how_many_different, proportion_different * 100,
+                        mean_difference, mean_abs_difference))
     return False
 
 
diff --git a/tensorflow/tools/test/upload_test_benchmarks.py b/tensorflow/tools/test/upload_test_benchmarks.py
index 9c45359ee1..c030575109 100644
--- a/tensorflow/tools/test/upload_test_benchmarks.py
+++ b/tensorflow/tools/test/upload_test_benchmarks.py
@@ -89,7 +89,6 @@ import shutil
 
 from six import text_type
 from google.cloud import datastore
-from six import text_type
 
 
 def is_real_file(dirpath, fname):
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index dbec66216a..4f3df570a5 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -50,31 +50,31 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl_linux",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz"
       ],
-      sha256 = "74844bd77294742bf2396ff040369d1aa4cdd9e826fcd38cf8398ae83564d146",
-      strip_prefix = "mklml_lnx_2018.0.2.20180127",
+      sha256 = "d2305244fdc9b87db7426ed4496e87a4b3977ad3374d73b8000e8b7a5b7aa725",
+      strip_prefix = "mklml_lnx_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_windows",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip"
       ],
-      sha256 = "d8fbf0faa0684bffa3548005d05fe5cfe56ff9dbc0e15e7612d7ac01055a6ded",
-      strip_prefix = "mklml_win_2018.0.2.20180127",
+      sha256 = "a584a5bf1c8d2ad70b90d12b52652030e9a338217719064fdb84b7ad0d693694",
+      strip_prefix = "mklml_win_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_darwin",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz"
       ],
-      sha256 = "aa740d71e14562bfea56e6829e6dc186e7487cbcf6748a88dec73826b7ec1943",
-      strip_prefix = "mklml_mac_2018.0.2.20180127",
+      sha256 = "094e3dfd61c816136dc8d12a45cc611ce26c5f4828176a3644cd0b0efa15a25b",
+      strip_prefix = "mklml_mac_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
 
@@ -85,11 +85,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
-          "https://github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
+          "https://github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
       ],
-      sha256 = "d2cfd93a70cfe86ebe054477c530c9b5c1218b70f75856eb6d1956c68ee89e8f",
-      strip_prefix = "mkl-dnn-0.13",
+      sha256 = "efebc53882856afec86457a2da644693f5d59c68772d41d640d6b60a8efc4eb0",
+      strip_prefix = "mkl-dnn-0.14",
       build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
   )
 
@@ -187,11 +187,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "highwayhash",
       urls = [
-          "https://mirror.bazel.build/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
-          "https://github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
+          "http://mirror.bazel.build/github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
+          "https://github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
       ],
-      sha256 = "0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
-      strip_prefix = "highwayhash-dfcb97ca4fe9277bf9dc1802dd979b071896453b",
+      sha256 = "9c3e0e87d581feeb0c18d814d98f170ff23e62967a2bd6855847f0b2fe598a37",
+      strip_prefix = "highwayhash-fd3d9af80465e4383162e4a7c5e2f406e82dd968",
       build_file = clean_dep("//third_party:highwayhash.BUILD"),
   )
 
diff --git a/third_party/eigen.BUILD b/third_party/eigen.BUILD
index 07bb6645eb..e54c1a4501 100644
--- a/third_party/eigen.BUILD
+++ b/third_party/eigen.BUILD
@@ -64,6 +64,7 @@ cc_library(
         # This define (mostly) guarantees we don't link any problematic
         # code. We use it, but we do not rely on it, as evidenced above.
         "EIGEN_MPL2_ONLY",
+        "EIGEN_MAX_ALIGN_BYTES=64",
     ],
     includes = ["."],
     visibility = ["//visibility:public"],
diff --git a/third_party/highwayhash.BUILD b/third_party/highwayhash.BUILD
index 1b8e40765e..08cb84ea2c 100644
--- a/third_party/highwayhash.BUILD
+++ b/third_party/highwayhash.BUILD
@@ -10,6 +10,7 @@ cc_library(
     srcs = ["highwayhash/sip_hash.cc"],
     hdrs = [
         "highwayhash/sip_hash.h",
+        "highwayhash/endianess.h",
         "highwayhash/state_helpers.h",
     ],
     visibility = ["//visibility:public"],
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index 4418ac32fc..663a218733 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -291,8 +291,10 @@ cc_library(
         "jchuff.h",
         "jconfig.h",
         "jdct.h",
+        "jerror.h",
         "jinclude.h",
         "jmorecfg.h",
+        "jpegint.h",
         "jpeglib.h",
         "jsimd.h",
         "jsimddct.h",
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index 76ab32d69c..17c5449cc0 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -28,7 +28,14 @@ cc_library(
         "pngwrite.c",
         "pngwtran.c",
         "pngwutil.c",
-    ],
+    ] + select({
+        "@org_tensorflow//tensorflow:linux_ppc64le": [
+            "powerpc/powerpc_init.c",
+            "powerpc/filter_vsx_intrinsics.c",
+        ],
+        "//conditions:default": [
+        ],
+    }),
     hdrs = [
         "png.h",
         "pngconf.h",
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index 954f21f5f8..3c7e5c8469 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -6,6 +6,7 @@
   * `PYTHON_LIB_PATH`: Location of python libraries.
 """
 
+_BAZEL_SH = "BAZEL_SH"
 _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 _PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
 _TF_PYTHON_CONFIG_REPO = "TF_PYTHON_CONFIG_REPO"
@@ -152,6 +153,22 @@ def _get_python_bin(repository_ctx):
             _PYTHON_BIN_PATH, repository_ctx.os.environ.get("PATH", "")))
 
 
+def _get_bash_bin(repository_ctx):
+  """Gets the bash bin path."""
+  bash_bin = repository_ctx.os.environ.get(_BAZEL_SH)
+  if bash_bin != None:
+    return bash_bin
+  else:
+    bash_bin_path = repository_ctx.which("bash")
+    if bash_bin_path != None:
+      return str(bash_bin_path)
+    else:
+      _fail("Cannot find bash in PATH, please make sure " +
+            "bash is installed and add its directory in PATH, or --define " +
+            "%s='/path/to/bash'.\nPATH=%s" % (
+                _BAZEL_SH, repository_ctx.os.environ.get("PATH", "")))
+
+
 def _get_python_lib(repository_ctx, python_bin):
   """Gets the python lib path."""
   python_lib = repository_ctx.os.environ.get(_PYTHON_LIB_PATH)
@@ -184,14 +201,14 @@ def _get_python_lib(repository_ctx, python_bin):
       "  print(paths[0])\n" +
       "END")
   cmd = '%s - %s' % (python_bin, print_lib)
-  result = repository_ctx.execute(["bash", "-c", cmd])
+  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
   return result.stdout.strip('\n')
 
 
 def _check_python_lib(repository_ctx, python_lib):
   """Checks the python lib path."""
   cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
-  result = repository_ctx.execute(["bash", "-c", cmd])
+  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
   if result.return_code == 1:
     _fail("Invalid python library path: %s" % python_lib)
 
@@ -199,7 +216,7 @@ def _check_python_lib(repository_ctx, python_lib):
 def _check_python_bin(repository_ctx, python_bin):
   """Checks the python bin path."""
   cmd =  '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
-  result = repository_ctx.execute(["bash", "-c", cmd])
+  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
   if result.return_code == 1:
     _fail("--define %s='%s' is not executable. Is it the python binary?" % (
         _PYTHON_BIN_PATH, python_bin))
@@ -294,6 +311,7 @@ def _python_autoconf_impl(repository_ctx):
 python_configure = repository_rule(
     implementation = _python_autoconf_impl,
     environ = [
+        _BAZEL_SH,
         _PYTHON_BIN_PATH,
         _PYTHON_LIB_PATH,
         _TF_PYTHON_CONFIG_REPO,
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index 36f5aa5bde..cb67d3e961 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -17,7 +17,6 @@
 _SINGLE_URL_WHITELIST = depset([
     "arm_compiler",
     "ortools_archive",
-    "gemmlowp",
 ])
 
 def _is_windows(ctx):
@@ -88,7 +87,9 @@ def _tf_http_archive(ctx):
   if ctx.attr.patch_file != None:
     _apply_patch(ctx, ctx.attr.patch_file)
   if ctx.attr.build_file != None:
-    ctx.template("BUILD", ctx.attr.build_file, {
+    # Use BUILD.bazel to avoid conflict with third party projects with
+    # BUILD or build (directory) underneath.
+    ctx.template("BUILD.bazel", ctx.attr.build_file, {
         "%prefix%": ".." if _repos_are_siblings() else "external",
     }, False)
 
-- 
cgit v1.2.3


From 148b4381fd0259cae441e459ec8ebe2c5d557722 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 18 Jun 2018 11:48:36 -0700
Subject: Automated g4 rollback of changelist 201011811

PiperOrigin-RevId: 201033171
---
 CONTRIBUTING.md                                    |   2 +-
 README.md                                          |   1 -
 RELEASE.md                                         |  67 +---
 configure.py                                       |   5 -
 tensorflow/BUILD                                   |   4 +-
 tensorflow/c/generate-pc.sh                        |  11 +-
 tensorflow/cc/gradients/math_grad.cc               |   1 -
 tensorflow/cc/gradients/nn_grad.cc                 |  47 ---
 tensorflow/cc/gradients/nn_grad_test.cc            |  84 +----
 tensorflow/compiler/aot/codegen_test_h.golden      |   4 +-
 .../compiler/aot/embedded_protocol_buffers.h       |   2 +-
 tensorflow/compiler/aot/runtime.h                  |   4 +-
 tensorflow/compiler/aot/runtime_test.cc            |  16 +-
 tensorflow/compiler/xla/service/cpu/BUILD          |  18 +-
 tensorflow/compiler/xla/service/cpu/cpu_runtime.cc |   2 -
 tensorflow/compiler/xla/service/cpu/cpu_runtime.h  |   1 -
 tensorflow/compiler/xla/service/cpu/ir_emitter.cc  |   8 +-
 .../compiler/xla/service/cpu/runtime_fft_impl.h    |  20 +-
 .../xla/service/cpu/runtime_single_threaded_fft.cc |  32 --
 .../xla/service/cpu/runtime_single_threaded_fft.h  |  31 --
 .../compiler/xla/service/cpu/simple_orc_jit.cc     |   2 -
 tensorflow/compiler/xla/service/pattern_matcher.h  |   2 +-
 .../compiler/xla/service/tuple_simplifier.cc       |   7 -
 tensorflow/compiler/xla/service/tuple_simplifier.h |   9 +-
 .../compiler/xla/service/tuple_simplifier_test.cc  |  77 ----
 tensorflow/contrib/autograph/__init__.py           |   3 -
 tensorflow/contrib/cmake/tf_c.cmake                |  22 +-
 tensorflow/contrib/cmake/tf_cc_ops.cmake           |   2 +-
 tensorflow/contrib/cmake/tf_python.cmake           |   3 +-
 tensorflow/contrib/cmake/tools/create_def_file.py  |   9 +-
 .../bijectors/sinh_arcsinh_bijector_test.py        |  28 +-
 tensorflow/contrib/eager/python/datasets.py        |   3 +-
 .../python/examples/notebooks/4_high_level.ipynb   |   4 +-
 .../feature_column/sequence_feature_column.py      |  22 +-
 .../feature_column/sequence_feature_column_test.py |  41 --
 tensorflow/contrib/ffmpeg/__init__.py              |   1 +
 tensorflow/contrib/ffmpeg/ffmpeg_ops.py            |   1 +
 tensorflow/contrib/framework/__init__.py           |   3 +-
 .../ops/fused_conv2d_bias_activation_op_test.py    |  11 +-
 .../src_impl/hexagon_controller.c                  |   2 +-
 tensorflow/contrib/lite/download_dependencies.sh   |   4 +-
 .../contrib/lite/examples/minimal/minimal.cc       |   2 +-
 .../contrib/lite/g3doc/tf_ops_compatibility.md     |  14 +-
 tensorflow/contrib/lite/java/ovic/README.md        |   4 +-
 .../kernels/internal/reference/reference_ops.h     |   4 +-
 tensorflow/contrib/lite/python/interpreter.py      |   2 +-
 .../interpreter_wrapper/interpreter_wrapper.cc     |   9 +-
 .../interpreter_wrapper/interpreter_wrapper.h      |   3 +-
 tensorflow/contrib/lite/python/lite.py             |  11 -
 tensorflow/contrib/lite/toco/import_tensorflow.cc  |   2 +-
 tensorflow/contrib/lite/toco/toco_port.cc          |   6 -
 tensorflow/contrib/lite/toco/toco_port.h           |  18 -
 tensorflow/contrib/makefile/compile_nsync.sh       |   2 +-
 .../contrib/makefile/download_dependencies.sh      |   4 +-
 .../contrib/metrics/python/ops/metric_ops.py       |   2 +-
 tensorflow/contrib/mpi_collectives/kernels/ring.h  |   2 +-
 .../contrib/opt/python/training/adamax_test.py     |   6 +-
 .../opt/python/training/model_average_optimizer.py |   2 +-
 tensorflow/contrib/periodic_resample/BUILD         |  20 +-
 .../kernels/periodic_resample_op.cc                |   5 -
 .../kernels/periodic_resample_op.h                 | 415 ++++++---------------
 .../contrib/periodic_resample/ops/array_ops.cc     |  53 +--
 .../periodic_resample/ops/array_ops_test.cc        |  41 --
 .../kernel_tests/periodic_resample_op_test.py      |  27 +-
 .../python/ops/periodic_resample_op.py             |   8 +-
 .../predictor/contrib_estimator_predictor.py       |   5 +-
 .../contrib/predictor/core_estimator_predictor.py  |   5 +-
 .../contrib/predictor/predictor_factories.py       |  24 +-
 .../contrib/predictor/predictor_factories_test.py  |  19 -
 .../contrib/predictor/saved_model_predictor.py     |   6 +-
 tensorflow/contrib/quantize/README.md              |   2 +-
 .../contrib/slim/python/slim/evaluation_test.py    |  25 +-
 tensorflow/contrib/summary/summary.py              |   5 +-
 .../contrib/tensor_forest/client/eval_metrics.py   |  45 ++-
 .../contrib/tensor_forest/python/tensor_forest.py  |  34 +-
 .../tensor_forest/python/tensor_forest_test.py     |  45 ---
 .../contrib/tensorrt/convert/convert_graph.cc      |  66 ++--
 .../contrib/tensorrt/convert/convert_nodes.cc      |  97 ++---
 tensorflow/contrib/tpu/python/tpu/datasets.py      |  16 +-
 tensorflow/contrib/tpu/python/tpu/datasets_test.py |  26 --
 tensorflow/core/BUILD                              |   9 +-
 .../core/api_def/base_api/api_def_Selu.pbtxt       |   4 -
 .../api_def/base_api/api_def_StringSplitV2.pbtxt   |  48 ---
 .../api_def/python_api/api_def_StringSplitV2.pbtxt |   4 -
 tensorflow/core/common_runtime/bfc_allocator.cc    |   8 +-
 tensorflow/core/common_runtime/bfc_allocator.h     |   3 +-
 .../direct_session_with_tracking_alloc_test.cc     |  16 -
 .../common_runtime/mkl_threadpool_device_test.cc   |  53 ---
 tensorflow/core/common_runtime/process_util.cc     |  11 +-
 .../core/common_runtime/threadpool_device.cc       |  25 +-
 .../rpc/grpc_master_service_impl.cc                |   4 +-
 .../core/distributed_runtime/rpc/grpc_testlib.cc   |  10 +-
 tensorflow/core/framework/allocator.h              |   5 +
 tensorflow/core/framework/op_gen_lib.cc            |   1 -
 .../remote_fused_graph_execute_info.proto          |   2 +-
 tensorflow/core/framework/tensor_test.cc           |  24 +-
 tensorflow/core/graph/mkl_layout_pass.cc           | 148 +-------
 tensorflow/core/graph/mkl_layout_pass_test.cc      |  31 --
 tensorflow/core/grappler/costs/graph_properties.cc |   1 +
 tensorflow/core/grappler/optimizers/BUILD          |   2 +-
 tensorflow/core/grappler/optimizers/remapper.cc    |   4 +-
 tensorflow/core/kernels/as_string_op.cc            |   2 -
 tensorflow/core/kernels/cwise_op_clip.cc           |  43 ++-
 .../core/kernels/dense_update_functor_gpu.cu.cc    |   1 -
 tensorflow/core/kernels/gather_functor.cc          |   1 -
 tensorflow/core/kernels/gather_functor_gpu.cu.cc   |   1 -
 tensorflow/core/kernels/gather_nd_op.cc            |   4 -
 tensorflow/core/kernels/gather_nd_op_gpu.cu.cc     |   2 -
 tensorflow/core/kernels/gather_op.cc               |   1 -
 tensorflow/core/kernels/mkl_concat_op.cc           | 213 +++--------
 tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc  |   2 -
 tensorflow/core/kernels/mkl_pooling_ops_common.h   |   6 +-
 tensorflow/core/kernels/scatter_nd_op.cc           |   4 -
 tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc    |   1 -
 .../core/kernels/scoped_allocator_ops_test.cc      |   9 +-
 tensorflow/core/kernels/segment_reduction_ops.h    |  10 +-
 tensorflow/core/kernels/sparse_matmul_op.cc        |   2 +-
 tensorflow/core/kernels/string_split_op.cc         | 130 -------
 tensorflow/core/ops/candidate_sampling_ops.cc      |   5 +-
 tensorflow/core/ops/dataset_ops.cc                 |  24 +-
 tensorflow/core/ops/image_ops.cc                   |   4 +-
 tensorflow/core/ops/math_ops.cc                    |   2 +-
 tensorflow/core/ops/nn_ops.cc                      |   1 -
 tensorflow/core/ops/string_ops.cc                  |  20 +-
 tensorflow/core/platform/cpu_info.cc               |  23 --
 tensorflow/core/platform/cpu_info.h                |   7 -
 tensorflow/core/platform/default/build_config.bzl  |   2 -
 .../core/platform/hadoop/hadoop_file_system.cc     |  21 +-
 tensorflow/core/platform/posix/port.cc             |   5 -
 tensorflow/core/public/version.h                   |   4 +-
 tensorflow/core/util/mkl_util.h                    |  50 +--
 tensorflow/docs_src/community/groups.md            |  29 +-
 tensorflow/docs_src/get_started/eager.md           |   2 +-
 tensorflow/docs_src/get_started/index.md           |   4 +-
 tensorflow/docs_src/install/install_c.md           |   2 +-
 tensorflow/docs_src/install/install_go.md          |   2 +-
 tensorflow/docs_src/install/install_java.md        |  24 +-
 tensorflow/docs_src/install/install_linux.md       |  24 +-
 tensorflow/docs_src/install/install_mac.md         |  10 +-
 tensorflow/docs_src/install/install_sources.md     |  17 +-
 tensorflow/docs_src/mobile/linking_libs.md         |   2 +-
 tensorflow/docs_src/mobile/prepare_models.md       |   4 +-
 tensorflow/docs_src/performance/quantization.md    |   2 +-
 .../docs_src/programmers_guide/estimators.md       |  19 +-
 .../docs_src/programmers_guide/feature_columns.md  |   4 +-
 tensorflow/examples/learn/iris.py                  |   7 +-
 tensorflow/java/src/gen/cc/op_generator.cc         |  11 +-
 tensorflow/java/src/gen/cc/op_specs.cc             |   1 -
 tensorflow/python/eager/backprop.py                |   4 +-
 tensorflow/python/estimator/BUILD                  |   5 +-
 tensorflow/python/estimator/exporter.py            |   4 +-
 tensorflow/python/estimator/inputs/numpy_io.py     |   8 +-
 .../python/estimator/inputs/numpy_io_test.py       |   5 +-
 tensorflow/python/estimator/inputs/pandas_io.py    |   7 +-
 .../python/estimator/inputs/pandas_io_test.py      |   5 +-
 .../estimator/inputs/queues/feeding_functions.py   |   2 +-
 tensorflow/python/estimator/keras.py               |   4 +-
 tensorflow/python/estimator/keras_test.py          |  14 +-
 .../python/grappler/layout_optimizer_test.py       |   4 +-
 tensorflow/python/keras/activations.py             |   2 -
 tensorflow/python/keras/callbacks.py               |  21 +-
 tensorflow/python/keras/callbacks_test.py          |   2 -
 tensorflow/python/keras/engine/network.py          |   2 +-
 tensorflow/python/keras/engine/saving_test.py      |   4 +-
 tensorflow/python/keras/engine/training.py         |   7 +-
 tensorflow/python/keras/engine/training_eager.py   |   2 +-
 tensorflow/python/keras/initializers_test.py       |  26 +-
 tensorflow/python/keras/layers/core.py             |  26 +-
 tensorflow/python/keras/models_test.py             |  14 -
 .../python/kernel_tests/as_string_op_test.py       |  10 -
 tensorflow/python/kernel_tests/betainc_op_test.py  |   4 +-
 tensorflow/python/kernel_tests/clip_ops_test.py    |  13 -
 tensorflow/python/kernel_tests/conv_ops_test.py    |  32 +-
 .../python/kernel_tests/gather_nd_op_test.py       |  32 +-
 tensorflow/python/kernel_tests/gather_op_test.py   |  20 +-
 tensorflow/python/kernel_tests/init_ops_test.py    |  27 --
 tensorflow/python/kernel_tests/pooling_ops_test.py |   4 +-
 tensorflow/python/kernel_tests/py_func_test.py     |  31 +-
 .../python/kernel_tests/scatter_nd_ops_test.py     |   6 +-
 tensorflow/python/kernel_tests/scatter_ops_test.py |  14 +-
 .../kernel_tests/segment_reduction_ops_test.py     |   4 +-
 .../python/kernel_tests/string_split_op_test.py    |  96 -----
 tensorflow/python/ops/array_ops.py                 |   4 -
 tensorflow/python/ops/gradient_checker.py          |   8 +-
 tensorflow/python/ops/image_ops_impl.py            |  74 ++--
 tensorflow/python/ops/image_ops_test.py            | 261 +++----------
 tensorflow/python/ops/init_ops.py                  |   3 +-
 tensorflow/python/ops/logging_ops.py               |   5 +-
 tensorflow/python/ops/math_ops.py                  |  28 +-
 tensorflow/python/ops/nn_impl.py                   |   5 +-
 tensorflow/python/ops/nn_ops.py                    |   4 +-
 tensorflow/python/ops/nn_test.py                   |  10 -
 tensorflow/python/ops/script_ops.py                |  35 +-
 tensorflow/python/ops/sparse_ops.py                |   4 -
 tensorflow/python/ops/string_ops.py                |  53 ---
 tensorflow/python/ops/variable_scope.py            |  21 +-
 .../python/tools/import_pb_to_tensorboard.py       |   0
 tensorflow/tensorflow.bzl                          |   2 +-
 .../tools/api/generator/create_python_api.py       |   8 +-
 tensorflow/tools/api/golden/tensorflow.image.pbtxt |   2 +-
 tensorflow/tools/api/golden/tensorflow.pbtxt       |   4 -
 .../tools/api/golden/tensorflow.strings.pbtxt      |   4 -
 tensorflow/tools/ci_build/builds/pip.sh            |   4 -
 .../tools/ci_build/builds/with_the_same_user       |   2 +-
 tensorflow/tools/ci_build/ci_build.sh              |   7 -
 tensorflow/tools/ci_build/copy_binary.py           |   3 +-
 .../tools/ci_build/install/install_pip_packages.sh |   4 -
 .../install/install_python3.5_pip_packages.sh      |   4 +-
 .../install/install_python3.6_pip_packages.sh      |   5 +-
 .../tools/ci_build/linux/mkl/basic-mkl-test.sh     |  29 --
 tensorflow/tools/ci_build/pi/build_raspberry_pi.sh |   8 +-
 .../def_file_filter/def_file_filter_configure.bzl  |   6 +-
 tensorflow/tools/dist_test/local_test.sh           |  12 +-
 tensorflow/tools/dist_test/remote_test.sh          |  11 +-
 tensorflow/tools/docker/Dockerfile.devel           |   2 +-
 tensorflow/tools/docker/Dockerfile.devel-cpu-mkl   |   2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu       |   6 +-
 tensorflow/tools/docker/Dockerfile.gpu             |   2 +-
 tensorflow/tools/pip_package/BUILD                 |   1 -
 tensorflow/tools/pip_package/build_pip_package.sh  | 160 +++-----
 tensorflow/tools/pip_package/setup.py              |   3 +-
 .../proto_text/gen_proto_text_functions_lib.cc     |   3 -
 .../tools/quantization/quantize_graph_test.py      |  12 +-
 tensorflow/tools/test/upload_test_benchmarks.py    |   1 +
 tensorflow/workspace.bzl                           |  40 +-
 third_party/eigen.BUILD                            |   1 -
 third_party/highwayhash.BUILD                      |   1 -
 third_party/jpeg/jpeg.BUILD                        |   2 -
 third_party/png.BUILD                              |   9 +-
 third_party/py/python_configure.bzl                |  24 +-
 third_party/repo.bzl                               |   5 +-
 231 files changed, 903 insertions(+), 3337 deletions(-)
 delete mode 100644 tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
 delete mode 100644 tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
 delete mode 100644 tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
 delete mode 100644 tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
 delete mode 100644 tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
 delete mode 100644 tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
 mode change 100644 => 100755 tensorflow/python/tools/import_pb_to_tensorboard.py
 delete mode 100755 tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh

(limited to 'tensorflow/cc')

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index db4b1581ae..8669c25c45 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -90,7 +90,7 @@ Bazel BUILD files also need to include a license section, e.g.,
 Changes to TensorFlow C++ code should conform to
 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
 
-Use `clang-tidy` to check your C/C++ changes. To install `clang-tidy` on ubuntu:16.04, do:
+Use `clang-tidy` to check your C/C++ changes. To install clang-tidy on ubuntu:16.04, do:
 
 ```bash
 apt-get install -y clang-tidy
diff --git a/README.md b/README.md
index 63853137cf..6fb4486d0d 100644
--- a/README.md
+++ b/README.md
@@ -56,7 +56,6 @@ $ python
 42
 >>> sess.close()
 ```
-Learn more examples about how to do specific tasks in TensorFlow at the [tutorials page of tensorflow.org](https://www.tensorflow.org/tutorials/).
 
 ## Contribution guidelines
 
diff --git a/RELEASE.md b/RELEASE.md
index e09e9c6190..84d9d52868 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,62 +1,3 @@
-# Release 1.9.0
-
-## Major Features And Improvements
-* Update tf.keras to the Keras 2.1.6 API.
-* `tfe.Network` is deprecated. Please inherit from `tf.keras.Model`.
-* Adding support of core feature columns and losses to gradient boosted trees estimators.
-* The distributions.Bijector API supports broadcasting for Bijectors with new API changes. See [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/distributions/bijectors/Bijector) for more details.
-* Layered variable names have changed in the following conditions:
-  * Using `tf.keras.layers` with custom variable scopes.
-  * Using `tf.layers` in  a subclassed `tf.keras.Model` class. See [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/layers) for more details
-
-## Breaking Chances
-  * If you're opening empty variable scopes; replace `variable_scope`('', ...) by `variable_scope`(`tf.get_variable_scope()`, ...).
-
-## Bug Fixes and Other Changes
-* `tf.data`:
-  * The `DatasetBase::DebugString()` method is now `const`.
-  * Added the `tf.contrib.data.sample_from_datasets()` API for randomly sampling from multiple datasets.
-* Eager Execution:
-* `tf.keras`:
-  * Move Keras code out of _impl folder and remove API files.
-  * `tf.keras.Model.save_weights` now saves in TensorFlow format by default.
-  * Enable dataset iterators to be passed to `tf.keras.Model` training/eval methods.
-* Accelerated Linear Algebra (XLA):
-* TensorFlow Debugger (tfdbg): fix an issue in which the TensorBoard Debugger Plugin could not handle total source file size exceeding gRPC message size limit (4 MB).
-* `tf.contrib`:
-  * Add `tf.contrib.data.choose_from_datasets()`.
-  * `tf.contrib.data.make_csv_dataset()` now supports line breaks in quoted strings. Two arguments were removed from `make_csv_dataset`.
-  * `tf.contrib.framework.zero_initializer` supports ResourceVariable.
-  * Adding "constrained_optimization" to tensorflow/contrib.
-* Other:
-  * Add GCS Configuration Ops.
-  * Changing signature of `MakeIterator` to enable propagating error status.
-  * KL divergence for two Dirichlet distributions.
-  * More consistent GcsFileSystem behavior for certain reads past EOF.
-  * Update benchmark for tf.scan to match ranges across eager and graph modes.
-  * Fixed bug in `tf.reduce_prod gradient` for complex dtypes.
-  * Add optional `args` argument to `Dataset.from_generator()`.
-  * Allow the use of '.' in variables (e.g. "hparams.parse('a.b=1.0')"), which would previously raise an error. This will correspond to an attribute name with an embedded '.' symbol (e.g. 'a.b'), which can only be accessed indirectly (e.g. through getattr and setattr).  To set this up the user will first need to explicitly add the variable to the hparam object (e.g. "hparams.add_hparam(name='a.b', value=0.0)").
-  * Benchmark for tf.scan in graph and eager modes.
-  * Added complex128 support to FFT, FFT2D, FFT3D, IFFT, IFFT2D, and IFFT3D.
-  * Making ids unique in `nn.embedding_lookup_sparse`. This helps to reduce RPC calls for looking up the embeddings when there are repeated ids in the batch.
-  * Support indicator column in boosted trees.
-  * Prevent `tf.gradients()` from backpropagating through integer tensors.
-  * LinearOperator[1D,2D,3D]Circulant added to `tensorflow.linalg`.
-  * Conv3D, Conv3DBackpropInput, Conv3DBackpropFilter now supports arbitrary.
-  * Added `tf.train.Checkpoint` for reading/writing object-based checkpoints.
-  * `Dataset.list_files()` now produces determinstic results when `shuffle=False` or a `seed` is passed.
-  * Added LinearOperatorKronecker, a dense-free implementation of the Kronecker Product.
-  * Allow LinearOperator to broadcast.
-  * SavedModelBuilder will now deduplicate asset names that point to files with the same basename and the same contents. Note that this may result in new asset files included in SavedModels in cases where assets with the same name but different contents were previously overwriting each other.
-
-
-## Thanks to our Contributors
-
-This release contains contributions from many people at Google, as well as:
-
-Abdullah Alrasheed, Achal Shah, Ad-530, ADiegoCAlonso, Aditya Yogi, Ag Ramesh, akindyakov, Andy Kernahan, Anya Petrova, Aurelien Geron, Ben, Ben Barsdell, Bhavani-Subramanian, braincodercn, Brett Koonce, Brian Nemsick, Brian Zier, Bryan Heden, candy.dc, cclauss, Clayne Robison, ctiijima, Dalmo Cirne, David Norman, David T.H. Kao, DosLin, ekelsen, Elson Rodriguez, Erik Smistad, Felix Abecassis, Fergal Cotter, fo40225, foo0x29a, Freedom" Koan-Sin Tan, FréDéRic Branchaud-Charron, gdh1995, Geoffrey Irving, Giuseppe, gracehoney, Guido Zuidhof, Guillaume Klein, Guozhong Zhuang, Haggai, Harald Husum, imsheridan, Ivan Zhang, Jan Zikes, Jayaram Bobba, Jesse Benson, Jesse Gumz, Jiajia Li, Jie, jinghuangintel, Jingwen, jjsjann123, Joe Yearsley, Joel Hestness, Joel Shor, josephyearsley, Junpeng Lao, Karol M. Langner, Kb Sriram, krantideep95, Krish Ravindranath, Letian Feng, Loo Rong Jie, Lukas Geiger, Maciej, Mahmoud Abuzaina, ManHyuk, Mark Ryan, mbhuiyan, Michal Turek, Mostafa Alaa, Myungsung Kwak, Nand Dalal, Nehal J Wani, Neil Tenenholtz, ngc92, Nicholas Nadeau, P.Eng., Avs, Niranjan Hasabnis, P-Hidringer, Paul Van Eck, Peng Yu, Qing Zhao, Qingying Chen, Quanlong, Rajendra Arora, Rholais Lii, rmanyari, Robin Richtsfeld, Russell Klopfer, Sagi, Sam Sendelbach, Sandeep N Gupta, Sandip Giri, Sarah Edkins, Scott Tseng, Sdalbsoo, Sergii Khomenko, Seungwoo Choi (Biggie), Seyed Majid Azimi, Shaoning Zeng, shengfuintel, Siu Kei, Muk, Smit Shilu, soonson, Stefan Schweter, Sukhwan Kim, Sunitha Kambhampati, Taehoon Lee, tamimaddari82, Tang, Wenyi, Ted Chang, u2takey, Utkarsh Upadhyay, Vadim Markovtsev, voegtlel, Wai Hon Law, wangsiyu, Wenhao Hu, wenhao.hu, William D. Irons, Yan Facai (颜发才), Yanbo Liang, Yihong Wang, Yilei (Dolee) Yang, Yong Tang, Yuan (Terry) Tang
-
 # Release 1.8.0
 
 ## Major Features And Improvements
@@ -463,6 +404,14 @@ answered questions, and were part of inspiring discussions.
 
 # Release 1.4.0
 
+## Major Features And Improvements
+* `tf.keras` is now part of the core TensorFlow API.
+* [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
+  the core TensorFlow API.
+  * The API is now subject to backwards compatibility guarantees.
+
+# Release 1.4.0
+
 ## Major Features And Improvements
 * `tf.keras` is now part of the core TensorFlow API.
 * [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
diff --git a/configure.py b/configure.py
index ada342a50a..bde7af8c0e 100644
--- a/configure.py
+++ b/configure.py
@@ -1397,10 +1397,6 @@ def set_grpc_build_flags():
   write_to_bazelrc('build --define grpc_no_ares=true')
 
 
-def set_build_strip_flag():
-  write_to_bazelrc('build --strip=always')
-
-
 def set_windows_build_flags():
   if is_windows():
     # The non-monolithic build is not supported yet
@@ -1523,7 +1519,6 @@ def main():
 
   set_grpc_build_flags()
   set_cc_opt_flags(environ_cp)
-  set_build_strip_flag()
   set_windows_build_flags()
 
   if get_var(
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 6d134dbb80..a73c4ca3aa 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -475,7 +475,7 @@ tf_cc_shared_object(
 # excludes all but a subset of function names.
 # On MacOS, the linker does not support version_script, but has an
 # an "-exported_symbols_list" command.  -z defs disallows undefined
-# symbols in object files.
+# symbols in object files and -s strips the output.
 
 tf_cc_shared_object(
     name = "libtensorflow.so",
@@ -489,6 +489,7 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
+            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow/c:version_script.lds)",
         ],
@@ -514,6 +515,7 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
+            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow:tf_version_script.lds)",
         ],
diff --git a/tensorflow/c/generate-pc.sh b/tensorflow/c/generate-pc.sh
index 7184ad68fb..02a6a58b61 100755
--- a/tensorflow/c/generate-pc.sh
+++ b/tensorflow/c/generate-pc.sh
@@ -15,12 +15,10 @@
 # ==============================================================================
 
 TF_PREFIX='/usr/local'
-LIBDIR='lib'
 
 usage() {
     echo "Usage: $0 OPTIONS"
     echo -e "-p, --prefix\tset installation prefix (default: /usr/local)"
-    echo -e "-l, --libdir\tset lib directory (default: lib)"
     echo -e "-v, --version\tset TensorFlow version"
     echo -e "-h, --help\tdisplay this message"
 }
@@ -28,7 +26,7 @@ usage() {
 [ $# == 0 ] && usage && exit 0
 
 # read the options
-ARGS=$(getopt -o p:l:v:h --long prefix:,libdir:,version:,help -n $0 -- "$@")
+ARGS=$(getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@")
 eval set -- "$ARGS"
 
 # extract options and their arguments into variables.
@@ -40,11 +38,6 @@ while true ; do
                 "") shift 2 ;;
                 *) TF_PREFIX=$2 ; shift 2 ;;
             esac ;;
-        -l|--libdir)
-            case "$2" in
-                "") shift 2 ;;
-                *) LIBDIR=$2 ; shift 2 ;;
-            esac ;;
         -v|--version)
             case "$2" in
                 "") shift 2 ;;
@@ -62,7 +55,7 @@ echo "Generating pkgconfig file for TensorFlow $TF_VERSION in $TF_PREFIX"
 cat << EOF > tensorflow.pc
 prefix=${TF_PREFIX}
 exec_prefix=\${prefix}
-libdir=\${exec_prefix}/${LIBDIR}
+libdir=\${exec_prefix}/lib
 includedir=\${prefix}/include
 
 Name: TensorFlow
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index 35a01e0341..52c177212a 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -38,7 +38,6 @@ REGISTER_NO_GRADIENT_OP("NotEqual");
 REGISTER_NO_GRADIENT_OP("LogicalAnd");
 REGISTER_NO_GRADIENT_OP("LogicalOr");
 REGISTER_NO_GRADIENT_OP("LogicalNot");
-REGISTER_NO_GRADIENT_OP("Floor");
 
 // Conjugate helper function returns the conjugate of an Output if it
 // is complex valued.
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index c73482d5f4..0cb3132e94 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -255,53 +255,6 @@ Status LRNGradHelper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("LRN", LRNGradHelper);
 
-Status SoftplusGradHelper(const Scope& scope, const Operation& op,
-                          const std::vector<Output>& grad_inputs,
-                          std::vector<Output>* grad_outputs) {
-  auto dx = internal::SoftplusGrad(scope, grad_inputs[0], op.input(0));
-  grad_outputs->push_back(dx);
-  return scope.status();
-}
-REGISTER_GRADIENT_OP("Softplus", SoftplusGradHelper);
-
-Status SoftsignGradHelper(const Scope& scope, const Operation& op,
-                          const std::vector<Output>& grad_inputs,
-                          std::vector<Output>* grad_outputs) {
-  auto dx = internal::SoftsignGrad(scope, grad_inputs[0], op.input(0));
-  grad_outputs->push_back(dx);
-  return scope.status();
-}
-REGISTER_GRADIENT_OP("Softsign", SoftsignGradHelper);
-
-Status FractionalAvgPoolGradHelper(const Scope& scope, const Operation& op,
-                                   const std::vector<Output>& grad_inputs,
-                                   std::vector<Output>* grad_outputs) {
-  bool overlapping;
-  TF_RETURN_IF_ERROR(
-      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
-  auto dx = internal::FractionalAvgPoolGrad(
-      scope, Shape(scope, op.input(0), Shape::OutType(DT_INT64)),
-      grad_inputs[0], op.output(1), op.output(2),
-      internal::FractionalAvgPoolGrad::Overlapping(overlapping));
-  grad_outputs->push_back(dx);
-  return scope.status();
-}
-REGISTER_GRADIENT_OP("FractionalAvgPool", FractionalAvgPoolGradHelper);
-
-Status FractionalMaxPoolGradHelper(const Scope& scope, const Operation& op,
-                                   const std::vector<Output>& grad_inputs,
-                                   std::vector<Output>* grad_outputs) {
-  bool overlapping;
-  TF_RETURN_IF_ERROR(
-      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
-  auto dx = internal::FractionalMaxPoolGrad(
-      scope, op.input(0), op.output(0), grad_inputs[0], op.output(1),
-      op.output(2), internal::FractionalMaxPoolGrad::Overlapping(overlapping));
-  grad_outputs->push_back(dx);
-  return scope.status();
-}
-REGISTER_GRADIENT_OP("FractionalMaxPool", FractionalMaxPoolGradHelper);
-
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index b4d457a9d1..c4eba7ecb0 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -28,8 +28,6 @@ namespace {
 using ops::BiasAdd;
 using ops::Conv2D;
 using ops::Elu;
-using ops::FractionalAvgPool;
-using ops::FractionalMaxPool;
 using ops::L2Loss;
 using ops::LogSoftmax;
 using ops::LRN;
@@ -43,8 +41,6 @@ using ops::Relu;
 using ops::Relu6;
 using ops::Selu;
 using ops::Softmax;
-using ops::Softplus;
-using ops::Softsign;
 
 class NNGradTest : public ::testing::Test {
  protected:
@@ -75,30 +71,22 @@ class NNGradTest : public ::testing::Test {
     EXPECT_LT(max_error, 1e-3);
   }
 
-  // Sets tensor with random values, ensuring that every pair of elements are at
-  // least a reasonable amount apart.
-  // This is an issue for max pooling operations, in which perturbations by the
-  // numeric gradient computation in the gradient checker can change the max
-  // value if a pool has values that are too close together.
+  // Sets tensor with random values, ensuring that the max value is largest by
+  // a reasonable amount.
+  // This is an issue for MaxPool, MaxPoolV2 and MaxPool3D, in which
+  // perturbations by the numeric gradient computation in the gradient checker
+  // can change the max value if values are too close together.
   template <typename T>
-  void SetRandomValuesForMaxPooling(Tensor* tensor) {
+  void SetRandomValuesWithBumpedMax(Tensor* tensor) {
     auto tensor_flat = tensor->flat<T>();
-    // First set the array to an increasing sequence of values spaced
-    // a reasonable amount apart
-    T cur = 0;
-    for (size_t i = 0; i < tensor->NumElements(); i++) {
-      tensor_flat(i) = cur;
-      cur += 5e-2;
-    }
-    // Fischer-Yates shuffle the array
-    for (size_t i = tensor->NumElements() - 1; i >= 1; i--) {
-      // j <- random integer 0 <= j <= i
-      size_t j = random::New64() % (i + 1);
-      // swap values at i, j
-      T tmp = tensor_flat(i);
-      tensor_flat(i) = tensor_flat(j);
-      tensor_flat(j) = tmp;
+    tensor_flat.setRandom();
+    int32 max_index = 0;
+    for (size_t i = 1; i < tensor->NumElements(); i++) {
+      if (tensor_flat(i) > tensor_flat(max_index)) {
+        max_index = i;
+      }
     }
+    tensor_flat(max_index) += 1e-2;
   }
 
   Scope scope_;
@@ -201,7 +189,7 @@ TEST_F(NNGradTest, MaxPoolGradHelper) {
   const std::vector<int> strides{1, 2, 2, 1};
   auto y = MaxPool(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesForMaxPooling<float>(&x_init_value);
+  SetRandomValuesWithBumpedMax<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -214,7 +202,7 @@ TEST_F(NNGradTest, MaxPoolGradV2Helper) {
   Tensor strides = test::AsTensor<int>({1, 2, 2, 1}, {4});
   auto y = MaxPoolV2(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesForMaxPooling<float>(&x_init_value);
+  SetRandomValuesWithBumpedMax<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -227,7 +215,7 @@ TEST_F(NNGradTest, MaxPool3DGradHelper) {
   const std::vector<int> strides{1, 3, 3, 3, 1};
   auto y = MaxPool3D(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesForMaxPooling<float>(&x_init_value);
+  SetRandomValuesWithBumpedMax<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -260,45 +248,5 @@ TEST_F(NNGradTest, LRN){
   RunTest(x, x_shape, y, x_shape);
 }
 
-TEST_F(NNGradTest, SoftplusGrad) {
-  TensorShape shape({3, 7});
-  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
-  auto y = Softplus(scope_, x);
-  RunTest(x, shape, y, shape);
-}
-
-TEST_F(NNGradTest, SoftsignGrad) {
-  TensorShape shape({3, 7});
-  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
-  auto y = Softsign(scope_, x);
-  RunTest(x, shape, y, shape);
-}
-
-TEST_F(NNGradTest, FractionalAvgPoolGradHelper) {
-  TensorShape x_shape({1, 3, 7, 1});
-  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
-  // Force consistent pooling regions for unit testing.
-  auto y = FractionalAvgPool(
-      scope_, x, {1, 1.2, 1.9, 1},
-      FractionalAvgPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
-          2));
-  TensorShape y_shape({1, 2, 3, 1});
-  RunTest(x, x_shape, y.output, y_shape);
-}
-
-TEST_F(NNGradTest, FractionalMaxPoolGradHelper) {
-  TensorShape x_shape({1, 3, 7, 1});
-  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
-  // Force consistent pooling regions for unit testing.
-  auto y = FractionalMaxPool(
-      scope_, x, {1, 1.2, 1.9, 1},
-      FractionalMaxPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
-          2));
-  Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesForMaxPooling<float>(&x_init_value);
-  TensorShape y_shape({1, 2, 3, 1});
-  RunTest(x, x_init_value, y.output, y_shape);
-}
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 6641d45e83..6e050cf564 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -56,9 +56,9 @@ namespace bar {
 //
 // Memory stats:
 //   arg bytes total:    104
-//   arg bytes aligned:  192
+//   arg bytes aligned:  128
 //   temp bytes total:   126
-//   temp bytes aligned: 320
+//   temp bytes aligned: 224
 class MyClass : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.h b/tensorflow/compiler/aot/embedded_protocol_buffers.h
index 4e194a6aba..ebfe4806c2 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.h
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.h
@@ -71,7 +71,7 @@ struct ProtobufToEmbed {
   const ::tensorflow::protobuf::MessageLite* message;
 };
 
-// Embeds a sequence of protocol buffers into an object file.
+// Embeds a a sequence of protocol buffers into an object file.
 //
 // `target_triple` is the target triple for the target architecture for the
 // generated object file.
diff --git a/tensorflow/compiler/aot/runtime.h b/tensorflow/compiler/aot/runtime.h
index d1a669ceb1..d085864f00 100644
--- a/tensorflow/compiler/aot/runtime.h
+++ b/tensorflow/compiler/aot/runtime.h
@@ -25,8 +25,8 @@ namespace tensorflow {
 namespace tfcompile {
 namespace runtime {
 
-// Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
-static constexpr size_t kAlign = 64;
+// Align to 32-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
+static constexpr size_t kAlign = 32;
 
 // aligned_buffer_bytes returns the sum of each size in `sizes`, skipping -1
 // values.  There are `n` entries in `sizes`.  Each buffer is aligned to kAlign
diff --git a/tensorflow/compiler/aot/runtime_test.cc b/tensorflow/compiler/aot/runtime_test.cc
index 06ec623eb2..6d603a02eb 100644
--- a/tensorflow/compiler/aot/runtime_test.cc
+++ b/tensorflow/compiler/aot/runtime_test.cc
@@ -24,7 +24,7 @@ namespace runtime {
 namespace {
 
 TEST(Runtime, AlignmentValue) {
-  // We've chosen 64 byte alignment for the tfcompile runtime to mimic the
+  // We've chosen 32 byte alignment for the tfcompile runtime to mimic the
   // regular tensorflow allocator, which was chosen to play nicely with Eigen.
   // The tfcompile runtime also has a requirement that comes from the xla
   // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8
@@ -39,13 +39,13 @@ TEST(Runtime, AlignedBufferBytes) {
   EXPECT_EQ(aligned_buffer_bytes(sizesA, 1), 0);
 
   static constexpr intptr_t sizesB[1] = {3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 64);
+  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 32);
 
   static constexpr intptr_t sizesC[1] = {32};
-  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 64);
+  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 32);
 
   static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 320);
+  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 192);
 }
 
 void* add_ptr(void* base, uintptr_t delta) {
@@ -101,11 +101,11 @@ TEST(Runtime, MallocFreeContiguousBuffers) {
   EXPECT_NE(base, nullptr);
   EXPECT_EQ(bufD[0], add_ptr(base, 0));
   EXPECT_EQ(bufD[1], nullptr);
-  EXPECT_EQ(bufD[2], add_ptr(base, 64));
+  EXPECT_EQ(bufD[2], add_ptr(base, 32));
   EXPECT_EQ(bufD[3], nullptr);
-  EXPECT_EQ(bufD[4], add_ptr(base, 128));
-  EXPECT_EQ(bufD[5], add_ptr(base, 192));
-  EXPECT_EQ(bufD[6], add_ptr(base, 256));
+  EXPECT_EQ(bufD[4], add_ptr(base, 64));
+  EXPECT_EQ(bufD[5], add_ptr(base, 128));
+  EXPECT_EQ(bufD[6], add_ptr(base, 160));
   for (int i = 0; i < 7; ++i) {
     const intptr_t size = sizesD[i];
     if (size != -1) {
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 1067b38f93..d82922a359 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -178,7 +178,6 @@ cc_library(
         ":runtime_matmul",
         ":runtime_matmul_mkl",
         ":runtime_single_threaded_conv2d",
-        ":runtime_single_threaded_fft",
         ":runtime_single_threaded_matmul",
         "@llvm//:execution_engine",
         "@llvm//:core",
@@ -517,6 +516,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
@@ -578,22 +578,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "runtime_single_threaded_fft",
-    srcs = [
-        "runtime_fft_impl.h",
-        "runtime_single_threaded_fft.cc",
-    ],
-    hdrs = ["runtime_single_threaded_fft.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:framework_lite",
-        "//third_party/eigen3",
-    ],
-)
-
 cc_library(
     name = "runtime_single_threaded_matmul",
     srcs = ["runtime_single_threaded_matmul.cc"],
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 54c52bc08f..215405f680 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -51,8 +51,6 @@ extern const char* const kEigenConvF16SymbolName =
 extern const char* const kEigenConvF32SymbolName =
     "__xla_cpu_runtime_EigenConvF32";
 extern const char* const kEigenFftSymbolName = "__xla_cpu_runtime_EigenFft";
-extern const char* const kEigenSingleThreadedFftSymbolName =
-    "__xla_cpu_runtime_EigenSingleThreadedFft";
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName =
     "__xla_cpu_runtime_EigenSingleThreadedMatMulF16";
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index aa0e967123..1dce6efa5c 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -52,7 +52,6 @@ extern const char* const kMKLSingleThreadedMatMulF64SymbolName;
 extern const char* const kEigenConvF16SymbolName;
 extern const char* const kEigenConvF32SymbolName;
 extern const char* const kEigenFftSymbolName;
-extern const char* const kEigenSingleThreadedFftSymbolName;
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF64SymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 758b8c62b4..2c20be155f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1172,13 +1172,7 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
       {int8_ptr_type, int8_ptr_type, int8_ptr_type, int32_type, int32_type,
        int64_type, int64_type, int64_type, int64_type},
       /*isVarArg=*/false);
-
-  bool multi_threaded_eigen =
-      hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
-  const char* fn_name = multi_threaded_eigen
-                            ? runtime::kEigenFftSymbolName
-                            : runtime::kEigenSingleThreadedFftSymbolName;
-
+  const char* fn_name = runtime::kEigenFftSymbolName;
   llvm::Function* fft_func = llvm::cast<llvm::Function>(
       module_->getOrInsertFunction(fn_name, fft_type));
   fft_func->setCallingConv(llvm::CallingConv::C);
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
index 0bf693edd0..984cb0616e 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/types.h"
 
 // 'tensorflow' namespace is used so that int64 and other types don't require
@@ -69,9 +71,11 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
+  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = fft_shape[i];
     out_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
+    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<float, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -84,8 +88,8 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   const auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
 
   // Compute the full FFT using a temporary tensor.
-  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(in_dims);
-
+  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
+  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
   const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> zero_start_indices;
   full_fft.device(device) =
       input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(axes);
@@ -108,9 +112,11 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
+  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
     out_dims[i + 1] = fft_shape[i];
+    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -123,7 +129,8 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   // region we will slice from input given fft_shape. We slice input to
   // fft_shape on its inner-most dimensions, except the last (which we
   // slice to fft_shape[-1] / 2 + 1).
-  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(out_dims);
+  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
+  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
 
   // Calculate the starting point and range of the source of
   // negative frequency part.
@@ -172,6 +179,7 @@ template <int FFTRank, typename EigenDevice>
 void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
                       int32 fft_type, int64 input_batch, int64 fft_length0,
                       int64 fft_length1, int64 fft_length2) {
+  CHECK(::xla::FftType_IsValid(fft_type)) << fft_type;
   switch (fft_type) {
     case ::xla::FftType::FFT:
       EigenFftC2C<true, FFTRank, EigenDevice>(
@@ -196,8 +204,7 @@ void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
           input_batch, fft_length0, fft_length1, fft_length2);
       break;
     default:
-      // Unsupported FFT type
-      abort();
+      LOG(FATAL) << "Unsupported FFT type: " << fft_type;
   }
 }
 
@@ -223,8 +230,7 @@ void EigenFftImpl(const EigenDevice& device, void* out, void* operand,
                                                  fft_length1, fft_length2);
       break;
     default:
-      // Unsupported FFT rank
-      abort();
+      LOG(FATAL) << "Unsupported FFT rank " << fft_rank;
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
deleted file mode 100644
index 2613ddb127..0000000000
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
-
-#include "tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h"
-#include "tensorflow/core/platform/dynamic_annotations.h"
-#include "tensorflow/core/platform/types.h"
-
-using tensorflow::int32;
-using tensorflow::int64;
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenSingleThreadedFft(
-    const void* run_options_ptr, void* out, void* operand, int32 fft_type,
-    int32 fft_rank, int64 input_batch, int64 fft_length0, int64 fft_length1,
-    int64 fft_length2) {
-  tensorflow::xla::EigenFftImpl(Eigen::DefaultDevice(), out, operand, fft_type,
-                                fft_rank, input_batch, fft_length0, fft_length1,
-                                fft_length2);
-}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
deleted file mode 100644
index dcd133d012..0000000000
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
-
-#include "tensorflow/core/platform/types.h"
-
-extern "C" {
-
-extern void __xla_cpu_runtime_EigenSingleThreadedFft(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, void* out,
-    void* operand, tensorflow::int32 fft_type, tensorflow::int32 fft_rank,
-    tensorflow::int64 input_batch, tensorflow::int64 fft_length0,
-    tensorflow::int64 fft_length1, tensorflow::int64 fft_length2);
-
-}  // extern "C"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index c4c90515ac..8d8c5e4c44 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
-#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -203,7 +202,6 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF64);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedFft);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index 2515222cf2..d3bc47e61e 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -204,7 +204,7 @@ class LayoutPattern {
   // Modifies the pattern to match only if the layout equals the given proto.
   // The layout must outlive the returned pattern.
   constexpr LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>> EqualTo(
-      const ::xla::Layout* layout) const {
+      const Layout* layout) const {
     return LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>>(
         LayoutPatternEqualImpl<Impl>(impl_, layout), matched_layout_);
   }
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.cc b/tensorflow/compiler/xla/service/tuple_simplifier.cc
index 77bdcc9de0..e536c8afbf 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.cc
@@ -30,17 +30,10 @@ limitations under the License.
 
 namespace xla {
 
-TupleSimplifier::TupleSimplifier(bool exclude_entry_computation) :
-    exclude_entry_computation_(exclude_entry_computation) {}
-
 StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
   // Initially add all GTE and Tuple instructions to the worklist.
   std::queue<HloInstruction*> worklist;
   for (auto* computation : module->computations()) {
-    if (exclude_entry_computation_ &&
-        computation == module->entry_computation()) {
-      continue;
-    }
     for (auto* instruction : computation->instructions()) {
       if (instruction->opcode() == HloOpcode::kTuple ||
           instruction->opcode() == HloOpcode::kGetTupleElement) {
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.h b/tensorflow/compiler/xla/service/tuple_simplifier.h
index 7509501883..e5e9b10b5b 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.h
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.h
@@ -27,20 +27,13 @@ namespace xla {
 // the module.
 class TupleSimplifier : public HloPassInterface {
  public:
-  TupleSimplifier() : TupleSimplifier(/*exclude_entry_computation=*/false) {}
-  explicit TupleSimplifier(bool exclude_entry_computation);
+  TupleSimplifier() {}
   ~TupleSimplifier() override {}
   tensorflow::StringPiece name() const override { return "tuple-simplifier"; }
 
   // Run tuple simplification on the given computation. Returns whether the
   // computation was changed.
   StatusOr<bool> Run(HloModule* module) override;
-
- private:
-  // When set, this pipeline stage will perform optimization of all computations
-  // apart from the module's entry computation. This is used by Graphcore's
-  // backend.
-  bool exclude_entry_computation_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
index d3635eae81..ca9ae91281 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
@@ -42,12 +42,6 @@ class TupleSimplifierTest : public HloTestBase {
     TF_ASSERT_OK(changed_status.status());
     EXPECT_EQ(change_expected, changed_status.ValueOrDie());
   }
-  void Run(HloModule* module, bool change_expected, bool exclude_entry) {
-    TupleSimplifier simplifier(exclude_entry);
-    auto changed_status = simplifier.Run(module);
-    TF_ASSERT_OK(changed_status.status());
-    EXPECT_EQ(change_expected, changed_status.ValueOrDie());
-  }
 
   const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
   const Shape tuple_shape_ = ShapeUtil::MakeTupleShape(
@@ -217,76 +211,5 @@ TEST_F(TupleSimplifierTest, IncompatibleTuples) {
   EXPECT_THAT(computation->root_instruction(), tuple);
 }
 
-TEST_F(TupleSimplifierTest, CanExcludeEntryComputation) {
-  //  Verify that the root computation can be excluded
-  auto module = CreateNewModule();
-
-  HloInstruction* p0;
-  HloInstruction* p1;
-  HloComputation* c0;
-  HloComputation* c1;
-  HloComputation* entry;
-
-  {
-    HloComputation::Builder builder(TestName() + "_1");
-    p0 = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
-    HloInstruction* gte0 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 0));
-    HloInstruction* gte1 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 1));
-    HloInstruction* gte2 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 2));
-
-    builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
-
-    c0 = module->AddEmbeddedComputation(builder.Build());
-  }
-  {
-    HloComputation::Builder builder(TestName() + "_2");
-    p1 = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
-    HloInstruction* gte0 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 0));
-    HloInstruction* gte1 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 1));
-    HloInstruction* gte2 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 2));
-
-    builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
-
-    c1 = module->AddEmbeddedComputation(builder.Build());
-  }
-  {
-    HloComputation::Builder builder(TestName() + "_Entry");
-    HloInstruction* tuple_param = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
-    HloInstruction* call0 = builder.AddInstruction(
-        HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c0));
-    HloInstruction* call1 = builder.AddInstruction(
-        HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c1));
-    HloInstruction* gte0 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, call0, 0));
-    HloInstruction* gte1 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, call1, 1));
-    HloInstruction* tuple0 =
-        builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
-    HloInstruction* gte2 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 0));
-    HloInstruction* gte3 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 1));
-
-    builder.AddInstruction(HloInstruction::CreateTuple({gte2, gte3}));
-
-    entry = module->AddEntryComputation(builder.Build());
-  }
-
-  Run(module.get(), /*change_expected=*/true, /*exclude_entry=*/ true);
-
-  EXPECT_THAT(c0->root_instruction(), p0);
-  EXPECT_THAT(c1->root_instruction(), p1);
-  EXPECT_THAT(entry->instruction_count(), 9);
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/contrib/autograph/__init__.py b/tensorflow/contrib/autograph/__init__.py
index dbdbad8f4c..637e49c082 100644
--- a/tensorflow/contrib/autograph/__init__.py
+++ b/tensorflow/contrib/autograph/__init__.py
@@ -23,7 +23,6 @@ from __future__ import print_function
 
 # TODO(mdan): Bring only the relevant symbols to the top level.
 from tensorflow.contrib.autograph import utils
-from tensorflow.contrib.autograph import operators
 from tensorflow.contrib.autograph.impl.api import convert
 from tensorflow.contrib.autograph.impl.api import converted_call
 from tensorflow.contrib.autograph.impl.api import do_not_convert
@@ -44,8 +43,6 @@ _allowed_symbols = [
     'do_not_convert',
     'to_code',
     'to_graph',
-    # Overloaded operators
-    'operators',
     # Special functions and directives
     'set_element_type',
     'set_loop_options',
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
index 2e0a2fcef4..bda5e26f43 100644
--- a/tensorflow/contrib/cmake/tf_c.cmake
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -37,15 +37,13 @@ add_dependencies(
   tf_core_lib
   tf_protos_cc)
 
-if(tensorflow_BUILD_PYTHON_BINDINGS)
-  add_library(tf_c_python_api OBJECT
-    "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
-    "${tensorflow_source_dir}/tensorflow/c/python_api.h"
-  )
-  add_dependencies(
-    tf_c_python_api
-    tf_c
-    tf_core_lib
-    tf_core_framework
-    tf_protos_cc)
-endif()
+add_library(tf_c_python_api OBJECT
+  "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
+  "${tensorflow_source_dir}/tensorflow/c/python_api.h"
+)
+add_dependencies(
+  tf_c_python_api
+  tf_c
+  tf_core_lib
+  tf_core_framework
+  tf_protos_cc)
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index 6c90cf398c..f73da0b8ab 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -155,7 +155,7 @@ if (WIN32)
     set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib")
   endif()
 else (WIN32)
-  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}")
+  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so")
 endif (WIN32)
 add_custom_target(tf_extension_ops)
 
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 9244604489..a0c3ddd28b 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -715,7 +715,7 @@ if(WIN32)
   endif()
 else()
   add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
                                      ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
 endif()
 
@@ -832,6 +832,7 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
 add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/testing/python/framework/util_test.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
+
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/README
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index 4f957f1e0b..cffe069aa3 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -44,8 +44,7 @@ UNDNAME = "undname.exe"
 DUMPBIN = "dumpbin.exe"
 
 # Exclude if matched
-EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::|Internal|"
-                        r"python_op_gen_internal|grappler")
+EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::")
 
 # Include if matched before exclude
 INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
@@ -57,10 +56,6 @@ INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
                            r"tensorflow::ops::internal::Enter|"
                            r"tensorflow::strings::internal::AppendPieces|"
                            r"tensorflow::strings::internal::CatPieces|"
-                           r"tensorflow::errors::Internal|"
-                           r"tensorflow::Tensor::CopyFromInternal|"
-                           r"tensorflow::kernel_factory::"
-                           r"OpKernelRegistrar::InitInternal|"
                            r"tensorflow::io::internal::JoinPathImpl")
 
 # Include if matched after exclude
@@ -69,7 +64,7 @@ INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
                         r"tensorflow::|"
                         r"functor::|"
                         r"\?nsync_|"
-                        r"stream_executor::")
+                        r"perftools::gputools")
 
 # We want to identify data members explicitly in the DEF file, so that no one
 # can implicitly link against the DLL if they use one of the variables exported
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
index 795f1993ba..45760a29ee 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
@@ -151,24 +151,16 @@ class SinhArcsinhBijectorTest(test.TestCase):
         self.assertAllClose(y, bijector.forward(x).eval(), rtol=1e-4, atol=0.)
         self.assertAllClose(x, bijector.inverse(y).eval(), rtol=1e-4, atol=0.)
 
-        # On IBM PPC systems, longdouble (np.float128) is same as double except that it can have more precision.
-        # Type double being of 8 bytes, can't hold square of max of float64 (which is also 8 bytes) and
-        # below test fails due to overflow error giving inf. So this check avoids that error by skipping square
-        # calculation and corresponding assert.
-
-        if np.amax(y) <= np.sqrt(np.finfo(np.float128).max) and \
-           np.fabs(np.amin(y)) <= np.sqrt(np.fabs(np.finfo(np.float128).min)):
-
-          # Do the numpy calculation in float128 to avoid inf/nan.
-          y_float128 = np.float128(y)
-          self.assertAllClose(
-              np.log(np.cosh(
-                  np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
-                      y_float128**2 + 1)) -
-              np.log(tailweight),
-              bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
-              rtol=1e-4,
-              atol=0.)
+        # Do the numpy calculation in float128 to avoid inf/nan.
+        y_float128 = np.float128(y)
+        self.assertAllClose(
+            np.log(np.cosh(
+                np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
+                    y_float128**2 + 1)) -
+            np.log(tailweight),
+            bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
+            rtol=1e-4,
+            atol=0.)
         self.assertAllClose(
             -bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
             bijector.forward_log_det_jacobian(x, event_ndims=0).eval(),
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index adf92c27ea..d7909dd5a2 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -106,8 +106,7 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
             target_device=target,
             buffer_size=10,
             container="",
-            shared_name=_generate_shared_name(
-                "contrib_eager_iterator_function_buffer_resource"))
+            shared_name=_generate_shared_name("function_buffer_resource"))
         self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
             handle=self._buffer_resource_handle,
             handle_device=self._device)
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
index 5749f22ac5..4fe3a0e3f3 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
@@ -68,7 +68,7 @@
         "# simply construct the object. Most layers take as a first argument the number\n",
         "# of output dimensions / channels.\n",
         "layer = tf.keras.layers.Dense(100)\n",
-        "# The number of input dimensions is often unnecessary, as it can be inferred\n",
+        "# The number of input dimensionss is often unnecessary, as it can be inferred\n",
         "# the first time the layer is used, but it can be provided if you want to \n",
         "# specify it manually, which is useful in some complex models.\n",
         "layer = tf.keras.layers.Dense(10, input_shape=(None, 5))"
@@ -267,7 +267,7 @@
         "  * `build`, where you know the shapes of the input tensors and can do the rest of the initialization\n",
         "  * `call`, where you do the forward computation\n",
         "\n",
-        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes required to create the variables will need to be explicitly specified."
+        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes requires to create the variables will need to be explicitly specified."
       ]
     },
     {
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index 05bcdac2ca..84a413c791 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -346,8 +346,7 @@ def sequence_numeric_column(
     key,
     shape=(1,),
     default_value=0.,
-    dtype=dtypes.float32,
-    normalizer_fn=None):
+    dtype=dtypes.float32):
   """Returns a feature column that represents sequences of numeric data.
 
   Example:
@@ -371,12 +370,6 @@ def sequence_numeric_column(
     default_value: A single value compatible with `dtype` that is used for
       padding the sparse data into a dense `Tensor`.
     dtype: The type of values.
-    normalizer_fn: If not `None`, a function that can be used to normalize the
-      value of the tensor after `default_value` is applied for parsing.
-      Normalizer function takes the input `Tensor` as its argument, and returns
-      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
-      even though the most common use case of this function is normalization, it
-      can be used for any kind of Tensorflow transformations.
 
   Returns:
     A `_SequenceNumericColumn`.
@@ -390,16 +383,12 @@ def sequence_numeric_column(
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
-  if normalizer_fn is not None and not callable(normalizer_fn):
-    raise TypeError(
-        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
 
   return _SequenceNumericColumn(
       key,
       shape=shape,
       default_value=default_value,
-      dtype=dtype,
-      normalizer_fn=normalizer_fn)
+      dtype=dtype)
 
 
 def _assert_all_equal_and_return(tensors, name=None):
@@ -418,7 +407,7 @@ class _SequenceNumericColumn(
     fc._SequenceDenseColumn,
     collections.namedtuple(
         '_SequenceNumericColumn',
-        ['key', 'shape', 'default_value', 'dtype', 'normalizer_fn'])):
+        ['key', 'shape', 'default_value', 'dtype'])):
   """Represents sequences of numeric data."""
 
   @property
@@ -430,10 +419,7 @@ class _SequenceNumericColumn(
     return {self.key: parsing_ops.VarLenFeature(self.dtype)}
 
   def _transform_feature(self, inputs):
-    input_tensor = inputs.get(self.key)
-    if self.normalizer_fn is not None:
-      input_tensor = self.normalizer_fn(input_tensor)
-    return input_tensor
+    return inputs.get(self.key)
 
   @property
   def _variable_shape(self):
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
index 45d7b74046..ee74cf56dc 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
@@ -28,7 +28,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
 
@@ -948,7 +947,6 @@ class SequenceNumericColumnTest(test.TestCase):
     self.assertEqual((1,), a.shape)
     self.assertEqual(0., a.default_value)
     self.assertEqual(dtypes.float32, a.dtype)
-    self.assertIsNone(a.normalizer_fn)
 
   def test_shape_saved_as_tuple(self):
     a = sfc.sequence_numeric_column('aaa', shape=[1, 2])
@@ -967,10 +965,6 @@ class SequenceNumericColumnTest(test.TestCase):
         ValueError, 'dtype must be convertible to float'):
       sfc.sequence_numeric_column('aaa', dtype=dtypes.string)
 
-  def test_normalizer_fn_must_be_callable(self):
-    with self.assertRaisesRegexp(TypeError, 'must be a callable'):
-      sfc.sequence_numeric_column('aaa', normalizer_fn='NotACallable')
-
   def test_get_sequence_dense_tensor(self):
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, values [[0.], [1]]
@@ -991,41 +985,6 @@ class SequenceNumericColumnTest(test.TestCase):
       self.assertAllEqual(
           expected_dense_tensor, dense_tensor.eval(session=sess))
 
-  def test_get_sequence_dense_tensor_with_normalizer_fn(self):
-
-    def _increment_two(input_sparse_tensor):
-      return sparse_ops.sparse_add(
-          input_sparse_tensor,
-          sparse_tensor.SparseTensor(((0, 0), (1, 1)), (2.0, 2.0), (2, 2))
-      )
-
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0.], [1]]
-        # example 1, [[10.]]
-        indices=((0, 0), (0, 1), (1, 0)),
-        values=(0., 1., 10.),
-        dense_shape=(2, 2))
-
-    # Before _increment_two:
-    #   [[0.], [1.]],
-    #   [[10.], [0.]],
-    # After _increment_two:
-    #   [[2.], [1.]],
-    #   [[10.], [2.]],
-    expected_dense_tensor = [
-        [[2.], [1.]],
-        [[10.], [2.]],
-    ]
-    numeric_column = sfc.sequence_numeric_column(
-        'aaa', normalizer_fn=_increment_two)
-
-    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_dense_tensor, dense_tensor.eval(session=sess))
-
   def test_get_sequence_dense_tensor_with_shape(self):
     """Tests get_sequence_dense_tensor with shape !=(1,)."""
     sparse_input = sparse_tensor.SparseTensorValue(
diff --git a/tensorflow/contrib/ffmpeg/__init__.py b/tensorflow/contrib/ffmpeg/__init__.py
index 484ffee3e7..daba965a98 100644
--- a/tensorflow/contrib/ffmpeg/__init__.py
+++ b/tensorflow/contrib/ffmpeg/__init__.py
@@ -28,6 +28,7 @@ from __future__ import print_function
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_audio
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import encode_audio
+from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 
 from tensorflow.python.util.all_util import remove_undocumented
 
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
index b1b5126d9e..020b5c99c6 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.contrib.ffmpeg.ops import gen_decode_audio_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_encode_audio_op_py
+from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index dc49383c5c..10d1ecc738 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -119,13 +119,14 @@ from tensorflow.python.framework.smart_cond import smart_cond
 from tensorflow.python.framework.smart_cond import smart_constant_value
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
+from tensorflow.python.ops.array_ops import broadcast_to
 from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_1d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_3d
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['nest']
+_allowed_symbols = ['nest', 'broadcast_to']
 _nest_allowed_symbols = [
     'assert_same_structure',
     'is_sequence',
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
index a955e21b72..65cb94b5a4 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -301,8 +301,8 @@ class FusedConv2DBiasActivationTest(test.TestCase):
           conv = tensors[i]
           value = values[i]
           ref_value = ref_values[i]
-          tf_logging.info("expected = ", ref_value)
-          tf_logging.info("actual = ", value)
+          print("expected = ", ref_value)
+          print("actual = ", value)
           tol = 1e-5
           if value.dtype == np.float16:
             tol = 1e-3
@@ -843,8 +843,7 @@ class FusedConvInt8Tests(test.TestCase):
                                                 vertical_stride, padding_type)
     output_width = CalculateConvolvedOutputDim(input_width, filter_width,
                                                horizontal_stride, padding_type)
-    tf_logging.info("output_height=", output_height, ", output_width=", 
-			                 output_width)
+    print("output_height=", output_height, ", output_width=", output_width)
 
     side_input, _, _ = gen_array_ops.quantize_v2(
         random_ops.random_uniform(
@@ -881,8 +880,8 @@ class FusedConvInt8Tests(test.TestCase):
     with self.test_session(
         use_gpu=True, config=NoMemoryOptimizationConfig()) as sess:
       actual_y, expected_y = sess.run([actual, expected])
-      tf_logging.info("actual_y = ", actual_y)
-      tf_logging.info("expected_y = ", expected_y)
+      print("actual_y = ", actual_y)
+      print("expected_y = ", expected_y)
       self.assertTrue(np.array_equal(actual_y, expected_y))
 
   def testFusedConvInt8(self):
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
index 2e5c84704f..6a5d982dc8 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "hexagon_controller.h"
 
-#include <stdlib.h>
+#include <malloc.h>
 #include <stdio.h>
 
 #include "adspmsgd.h"
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index 840015a7fa..436c3e1d4c 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -30,7 +30,9 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
+# the archive has been propagated in mirror.bazel.build.
+GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
diff --git a/tensorflow/contrib/lite/examples/minimal/minimal.cc b/tensorflow/contrib/lite/examples/minimal/minimal.cc
index 8b0ace96cc..106e3b0270 100644
--- a/tensorflow/contrib/lite/examples/minimal/minimal.cc
+++ b/tensorflow/contrib/lite/examples/minimal/minimal.cc
@@ -38,7 +38,7 @@ using namespace tflite;
 
 int main(int argc, char *argv[]) {
   if(argc != 2) {
-    fprintf(stderr, "minimal <tflite model>\n");
+    fprintf(stderr, "Usage: %s <model>\n");
     return 1;
   }
   const char* filename = argv[1];
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index 965273f0f0..bb2e615eac 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -128,6 +128,7 @@ TensorFlow operation not listed above are likely unsupported. Notably, the
 following common ops are not supported at the moment:
 
 *   [tf.depth_to_space](https://www.tensorflow.org/api_docs/python/tf/depth_to_space)
+*   [tf.gather](https://www.tensorflow.org/api_docs/python/tf/gather)
 *   [tf.image.resize_bilinear](https://www.tensorflow.org/api_docs/python/tf/image/resize_bilinear)
 *   [tf.tanh](https://www.tensorflow.org/api_docs/python/tf/tanh)
 
@@ -305,19 +306,6 @@ Options {
 }
 ```
 
-**GATHER**
-
-```
-Inputs {
-  0: params tensor
-  1: indices tensor
-  2: axis tensor (optional)
-}
-Outputs {
-  0: a tensor with same type as the params tensor.
-}
-```
-
 **GREATER**
 
 ```
diff --git a/tensorflow/contrib/lite/java/ovic/README.md b/tensorflow/contrib/lite/java/ovic/README.md
index 26349347fa..5efa70987e 100644
--- a/tensorflow/contrib/lite/java/ovic/README.md
+++ b/tensorflow/contrib/lite/java/ovic/README.md
@@ -2,7 +2,7 @@
 
 This folder contains building code for track one of the [Low Power ImageNet Recognition Challenge workshop at CVPR 2018.](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018)
 
-## Pre-requisite
+## Pre-requesits
 
 Follow the steps [here](https://www.tensorflow.org/mobile/tflite/demo_android) to install Tensorflow, Bazel, and the Android NDK and SDK.
 
@@ -49,7 +49,7 @@ Once you have a submission that follows the instructions from the [competition s
 You can call the validator binary below to verify that your model fits the format requirements. This often helps you to catch size mismatches (e.g. output should be [1, 1001] instead of [1,1,1,1001]). Let say the submission file is located at `/path/to/my_model.lite`, then call:
 
 ```sh
-bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
+bazel build --cxxopt--std=c++11 //tensorflow/contrib/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
 bazel-bin/tensorflow/contrib/lite/java/ovic/ovic_validator /path/to/my_model.lite
 ```
 
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 1908f7fa6c..a2f192bbc2 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1934,7 +1934,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // The quantization of the input, output arrays is as follows:
 //  - The input activations are quantized as uint8 on the interval
 //    [-1, 127/128].
-//    The rationale for that is that is the natural interval for output
+//    The rationale for that is that that is the natural interval for output
 //    activations (see next point) and these need to be concatenated together.
 //    We could accommodate different ranges by re-scaling, but we empirically
 //    found that setting the input activations range to be [-1, 127/128] in the
@@ -1999,7 +1999,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // However, for a fixed-point implementation in 16-bit integers, using 5
 // integer bits to represent the [-16, 16] range would leave only 11
 // fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
-// representable values. Notice that is higher than the
+// representable values. Notice that that is higher than the
 // worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
 // Using [-8, 8] thus seems like the better compromise overall, enjoying
 // an increment of 2.4e-4 between representable values and a worst-case
diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
index fd90823425..9400e757b9 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -55,7 +55,7 @@ class Interpreter(object):
     elif model_content and not model_path:
       self._interpreter = (
           _interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromBuffer(
-              model_content))
+              model_content, len(model_content)))
       if not self._interpreter:
         raise ValueError(
             'Failed to create model from {} bytes'.format(len(model_content)))
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index b283551c45..f705551fcb 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -397,14 +397,9 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
-    PyObject* data) {
-  char * buf = nullptr;
-  Py_ssize_t length;
-  if (PY_TO_CPPSTRING(data, &buf, &length) == -1) {
-    return nullptr;
-  }
+    const char* data, size_t len) {
   std::unique_ptr<tflite::FlatBufferModel> model =
-      tflite::FlatBufferModel::BuildFromBuffer(buf, length);
+      tflite::FlatBufferModel::BuildFromBuffer(data, len);
   return model ? new InterpreterWrapper(std::move(model)) : nullptr;
 }
 
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
index cbeb53bee7..b0ed7c4559 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -40,7 +40,8 @@ class InterpreterWrapper {
   static InterpreterWrapper* CreateWrapperCPPFromFile(const char* model_path);
 
   // SWIG caller takes ownership of pointer.
-  static InterpreterWrapper* CreateWrapperCPPFromBuffer(PyObject* data);
+  static InterpreterWrapper* CreateWrapperCPPFromBuffer(const char* data,
+                                                        size_t len);
 
   ~InterpreterWrapper();
   bool AllocateTensors();
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 88dda7290b..0913cd2c5c 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -34,8 +34,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from six import PY3
-
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
 from tensorflow.contrib.lite.python import lite_constants as constants
@@ -56,7 +54,6 @@ from tensorflow.python.framework.importer import import_graph_def
 from tensorflow.python.ops.variables import global_variables_initializer
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
-# from tensorflow.python.util.all_util import remove_undocumented
 
 
 class TocoConverter(object):
@@ -206,12 +203,6 @@ class TocoConverter(object):
       except (_text_format.ParseError, DecodeError):
         try:
           print("Ignore 'tcmalloc: large alloc' warnings.")
-
-          if not isinstance(file_content, str):
-            if PY3:
-              file_content = file_content.decode('utf-8')
-            else:
-              file_content = file_content.encode('utf-8')
           _text_format.Merge(file_content, graph_def)
         except (_text_format.ParseError, DecodeError):
           raise ValueError(
@@ -391,5 +382,3 @@ def _freeze_graph(sess, output_tensors):
                                                         output_arrays)
   else:
     return sess.graph_def
-
-# remove_undocumented(__name__)
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 5c7fa09891..e33b430937 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -178,7 +178,7 @@ ArrayDataType ConvertDataType(tensorflow::DataType dtype) {
   else if (dtype == DT_STRING)
     return ArrayDataType::kString;
   else
-    LOG(INFO) << "Unsupported data type in placeholder op: " << dtype;
+    LOG(INFO) << "Unsupported data type in placehoder op: " << dtype;
   return ArrayDataType::kNone;
 }
 
diff --git a/tensorflow/contrib/lite/toco/toco_port.cc b/tensorflow/contrib/lite/toco/toco_port.cc
index de76fd4032..1b21c8bc60 100644
--- a/tensorflow/contrib/lite/toco/toco_port.cc
+++ b/tensorflow/contrib/lite/toco/toco_port.cc
@@ -20,12 +20,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
-namespace std {
-double round(double x) { return ::round(x); }
-}  // namespace std
-#endif
-
 namespace toco {
 namespace port {
 void CopyToBuffer(const string& src, char* dest) {
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h
index 17f82b9dd7..5c019cb2bf 100644
--- a/tensorflow/contrib/lite/toco/toco_port.h
+++ b/tensorflow/contrib/lite/toco/toco_port.h
@@ -34,24 +34,6 @@ limitations under the License.
 #define TFLITE_PROTO_NS google::protobuf
 #endif
 
-#ifdef __ANDROID__
-#include <sstream>
-namespace std {
-
-template <typename T>
-std::string to_string(T value)
-{
-    std::ostringstream os ;
-    os << value ;
-    return os.str() ;
-}
-
-#ifdef __ARM_ARCH_7A__
-double round(double x);
-#endif
-}
-#endif
-
 namespace toco {
 namespace port {
 
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index a28fc3a87f..e8c6edd7ba 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -270,7 +270,7 @@ for arch in $archs; do
                         PLATFORM_LDFLAGS=-pthread
                         MKDEP=${CC} -M -std=c++11
                         PLATFORM_C=../../platform/c++11/src/nsync_semaphore_mutex.cc \
-                                   ../../platform/posix/src/per_thread_waiter.c \
+                                   ../../platform/c++11/src/per_thread_waiter.cc \
                                    ../../platform/c++11/src/yield.cc \
                                    ../../platform/c++11/src/time_rep_timespec.cc \
                                    ../../platform/c++11/src/nsync_panic.cc
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 48953e2e38..eff9081e35 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -27,7 +27,9 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
+# the archive has been propagated in mirror.bazel.build.
+GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index a6be2084aa..2ed99d50a4 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -2503,7 +2503,7 @@ def _compute_recall_at_precision(tp, fp, fn, precision, name):
     name: An optional variable_scope name.
 
   Returns:
-    The recall at a given `precision`.
+    The recall at a the given `precision`.
   """
   precisions = math_ops.div(tp, tp + fp + _EPSILON)
   tf_index = math_ops.argmin(
diff --git a/tensorflow/contrib/mpi_collectives/kernels/ring.h b/tensorflow/contrib/mpi_collectives/kernels/ring.h
index c001615d3f..1d56d588bc 100644
--- a/tensorflow/contrib/mpi_collectives/kernels/ring.h
+++ b/tensorflow/contrib/mpi_collectives/kernels/ring.h
@@ -129,7 +129,7 @@ cudaStream_t CudaStreamForMPI();
  *  has the fully accumulated Segment 1; and so on. The scatter-reduce is
  * complete.
  *
- *  Next, the allgather distributes these fully accumulated chunks across all
+ *  Next, the allgather distributes these fully accumululated chunks across all
  * nodes. Communication proceeds in the same ring, once again in N-1 steps. At
  * the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
  * For example, at the first iteration, the following transfers will occur:
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index 915e6504e1..21bf3f5313 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -224,10 +224,8 @@ class AdaMaxOptimizerTest(test.TestCase):
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0),
-                                             rtol=1e-2)
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1),
-                                             rtol=1e-2)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
           if use_resource:
             self.assertEqual("var0_%d/AdaMax:0" % (i,),
                              opt.get_slot(var=var0, name="m").name)
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer.py b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
index b6b10e500b..a7c97a1da2 100644
--- a/tensorflow/contrib/opt/python/training/model_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
@@ -62,7 +62,7 @@ class ModelAverageCustomGetter(object):
   """
 
   def __init__(self, worker_device):
-    """Create a new `ModelAverageCustomGetter`.
+    """Create a new `ElasticAverageCustomGetter`.
 
     Args:
       worker_device: String.  Name of the `worker` job.
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
index aad1ca04c5..6ca7fe8b6e 100644
--- a/tensorflow/contrib/periodic_resample/BUILD
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -6,13 +6,12 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
+    "py_test",
     "tf_gen_op_libs",
     "tf_custom_op_library",
     "tf_custom_op_py_library",
     "tf_gen_op_wrapper_py",
 )
-load("//tensorflow:tensorflow.bzl", "py_test")
 
 cc_library(
     name = "all_ops",
@@ -85,23 +84,6 @@ py_test(
         ":init_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradient_checker",
-    ],
-)
-
-tf_cc_test(
-    name = "periodic_resample_op_cc_test",
-    size = "small",
-    srcs = [
-        "ops/array_ops_test.cc",
-    ],
-    deps = [
-        ":all_ops",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_proto",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
     ],
 )
 
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
index 514689cf45..e18923c8aa 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
@@ -22,9 +22,4 @@ namespace tensorflow {
 REGISTER_KERNEL_BUILDER(Name("PeriodicResample").Device(DEVICE_CPU),
                         PeriodicResampleOp);
 
-
-REGISTER_KERNEL_BUILDER(Name("PeriodicResampleOpGrad")
-                            .Device(DEVICE_CPU),
-                        PeriodicResampleOpGrad);
-
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
index 42fba81a5c..3ab588c458 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
@@ -25,202 +25,92 @@
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/util/work_sharder.h"
 
 namespace {
 
-// Computes input tensor index for given output index during forward
-// propagation through periodic_resample operation.
-class InputIndexer {
- public:
-  InputIndexer(const std::vector<tensorflow::int64>& output_dimensions,
-               const tensorflow::TensorShape& input_shape,
-               int adjustable_dimension)
-      : output_dimensions_(output_dimensions),
-        adjustable_dimension_(adjustable_dimension),
-        rank_(input_shape.dims()),
-        linear_output_index_(0),
-        linear_input_index_(0),
-        adjustable_dimension_carriage_sum_(0) {
-    auto input_dimensions = TensorShapeToVector(input_shape);
-    // factors by which input_dimensions increases/decreases w.r.t.
-    // output_dimensions
-    dimension_ceiling_ =
-        ComputeDimensionCeiling(output_dimensions, input_dimensions);
-    cumulative_dimensions_ = ComputeCumulativeDimensions();
-
-    output_indices_.resize(output_dimensions_.size());
-    input_indices_.resize(output_dimensions_.size());
-
-    // Compute index_factors
-    index_factors_.resize(rank_);
-    tensorflow::int64 last_index_factor = 1;
-    for (auto r = rank_ - 1; r >= 0; --r) {
-      index_factors_[r] = last_index_factor;
-      last_index_factor *= input_dimensions[r];
-    }
-  }
-
-  tensorflow::int64 linear_input_index() const { return linear_input_index_; }
-
-  void MoveToOutputIndex(tensorflow::int64 output_index);
-  void IncrementOutputIndex();
-
- private:
-  void RecomputeInputAdjustableDimensionIndex() {
-    tensorflow::int64 index = adjustable_dimension_carriage_sum_;
-    index *= output_dimensions_[adjustable_dimension_];
-    index += output_indices_[adjustable_dimension_];
-    input_indices_[adjustable_dimension_] = index;
-  }
-
-  std::vector<tensorflow::int64> TensorShapeToVector(
-      const tensorflow::TensorShape& tensor_shape);
-
-  std::vector<tensorflow::int64> ComputeDimensionCeiling(
-      const std::vector<tensorflow::int64>& output_dimensions,
-      const std::vector<tensorflow::int64>& input_dimensions);
-
-  std::vector<tensorflow::int64> ComputeCumulativeDimensions();
-
-  const std::vector<tensorflow::int64> output_dimensions_;
-  std::vector<tensorflow::int64> dimension_ceiling_;
-  std::vector<tensorflow::int64> index_factors_;
-  std::vector<tensorflow::int64> cumulative_dimensions_;
-  std::vector<tensorflow::int64> output_indices_;
-  std::vector<tensorflow::int64> input_indices_;
-
-  const int adjustable_dimension_;
-  const int rank_;
-  tensorflow::int64 linear_output_index_;
-  tensorflow::int64 linear_input_index_;
-  tensorflow::int64 adjustable_dimension_carriage_sum_;
-};
-
-void InputIndexer::MoveToOutputIndex(tensorflow::int64 output_index) {
-  linear_output_index_ = output_index;
-  linear_input_index_ = 0;
+template <class IndexVecT, class IndexT>
+IndexT compute_input_index(
+    IndexVecT* target_dimensions, const IndexT& output_index,
+    const IndexVecT& original_dimensions, const int& adjustable_dimension,
+    const std::vector<tensorflow::int64>& dimension_ceiling,
+    const std::vector<tensorflow::int64>& cumulative_dimensions, IndexT* result,
+    std::vector<IndexT>* output_indices, const int& rank) {
+  *result = 0;
+  output_indices->clear();
 
   // un-rasterize the output index
   auto last_reduced_i = output_index;
-  for (auto r = rank_ - 1; r >= 0; --r) {
-    output_indices_[r] = last_reduced_i % output_dimensions_[r];
+  for (auto r = rank - 1; r >= 0; --r) {
+    (*output_indices)[r] = last_reduced_i % (*target_dimensions)[r];
     last_reduced_i =
-        (last_reduced_i - output_indices_[r]) / output_dimensions_[r];
+        (last_reduced_i - (*output_indices)[r]) / (*target_dimensions)[r];
   }
 
-  tensorflow::int64 carriage_sum = 0;
-  for (int qi = 0; qi < rank_; ++qi) {
-    if (qi == adjustable_dimension_) continue;
-    carriage_sum += cumulative_dimensions_[qi] *
-                    (output_indices_[qi] % dimension_ceiling_[qi]);
-  }
-  adjustable_dimension_carriage_sum_ = carriage_sum;
-
   // rasterize the input index
-  for (auto r = rank_ - 1; r >= 0; --r) {
-    if (r != adjustable_dimension_) {
-      input_indices_[r] = output_indices_[r] / dimension_ceiling_[r];
-    } else {
-      RecomputeInputAdjustableDimensionIndex();
-    }
-  }
-  for (auto r = rank_ - 1; r >= 0; --r) {
-    linear_input_index_ += index_factors_[r] * input_indices_[r];
-  }
-}
-
-void InputIndexer::IncrementOutputIndex() {
-  linear_output_index_++;
-  for (auto r = rank_ - 1; r >= 0; --r) {
-    auto old_carriage_sum_increment =
-        cumulative_dimensions_[r] *
-        (output_indices_[r] % dimension_ceiling_[r]);
-    output_indices_[r] = (output_indices_[r] + 1) % output_dimensions_[r];
-    if (r != adjustable_dimension_) {
-      auto new_input_index = output_indices_[r] / dimension_ceiling_[r];
-      linear_input_index_ +=
-          (new_input_index - input_indices_[r]) * index_factors_[r];
-
-      input_indices_[r] = new_input_index;
-
-      auto new_carriage_sum_increment =
-          cumulative_dimensions_[r] *
-          (output_indices_[r] % dimension_ceiling_[r]);
-
-      adjustable_dimension_carriage_sum_ = adjustable_dimension_carriage_sum_ -
-                                           old_carriage_sum_increment +
-                                           new_carriage_sum_increment;
-    }
-
-    if (output_indices_[r] != 0) {
-      // No more carries to higher indices.
-      break;
+  IndexT last_index_factor = 1;
+  for (auto r = rank - 1; r >= 0; --r) {
+    IndexT index = 0;
+    if (r != adjustable_dimension)
+      index = (*output_indices)[r] / dimension_ceiling[r];
+    else {
+      for (int qi = 0; qi < rank; ++qi) {
+        if (qi == adjustable_dimension) continue;
+        index += cumulative_dimensions[qi] *
+                 ((*output_indices)[qi] % dimension_ceiling[qi]);
+      }
+      index *= (*target_dimensions)[adjustable_dimension];
+      index += (*output_indices)[r];
     }
+    *result += last_index_factor * index;
+    last_index_factor *= original_dimensions[r];
   }
-  auto old_adjustable_dimension_input_index =
-      input_indices_[adjustable_dimension_];
-  RecomputeInputAdjustableDimensionIndex();
-  linear_input_index_ += (input_indices_[adjustable_dimension_] -
-                           old_adjustable_dimension_input_index) *
-                          index_factors_[adjustable_dimension_];
-}
 
-std::vector<tensorflow::int64> InputIndexer::TensorShapeToVector(
-    const tensorflow::TensorShape& tensor_shape) {
-  std::vector<tensorflow::int64> result(tensor_shape.dims());
-  int count = 0;
-  for (const auto dim_info : tensor_shape) {
-    result[count] = dim_info.size;
-    ++count;
-  }
-  return result;
+  return *result;
 }
 
-std::vector<tensorflow::int64> InputIndexer::ComputeDimensionCeiling(
-    const std::vector<tensorflow::int64>& output_dimensions,
-    const std::vector<tensorflow::int64>& input_dimensions) {
-  std::vector<tensorflow::int64> dimension_ceiling(input_dimensions.size());
-  for (size_t i = 0; i < input_dimensions.size(); ++i) {
-    dimension_ceiling[i] = (output_dimensions[i] + input_dimensions[i] - 1) /
-        input_dimensions[i];
-  }
-  return dimension_ceiling;
-}
+template <class InputDataT,
+          class IndexVecT>  // both types are needed here b/c IndexVecT and
+                            // InputDataT are not related
+                            void
+                            fill_periodic_tensor(
+                                tensorflow::OpKernelContext* context,
+                                const IndexVecT& desired_shape,
+                                const tensorflow::Tensor& input_tensor) {
+  // input is a strided array (last index is fastest, C-ordered)
+  auto input = input_tensor.flat<InputDataT>();
+  const int rank = input_tensor.dims();
+  // original and target dimensions
+  std::vector<tensorflow::int64> original_dimensions(rank),
+      target_dimensions(rank);
+  tensorflow::int64 total_size(input_tensor.NumElements()), new_sliced_size(1);
+  // factors by which original_dimensions increases/decreases w.r.t.
+  // target_dimensions
+  std::vector<tensorflow::int64> dimension_ceiling(rank),
+      cumulative_dimensions(rank);
+  // index of adjustable dimension
+  int adjustable_dimension;
+  tensorflow::TensorShape output_shape;
 
-std::vector<tensorflow::int64> InputIndexer::ComputeCumulativeDimensions() {
-  std::vector<tensorflow::int64> cumulative_dimensions(rank_);
-  int count = 0;
-  for (int i = 0; i < rank_; ++i) {
-    if (count == 0) {
-      cumulative_dimensions[count] = 1;
-    } else {
-      cumulative_dimensions[count] =
-          cumulative_dimensions[count - 1] * dimension_ceiling_[count - 1];
-    }
-    ++count;
-  }
-  return cumulative_dimensions;
-}
+  // requires that the rank of the input tensor and length of the desired shape
+  // are equal
+  OP_REQUIRES(context, rank == desired_shape.size(),
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample expects the rank of the input tensor, ",
+                  rank, ", to be the same as the length of the desired shape, ",
+                  desired_shape.size(), "."));
 
-template <typename IndexVecT>
-void process_desired_shape(tensorflow::OpKernelContext* context,
-                           const tensorflow::TensorShape& input_tensor_shape,
-                           const IndexVecT& desired_shape,
-                           int* adjustable_dimension,
-                           std::vector<tensorflow::int64>* target_dimensions,
-                           tensorflow::int64* output_size) {
-  tensorflow::int64 new_sliced_size = 1;
   bool found = false;
-  const int rank = input_tensor_shape.dims();
+  const auto& input_tensor_shape = input_tensor.shape();
+
   for (int i = 0; i < rank; ++i) {
+    // if (desired_shape(i) < 1) {
     if (desired_shape[i] < 1) {
       // only one index can be adjustable
       OP_REQUIRES(context, !found,
                   tensorflow::errors::InvalidArgument(
                       "periodic_resample expects only "
                       "one index to be marked as adjustable."));
-      *adjustable_dimension = i;
+      adjustable_dimension = i;
       found = true;
     } else {
       OP_REQUIRES(
@@ -232,8 +122,9 @@ void process_desired_shape(tensorflow::OpKernelContext* context,
               i, " input tensor has size ", input_tensor_shape.dim_size(i),
               ", desired shape has size ", desired_shape[i], "."));
 
-      (*target_dimensions)[i] = desired_shape[i];
-      new_sliced_size *= (*target_dimensions)[i];
+      // target_dimensions[i] = desired_shape(i);
+      target_dimensions[i] = desired_shape[i];
+      new_sliced_size *= target_dimensions[i];
     }
   }
   // at least one index needs to be adjustable
@@ -241,50 +132,26 @@ void process_desired_shape(tensorflow::OpKernelContext* context,
               tensorflow::errors::InvalidArgument(
                   "periodic_resample expects at least "
                   "one index to be marked as adjustable."));
-  (*target_dimensions)[*adjustable_dimension] =
-      input_tensor_shape.num_elements() / new_sliced_size;
-
-  *output_size = new_sliced_size * (*target_dimensions)[*adjustable_dimension];
-}
-
-// Heuristic number based on measurements on
-// Intel(R) Core(TM) i7-4930K CPU @ 3.40GHz
-const tensorflow::int64 costPerFillIndex = 35;
 
-enum class Mode {
-  kForward,
-  kGradient
-};
-
-// Computes either periodic_resample operation output or gradients for it,
-// depending on |mode|.
-// |original_shape| is always shape of input to periodic_resample operation.
-// |source_tensor| is either source for periodic_resample (for forward mode)
-//     or gradients tensor.
-// |desired_shape| is always shape, provided by user, to which forward
-//     propagation attempts resample input tensor.
-template <class InputDataT, Mode mode>
-void
-do_periodic_resample_op(tensorflow::OpKernelContext* context,
-                        const tensorflow::TensorShape& original_shape,
-                        const tensorflow::PartialTensorShape& desired_shape,
-                        const tensorflow::Tensor& source_tensor) {
-  const int rank = source_tensor.dims();
+  int count = 0;
+  for (const auto dim_info : input_tensor.shape()) {
+    original_dimensions[count] = dim_info.size;
+    ++count;
+  }
 
-  // requires that the rank of the input tensor and length of the desired shape
-  // are equal
-  OP_REQUIRES(context, rank == desired_shape.dims(),
-              tensorflow::errors::InvalidArgument(
-                  "periodic_resample expects the rank of the input tensor, ",
-                  rank, ", to be the same as the length of the desired shape, ",
-                  desired_shape.dims(), "."));
+  target_dimensions[adjustable_dimension] = total_size / new_sliced_size;
 
-  std::vector<tensorflow::int64> target_dimensions(rank);
-  tensorflow::int64 new_size = 0;
-  // index of adjustable dimension
-  int adjustable_dimension = 0;
-  process_desired_shape(context, original_shape, desired_shape.dim_sizes(),
-                        &adjustable_dimension, &target_dimensions, &new_size);
+  count = 0;
+  for (int i = 0; i < input_tensor.shape().dims(); ++i) {
+    dimension_ceiling[count] = tensorflow::int64(std::ceil(
+        float(target_dimensions[count]) / float(original_dimensions[count])));
+    if (count == 0)
+      cumulative_dimensions[count] = 1;
+    else
+      cumulative_dimensions[count] =
+          cumulative_dimensions[count - 1] * dimension_ceiling[count - 1];
+    ++count;
+  }
 
   // ensure that the new dimension is greater than zero
   OP_REQUIRES(context, target_dimensions[adjustable_dimension] > 0,
@@ -293,14 +160,11 @@ do_periodic_resample_op(tensorflow::OpKernelContext* context,
                   "adjustable dimension, ",
                   adjustable_dimension, ", isn't greater than zero, ",
                   target_dimensions[adjustable_dimension], "."));
-  tensorflow::TensorShape output_shape;
-  if (mode == Mode::kForward) {
-    for (int i = 0; i < rank; ++i) {
-      output_shape.AddDim(target_dimensions[i]);
-    }
-  } else {
-    output_shape = original_shape;
+  for (int i = 0; i < rank; ++i) {
+    output_shape.AddDim(target_dimensions[i]);
   }
+  const auto new_size =
+      new_sliced_size * target_dimensions[adjustable_dimension];
 
   // Create an output tensor and attach it to the current context
   tensorflow::Tensor* output_tensor = nullptr;
@@ -308,73 +172,47 @@ do_periodic_resample_op(tensorflow::OpKernelContext* context,
                  context->allocate_output(0, output_shape, &output_tensor));
   auto output = output_tensor->flat<InputDataT>();
 
-  // input is a strided array (last index is fastest, C-ordered)
-  auto input = source_tensor.flat<InputDataT>();
+  // memory is allocated for these variables outside the inner loop for
+  // efficiency (although, I could create a separate class scope for
+  // this purpose instead)
+  tensorflow::int64 result = 0;
+  std::vector<tensorflow::int64> output_indices(target_dimensions.size());
 
   // Fill output tensor with periodically resampled input tensor values
-  InputIndexer input_indexer(target_dimensions, original_shape,
-                             adjustable_dimension);
-
-  auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-  auto fill_output_tensor = [&input_indexer, &output, &input](
-      tensorflow::int64 start, tensorflow::int64 limit) {
-    InputIndexer local_indexer(input_indexer);
-    local_indexer.MoveToOutputIndex(start);
-    for (tensorflow::int64 output_index = start; output_index < limit;
-         ++output_index) {
-      if (mode == Mode::kForward) {
-        output(output_index) = input(local_indexer.linear_input_index());
-      } else {
-        output(local_indexer.linear_input_index()) = input(output_index);
-      }
-      local_indexer.IncrementOutputIndex();
-    }
-  };
-  ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
-                      new_size, costPerFillIndex, fill_output_tensor);
-}
-
-#define DATA_TYPE_SWITCH(data_type, context, CASE)                            \
-  switch (data_type) {                                                        \
-    CASE(float)                                                               \
-    CASE(double)                                                              \
-    CASE(tensorflow::int32)                                                   \
-    CASE(tensorflow::int64)                                                   \
-    default:                                                                  \
-      context->CtxFailure(__FILE__, __LINE__,                                 \
-          tensorflow::errors::InvalidArgument(                                \
-              "Unsuppored tensor elements type"));                            \
-      break;                                                                  \
+  for (tensorflow::int64 output_index = 0; output_index < new_size;
+       ++output_index) {
+    output(output_index) = input(compute_input_index(
+        &target_dimensions, output_index, original_dimensions,
+        adjustable_dimension, dimension_ceiling, cumulative_dimensions, &result,
+        &output_indices, rank));
   }
+}
 
 void create_output_tensor(
     tensorflow::OpKernelContext* context,
     const tensorflow::Tensor& input_tensor,
     const tensorflow::DataType& input_tensor_type,
-    const tensorflow::PartialTensorShape& desired_shape) {
-#define CASE(type)                                                            \
-    case tensorflow::DataTypeToEnum<type>::value:                             \
-      do_periodic_resample_op<type, Mode::kForward>(                          \
-          context, input_tensor.shape(), desired_shape, input_tensor);        \
-      break;
+    const tensorflow::PartialTensorShape& desired_shape_tensor) {
+  auto desired_shape = desired_shape_tensor.dim_sizes();
 
-  DATA_TYPE_SWITCH(input_tensor_type, context, CASE);
-#undef CASE
-}
-
-void create_grad_tensor(tensorflow::OpKernelContext* context,
-                        const tensorflow::Tensor& grad_tensor,
-                        const tensorflow::DataType& grad_tensor_type,
-                        const tensorflow::TensorShape& original_shape,
-                        const tensorflow::PartialTensorShape& desired_shape) {
-#define CASE(type)                                                            \
-    case tensorflow::DataTypeToEnum<type>::value:                             \
-      do_periodic_resample_op<type, Mode::kGradient>(                         \
-          context, original_shape, desired_shape, grad_tensor);               \
+  // obligatory type switch
+  switch (input_tensor_type) {
+    case tensorflow::DataTypeToEnum<float>::value:
+      fill_periodic_tensor<float>(context, desired_shape, input_tensor);
       break;
-
-  DATA_TYPE_SWITCH(grad_tensor_type, context, CASE);
-#undef CASE
+    case tensorflow::DataTypeToEnum<double>::value:
+      fill_periodic_tensor<double>(context, desired_shape, input_tensor);
+      break;
+    case tensorflow::DataTypeToEnum<tensorflow::int32>::value:
+      fill_periodic_tensor<tensorflow::int32>(context, desired_shape,
+                                              input_tensor);
+      break;
+    case tensorflow::DataTypeToEnum<tensorflow::int64>::value:
+      fill_periodic_tensor<tensorflow::int64>(context, desired_shape,
+                                              input_tensor);
+      break;
+    default:;
+  }
 }
 
 }  // namespace
@@ -400,25 +238,4 @@ class PeriodicResampleOp : public tensorflow::OpKernel {
   tensorflow::PartialTensorShape desired_shape;
 };
 
-class PeriodicResampleOpGrad : public tensorflow::OpKernel {
- public:
-  explicit PeriodicResampleOpGrad(tensorflow::OpKernelConstruction* context)
-      : tensorflow::OpKernel(context) {
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("original_shape", &original_shape));
-    OP_REQUIRES_OK(context, context->GetAttr("desired_shape", &desired_shape));
-  }
-
-  void Compute(tensorflow::OpKernelContext* context) override {
-    const tensorflow::Tensor& grad_tensor = context->input(0);
-    const tensorflow::DataType grad_tensor_type = context->input_dtype(0);
-    create_grad_tensor(context, grad_tensor, grad_tensor_type, original_shape,
-                       desired_shape);
-  }
-
- private:
-  tensorflow::TensorShape original_shape;
-  tensorflow::PartialTensorShape desired_shape;
-};
-
 #endif  // TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops.cc b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
index fd38cd09b4..82bd796956 100644
--- a/tensorflow/contrib/periodic_resample/ops/array_ops.cc
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
@@ -26,42 +26,7 @@ REGISTER_OP("PeriodicResample")
     .Input("values: T")
     .Attr("shape: shape")
     .Output("output: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      tensorflow::PartialTensorShape desired_shape;
-      TF_RETURN_IF_ERROR(c->GetAttr("shape", &desired_shape));
-      shape_inference::ShapeHandle input_tensor_shape = c->input(0);
-      shape_inference::DimensionHandle num_input_elements =
-          c->NumElements(input_tensor_shape);
-      shape_inference::ShapeHandle result_shape_handle;
-      if (!shape_inference::InferenceContext::ValueKnown(num_input_elements)) {
-        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
-            desired_shape, &result_shape_handle));
-      } else {
-        const int rank = c->Rank(input_tensor_shape);
-        std::vector<tensorflow::int64> target_dimensions(rank);
-        tensorflow::int64 new_sliced_size = 1;
-        int adjustable_dimension = 0;
-        for (int i = 0; i < rank; ++i) {
-          if (desired_shape.dim_size(i) < 1) {
-            adjustable_dimension = i;
-          } else {
-            target_dimensions[i] = desired_shape.dim_size(i);
-            new_sliced_size *= target_dimensions[i];
-          }
-        }
-        target_dimensions[adjustable_dimension] =
-            shape_inference::InferenceContext::Value(
-                num_input_elements) / new_sliced_size;
-        tensorflow::TensorShape result_shape;
-        for (int i = 0; i < rank; ++i) {
-          result_shape.AddDim(target_dimensions[i]);
-        }
-        TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(
-            result_shape, &result_shape_handle));
-      }
-      c->set_output(0, result_shape_handle);
-      return Status::OK();
-    })
+    .SetShapeFn(shape_inference::ExplicitShape)
     .Doc(R"doc(
 Periodically resample elements of a tensor to conform to `shape`.
 
@@ -136,20 +101,4 @@ output: Periodically resampled tensor that has dimensions specified as in
 
 )doc");
 
-
-REGISTER_OP("PeriodicResampleOpGrad")
-    .Attr("T: numbertype")
-    .Input("grad: T")
-    .Attr("original_shape: shape")
-    .Attr("desired_shape: shape")
-    .Output("grad_values: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      tensorflow::TensorShape original_shape;
-      TF_RETURN_IF_ERROR(c->GetAttr("original_shape", &original_shape));
-      shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(original_shape, &s));
-      c->set_output(0, s);
-      return Status::OK();
-});
-
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc b/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
deleted file mode 100644
index 43b7c1799f..0000000000
--- a/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/node_def_builder.h"
-#include "tensorflow/core/framework/shape_inference_testutil.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-
-TEST(ArrayOpsTest, PeriodicResample_ShapeFn) {
-  ShapeInferenceTestOp op("PeriodicResample");
-  // Case 1: output shape can be fully inferreed.
-  PartialTensorShape shape({4, 4, -1});
-  TensorShapeProto shape_proto;
-  shape.AsProto(&shape_proto);
-
-  TF_ASSERT_OK(NodeDefBuilder("test", "PeriodicResample")
-                   .Input({"values", 0, DT_INT32})
-                   .Attr("shape", shape_proto)
-                   .Finalize(&op.node_def));
-  INFER_OK(op, "[2,2,4]", "[4,4,1]");
-  // Case 2: output shape can not be inferred - report desired shape.
-  INFER_OK(op, "[2,2,?]", "[4,4,?]");
-}
-
-}  // end namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
index 31a6fe1d94..a25de55e18 100644
--- a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
+++ b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
@@ -21,11 +21,8 @@ from __future__ import print_function
 import numpy
 
 from tensorflow.contrib.periodic_resample import periodic_resample
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -96,6 +93,7 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
   def testPeriodicResampleErrors(self):
     input_tensor = numpy.zeros(shape=[1, 2, 2, 4])
     with self.test_session():
+      variables.global_variables_initializer().run()
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
           'Dimension 3 input tensor has size 4, desired shape has size 1'):
@@ -105,29 +103,6 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
           '4, to be the same as the length of the desired shape, 3'):
         periodic_resample(input_tensor, [None, 4, 4]).eval()
 
-  def testPeriodicResampleGradient(self):
-    desired_shape = numpy.array([4, 4, None])
-    result_shape = (4, 4, 1)
-    input_shape = (2, 2, 4)
-    with self.test_session() as sess:
-      x = array_ops.placeholder(dtypes.float32, shape=input_shape)
-      output = periodic_resample(x, desired_shape)
-      error = gradient_checker.compute_gradient_error(
-          x, input_shape, output, result_shape)
-      self.assertLess(error, 1e-4)
-
-  def testPeriodicResampleShapeInference(self):
-    with self.test_session() as sess:
-      # Case 1: output shape can be fully inferreed.
-      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, 4))
-      output = periodic_resample(x, [4, 4, None])
-      self.assertEqual(output.shape, [4, 4, 1])
-      # Case 2: output shape can not be inferred - report desired shape.
-      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, None))
-      output = periodic_resample(x, [4, 4, None])
-      self.assertTrue(output.shape.is_compatible_with([4, 4, None]))
-      self.assertEqual(output.shape[2].value, None)
-
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
index 470e300ccb..348623d8f8 100644
--- a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
+++ b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
@@ -21,17 +21,11 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.contrib.periodic_resample.python.ops import gen_periodic_resample_op
 
-from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample, periodic_resample_op_grad
+from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample
 
 from tensorflow.contrib.util import loader
-from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
 # pylint: enable=unused-import
 
 _periodic_resample_op = loader.load_op_library(
     resource_loader.get_path_to_datafile('_periodic_resample_op.so'))
-
-@ops.RegisterGradient("PeriodicResample")
-def _periodic_resample_grad_cc(op, grad):
-  return periodic_resample_op_grad(
-      grad, op.inputs[0].shape, op.get_attr('shape'))
diff --git a/tensorflow/contrib/predictor/contrib_estimator_predictor.py b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
index af3b2ad1b5..b7a98c68e2 100644
--- a/tensorflow/contrib/predictor/contrib_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
@@ -34,8 +34,7 @@ class ContribEstimatorPredictor(predictor.Predictor):
                prediction_input_fn,
                input_alternative_key=None,
                output_alternative_key=None,
-               graph=None,
-               config=None):
+               graph=None):
     """Initialize a `ContribEstimatorPredictor`.
 
     Args:
@@ -49,7 +48,6 @@ class ContribEstimatorPredictor(predictor.Predictor):
         multi-headed models.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
-      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -60,7 +58,6 @@ class ContribEstimatorPredictor(predictor.Predictor):
       checkpoint_path = saver.latest_checkpoint(estimator.model_dir)
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
-              config=config,
               checkpoint_filename_with_path=checkpoint_path))
 
     input_alternative_key = (
diff --git a/tensorflow/contrib/predictor/core_estimator_predictor.py b/tensorflow/contrib/predictor/core_estimator_predictor.py
index a725072e72..d78d94c269 100644
--- a/tensorflow/contrib/predictor/core_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/core_estimator_predictor.py
@@ -51,8 +51,7 @@ class CoreEstimatorPredictor(predictor.Predictor):
                estimator,
                serving_input_receiver_fn,
                output_key=None,
-               graph=None,
-               config=None):
+               graph=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -63,7 +62,6 @@ class CoreEstimatorPredictor(predictor.Predictor):
         `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
-      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -73,7 +71,6 @@ class CoreEstimatorPredictor(predictor.Predictor):
       checkpoint_dir = estimator.model_dir
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
-              config=config,
               checkpoint_dir=checkpoint_dir))
 
     feed_tensor_info = signature_def.inputs
diff --git a/tensorflow/contrib/predictor/predictor_factories.py b/tensorflow/contrib/predictor/predictor_factories.py
index f275bc15ad..6e77e934fe 100644
--- a/tensorflow/contrib/predictor/predictor_factories.py
+++ b/tensorflow/contrib/predictor/predictor_factories.py
@@ -30,8 +30,7 @@ def from_contrib_estimator(estimator,
                            prediction_input_fn,
                            input_alternative_key=None,
                            output_alternative_key=None,
-                           graph=None,
-                           config=None):
+                           graph=None):
   """Constructs a `Predictor` from a `tf.contrib.learn.Estimator`.
 
   Args:
@@ -45,7 +44,6 @@ def from_contrib_estimator(estimator,
       multi-headed models.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
-    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -64,15 +62,13 @@ def from_contrib_estimator(estimator,
       prediction_input_fn,
       input_alternative_key=input_alternative_key,
       output_alternative_key=output_alternative_key,
-      graph=graph,
-      config=config)
+      graph=graph)
 
 
 def from_estimator(estimator,
                    serving_input_receiver_fn,
                    output_key=None,
-                   graph=None,
-                   config=None):
+                   graph=None):
   """Constructs a `Predictor` from a `tf.python.estimator.Estimator`.
 
   Args:
@@ -83,7 +79,6 @@ def from_estimator(estimator,
       `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
-    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -98,19 +93,14 @@ def from_estimator(estimator,
                     'tf.contrib.learn.Estimator. You likely want to call '
                     'from_contrib_estimator.')
   return core_estimator_predictor.CoreEstimatorPredictor(
-      estimator,
-      serving_input_receiver_fn,
-      output_key=output_key,
-      graph=graph,
-      config=config)
+      estimator, serving_input_receiver_fn, output_key=output_key, graph=graph)
 
 
 def from_saved_model(export_dir,
                      signature_def_key=None,
                      signature_def=None,
                      tags=None,
-                     graph=None,
-                     config=None):
+                     graph=None):
   """Constructs a `Predictor` from a `SavedModel` on disk.
 
   Args:
@@ -125,7 +115,6 @@ def from_saved_model(export_dir,
       `SignatureDef`. Defaults to `DEFAULT_TAGS`.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
-    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -139,5 +128,4 @@ def from_saved_model(export_dir,
       signature_def_key=signature_def_key,
       signature_def=signature_def,
       tags=tags,
-      graph=graph,
-      config=config)
+      graph=graph)
diff --git a/tensorflow/contrib/predictor/predictor_factories_test.py b/tensorflow/contrib/predictor/predictor_factories_test.py
index a2ef1dc3af..578d9424b2 100644
--- a/tensorflow/contrib/predictor/predictor_factories_test.py
+++ b/tensorflow/contrib/predictor/predictor_factories_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.contrib.predictor import predictor_factories
 from tensorflow.contrib.predictor import testing_common
-from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import test
 
 MODEL_DIR_NAME = 'contrib/predictor/test_export_dir'
@@ -42,11 +41,6 @@ class PredictorFactoriesTest(test.TestCase):
     """Test loading from_saved_model with tags."""
     predictor_factories.from_saved_model(self._export_dir, tags='serve')
 
-  def testFromSavedModelWithSessionConfig(self):
-    """Test loading from_saved_model with session config."""
-    predictor_factories.from_saved_model(
-        self._export_dir, config=config_pb2.ConfigProto())
-
   def testFromSavedModelWithBadTags(self):
     """Test that loading fails for bad tags."""
     bad_tags_regex = ('.*? could not be found in SavedModel')
@@ -59,13 +53,6 @@ class PredictorFactoriesTest(test.TestCase):
     predictor_factories.from_contrib_estimator(
         estimator, input_fn, output_alternative_key='sum')
 
-  def testFromContribEstimatorWithSessionConfig(self):
-    estimator = testing_common.get_arithmetic_estimator(core=False)
-    input_fn = testing_common.get_arithmetic_input_fn(core=False)
-    predictor_factories.from_contrib_estimator(
-        estimator, input_fn, output_alternative_key='sum',
-        config=config_pb2.ConfigProto())
-
   def testFromContribEstimatorWithCoreEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=True)
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
@@ -77,12 +64,6 @@ class PredictorFactoriesTest(test.TestCase):
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
     predictor_factories.from_estimator(estimator, input_fn)
 
-  def testFromCoreEstimatorWithSessionConfig(self):
-    estimator = testing_common.get_arithmetic_estimator(core=True)
-    input_fn = testing_common.get_arithmetic_input_fn(core=True)
-    predictor_factories.from_estimator(
-        estimator, input_fn, config=config_pb2.ConfigProto())
-
   def testFromCoreEstimatorWithContribEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=False)
     input_fn = testing_common.get_arithmetic_input_fn(core=False)
diff --git a/tensorflow/contrib/predictor/saved_model_predictor.py b/tensorflow/contrib/predictor/saved_model_predictor.py
index 95da6d04ed..0dbca0f813 100644
--- a/tensorflow/contrib/predictor/saved_model_predictor.py
+++ b/tensorflow/contrib/predictor/saved_model_predictor.py
@@ -121,8 +121,7 @@ class SavedModelPredictor(predictor.Predictor):
                input_names=None,
                output_names=None,
                tags=None,
-               graph=None,
-               config=None):
+               graph=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -143,7 +142,6 @@ class SavedModelPredictor(predictor.Predictor):
         the correct `SignatureDef`. Defaults to `DEFAULT_TAGS`.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
-      config: `ConfigProto` proto used to configure the session.
     Raises:
       ValueError: If more than one of signature_def_key OR signature_def OR
         (input_names AND output_names) is specified.
@@ -154,7 +152,7 @@ class SavedModelPredictor(predictor.Predictor):
     self._graph = graph or ops.Graph()
 
     with self._graph.as_default():
-      self._session = session.Session(config=config)
+      self._session = session.Session()
       loader.load(self._session, tags.split(','), export_dir)
 
     if input_names is None:
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index 27a933c0f9..c83623ec94 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -6,7 +6,7 @@ inference. The details of the transformation implemented in this package is
 described here [1].
 
 This is done using the
-[fake quantization op](https://www.tensorflow.org/api_guides/python/array_ops#Fake_quantization).
+[fake quantization op](https://www.tensorflow.org/versions/r0.12/api_docs/python/array_ops/fake_quantization).
 
 Literature has shown that fixed point networks provide comparable performance to
 floating point networks [2]. This is achieved by modeling the quantization
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 3d0308aaf3..94fc12ca81 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -26,6 +26,7 @@ import time
 import numpy as np
 
 from tensorflow.contrib.framework.python.ops import variables as variables_lib
+from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.slim.python.slim import evaluation
 from tensorflow.contrib.training.python.training import evaluation as evaluation_lib
 from tensorflow.core.protobuf import saver_pb2
@@ -36,7 +37,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import gfile
@@ -89,8 +89,8 @@ class EvaluationTest(test.TestCase):
     self._predictions, self._scale = TestModel(self._inputs)
 
   def testFinalOpsOnEvaluationLoop(self):
-    value_op, update_op = metrics.accuracy(
-        labels=self._labels, predictions=self._predictions)
+    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
+                                                        self._labels)
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
     # Create checkpoint and log directories:
@@ -136,10 +136,9 @@ class EvaluationTest(test.TestCase):
     self.assertTrue(obj.hook_was_run)
 
   def _create_names_to_metrics(self, predictions, labels):
-    accuracy0, update_op0 = metrics.accuracy(
-        labels=labels, predictions=predictions)
-    accuracy1, update_op1 = metrics.accuracy(
-        labels=labels, predictions=predictions + 1)
+    accuracy0, update_op0 = metric_ops.streaming_accuracy(predictions, labels)
+    accuracy1, update_op1 = metric_ops.streaming_accuracy(predictions + 1,
+                                                          labels)
 
     names_to_values = {'Accuracy': accuracy0, 'Another_accuracy': accuracy1}
     names_to_updates = {'Accuracy': update_op0, 'Another_accuracy': update_op1}
@@ -199,8 +198,8 @@ class EvaluationTest(test.TestCase):
     predictions_limited = input.limit_epochs(self._predictions, num_epochs=1)
     labels_limited = input.limit_epochs(self._labels, num_epochs=1)
 
-    value_op, update_op = metrics.accuracy(
-        labels=labels_limited, predictions=predictions_limited)
+    value_op, update_op = metric_ops.streaming_accuracy(
+        predictions_limited, labels_limited)
 
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
@@ -261,8 +260,8 @@ class SingleEvaluationTest(test.TestCase):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metrics.accuracy(
-        labels=self._labels, predictions=self._predictions)
+    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
+                                                        self._labels)
 
     # Run the evaluation and verify the results:
     accuracy_value = evaluation.evaluate_once(
@@ -277,8 +276,8 @@ class SingleEvaluationTest(test.TestCase):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metrics.accuracy(
-        labels=self._labels, predictions=self._predictions)
+    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
+                                                        self._labels)
 
     dumping_root = os.path.join(self.get_temp_dir(), 'tfdbg_dump_dir')
     dumping_hook = hooks.DumpingDebugHook(dumping_root, log_usage=False)
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index d22b80ac88..99ced53e11 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -21,7 +21,6 @@ from @{tf.summary.merge_all} to @{tf.summary.FileWriter}.
 
 To use with eager execution enabled, write your code as follows:
 
-```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -31,11 +30,9 @@ with summary_writer.as_default(), tf.contrib.summary.always_record_summaries():
   tf.contrib.summary.scalar("loss", my_loss)
   # In this case every call to tf.contrib.summary.scalar will generate a record
   # ...
-```
 
 To use it with graph execution, write your code as follows:
 
-```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -56,7 +53,7 @@ with tf.Session(...) as sess:
   while not_done_training:
     sess.run([train_op, tf.contrib.summary.all_summary_ops()])
     # ...
-```
+
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index d8236a0a6f..e893e1d1c8 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -21,10 +21,10 @@ import numpy as np
 
 from tensorflow.contrib import losses
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
+from tensorflow.contrib.metrics.python.ops import metric_ops
 
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 
 INFERENCE_PROB_NAME = prediction_key.PredictionKey.PROBABILITIES
@@ -38,13 +38,12 @@ def _top_k_generator(k):
     targets = math_ops.to_int32(targets)
     if targets.get_shape().ndims > 1:
       targets = array_ops.squeeze(targets, axis=[1])
-    return metrics.mean(nn.in_top_k(probabilities, targets, k))
+    return metric_ops.streaming_mean(nn.in_top_k(probabilities, targets, k))
   return _top_k
 
 
 def _accuracy(predictions, targets, weights=None):
-  return metrics.accuracy(
-      labels=targets, predictions=predictions, weights=weights)
+  return metric_ops.streaming_accuracy(predictions, targets, weights=weights)
 
 
 def _r2(probabilities, targets, weights=None):
@@ -54,7 +53,7 @@ def _r2(probabilities, targets, weights=None):
   squares_residuals = math_ops.reduce_sum(
       math_ops.square(targets - probabilities), 0)
   score = 1 - math_ops.reduce_sum(squares_residuals / squares_total)
-  return metrics.mean(score, weights=weights)
+  return metric_ops.streaming_mean(score, weights=weights)
 
 
 def _squeeze_and_onehot(targets, depth):
@@ -63,7 +62,7 @@ def _squeeze_and_onehot(targets, depth):
 
 
 def _sigmoid_entropy(probabilities, targets, weights=None):
-  return metrics.mean(
+  return metric_ops.streaming_mean(
       losses.sigmoid_cross_entropy(probabilities,
                                    _squeeze_and_onehot(
                                        targets,
@@ -72,7 +71,7 @@ def _sigmoid_entropy(probabilities, targets, weights=None):
 
 
 def _softmax_entropy(probabilities, targets, weights=None):
-  return metrics.mean(
+  return metric_ops.streaming_mean(
       losses.sparse_softmax_cross_entropy(probabilities,
                                           math_ops.to_int32(targets)),
       weights=weights)
@@ -83,7 +82,7 @@ def _predictions(predictions, unused_targets, **unused_kwargs):
 
 
 def _class_log_loss(probabilities, targets, weights=None):
-  return metrics.mean(
+  return metric_ops.streaming_mean(
       losses.log_loss(probabilities,
                       _squeeze_and_onehot(targets,
                                           array_ops.shape(probabilities)[1])),
@@ -91,36 +90,34 @@ def _class_log_loss(probabilities, targets, weights=None):
 
 
 def _precision(predictions, targets, weights=None):
-  return metrics.precision(
-      labels=targets, predictions=predictions, weights=weights)
+  return metric_ops.streaming_precision(predictions, targets, weights=weights)
 
 
 def _precision_at_thresholds(predictions, targets, weights=None):
-  return metrics.precision_at_thresholds(
-      labels=targets,
-      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
-      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
+  return metric_ops.streaming_precision_at_thresholds(
+      array_ops.slice(predictions, [0, 1], [-1, 1]),
+      targets,
+      np.arange(
+          0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _recall(predictions, targets, weights=None):
-  return metrics.recall(
-      labels=targets, predictions=predictions, weights=weights)
+  return metric_ops.streaming_recall(predictions, targets, weights=weights)
 
 
 def _recall_at_thresholds(predictions, targets, weights=None):
-  return metrics.recall_at_thresholds(
-      labels=targets,
-      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
-      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
+  return metric_ops.streaming_recall_at_thresholds(
+      array_ops.slice(predictions, [0, 1], [-1, 1]),
+      targets,
+      np.arange(
+          0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _auc(probs, targets, weights=None):
-  return metrics.auc(
-      labels=targets,
-      predictions=array_ops.slice(probs, [0, 1], [-1, 1]),
-      weights=weights)
+  return metric_ops.streaming_auc(array_ops.slice(probs, [0, 1], [-1, 1]),
+                                  targets, weights=weights)
 
 
 _EVAL_METRICS = {
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index 6f62cd11a9..7a35a70bbe 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -295,7 +295,7 @@ def get_epoch_variable():
 
 
 # A simple container to hold the training variables for a single tree.
-class TreeVariables(object):
+class TreeTrainingVariables(object):
   """Stores tf.Variables for training a single random tree.
 
   Uses tf.get_variable to get tree-specific names so that this can be used
@@ -303,7 +303,7 @@ class TreeVariables(object):
   then relies on restoring that model to evaluate).
   """
 
-  def __init__(self, params, tree_num, training, tree_config='', tree_stat=''):
+  def __init__(self, params, tree_num, training):
     if (not hasattr(params, 'params_proto') or
         not isinstance(params.params_proto,
                        _params_proto.TensorForestParams)):
@@ -315,28 +315,27 @@ class TreeVariables(object):
       # TODO(gilberth): Manually shard this to be able to fit it on
       # multiple machines.
       self.stats = stats_ops.fertile_stats_variable(
-          params, tree_stat, self.get_tree_name('stats', tree_num))
+          params, '', self.get_tree_name('stats', tree_num))
     self.tree = model_ops.tree_variable(
-        params, tree_config, self.stats, self.get_tree_name('tree', tree_num))
+        params, '', self.stats, self.get_tree_name('tree', tree_num))
 
   def get_tree_name(self, name, num):
     return '{0}-{1}'.format(name, num)
 
 
-class ForestVariables(object):
+class ForestTrainingVariables(object):
   """A container for a forests training data, consisting of multiple trees.
 
-  Instantiates a TreeVariables object for each tree. We override the
+  Instantiates a TreeTrainingVariables object for each tree. We override the
   __getitem__ and __setitem__ function so that usage looks like this:
 
-    forest_variables = ForestVariables(params)
+    forest_variables = ForestTrainingVariables(params)
 
     ... forest_variables.tree ...
   """
 
   def __init__(self, params, device_assigner, training=True,
-               tree_variables_class=TreeVariables,
-               tree_configs=None, tree_stats=None):
+               tree_variables_class=TreeTrainingVariables):
     self.variables = []
     # Set up some scalar variables to run through the device assigner, then
     # we can use those to colocate everything related to a tree.
@@ -348,13 +347,7 @@ class ForestVariables(object):
 
     for i in range(params.num_trees):
       with ops.device(self.device_dummies[i].device):
-        kwargs = {}
-        if tree_configs is not None:
-          kwargs.update(dict(tree_config=tree_configs[i]))
-        if tree_stats is not None:
-          kwargs.update(dict(tree_stat=tree_stats[i]))
-        self.variables.append(tree_variables_class(
-            params, i, training, **kwargs))
+        self.variables.append(tree_variables_class(params, i, training))
 
   def __setitem__(self, t, val):
     self.variables[t] = val
@@ -368,11 +361,9 @@ class RandomForestGraphs(object):
 
   def __init__(self,
                params,
-               tree_configs=None,
-               tree_stats=None,
                device_assigner=None,
                variables=None,
-               tree_variables_class=TreeVariables,
+               tree_variables_class=TreeTrainingVariables,
                tree_graphs=None,
                training=True):
     self.params = params
@@ -380,10 +371,9 @@ class RandomForestGraphs(object):
         device_assigner or framework_variables.VariableDeviceChooser())
     logging.info('Constructing forest with params = ')
     logging.info(self.params.__dict__)
-    self.variables = variables or ForestVariables(
+    self.variables = variables or ForestTrainingVariables(
         self.params, device_assigner=self.device_assigner, training=training,
-        tree_variables_class=tree_variables_class,
-        tree_configs=tree_configs, tree_stats=tree_stats)
+        tree_variables_class=tree_variables_class)
     tree_graph_class = tree_graphs or RandomTreeGraphs
     self.trees = [
         tree_graph_class(self.variables[i], self.params, i)
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
index 1c9c81827e..bbe627b157 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
@@ -18,14 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from google.protobuf.json_format import ParseDict
-from tensorflow.contrib.decision_trees.proto import generic_tree_model_pb2 as _tree_proto
 from tensorflow.contrib.tensor_forest.python import tensor_forest
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import resources
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -114,47 +110,6 @@ class TensorForestTest(test_util.TensorFlowTestCase):
     self.assertTrue(isinstance(paths, ops.Tensor))
     self.assertTrue(isinstance(var, ops.Tensor))
 
-  def testInfrenceFromRestoredModel(self):
-    input_data = [[-1., 0.], [-1., 2.],  # node 1
-                  [1., 0.], [1., -2.]]  # node 2
-    expected_prediction = [[0.0, 1.0], [0.0, 1.0],
-                           [0.0, 1.0], [0.0, 1.0]]
-    hparams = tensor_forest.ForestHParams(
-        num_classes=2,
-        num_features=2,
-        num_trees=1,
-        max_nodes=1000,
-        split_after_samples=25).fill()
-    tree_weight = {'decisionTree':
-                       {'nodes':
-                        [{'binaryNode':
-                          {'rightChildId': 2,
-                           'leftChildId': 1,
-                           'inequalityLeftChildTest':
-                           {'featureId': {'id': '0'},
-                            'threshold': {'floatValue': 0}}}},
-                         {'leaf': {'vector':
-                                   {'value': [{'floatValue': 0.0},
-                                              {'floatValue': 1.0}]}},
-                          'nodeId': 1},
-                         {'leaf': {'vector':
-                                   {'value': [{'floatValue': 0.0},
-                                              {'floatValue': 1.0}]}},
-                          'nodeId': 2}]}}
-    restored_tree_param = ParseDict(tree_weight,
-                                    _tree_proto.Model()).SerializeToString()
-    graph_builder = tensor_forest.RandomForestGraphs(hparams,
-                                                     [restored_tree_param])
-    probs, paths, var = graph_builder.inference_graph(input_data)
-    self.assertTrue(isinstance(probs, ops.Tensor))
-    self.assertTrue(isinstance(paths, ops.Tensor))
-    self.assertTrue(isinstance(var, ops.Tensor))
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      resources.initialize_resources(resources.shared_resources()).run()
-      self.assertEquals(probs.eval().shape, (4, 2))
-      self.assertEquals(probs.eval().tolist(), expected_prediction)
-
   def testTrainingConstructionClassificationSparse(self):
     input_data = sparse_tensor.SparseTensor(
         indices=[[0, 0], [0, 3], [1, 0], [1, 7], [2, 1], [3, 9]],
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index da4dd5a14c..b7b26cfb1c 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -91,11 +91,8 @@ void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
       if (!subgraph_node_ids.count(edge->src()->id()) &&
           !edge->src()->IsSource() && !edge->IsControlEdge()) {
         incoming_edges->insert(edge);
-        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
-                << " Y, ";
       } else {
-        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
-                << " N, ";
+        VLOG(2) << node->name() << " -> " << edge->src()->name() << " N, ";
       }
     }
   }
@@ -109,12 +106,10 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
     for (const tensorflow::Edge* edge : node->out_edges()) {
       if (!subgraph_node_ids.count(edge->dst()->id()) &&
           !edge->dst()->IsSink() && !edge->IsControlEdge()) {
-        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
-                << " Y, ";
+        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " Y, ";
         outgoing_edges->insert(edge);
       } else {
-        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
-                << " N, ";
+        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " N, ";
       }
     }
   }
@@ -186,27 +181,29 @@ struct ConvertGraphParams {
 static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
   GetSubGraphIncomingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_incoming_edges);
-
-  std::set<std::pair<int, int>> unique_tensors;
-  // Add only unique input source nodes. If output of an outside node is shared
-  // between multiple nodes inside the engine, only one edge should be created
   for (const tensorflow::Edge* edge : p->subgraph_incoming_edges) {
-    unique_tensors.insert({edge->src()->id(), edge->src_output()});
+    p->subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
+  }
+  auto output_name_to_index_map = BuildTensorNameMap(p->output_names);
+  std::set<std::pair<int, int>> subgraph_outputs_set;
+  // Collect outputs referenced from output_names
+  for (int node_id : p->subgraph_node_ids) {
+    tensorflow::Node* node = p->graph.FindNodeId(node_id);
+    if (output_name_to_index_map.count(node->name())) {
+      for (int index : output_name_to_index_map.at(node->name())) {
+        subgraph_outputs_set.insert({node_id, index});
+      }
+    }
   }
-  p->subgraph_inputs.insert(p->subgraph_inputs.begin(), unique_tensors.begin(),
-                            unique_tensors.end());
   GetSubGraphOutgoingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_outgoing_edges);
-  unique_tensors.clear();
-  // Similar to above, if multiple ouside nodes are sharing the output of an
-  // internal node only one output port should be created and shared between
-  // outputs
   for (const tensorflow::Edge* edge : p->subgraph_outgoing_edges) {
-    unique_tensors.insert({edge->src()->id(), edge->src_output()});
+    subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
   }
-  p->subgraph_outputs.reserve(unique_tensors.size());
+  p->subgraph_outputs.reserve(subgraph_outputs_set.size());
   p->subgraph_outputs.insert(p->subgraph_outputs.begin(),
-                             unique_tensors.begin(), unique_tensors.end());
+                             subgraph_outputs_set.begin(),
+                             subgraph_outputs_set.end());
   return tensorflow::Status::OK();
 }
 
@@ -228,6 +225,7 @@ tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
   for (auto in_edge :
        params->subgraph_incoming_edges) {  // loop over incoming edges and
                                            // attach them to calib node
+    // tensorflow::Node* src_node = in_edge->src();
     auto src_output = in_edge->src_output();
     auto dst_node = in_edge->dst();
     auto dst_input = in_edge->dst_input();
@@ -259,24 +257,19 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
   for (size_t i = 0; i < params->subgraph_inputs.size(); ++i) {
     subgraph_edge_to_input_map.insert({params->subgraph_inputs.at(i), i});
   }
-  std::set<std::pair<int, int>> unique_tensors;
   for (const tensorflow::Edge* edge : params->subgraph_incoming_edges) {
     std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
-    if (unique_tensors.count(old_src)) continue;
-    unique_tensors.insert(old_src);
     int new_src_output = subgraph_edge_to_input_map.at(old_src);
     params->graph.AddEdge(edge->src(), edge->src_output(), trt_node,
                           new_src_output);
-    VLOG(1) << "Wire " << edge->src()->name() << ":" << edge->src_output()
-            << " -> " << trt_node->name() << ":" << new_src_output;
     params->graph.RemoveEdge(edge);
   }
-  if (VLOG_IS_ON(2)) {
-    VLOG(2) << "new edge count: " << trt_node->in_edges().size();
-    for (const tensorflow::Edge* edge : trt_node->in_edges()) {
-      VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
-    }
+
+  VLOG(2) << "new wiring edges: " << trt_node->in_edges().size();
+  for (const tensorflow::Edge* edge : trt_node->in_edges()) {
+    VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
   }
+
   TF_RETURN_IF_ERROR(status);
 
   // Re-map outgoing edges to use the new TRT node instead of the orig subgraph
@@ -290,8 +283,6 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
     int new_src_output = subgraph_edge_to_output_map.at(old_src);
     TF_RETURN_IF_ERROR(params->graph.UpdateEdge(
         trt_node, new_src_output, edge->dst(), edge->dst_input()));
-    VLOG(1) << "Wire " << trt_node->name() << ":" << new_src_output << " -> "
-            << edge->dst()->name() << ":" << edge->dst_input();
   }
   // Remove the original subgraph
   for (int node_id : params->subgraph_node_ids) {
@@ -326,12 +317,9 @@ tensorflow::Status ConvertCalibGraphToInferGraph(
       tensorflow::GraphConstructorOptions(), graph_def, &graph));
   //  get calib nodes
   std::vector<tensorflow::Node*> calib_nodes;
-  std::vector<tensorflow::Node*> topo_order;
-  tensorflow::GetPostOrder(graph, &topo_order);
-  for (auto rit = topo_order.rbegin(); rit != topo_order.rend(); ++rit) {
-    auto node = *rit;
+  for (auto node : graph.op_nodes()) {
     if (node->type_string() == "TRTCalibOp") {
-      VLOG(1) << "Found Calib Node " << node->name();
+      VLOG(1) << "Found Calib Node";
       calib_nodes.push_back(node);
     }
   }
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 4e4d295538..96e0700862 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -362,11 +362,10 @@ void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
       break;
     }
     case tensorflow::DataType::DT_HALF: {
-      Reorder2(
-          {k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
-          istrides,
-          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
-          ostrides);
+      Reorder2({k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
+               istrides, static_cast<Eigen::half*>(
+                             const_cast<void*>(oweights->GetValues())),
+               ostrides);
       break;
     }
     default:
@@ -1180,9 +1179,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented(
-        "binary op: " + node_def.op() +
-        " not supported at: " + node_def.name());
+    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
+                                             " not supported at: " +
+                                             node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -2139,7 +2138,9 @@ void Converter::register_op_converters() {
 }
 
 }  // namespace
-
+tensorflow::Status GetTensorRTGraph(tensorrt::convert::SubGraphParams& s) {
+  return tensorflow::errors::Unimplemented("Not implemented yet");
+}
 tensorflow::Status ConvertCalibrationNodeToEngineNode(
     tensorflow::Graph& graph, tensorflow::Node* c_node) {
   const auto ndef = c_node->def();
@@ -2163,23 +2164,9 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   for (auto n : graph.op_nodes()) {
     node_maps.insert({n->name(), n});
   }
-  std::set<int> subgraph_ids;
-  for (const auto internal_node : segment_nodes) {
-    subgraph_ids.insert(node_maps.at(internal_node)->id());
-  }
-  if (VLOG_IS_ON(2)) {
-    string node_names = StrCat(c_node->name(), " segment nodes= ");
-
-    for (const auto& node_name : segment_nodes) {
-      StrAppend(&node_names, node_name, ", ");
-    }
-    VLOG(2) << node_names;
-  }
-
   VLOG(1) << "Output Nodes:";
   std::vector<tensorflow::DataType> out_types;
   std::vector<const tensorflow::Edge*> out_edges;
-
   for (auto& i : output_nodes) {
     auto node_port = tensorflow::str_util::Split(i, ":");
     VLOG(1) << " " << i << " in graph " << node_maps.count(i);
@@ -2199,24 +2186,18 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
         out_types.push_back(out_node->output_type(0));
       }
       for (auto out_edge : out_node->out_edges()) {
-        if (subgraph_ids.count(out_edge->dst()->id()))
-          continue;  // skip internal edges;
         if (out_edge->src_output() == port) {
           out_edges.push_back(out_edge);
-          VLOG(1) << "OUTPUT EDGE " << out_edge->src()->name() << ":"
-                  << out_edge->src_output() << " -> " << out_edge->dst()->name()
-                  << ":" << out_edge->dst_input();
+          break;
         }
       }
     } else {
       LOG(WARNING) << " couldn't find output node " << out_node_name;
     }
   }
-  if (VLOG_IS_ON(1)) {
-    VLOG(1) << c_node->name() << " Input Nodes:";
-    for (auto& i : input_names) {
-      VLOG(1) << " Input " << i << " in graph " << node_maps.count(i);
-    }
+  VLOG(1) << "Input Nodes:";
+  for (auto& i : input_names) {
+    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
   }
   auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
   auto resmgr = trt_rm->getManager("TRTCalibOps");
@@ -2250,24 +2231,14 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   calib_res->builder_ = nullptr;
   tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
   std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  income_edges.resize(c_node->num_inputs());
   for (const auto in_edge : c_node->in_edges()) {
     auto src = in_edge->src();
     int dest_port = in_edge->dst_input();
-    VLOG(1) << "Incoming connection " << src->name() << ":"
-            << in_edge->src_output() << " -> " << c_node->name() << ":"
-            << dest_port;
-    income_edges.at(dest_port) = {src->name(), in_edge->src_output(),
-                                  c_node->input_type(dest_port)};
+    income_edges.emplace_back(src->name(), in_edge->src_output(),
+                              c_node->input_type(dest_port));
   }
   tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
       income_edges);
-  if (VLOG_IS_ON(2)) {
-    for (const auto& inp : input_list) {
-      VLOG(2) << " Input from inputlist " << inp.node << ":" << inp.index << " "
-              << tensorflow::DataTypeString(inp.data_type);
-    }
-  }
   op_builder.Input(input_list);
   tensorflow::NodeDef engine_node;
   const char* engine_plan_data = static_cast<const char*>(engine_plan->data());
@@ -2284,26 +2255,13 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   }
   auto trt_engine_node = graph.AddNode(engine_node, &status);
   TF_RETURN_IF_ERROR(status);
-  std::map<string, int> port_map;
-  for (size_t t = 0; t < output_nodes.size(); t++) {
-    port_map.insert({output_nodes.at(t), t});
-  }
-  for (auto& i : out_edges) {
-    string s(i->src()->name());
-    if (i->src_output()) StrAppend(&s, ":", i->src_output());
-    int out_port = port_map.at(s);
-    VLOG(1) << "Connecting " << trt_engine_node->name() << ":" << out_port
-            << " -> " << i->dst()->name() << ":" << i->dst_input();
-    TF_RETURN_IF_ERROR(
-        graph.UpdateEdge(trt_engine_node, out_port, i->dst(), i->dst_input()));
-  }
-  for (const auto ed : trt_engine_node->in_edges()) {
-    VLOG(1) << "In Edge  " << ed->src()->name() << ":" << ed->src_output()
-            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
-  }
-  for (const auto ed : trt_engine_node->out_edges()) {
-    VLOG(1) << "Out Edge " << ed->src()->name() << ":" << ed->src_output()
-            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
+  for (size_t i = 0; i < out_edges.size(); i++) {
+    VLOG(1) << "Connecting trt_engine_node output " << i << " with "
+            << out_edges.at(i)->dst()->name() << " port "
+            << out_edges.at(i)->dst_input();
+    TF_RETURN_IF_ERROR(graph.UpdateEdge(trt_engine_node, i,
+                                        out_edges.at(i)->dst(),
+                                        out_edges.at(i)->dst_input()));
   }
   VLOG(1) << "Segment nodes:";
   for (auto& i : segment_nodes) {
@@ -2374,7 +2332,6 @@ tensorflow::Status ConvertSubgraph(
     std::vector<string>* output_names,
     std::vector<tensorflow::DataType>* output_dtypes,
     const string& engine_name) {
-  std::set<string> added_tensors;
   for (const std::pair<int, int>& input : s.input_inds) {
     VLOG(2) << "parsing input. Node id= " << input.first;
     int node_id = input.first;
@@ -2417,6 +2374,7 @@ tensorflow::Status ConvertSubgraph(
 
     auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
+    input_dtypes->push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
     auto type_status = ConvertDType(tf_dtype, &dtype);
@@ -2452,10 +2410,8 @@ tensorflow::Status ConvertSubgraph(
     if (output_idx != 0) {
       input_tensor_name = StrCat(node_name, ":", output_idx);
     }
-    if (added_tensors.count(input_tensor_name)) continue;
-    added_tensors.insert(input_tensor_name);
+
     input_names->push_back(input_tensor_name);
-    input_dtypes->push_back(tf_dtype);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
         input_tensor_name.c_str(), dtype, input_dim_pseudo_chw);
 
@@ -2479,7 +2435,6 @@ tensorflow::Status ConvertSubgraph(
 
   // Gather output metadata
   int trt_engine_op_output_idx = 0;
-  added_tensors.clear();
   for (const std::pair<int, int>& output : s.output_inds) {
     int node_id = output.first;
     int output_idx = output.second;
@@ -2496,8 +2451,6 @@ tensorflow::Status ConvertSubgraph(
     if (output_idx != 0)
       tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
     VLOG(2) << "Output tensor name: " << tensor_name;
-    if (added_tensors.count(tensor_name)) continue;
-    added_tensors.insert(tensor_name);
     output_names->push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index d879170b68..2e472a2805 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -166,21 +166,11 @@ def StreamingFilesDataset(files,
     return remote_iterator.get_next()
 
   def MapFn(unused_input):
-    if isinstance(source_dataset.output_types, dtypes.DType):
-      output_types = [source_dataset.output_types]
-    elif isinstance(source_dataset.output_types, (list, tuple)):
-      output_types = source_dataset.output_types
-    else:
-      raise ValueError('source dataset has invalid output types')
-    remote_calls = functional_ops.remote_call(
+    return functional_ops.remote_call(
         args=[source_handle],
-        Tout=output_types,
+        Tout=[dtypes.string],
         f=LoadingFunc,
-        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
-    if len(remote_calls) == 1:
-      return remote_calls[0]
-    else:
-      return remote_calls
+        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)[0]
 
   with ops.device('/job:%s' % worker_job):
     output_dataset = dataset_ops.Dataset.range(2).repeat().map(
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
index b58d05eac5..918cf0ed8e 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
@@ -26,8 +26,6 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -164,30 +162,6 @@ class DatasetsTest(test.TestCase):
 
     self.assertEqual(set(all_contents), set(retrieved_values))
 
-  def testArbitraryReaderFuncFromDatasetGenerator(self):
-
-    def my_generator():
-      yield (1, [1] * 10)
-
-    def gen_dataset(dummy):
-      return dataset_ops.Dataset.from_generator(
-          my_generator, (dtypes.int64, dtypes.int64),
-          (tensor_shape.TensorShape([]), tensor_shape.TensorShape([10])))
-
-    dataset = datasets.StreamingFilesDataset(
-        dataset_ops.Dataset.range(10), filetype=gen_dataset)
-
-    iterator = dataset.make_initializable_iterator()
-    self._sess.run(iterator.initializer)
-    get_next = iterator.get_next()
-
-    retrieved_values = self._sess.run(get_next)
-
-    self.assertIsInstance(retrieved_values, (list, tuple))
-    self.assertEqual(len(retrieved_values), 2)
-    self.assertEqual(retrieved_values[0], 1)
-    self.assertItemsEqual(retrieved_values[1], [1] * 10)
-
   def testUnexpectedFiletypeString(self):
     with self.assertRaises(ValueError):
       datasets.StreamingFilesDataset(
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index b1c224a345..d89633199d 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -699,9 +699,7 @@ cc_library(
     srcs = ["platform/stacktrace_handler.cc"],
     hdrs = ["platform/stacktrace_handler.h"],
     deps = [
-        ":abi",
         ":lib_platform",
-        ":stacktrace",
     ],
 )
 
@@ -3091,8 +3089,6 @@ cc_library(
         # we now need at least "str_util".
         ":lib",
         ":lib_platform",
-        ":stacktrace_handler",
-        ":test_lite",
         "//tensorflow/core/platform/default/build_config:test_lite_main",
     ],
     alwayslink = 1,
@@ -3573,10 +3569,7 @@ tf_cc_tests_gpu(
 tf_cc_test_mkl(
     name = "mkl_runtime_tests",
     size = "small",
-    srcs = [
-        "common_runtime/mkl_cpu_allocator_test.cc",
-        "common_runtime/mkl_threadpool_device_test.cc",
-    ],
+    srcs = ["common_runtime/mkl_cpu_allocator_test.cc"],
     linkstatic = 1,
     deps = [
         ":core",
diff --git a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
index 985f09312f..cbe76de415 100644
--- a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
@@ -4,10 +4,6 @@ op {
   description: <<END
 if < 0, `scale * features` otherwise.
 
-To be used together with
-`initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
-For correct dropout, use `tf.contrib.nn.alpha_dropout`.
-
 See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
deleted file mode 100644
index 6e13d0d049..0000000000
--- a/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
+++ /dev/null
@@ -1,48 +0,0 @@
-op {
-  graph_op_name: "StringSplitV2"
-  in_arg {
-    name: "input"
-    description: <<END
-`1-D` string `Tensor`, the strings to split.
-END
-  }
-  in_arg {
-    name: "sep"
-    description: <<END
-`0-D` string `Tensor`, the delimiter character.
-END
-  }
-  attr {
-    name: "maxsplit"
-    description: <<END
-An `int`. If `maxsplit > 0`, limit of the split of the result.
-END
-  }
-  summary: "Split elements of `source` based on `sep` into a `SparseTensor`."
-  description: <<END
-Let N be the size of source (typically N will be the batch size). Split each
-element of `source` based on `sep` and return a `SparseTensor`
-containing the split tokens. Empty tokens are ignored.
-
-For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
-then the output will be
-```
-st.indices = [0, 0;
-              0, 1;
-              1, 0;
-              1, 1;
-              1, 2]
-st.shape = [2, 3]
-st.values = ['hello', 'world', 'a', 'b', 'c']
-```
-
-If `sep` is given, consecutive delimiters are not grouped together and are
-deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
-sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
-string, consecutive whitespace are regarded as a single separator, and the
-result will contain no empty strings at the startor end if the string has
-leading or trailing whitespace.
-
-Note that the above mentioned behavior matches python's str.split.
-END
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
deleted file mode 100644
index 0e8576fb01..0000000000
--- a/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "StringSplitV2"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 9cda17867b..8f2a419756 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -86,7 +86,7 @@ BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
   return &(chunks_[h]);
 }
 
-bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
+bool BFCAllocator::Extend(size_t rounded_bytes) {
   size_t available_bytes = memory_limit_ - total_region_allocated_bytes_;
   // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
   available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
@@ -108,7 +108,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
 
   // Try allocating.
   size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
-  void* mem_addr = suballocator_->Alloc(alignment, bytes);
+  void* mem_addr = suballocator_->Alloc(32, bytes);
   if (mem_addr == nullptr && !started_backpedal_) {
     // Only backpedal once.
     started_backpedal_ = true;
@@ -119,7 +119,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
     while (mem_addr == nullptr) {
       bytes = RoundedBytes(bytes * kBackpedalFactor);
       if (bytes < rounded_bytes) break;
-      mem_addr = suballocator_->Alloc(alignment, bytes);
+      mem_addr = suballocator_->Alloc(32, bytes);
     }
   }
 
@@ -261,7 +261,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
   }
 
   // Try to extend
-  if (Extend(unused_alignment, rounded_bytes)) {
+  if (Extend(rounded_bytes)) {
     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
     if (ptr != nullptr) {
       return ptr;
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 52aedb1e9c..ba5a3eea3a 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -305,8 +305,7 @@ class BFCAllocator : public VisitableAllocator {
   // Try to add a new memory region that can satisfy an allocation of
   // 'rounded_bytes' bytes.  Returns true on success and false on
   // failure.
-  bool Extend(size_t alignment, size_t rounded_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Returns a pointer to an underlying allocated chunk of size
   // 'rounded_bytes'.
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 9028e6298c..c21a1ea9f2 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -102,25 +102,9 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
         EXPECT_EQ(2, shape.dim(0).size());
         EXPECT_EQ(1, shape.dim(1).size());
         if (node->name() == y->name()) {
-#ifdef INTEL_MKL
-          // if MKL is used, it goes through various additional 
-          // graph rewrite pass. In TF, everytime a graph pass 
-          // happens, "constant" nodes are allocated
-          // and deallocated. Each allocation calls the
-          // (FindChunkPtr of BFCAllocator),
-          // which increments the value of AllocationId. 
-          // Thus AllocationId becomes more than 3 and 4 if 
-          // MKL is used. Now they are 9 and 10 for MKL. 
-          EXPECT_EQ(19, cm->AllocationId(node, 0));
-#else
           EXPECT_EQ(21, cm->AllocationId(node, 0));
-#endif 
         } else {
-#ifdef INTEL_MKL
-          EXPECT_EQ(20, cm->AllocationId(node, 0));
-#else
           EXPECT_EQ(22, cm->AllocationId(node, 0));
-#endif 
         }
       }
       EXPECT_LE(0, cm->MaxExecutionTime(node));
diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
deleted file mode 100644
index 5d583a8360..0000000000
--- a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef INTEL_MKL
-
-#include "tensorflow/core/common_runtime/threadpool_device.h"
-
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/cpu_info.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/public/session_options.h"
-
-namespace tensorflow {
-
-#ifdef _OPENMP
-TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) {
-  SessionOptions options;
-  unsetenv("OMP_NUM_THREADS");
-
-  ThreadPoolDevice* tp = new ThreadPoolDevice(
-      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
-
-  const int ht = port::NumHyperthreadsPerCore();
-  EXPECT_EQ(omp_get_max_threads(), (port::NumSchedulableCPUs() + ht - 1) / ht);
-}
-
-TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) {
-  SessionOptions options;
-  setenv("OMP_NUM_THREADS", "314", 1);
-
-  ThreadPoolDevice* tp = new ThreadPoolDevice(
-      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
-
-  EXPECT_EQ(omp_get_max_threads(), 314);
-}
-#endif  // _OPENMP
-
-}  // namespace tensorflow
-
-#endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index a5d31b75c7..21912236d0 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -16,10 +16,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 
 #ifdef INTEL_MKL
-#ifdef _OPENMP
 #include <omp.h>
-#endif  // _OPENMP
-#endif  // INTEL_MKL
+#endif
 #include <string.h>
 
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -59,10 +57,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   // MKL library executes ops in parallel using OMP threads
   // Set inter_op conservatively to avoid thread oversubscription that could
   // lead to severe perf degradations and OMP resource exhaustion
-  int mkl_intra_op = 1;
-#ifdef _OPENMP
-  mkl_intra_op = omp_get_max_threads();
-#endif  // _OPENMP
+  const int mkl_intra_op = omp_get_max_threads();
   CHECK_GE(mkl_intra_op, 1);
   const int32 mkl_inter_op = std::max(
       (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
@@ -73,7 +68,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
 #else
   // Default to using the number of cores available in the process.
   return port::NumSchedulableCPUs();
-#endif  // INTEL_MKL
+#endif
 }
 
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 74a87215e1..f7a07fe503 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -31,11 +31,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 
 #ifdef INTEL_MKL
-#ifdef _OPENMP
-#include <omp.h>
-#endif
 #include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
-#include "tensorflow/core/platform/cpu_info.h"
 #endif
 
 namespace tensorflow {
@@ -47,26 +43,7 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
     : LocalDevice(options, Device::BuildDeviceAttributes(
                                name, DEVICE_CPU, memory_limit, locality)),
       allocator_(allocator),
-      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {
-#ifdef INTEL_MKL
-#ifdef _OPENMP
-  const char* user_omp_threads = getenv("OMP_NUM_THREADS");
-  if (user_omp_threads == nullptr) {
-    // OMP_NUM_THREADS controls MKL's intra-op parallelization
-    // Default to available physical cores
-    const int mkl_intra_op = port::NumSchedulableCPUs();
-    const int ht = port::NumHyperthreadsPerCore();
-    omp_set_num_threads((mkl_intra_op + ht - 1) / ht);
-  } else {
-    uint64 user_val = 0;
-    if (strings::safe_strtou64(user_omp_threads, &user_val)) {
-      // Superflous but triggers OpenMP loading
-      omp_set_num_threads(user_val);
-    }
-  }
-#endif  // _OPENMP
-#endif  // INTEL_MKL
-}
+      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {}
 
 ThreadPoolDevice::~ThreadPoolDevice() {}
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index 770a0fcf14..1cea1b1462 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -147,9 +147,7 @@ MasterService::Stub::Stub(
 }
 
 MasterService::AsyncService::AsyncService() {
-  int method_len = sizeof(grpcMasterService_method_names) / 
-                    sizeof(grpcMasterService_method_names[0]);
-  for (int i = 0; i < method_len; ++i) {
+  for (int i = 0; i < 10; ++i) {
     AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcMasterService_method_names[i],
         ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
index a8508d2d4f..89f83f9f24 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_session.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
@@ -51,14 +50,9 @@ Status TestCluster::MakeTestCluster(const SessionOptions& options, int n,
   }
 
   for (int i = 0; i < n; ++i) {
-    string server_file =
-        strings::StrCat(testing::TensorFlowSrcRoot(),
-                        "/core/distributed_runtime/rpc/grpc_testlib_server");
-    if (!options.env->FileExists(server_file).ok()) {
-      return errors::Internal("Could not find grpc_testlib_server");
-    }
     const std::vector<string> argv(
-        {server_file,
+        {strings::StrCat(testing::TensorFlowSrcRoot(),
+                         "/core/distributed_runtime/rpc/grpc_testlib_server"),
          /* see grpc_testlib_server.cc for flags */
          tf_jobs, "--tf_job=localhost", strings::StrCat("--tf_task=", i),
          strings::StrCat("--num_cpus=", num_cpus),
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 2bb4d32d57..2c87156dca 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -67,8 +67,13 @@ struct AllocatorStats {
 // device memory.
 class Allocator {
  public:
+#ifdef EIGEN_VECTORIZE_AVX512
   // Align to 64 byte boundary.
   static constexpr size_t kAllocatorAlignment = 64;
+#else
+  // Align to 32 byte boundary.
+  static constexpr size_t kAllocatorAlignment = 32;
+#endif
 
   virtual ~Allocator();
 
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 4b56d807df..3d7920a6e2 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_gen_lib.h"
 
-#include <algorithm>
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/framework/remote_fused_graph_execute_info.proto b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
index 10072724d2..eb689ec1e6 100644
--- a/tensorflow/core/framework/remote_fused_graph_execute_info.proto
+++ b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RemoteFusedGraphExecuteInfoProto";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+//add go_package externally
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index 80e168df97..b613effd18 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -1147,29 +1147,29 @@ TEST(Tensor, FailureToAllocate) {
 
 // On the alignment.
 //
-// As of 2018/5, tensorflow::Tensor allocates its buffer with 64-byte
+// As of 2015/8, tensorflow::Tensor allocates its buffer with 32-byte
 // alignment. Tensor::tensor/flat/vec/matrix methods requires the
 // buffer satisfies Eigen::Aligned (e.g., 16-bytes aligned usually,
-// 32-bytes for AVX, and 64-bytes for AVX512). Tensor::Slice requires
-// the caller to ensure its result is aligned if the caller intends
-// to use those methods. In this test case, we simply make sure each
-// slice is 64-byte aligned: sizeof(float) * 4 * 36 = 576.  576 % 64 = 0.
+// and 32-bytes for AVX). Tensor::Slice requires the caller to ensure
+// its result is aligned if the caller intends to use those methods.
+// In this test case, we simply make sure each slice is 32-byte
+// aligned: sizeof(float) * 4 * 2 = 32.
 TEST(Tensor, Slice_Basic) {
   Tensor saved;
   {  // General
-    Tensor x(DT_FLOAT, TensorShape({10, 4, 36}));
+    Tensor x(DT_FLOAT, TensorShape({10, 4, 34}));
     // Fills in known values.
     for (int i = 0; i < 10; ++i) {
       x.Slice(i, i + 1).flat<float>().setConstant(i * 1.f);
     }
     // A simple slice along dim0.
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 36})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 34})));
     auto tx = x.tensor<float, 3>();
     auto ty = y.tensor<float, 3>();
     for (int i = 0; i < 4; ++i) {
       for (int j = 0; j < 4; ++j) {
-        for (int k = 0; k < 36; ++k) {
+        for (int k = 0; k < 34; ++k) {
           EXPECT_EQ(ty(i, j, k), 4.0 + i);
           EXPECT_EQ(&tx(4 + i, j, k), &ty(i, j, k));
         }
@@ -1186,7 +1186,7 @@ TEST(Tensor, Slice_Basic) {
     auto tz = z.tensor<float, 3>();
     EXPECT_EQ(1, z.dim_size(0));
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 36; ++k) {
+      for (int k = 0; k < 34; ++k) {
         EXPECT_EQ(tz(0, j, k), 6.0);
       }
     }
@@ -1198,16 +1198,16 @@ TEST(Tensor, Slice_Basic) {
     EXPECT_EQ(1, saved.dim_size(0));
     auto tsaved = saved.tensor<float, 3>();
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 36; ++k) {
+      for (int k = 0; k < 34; ++k) {
         EXPECT_EQ(tsaved(0, j, k), 6.0);
       }
     }
   }
   {  // Empty
-    Tensor x(DT_FLOAT, TensorShape({10, 0, 36}));
+    Tensor x(DT_FLOAT, TensorShape({10, 0, 34}));
     x.flat<float>().setRandom();
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 36})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 34})));
   }
 
   {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index b9667998d6..72a13d4da7 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -2691,14 +2691,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // If Op has been specifically assigned to a non-CPU device, then No.
     if (!n->assigned_device_name().empty() &&
-        !str_util::StrContains(n->assigned_device_name(), kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) {
       result = false;
       reason = "Op has been assigned a runtime device that is not CPU.";
     }
 
     // If user has specifically assigned this op to a non-CPU device, then No.
     if (!n->def().device().empty() &&
-        !str_util::StrContains(n->def().device(), kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) {
       result = false;
       reason = "User has assigned a device that is not CPU.";
     }
@@ -2865,9 +2865,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
-  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized
-  // path. The unoptimized path is slow. Thus we dont rewrite the node
-  // and use default Eigen. But for depth_radius=2, MKL DNN optimized
+  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized 
+  // path. The unoptimized path is slow. Thus we dont rewrite the node 
+  // and use default Eigen. But for depth_radius=2, MKL DNN optimized 
   // path is taken, i.e., eigen node is rewritten by MKl DNN node.
   static bool LrnRewrite(const Node* n) {
     CHECK_NOTNULL(n);
@@ -2876,13 +2876,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     CHECK_EQ(GetNodeAttr(n->def(), "depth_radius", &depth_radius).ok(), true);
 
     // if the depth_radius of LRN is not 2, don't rewrite the node by MKL DNN
-    // and use eigen node instead
+    // and use eigen node instead 
     if (depth_radius == 2) {
       return true;
     }
     VLOG(1) << "LrnRewrite: The model sets depth_radius as not 2 which"
             << "case is not optimized by Intel MKL, thus using Eigen op"
-            << "for LRN ";
+            << "for LRN " ; 
 
     return false;
   }
@@ -3015,35 +3015,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                 std::vector<NodeBuilder::NodeOut>* ws_tensors,
                                 bool* are_ws_tensors_added);
 
-  // Helper function used by FixMklMetaDataEdges. Fixes the metadata edge
-  // pointed by 'e_metadata' corresponding to the data edge 'e_data' in graph
-  // 'g'. Returns true is fixup was done; otherwise, it returns false.
-  bool FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
-    const Edge* e_data, const Edge* e_metadata);
-
-  // Are the input Mkl metadata edges for node 'n' in graph 'g' correctly
-  // connected? If not, then fix them. This is needed because a graph may have
-  // some input Mkl metadata edges incorrectly setup after node merge and
-  // rewrite passes. This could happen because GetReversePostOrder function may
-  // not provide topologically sorted order if a graph contains cycles. The
-  // function returns true if at least one Mkl metadata edge for node 'n' was
-  // fixed. Otherwise, it returns false.
-  //
-  // Example:
-  //
-  // X = MklConv2D(_, _, _)
-  // Y = MklConv2DWithBias(_, _, _, _, _, _)
-  // Z = MklAdd(X, Y, DummyMklTensor, Y:1)
-  //
-  // For a graph such as shown above, note that 3rd argument of MklAdd contains
-  // DummyMklTensor. Actually, it should be getting the Mkl metadata from
-  // MklConv2D op (specifically, X:2). This incorrect plumbing could be possible
-  // (although rare) if the Mkl NodeMerge + NodeRewrite passes visit Z before X
-  // (possible if X, Y, Z are part of a loop.) This function fixes the Mkl
-  // metadata edges only - it does not rewrite nodes nor does it modify the Mkl
-  // data edges (1st and 2nd arguments of MklAdd).
-  bool FixMklMetaDataEdges(std::unique_ptr<Graph>* g, Node* n);
-
   // Functions specific to operators to copy attributes
   // We need operator-specific function to copy attributes because the framework
   // does not provide any generic function for it.
@@ -4270,92 +4241,6 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   return nullptr;
 }
 
-///////////////////////////////////////////////////////////////////////////////
-//              Post-rewrite Mkl metadata fixup pass
-///////////////////////////////////////////////////////////////////////////////
-bool MklLayoutRewritePass::FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
-    const Edge* e_data, const Edge* e_metadata) {
-  if (g == nullptr || e_data == nullptr || e_metadata == nullptr) {
-    return false;
-  }
-
-  Node* n_data = e_data->src();
-  int n_data_op_slot = e_data->src_output();
-  int n_metadata_op_slot = GetTensorMetaDataIndex(n_data_op_slot,
-                                                  n_data->num_outputs());
-
-  // If the source of meta edge is a constant node (producing dummy Mkl metadata
-  // tensor), then we will need to fix.
-  if (IsConstant(e_metadata->src())) {
-    Node* e_metadata_dst = e_metadata->dst();
-    int e_metadata_in_slot = e_metadata->dst_input();
-    CHECK_NOTNULL((*g)->AddEdge(n_data, n_metadata_op_slot,
-                  e_metadata_dst, e_metadata_in_slot));
-
-    (*g)->RemoveEdge(e_metadata);
-    return true;
-  }
-
-  return false;
-}
-
-bool MklLayoutRewritePass::FixMklMetaDataEdges(std::unique_ptr<Graph>* g,
-    Node* n) {
-  bool result = false;
-
-  // If graph node is not Mkl node, then return.
-  DataType T = DT_INVALID;
-  if (!GetNodeAttr(n->def(), "T", &T).ok() ||
-      !mkl_op_registry::IsMklOp(n->type_string(), T)) {
-    return result;
-  }
-
-  // If it is Mkl node, then check if the input edges to this node that carry
-  // Mkl metadata are linked up correctly with the source node.
-
-  // For Mkl nodes, we generate twice the number of input tensors (n for Mkl
-  // data tensors + n for Mkl metadata tensors). We need to check for correct
-  // connection of n metadata tensors only.
-  int num_data_inputs = n->num_inputs() / 2;
-  for (int idx = 0; idx < num_data_inputs; idx++) {
-    // Get the edge connecting input slot with index (idx).
-    const Edge* e = nullptr;
-    TF_CHECK_OK(n->input_edge(idx, &e));
-
-    // If e is control edge, then skip.
-    if (e->IsControlEdge()) {
-      continue;
-    }
-
-    // Check that the source node for edge 'e' is Mkl node. If it is not an Mkl
-    // node, then we don't need to do anything.
-    Node* e_src = e->src();
-    if (GetNodeAttr(e_src->def(), "T", &T).ok() &&
-        mkl_op_registry::IsMklOp(e_src->type_string(), T)) {
-      // Source node for edge 'e' is Mkl node.
-      // Destination node and destination input slot of e is node 'n' and 'idx'
-      // resp.
-      CHECK_EQ(e->dst(), n);
-      CHECK_EQ(e->dst_input(), idx);
-
-      // Let's get edge that carries Mkl metadata corresponding to Mkl data edge
-      // 'e'. For that, let's first get the input slot of 'n' where the meta
-      // edge will feed the value.
-      int e_meta_in_slot = GetTensorMetaDataIndex(e->dst_input(),
-                                                  n->num_inputs());
-      const Edge* e_meta = nullptr;
-      TF_CHECK_OK(n->input_edge(e_meta_in_slot, &e_meta));
-
-      // Let's check if we need to fix this meta edge.
-      if (FixMklMetaDataEdgeIfNeeded(g, e, e_meta)) {
-        result = true;
-      }
-    }
-  }
-
-  return result;
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 //              Run function for the pass
 ///////////////////////////////////////////////////////////////////////////////
@@ -4422,25 +4307,6 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
 
   DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite)", &**g);
 
-  order.clear();
-  GetReversePostOrder(**g, &order);  // This will give us topological sort.
-  for (Node* n : order) {
-    // If node is not an op or it cannot run on CPU device, then skip.
-    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
-      continue;
-    }
-    if (FixMklMetaDataEdges(g, n)) {
-      string node_name = n->name();
-      string op_name = n->type_string();
-
-      VLOG(1) << "MklLayoutRewritePass: fixed metadata edges for node "
-              << node_name << " with op " << op_name;
-      result = true;
-    }
-  }
-  DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite+Fixup)",
-            &**g);
-
   return result;
 }
 
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 7645b4a7f0..029cdcf94a 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -3518,37 +3518,6 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
             "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
 }
 
-/////////////////////////////////////////////////////////////////////
-//         Post-rewrite fixup pass test
-
-TEST_F(MklLayoutPassTest, PostRewriteFixUpPass) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_UINT8 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_UINT8 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'E' op: '_MklAdd'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A', 'D', 'D']}");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Const);E(_MklAdd);"
-            "M(_MklInput);N(_MklInput)|A->C;A->E:1;B->C:1;C->E;C:2->E:2;"
-            "D->E:3;M->C:2;N->C:3");
-}
-
 /////////////////////////////////////////////////////////////////////
 
 static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 0c02876ac5..6749a7c571 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -610,6 +610,7 @@ class SymbolicShapeRefiner {
     }
   };
 
+  // Compute the shape of the tensors outputed by node 'node' at output port
   // 'port_index' as the union of shape1 and shape2.
   ShapeHandle OutputAsUnion(const NodeDef* node, int port_index,
                             ShapeHandle shape1, ShapeHandle shape2) {
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 8ca726df0b..1b18087cdf 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -679,7 +679,6 @@ cc_library(
     deps = [
         ":constant_folding",
         ":graph_optimizer",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
@@ -781,6 +780,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:scoped_allocator_ops_op_lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 03e36a7b9c..4dde7ed1b4 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -201,7 +200,8 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
         }
       }
       if (optimizable) {
-        VLOG(1) << "Optimizing fused batch norm node " << node.DebugString();
+        VLOG(2) << "Optimizing fused batch norm node " << node.DebugString()
+                << std::endl;
         AddBatchNormNodes(optimized_graph, node);
         continue;
       }
diff --git a/tensorflow/core/kernels/as_string_op.cc b/tensorflow/core/kernels/as_string_op.cc
index a7757d1361..66c4aff3e3 100644
--- a/tensorflow/core/kernels/as_string_op.cc
+++ b/tensorflow/core/kernels/as_string_op.cc
@@ -73,7 +73,6 @@ class AsStringOp : public OpKernel {
     }
     switch (dtype) {
       case DT_INT8:
-      case DT_INT16:
       case DT_INT32:
         strings::Appendf(&format_, "d");
         break;
@@ -130,7 +129,6 @@ class AsStringOp : public OpKernel {
       ENCODE_TYPE(DT_FLOAT, float, format_);
       ENCODE_TYPE(DT_DOUBLE, double, format_);
       ENCODE_TYPE(DT_INT8, int8, format_);
-      ENCODE_TYPE(DT_INT16, int16, format_);
       case (DT_BOOL): {
         const auto& input_flat = input_tensor->flat<bool>();
         for (int i = 0; i < input_flat.size(); ++i) {
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index 49b90e855b..14d889e8e3 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -33,41 +33,52 @@ class ClipOp : public OpKernel {
     const Tensor& in0 = ctx->input(0);
     const Tensor& in1 = ctx->input(1);
     const Tensor& in2 = ctx->input(2);
-    OP_REQUIRES(ctx, (in0.shape() == in1.shape() ||
-                      TensorShapeUtils::IsScalar(in1.shape())) &&
-                     (in0.shape() == in2.shape() ||
-                      TensorShapeUtils::IsScalar(in2.shape())),
-                errors::InvalidArgument(
-                    "clip_value_min and clip_value_max must be either of "
-                    "the same shape as input, or a scalar. ",
-                    "input shape: ", in0.shape().DebugString(),
-                    "clip_value_min shape: ", in1.shape().DebugString(),
-                    "clip_value_max shape: ", in2.shape().DebugString()));
-
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
-    if (out->NumElements() == 0) return;  // Nothing to do for empty output
 
     auto in0_flat = in0.flat<T>();
     auto in1_flat = in1.flat<T>();
     auto in2_flat = in2.flat<T>();
-    auto out_flat = out->flat<T>();
     const Device& d = ctx->eigen_device<Device>();
 
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
+    auto out_flat = out->flat<T>();
     if (in1.shape() == in2.shape()) {
       if (in0.shape() == in1.shape()) {
         functor::TernaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                             out_flat);
       } else {
+        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in1.shape()),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::UnaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                           out_flat);
       }
     } else {
       if (in0.shape() == in1.shape()) {
+        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in2.shape()),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryLeftClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                out_flat);
       } else {
+        OP_REQUIRES(ctx,
+                    (in0.shape() == in2.shape() &&
+                     TensorShapeUtils::IsScalar(in1.shape())),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryRightClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                 out_flat);
       }
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 17a85d9773..9a3b2303a3 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -57,7 +57,6 @@ struct DenseUpdate<GPUDevice, T, SUB> {
   template struct functor::DenseUpdate<GPUDevice, T, ADD>; \
   template struct functor::DenseUpdate<GPUDevice, T, SUB>;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
-TF_CALL_int32(DEFINE_GPU_KERNELS);
 TF_CALL_int64(DEFINE_GPU_KERNELS);
 #undef DEFINE_GPU_KERNELS
 
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index 5cd8e04927..e6fefe643b 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -37,7 +37,6 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
-TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.cc b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
index 4563fc6353..39b6924d74 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
@@ -31,7 +31,6 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
-TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 4e53291b7f..7e5a9e1ec5 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -228,8 +228,6 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
-TF_CALL_int32(DECLARE_GPU_SPECS);
-TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
@@ -241,8 +239,6 @@ TF_CALL_complex128(DECLARE_GPU_SPECS);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_ND_GPU(type) REGISTER_GATHER_ND_ALL_INDICES(GPU, type)
 
-TF_CALL_int32(REGISTER_GATHER_ND_GPU);
-TF_CALL_int64(REGISTER_GATHER_ND_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex64(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex128(REGISTER_GATHER_ND_GPU);
diff --git a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
index da8d2e9e3c..b03efc684f 100644
--- a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
@@ -119,8 +119,6 @@ struct GatherNdSlice<GPUDevice, T, Index, IXDIM> {
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
-TF_CALL_int32(DEFINE_GPU_SPECS);
-TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 094504d6b9..ef332ebee3 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -153,7 +153,6 @@ TF_CALL_uint64(REGISTER_GATHER_CPU);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
-TF_CALL_int64(REGISTER_GATHER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
 TF_CALL_complex64(REGISTER_GATHER_GPU);
 TF_CALL_complex128(REGISTER_GATHER_GPU);
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 31d1b949ef..5eeb23d810 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -14,7 +14,6 @@ limitations under the License.
 
 #include <limits>
 #include <vector>
-#include <unordered_map>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -591,8 +590,8 @@ class MklConcatOp : public OpKernel {
       const int N = input_tensors.size();
 
       // Get Tensor shapes.
-      std::vector<MklDnnShape> mkl_input_shapes(N);
-      GetMklShapeList(context, "values", &mkl_input_shapes);
+      std::vector<MklDnnShape> input_shapes(N);
+      GetMklShapeList(context, "values", &input_shapes);
 
       const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM)
                                             ? MklGetInput(context, 0)
@@ -611,14 +610,19 @@ class MklConcatOp : public OpKernel {
       int i = 0;
       bool invoke_eigen = false;
       bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
-      const TensorShape expected_shape = mkl_input_shapes[0].IsMklTensor()
-                                       ? mkl_input_shapes[0].GetTfShape()
-                                       : input_tensors[0].shape();
+      const TensorShape expected_shape = input_shapes[0].IsMklTensor()
+                                             ? input_shapes[0].GetTfShape()
+                                             : input_tensors[0].shape();
       size_t expected_dims = expected_shape.dims();
 
       if (concat_dim < 0) concat_dim = expected_dims + concat_dim;
 
-      for (auto& s : mkl_input_shapes) {
+      for (auto& s : input_shapes) {
+        if (s == expected_shape) {
+          ++i;
+          continue;
+        }
+
         TensorShape s_shape =
             s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
         size_t s_dims = s_shape.dims();
@@ -661,14 +665,21 @@ class MklConcatOp : public OpKernel {
 
       // Call Eigen library
       if (invoke_eigen) {
-        CallEigenVersion(context, input_tensors, mkl_input_shapes);
+        TensorShapeList tf_input_shapes;
+        i = 0;
+        for (auto& s : input_shapes) {
+          TensorShape s_shape =
+              s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
+          tf_input_shapes.push_back(s_shape);
+          ++i;
+        }
+        CallEigenVersion(context, input_tensors, tf_input_shapes);
         return;
       }
 
       memory::dims dst_dims;
-
       if (are_all_mkl_inputs)
-        dst_dims = TFShapeToMklDnnDims(mkl_input_shapes[0].GetTfShape());
+        dst_dims = TFShapeToMklDnnDims(input_shapes[0].GetTfShape());
       else
         // When all the inputs are in Tensorflow format, we don't know
         // what is the input data format. In that case, we just use
@@ -678,61 +689,26 @@ class MklConcatOp : public OpKernel {
       std::vector<memory::primitive_desc> srcs_pd;
       std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
       int64 dst_concat_dim_size = 0;
-
-      bool isMklReorderNeeded = false;
-      memory::format mkl_common_format = memory::format::any;
-      if (are_all_mkl_inputs) {
-        mkl_common_format =
-            FindMklCommonFormat(mkl_input_shapes, concat_dim,
-               &isMklReorderNeeded, &dst_concat_dim_size);
-
-        if (!isMklReorderNeeded) {
-          // All MKL tensors have a same format. Reorder is not needed.
-          for (int k = 0; k < N; k++) {
-            if (input_tensors[k].NumElements() == 0)
-              continue;
-
-            auto src_md = mkl_input_shapes[k].GetMklLayout();
-            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-            auto src_mpd = srcs[k].GetUsrMemPrimDesc();
-            srcs_pd.push_back(src_mpd);
-          }
-        } else {
-          // MKL tensors have different formats.
-          // Reorder them to most common format.
-          for (int k = 0; k < N; k++) {
-            if (input_tensors[k].NumElements() == 0)
-              continue;
-
-            auto src_dims = TFShapeToMklDnnDims(
-                mkl_input_shapes[k].GetTfShape());
-            auto src_md = mkl_input_shapes[k].GetMklLayout();
-            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-
-            if (src_md.data.format != mkl_common_format)
-              src_md = memory::desc(src_dims, MklDnnType<T>(),
-                           mkl_common_format);
-
-            srcs_pd.push_back(memory::primitive_desc(src_md, cpu_engine));
-          }
-        }
-      } else {  // All TF inputs
-        for (int k = 0; k < N; k++) {
-          if (input_tensors[k].NumElements() == 0)
-            continue;
-
-          memory::dims src_dims = TFShapeToMklDnnDims(input_tensors[k].shape());
-          dst_concat_dim_size += src_dims[concat_dim];
-
-          // It does not matter what data format to be used (NHWC versus NCHW).
-          // We just need to ensure that output uses same data format as inputs.
-          auto src_md =
-              memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
-
-          srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-          auto src_mpd = srcs[k].GetUsrMemPrimDesc();
-          srcs_pd.push_back(src_mpd);
-        }
+      for (int k = 0; k < N; k++) {
+        bool is_mkl_tensor = input_shapes[k].IsMklTensor();
+        memory::dims src_dims;
+
+        // Same comment as dst_dims for src_dims.
+        src_dims = (is_mkl_tensor)
+                       ? TFShapeToMklDnnDims(input_shapes[k].GetTfShape())
+                       : TFShapeToMklDnnDims(input_tensors[k].shape());
+
+        dst_concat_dim_size += src_dims[concat_dim];
+        auto src_md =
+            is_mkl_tensor ? input_shapes[k].GetMklLayout() :
+                          // It does not matter what data format we use here
+                          // (NHWC or NCHW). We just need to ensure that output
+                          // of Concat uses same data format as input.
+                memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
+
+        srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+        auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+        srcs_pd.push_back(src_mpd);
       }
       dst_dims[concat_dim] = dst_concat_dim_size;
 
@@ -742,33 +718,25 @@ class MklConcatOp : public OpKernel {
       if (are_all_mkl_inputs) {
         // Since we are passing a specific format for destination,
         // we need to have dst_dims in MklDnn order (NCHW).
-        auto orig_tf_format = mkl_input_shapes[0].GetTfDataFormat();
+        auto orig_tf_format = input_shapes[0].GetTfDataFormat();
         dst_dims_in_nchw = MklDnnDimsInNCHW(
             dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
-        // Set the output format same as the most common format of inputs
-        // to avoid layout conversions.
+        // We will set the output in the same format as input to avoid layout
+        // conversions.
+        // Currently we are setting dst format same as input format.
+        // See if we can make this choice in a better way.
         dst_md = memory::desc(
-            dst_dims_in_nchw, MklDnnType<T>(), mkl_common_format);
+            dst_dims_in_nchw, MklDnnType<T>(),
+            (memory::format)input_shapes[0].GetMklLayout().data.format);
       } else {
-        // All inputs are TF tensors.
-        // Set the output format same as input format (nchw).
+        // Again, format does not matter here. We just need to make it same as
+        // input format.
         dst_md = memory::desc(dst_dims, MklDnnType<T>(), memory::format::nchw);
       }
 
       std::vector<primitive::at> inputs;
-      std::vector<primitive> net;
-      if (isMklReorderNeeded) {
-        for (int k = 0; k < input_tensors.size(); k++) {
-          if (input_tensors[k].NumElements() > 0) {
-            srcs[k].CheckReorderToOpMem(srcs_pd[k], &net);
-          }
-        }
-      }
-      for (int k = 0; k < input_tensors.size(); k++) {
-        if (input_tensors[k].NumElements() > 0) {
-          inputs.push_back(srcs[k].GetOpMem());
-        }
-      }
+      for (int k = 0; k < input_tensors.size(); k++)
+        inputs.push_back(srcs[k].GetOpMem());
 
       // If all inputs are in MKL format, then meaning of concat_dim needs to
       // change. Value of concat_dim is tied to input Tensorflow data format
@@ -777,8 +745,7 @@ class MklConcatOp : public OpKernel {
       // But ifinput tensors are in NHWC order, then semantics need to change.
       // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
       // then since MklDnn order is NCHW, concat_dim needs to be 1.
-      if (are_all_mkl_inputs)
-         concat_dim = mkl_input_shapes[0].TfDimIdx(concat_dim);
+      if (are_all_mkl_inputs) concat_dim = input_shapes[0].TfDimIdx(concat_dim);
 
       auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
 
@@ -791,7 +758,7 @@ class MklConcatOp : public OpKernel {
         dnn_shape_dst.SetMklLayout(&dst_pd);
         dnn_shape_dst.SetElemType(MklDnnType<T>());
         dnn_shape_dst.SetTfLayout(dst_dims.size(), dst_dims_in_nchw,
-                                  mkl_input_shapes[0].GetTfDataFormat());
+                                  input_shapes[0].GetTfDataFormat());
         tf_shape_dst.AddDim((dst_pd.get_size() / sizeof(T)));
       } else {
         dnn_shape_dst.SetMklTensor(false);
@@ -806,6 +773,7 @@ class MklConcatOp : public OpKernel {
       dst.SetUsrMem(dst_md, dst_tensor);
 
       auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
+      std::vector<primitive> net;
       net.push_back(concat_op);
       stream(stream::kind::eager).submit(net).wait();
     } catch (mkldnn::error& e) {
@@ -819,27 +787,15 @@ class MklConcatOp : public OpKernel {
   }
 
   void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
-                        const MklDnnShapeList& mkl_input_shapes) {
-    CHECK_EQ(values.size(), mkl_input_shapes.size());
+                        const TensorShapeList& input_shapes) {
+    CHECK_EQ(values.size(), input_shapes.size());
 
     std::vector<Tensor> converted_values;
-    TensorShapeList tf_input_shapes;
-    for (int i = 0; i < mkl_input_shapes.size(); i++) {
-      if (mkl_input_shapes[i].IsMklTensor()) {
-        // do conversion from MKL to TF
-        Tensor tmp_tensor =
-            ConvertMklToTF<T>(context, values[i], mkl_input_shapes[i]);
-        converted_values.push_back(tmp_tensor);
-        tf_input_shapes.push_back(mkl_input_shapes[i].GetTfShape());
-      } else {
-        // no conversion since it is TF tensor already
-        converted_values.push_back(values[i]);
-        tf_input_shapes.push_back(values[i].shape());
-      }
-    }
+    for (int i = 0; i < input_shapes.size(); i++)
+      converted_values.push_back(values[i]);
 
     // Call Eigen concat.
-    eigen_concat_op_.Compute(context, converted_values, tf_input_shapes);
+    eigen_concat_op_.Compute(context, converted_values, input_shapes);
 
     // Set output Mkl tensor for this op.
     MklDnnShape dnn_shape_output;
@@ -856,55 +812,6 @@ class MklConcatOp : public OpKernel {
         output_tensor->flat<uint8>().data(),
         output_tensor->flat<uint8>().size() * sizeof(uint8));
   }
-
-  // This method finds the most commom format accross all MKL inputs
-  // Inputs:
-  //   1. input_shapes: shapes of input (MKL) tensors.
-  //   2. concat_dim: concat dimension.
-  // Outputs:
-  //   1. is_reorder_needed is set to true if inputs have difference formats
-  //      It is set to false otherwise.
-  //   2. concat_dim_size is the size of concat_dim.
-  // Return:
-  //   return the common MKL format.
-  memory::format FindMklCommonFormat(const MklDnnShapeList& input_shapes,
-      int concat_dim, bool* is_reorder_needed, int64* concat_dim_size) {
-    *is_reorder_needed = false;
-    *concat_dim_size = 0;
-    std::unordered_map<int, int> occurrence_map;
-    if (input_shapes.size() == 0)
-      return memory::format::any;
-
-    // Compute ocurrences of each format of all inputs.
-    for (int k=0; k <input_shapes.size(); k++) {
-      auto src_dims = TFShapeToMklDnnDims(input_shapes[k].GetTfShape());
-      *concat_dim_size += src_dims[concat_dim];
-      int fmt = static_cast<int>(
-          input_shapes[k].GetMklLayout().data.format);
-      occurrence_map[fmt] += 1;
-    }
-
-    if (occurrence_map.size() == 1) {
-       // this means that all inputs have a same format
-       // return it with is_reorder_needed set false.
-       return static_cast<memory::format>(
-           input_shapes[0].GetMklLayout().data.format);
-    }
-
-    // Input tensors have different formats. Thus, reorder is needed.
-    // We pick up the most common format to minimize the total
-    // number of input reorder.
-    memory::format commonest_format = memory::format::any;
-    int max_occurrence = 0;
-    *is_reorder_needed = true;
-    for (auto item : occurrence_map) {
-      if (item.second > max_occurrence) {
-        commonest_format = static_cast<memory::format>(item.first);
-        max_occurrence = item.second;
-      }
-    }
-    return commonest_format;
-  }
 };
 
 #endif
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index f857be6c32..c1da0ded1d 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -18,7 +18,6 @@ limitations under the License.
 // bias.
 
 #ifdef INTEL_MKL
-#ifdef INTEL_MKL_ML
 
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
@@ -265,5 +264,4 @@ class MklConv2DCustomBackpropBiasOp : public OpKernel {
 TF_CALL_float(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 } /* namespace tensorflow */
-#endif /* INTEL_MKL_ML */
 #endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index c0dfed7d7d..279167aba2 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -199,15 +199,13 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
     CHECK_NOTNULL(pool_params);
     CHECK_NOTNULL(dnn_data_input);
     TensorShape input_tensor_shape = input_tensor.shape();
-    if (input_tensor.NumElements() != 0) {
-      memory::desc input_md =
+    memory::desc input_md =
         input_mkl_shape.IsMklTensor()
             ? input_mkl_shape.GetMklLayout()
             : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
                                                      this->data_format_tf_),
                            MklDnnType<T>(), this->data_format_mkldnn_);
-      dnn_data_input->SetUsrMem(input_md, &input_tensor);
-    }
+    dnn_data_input->SetUsrMem(input_md, &input_tensor);
     this->InitMklPoolParameters(context, pool_params, input_mkl_shape,
                                 input_tensor_shape);
   }
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index e1fc2ea128..43c5b29509 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -292,7 +292,6 @@ TF_CALL_string(REGISTER_SCATTER_ND_CPU);
   REGISTER_SCATTER_ND_UPDATE_GPU(type);   \
   REGISTER_SCATTER_ND_GPU(type);
 
-TF_CALL_int32(REGISTER_SCATTER_ND_ALL_GPU);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_complex64(REGISTER_SCATTER_ND_ALL_GPU);
@@ -307,8 +306,6 @@ TF_CALL_complex128(REGISTER_SCATTER_ND_ALL_GPU);
 #define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
   REGISTER_SCATTER_ND_UPDATE(type, SYCL);
 
-TF_CALL_int32(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
-TF_CALL_int32(REGISTER_SCATTER_ND_UPDATE_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
 #undef REGISTER_SCATTER_ND_ADD_SUB_SYCL
@@ -579,7 +576,6 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
-TF_CALL_int32(DECLARE_GPU_SPECS);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index 08b657f4c3..a3c21edc15 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -170,7 +170,6 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
-TF_CALL_int32(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
index 634f9ba887..bb0129fa6f 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@@ -216,13 +216,8 @@ TEST_F(ScopedAllocatorConcatOpTest, Success3) {
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, Reshape) {
-  MakeOp({2, 2, 4}, DT_DOUBLE, true, "test", 120, 2);
-
-  // The elements of the third parameter to ExecOp must be multiples of
-  // Allocator::kAllocatorAlignment in size.  If they are not, the backing
-  // tensor allocated by PrepOp will have too many elements and reshaping
-  // will fail.
-  ExecOp(DT_DOUBLE, 120, {{2, 4}, {2, 4}});
+  MakeOp({2, 2, 2}, DT_DOUBLE, true, "test", 120, 2);
+  ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}});
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, NoReshapeAttr) {
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index d65692a552..7796bf3587 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -16,14 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
-
-// This file requires the following include because it uses CudaAtomicMax:
-// #include "tensorflow/core/util/cuda_kernel_helper.h"
-
-// Unfortunately we can't add the #include, since it breaks compilation for
-// non-GPU targets. This only breaks in clang, because it's more strict for
-// template code and CudaAtomicMax is used in template context.
-
 // This file requires the following include because it uses CudaAtomicMax:
 // #include "tensorflow/core/util/cuda_kernel_helper.h"
 
@@ -138,4 +130,4 @@ struct Highest {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 866c5dcd52..a1f9667b78 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -1490,7 +1490,7 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
 
 #endif  // TENSORFLOW_USE_LIBXSMM
 
-// Here is an overview of the SparseMatMul code. Note that we assume that the
+// Here is a an overview of the SparseMatMul code. Note that we assume that the
 // left matrix is sparse.
 //
 // The matrix "left" is divided into a grid with blocksize of (M, KL). Each
diff --git a/tensorflow/core/kernels/string_split_op.cc b/tensorflow/core/kernels/string_split_op.cc
index 26ab72f12e..4c2b312c34 100644
--- a/tensorflow/core/kernels/string_split_op.cc
+++ b/tensorflow/core/kernels/string_split_op.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
@@ -44,63 +43,6 @@ std::vector<string> Split(const string& str, const string& delimiter,
   return char_vector;
 }
 
-std::vector<string> SplitV2(const string& str, StringPiece sep, int maxsplit) {
-  // This SplitV2 method matches the behavior of python's str.split:
-  //   If sep is given, consecutive delimiters are not grouped together
-  //   and are deemed to delimit empty strings (for example, '1,,2'.split(',')
-  //   returns ['1', '', '2']). The sep argument may consist of multiple
-  //   characters (for example, '1<>2<>3'.split('<>') returns ['1', '2', '3']).
-  //   Splitting an empty string with a specified separator returns [''].
-  //
-  //   If sep is not specified or is None, a different splitting algorithm is
-  //   applied: runs of consecutive whitespace are regarded as a single
-  //   separator, and the result will contain no empty strings at the start or
-  //   end if the string has leading or trailing whitespace. Consequently,
-  //   splitting an empty string or a string consisting of just whitespace
-  //   with a None separator returns [].
-
-  std::vector<string> result;
-
-  StringPiece text(str);
-  if (maxsplit == 0) {
-    result.emplace_back(std::string(text));
-    return result;
-  }
-
-  if (sep.empty()) {
-    StringPiece token;
-    // Remove leading whitespaces.
-    str_util::RemoveLeadingWhitespace(&text);
-    int split = 0;
-    while (str_util::ConsumeNonWhitespace(&text, &token)) {
-      result.emplace_back(std::string(token));
-      str_util::RemoveLeadingWhitespace(&text);
-      ++split;
-      if (maxsplit > 0 && split == maxsplit) {
-        result.emplace_back(std::string(text));
-        return result;
-      }
-    }
-    return result;
-  }
-  auto p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
-  int split = 0;
-  while (p != text.end()) {
-    StringPiece token = text.substr(0, p - text.begin());
-    result.emplace_back(std::string(token));
-    text.remove_prefix(token.size());
-    text.remove_prefix(sep.size());
-    ++split;
-    if (maxsplit > 0 && split == maxsplit) {
-      result.emplace_back(std::string(text));
-      return result;
-    }
-    p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
-  }
-  result.emplace_back(std::string(text));
-  return result;
-}
-
 }  // namespace
 
 class StringSplitOp : public OpKernel {
@@ -180,78 +122,6 @@ class StringSplitOp : public OpKernel {
   bool skip_empty_;
 };
 
-class StringSplitV2Op : public OpKernel {
- public:
-  explicit StringSplitV2Op(OpKernelConstruction* context)
-      : OpKernel(context), maxsplit_(-1) {
-    OP_REQUIRES_OK(context, context->GetAttr("maxsplit", &maxsplit_));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor* input_tensor;
-    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(input_tensor->shape()),
-                errors::InvalidArgument("input must be a vector, got shape: ",
-                                        input_tensor->shape().DebugString()));
-
-    const auto input_vec = input_tensor->vec<string>();
-    const int64 batch_size = input_vec.dimension(0);
-
-    const Tensor* sep_tensor;
-    OP_REQUIRES_OK(ctx, ctx->input("sep", &sep_tensor));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(sep_tensor->shape()),
-                errors::InvalidArgument("sep must be a scalar, got shape: ",
-                                        sep_tensor->shape().DebugString()));
-    const auto sep_vec = sep_tensor->flat<string>();
-    StringPiece sep(sep_vec(0));
-    std::vector<string> tokens;
-    // Guess that we'll be unpacking a handful of tokens per example.
-    static constexpr int kReserveSize = 4;
-    tokens.reserve(batch_size * kReserveSize);
-
-    int64 output_size = 0;
-    int64 max_num_entries = 0;
-    std::vector<int64> num_indices(batch_size);
-    for (int64 i = 0; i < batch_size; ++i) {
-      std::vector<string> parts = SplitV2(input_vec(i), sep, maxsplit_);
-      int64 n_entries = parts.size();
-      num_indices[i] = n_entries;
-      output_size += n_entries;
-      max_num_entries = std::max(max_num_entries, n_entries);
-      tokens.insert(tokens.end(), parts.begin(), parts.end());
-    }
-
-    Tensor* sp_indices_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({output_size, 2}),
-                                             &sp_indices_t));
-    Tensor* sp_tokens_t;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_output(1, TensorShape({output_size}), &sp_tokens_t));
-    Tensor* sp_shape_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(2, TensorShape({2}), &sp_shape_t));
-
-    auto sp_indices = sp_indices_t->matrix<int64>();
-    auto sp_tokens = sp_tokens_t->vec<string>();
-    auto sp_shape = sp_shape_t->vec<int64>();
-    sp_shape(0) = batch_size;
-    sp_shape(1) = max_num_entries;
-    size_t c = 0;
-    for (size_t i = 0; i < batch_size; ++i) {
-      for (size_t j = 0; j < num_indices[i]; ++j) {
-        sp_indices(c, 0) = i;
-        sp_indices(c, 1) = j;
-        sp_tokens(c) = tokens[c];
-        ++c;
-      }
-    }
-  }
-
- private:
-  int maxsplit_;
-};
-
 REGISTER_KERNEL_BUILDER(Name("StringSplit").Device(DEVICE_CPU), StringSplitOp);
-REGISTER_KERNEL_BUILDER(Name("StringSplitV2").Device(DEVICE_CPU),
-                        StringSplitV2Op);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/candidate_sampling_ops.cc b/tensorflow/core/ops/candidate_sampling_ops.cc
index 6e589c8d1c..6e4d100b04 100644
--- a/tensorflow/core/ops/candidate_sampling_ops.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops.cc
@@ -145,15 +145,12 @@ REGISTER_OP("ComputeAccidentalHits")
       int64 num_true;
       TF_RETURN_IF_ERROR(c->GetAttr("num_true", &num_true));
 
-      // Validate true_classes, must be a matrix.
+      // Validate true_classes.
       ShapeHandle true_classes;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &true_classes));
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(
           c->WithValue(c->Dim(true_classes, 1), num_true, &unused));
-      // Validate sampled_candidates, must be a vector.
-      ShapeHandle sampled_candidates;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &sampled_candidates));
 
       // All three outputs are the same shape.
       ShapeHandle v = c->Vector(InferenceContext::kUnknownDim);
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 9dca5f53ce..15e0ca8af9 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -218,17 +218,7 @@ REGISTER_OP("MapAndBatchDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      // Use index from the end to retrieve the Input shapes,
-      // so that to avoid guessing the length of "other_arguments".
-      // batch_size, num_parallel_batches, and drop_remainder are 0-D scalars.
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
-
-      return shape_inference::ScalarShape(c);
-    });
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("MapAndBatchDatasetV2")
     .Input("input_dataset: variant")
@@ -241,17 +231,7 @@ REGISTER_OP("MapAndBatchDatasetV2")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      // Use index from the end to retrieve the Input shapes,
-      // so that to avoid guessing the length of "other_arguments".
-      // batch_size, num_parallel_calls, and drop_remainder are 0-D scalars.
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
-
-      return shape_inference::ScalarShape(c);
-    });
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("PrefetchDataset")
     .Input("input_dataset: variant")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 87f4991134..d949e70c66 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -454,9 +454,7 @@ REGISTER_OP("DrawBoundingBoxes")
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 2), 4, &unused));
 
-      // The rank of the input image (rank = 4) has already been restricted
-      // above, and the output is of the same shape as the input.
-      return shape_inference::UnchangedShape(c);
+      return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
     });
 
 // --------------------------------------------------------------------------
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index b3487122e2..1740fa152c 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1084,7 +1084,7 @@ REGISTER_OP("UnsortedSegmentProd")
     .Input("segment_ids: Tindices")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: realnumbertype")
     .Attr("Tindices: {int32,int64}")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .SetShapeFn(UnsortedSegmentReductionShapeFn);
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 41efa49ce3..fc60e807b9 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1453,7 +1453,6 @@ REGISTER_OP("QuantizedReluX")
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 4423062362..1d5c743a56 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -78,7 +78,7 @@ REGISTER_OP("ReduceJoin")
 REGISTER_OP("AsString")
     .Input("input: T")
     .Output("output: string")
-    .Attr("T: {int8, int16, int32, int64, complex64, float, double, bool}")
+    .Attr("T: {int32, int64, complex64, float, double, bool, int8}")
     .Attr("precision: int = -1")
     .Attr("scientific: bool = false")
     .Attr("shortest: bool = false")
@@ -134,24 +134,6 @@ REGISTER_OP("StringSplit")
       return Status::OK();
     });
 
-REGISTER_OP("StringSplitV2")
-    .Input("input: string")
-    .Input("sep: string")
-    .Output("indices: int64")
-    .Output("values: string")
-    .Output("shape: int64")
-    .Attr("maxsplit: int = -1")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-
-      c->set_output(0, c->Matrix(InferenceContext::kUnknownDim, 2));
-      c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
-      c->set_output(2, c->Vector(2));
-      return Status::OK();
-    });
-
 REGISTER_OP("StringStrip")
     .Input("input: string")
     .Output("output: string")
diff --git a/tensorflow/core/platform/cpu_info.cc b/tensorflow/core/platform/cpu_info.cc
index e9da3d8e32..99de364042 100644
--- a/tensorflow/core/platform/cpu_info.cc
+++ b/tensorflow/core/platform/cpu_info.cc
@@ -344,28 +344,5 @@ int CPUModelNum() {
 #endif
 }
 
-int CPUIDNumSMT() {
-#ifdef PLATFORM_IS_X86
-  // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration
-  // https://software.intel.com/en-us/articles/intel-sdm (Vol 3A)
-  // Section: Detecting Hardware Multi-threads Support and Topology
-  // Uses CPUID Leaf 11 to enumerate system topology on Intel x86 architectures
-  // Other cases not supported
-  uint32 eax, ebx, ecx, edx;
-  // Check if system supports Leaf 11
-  GETCPUID(eax, ebx, ecx, edx, 0, 0);
-  if (eax >= 11) {
-    // 1) Leaf 11 available? CPUID.(EAX=11, ECX=0):EBX != 0
-    // 2) SMT_Mask_Width = CPUID.(EAX=11, ECX=0):EAX[4:0] if CPUID.(EAX=11,
-    // ECX=0):ECX[15:8] is 1
-    GETCPUID(eax, ebx, ecx, edx, 11, 0);
-    if (ebx != 0 && ((ecx & 0xff00) >> 8) == 1) {
-      return 1 << (eax & 0x1f);  // 2 ^ SMT_Mask_Width
-    }
-  }
-#endif  // PLATFORM_IS_X86
-  return 0;
-}
-
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index 175c9ae8b1..b5be7e8b54 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -35,10 +35,6 @@ namespace port {
 // software can change it dynamically.
 int NumSchedulableCPUs();
 
-// Returns an estimate of the number of hyperthreads per physical core
-// on the CPU
-int NumHyperthreadsPerCore();
-
 // Mostly ISA related features that we care about
 enum CPUFeature {
   // Do not change numeric assignments.
@@ -111,9 +107,6 @@ int CPUModelNum();
 // Returns nominal core processor cycles per second of each processor.
 double NominalCPUFrequency();
 
-// Returns num of hyperthreads per physical core
-int CPUIDNumSMT();
-
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index a319ccbdbe..ae81f9b5b3 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -71,8 +71,6 @@ def pyx_library(
         name = filename + "_cython_translation",
         srcs = [filename],
         outs = [filename.split(".")[0] + ".cpp"],
-        # Optionally use PYTHON_BIN_PATH on Linux platforms so that python 3
-        # works. Windows has issues with cython_binary so skip PYTHON_BIN_PATH.
         cmd = "PYTHONHASHSEED=0 $(location @cython//:cython_binary) --cplus $(SRCS) --output-file $(OUTS)",
         tools = ["@cython//:cython_binary"] + pxd_srcs,
     )
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index ff4b4436bb..72c12318ca 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -115,17 +115,18 @@ class LibHDFS {
     const char* kLibHdfsDso = "libhdfs.so";
 #endif
     char* hdfs_home = getenv("HADOOP_HDFS_HOME");
-    if (hdfs_home != nullptr) {
-      string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
-      status_ = TryLoadAndBind(path.c_str(), &handle_);
-      if (status_.ok()) {
-        return;
-      }
+    if (hdfs_home == nullptr) {
+      status_ = errors::FailedPrecondition(
+          "Environment variable HADOOP_HDFS_HOME not set");
+      return;
+    }
+    string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
+    status_ = TryLoadAndBind(path.c_str(), &handle_);
+    if (!status_.ok()) {
+      // try load libhdfs.so using dynamic loader's search path in case
+      // libhdfs.so is installed in non-standard location
+      status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
     }
-
-    // Try to load the library dynamically in case it has been installed
-    // to a in non-standard location.
-    status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
   }
 
   Status status_;
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 708f32ba80..8e316472fe 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -74,11 +74,6 @@ int NumSchedulableCPUs() {
   return kDefaultCores;
 }
 
-int NumHyperthreadsPerCore() {
-  static const int ht_per_core = tensorflow::port::CPUIDNumSMT();
-  return (ht_per_core > 0) ? ht_per_core : 1;
-}
-
 void* AlignedMalloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__)
   return memalign(minimum_alignment, size);
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index cb1fd09dbb..522a9d84fd 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 9
+#define TF_MINOR_VERSION 8
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc0"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 90b6533690..dffc965b14 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -42,7 +42,6 @@ limitations under the License.
 
 #ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
-#include "tensorflow/core/lib/core/stringpiece.h"
 
 using mkldnn::engine;
 using mkldnn::memory;
@@ -713,48 +712,15 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
   return output_tensor;
 }
 #else
-using mkldnn::stream;
-template <typename T> class MklDnnData;
-
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklDnnShape& mkl_shape) {
   Tensor output_tensor;
-  try {
-    if (!mkl_shape.IsMklTensor())
-      return mkl_tensor;  // return input since it is already TF tensor
-
-    TensorShape output_shape = mkl_shape.GetTfShape();;
-
-    // Allocate output tensor.
-    context->allocate_temp(DataTypeToEnum<T>::v(),
-        output_shape, &output_tensor);
-
-    auto cpu_engine = engine(engine::cpu, 0);
-    MklDnnData<T> input(&cpu_engine);
-
-    // Get Mkl layout of input tensor.
-    auto input_mkl_md = mkl_shape.GetMklLayout();
-    auto output_tf_md = mkl_shape.GetTfLayout();
-    auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
-    input.SetUsrMem(input_mkl_md, &mkl_tensor);
-
-    // reorder
-    if (input.IsReorderNeeded(output_tf_pd)) {
-      std::vector<primitive> net;
-      CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, &output_tensor, &net),
-             true);
-      stream(stream::kind::eager).submit(net).wait();
-    } else {
-      // If not, just forward input tensor to output tensor.
-      CHECK(output_tensor.CopyFrom(mkl_tensor, output_shape));
-    }
-  } catch (mkldnn::error& e) {
-    string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) + ", in file " +
-                       string(__FILE__) + ":" + std::to_string(__LINE__);
-    LOG(FATAL) << "Operation received an exception: " << error_msg;
-  }
+  TensorShape output_shape;
+
+  TF_CHECK_OK(
+      Status(error::Code::UNIMPLEMENTED, "Unimplemented conversion function"));
+
   return output_tensor;
 }
 #endif
@@ -1877,7 +1843,7 @@ class FactoryKeyCreator {
   template <typename T>
   void AddAsKey(const T data) {
     auto buffer = reinterpret_cast<const char *>(&data);
-    Append(StringPiece(buffer, sizeof(T)));
+    Append(absl::string_view(buffer, sizeof(T)));
   }
 
   std::string GetKey() {
@@ -1888,8 +1854,8 @@ class FactoryKeyCreator {
   string key_;
   const char delimiter = 'x';
   const int kMaxKeyLength = 256;
-  void Append(StringPiece s) {
-    key_.append(s.ToString());
+  void Append(absl::string_view s) {
+    key_.append(string(s));
     key_.append(1, delimiter);
   }
 };
diff --git a/tensorflow/docs_src/community/groups.md b/tensorflow/docs_src/community/groups.md
index 0b07d413da..d92f5775fa 100644
--- a/tensorflow/docs_src/community/groups.md
+++ b/tensorflow/docs_src/community/groups.md
@@ -1,38 +1,17 @@
 # User Groups
 
-TensorFlow has communities around the world. [Submit your community!](https://docs.google.com/forms/d/e/1FAIpQLSc_RQIUYtVgLLihzATaO_WUXkEyBDE_OoRoOXYDPmBEvHuEBA/viewform)
+TensorFlow has communities around the world.
 
 ## Asia
 
-* [TensorFlow China community](https://www.tensorflowers.cn)
-* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/)
-* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/)
-* [Soleil Data Dojo](https://soleildatadojo.connpass.com/)
+* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
+* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
+* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
 * [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/)
-* [TensorFlow Philippines Community](https://www.facebook.com/groups/TensorFlowPH/)
-* [TensorFlow and Deep Learning Singapore](https://www.meetup.com/TensorFlow-and-Deep-Learning-Singapore/)
-* [TensorFlow India](https://www.facebook.com/tensorflowindia)
 
 
 ## Europe
 
 * [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
 * [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
-* [Tensorflow Belgium](https://www.meetup.com/TensorFlow-Belgium)
-* [TensorFlow x Rome Meetup](https://www.meetup.com/it-IT/TensorFlow-x-Rome-Meetup)
-* [TensorFlow London](https://www.meetup.com/TensorFlow-London/)
-* [TensorFlow Edinburgh](https://www.meetup.com/tensorflow-edinburgh/)
 
-
-## America
-
-* [TensorFlow Buenos Aires](https://www.meetup.com/TensorFlow-Buenos-Aires/)
-
-
-## Oceania
-* [Melbourne TensorFlow Meetup](https://www.meetup.com/Melbourne-TensorFlow-Meetup)
-
-
-## Africa
-
-* [TensorFlow Tunis Meetup](https://www.meetup.com/fr-FR/TensorFlow-Tunis-Meetup/)
diff --git a/tensorflow/docs_src/get_started/eager.md b/tensorflow/docs_src/get_started/eager.md
index bbb25e20c6..f08ac74425 100644
--- a/tensorflow/docs_src/get_started/eager.md
+++ b/tensorflow/docs_src/get_started/eager.md
@@ -1,3 +1,3 @@
 # Get Started with Eager Execution
 
-[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/r1.9.0/samples/core/get_started/eager.ipynb)
+[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/r1.8.0/samples/core/get_started/eager.ipynb)
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index 232d2f1547..55579d52fb 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -10,9 +10,9 @@ course prior to diving into TensorFlow documentation:
 TensorFlow is a tool for machine learning. While it contains a wide range of
 functionality, TensorFlow is mainly designed for deep neural network models.
 
-The easiest way to get started with TensorFlow is by using Eager Execution.
+The easiest way to get started with TensorFlow is using Eager Execution.
 
-  * @{$get_started/eager}, is for anyone new to machine learning or TensorFlow.
+  * @{$get_started/eager}, is for anyone new to  machine learning or TensorFlow.
 
 TensorFlow provides many APIs. The remainder of this section focuses on the
 Estimator API which provide scalable, high-performance models. See the
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 2901848745..1abd840ab3 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 55bc0f64e7..52a2a3f8a6 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.9.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 637231da12..1256fb99c4 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.9.0-rc0</version>
+  <version>1.8.0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.9.0-rc0</version>
+                 <version>1.8.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -124,12 +124,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.9.0-rc0</version>
+  <version>1.8.0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.9.0-rc0</version>
+  <version>1.8.0</version>
 </dependency>
 ```
 
@@ -148,7 +148,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -167,7 +167,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -175,13 +175,13 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.9.0-rc0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0.zip).
   3. Extract this .zip file.
 
-__Note__: The native library (`tensorflow_jni.dll`) requires `msvcp140.dll` at runtime, which is included in the [Visual C++ 2015 Redistributable](https://www.microsoft.com/en-us/download/details.aspx?id=48145) package. 
+
 
 ### Validate the installation
 
@@ -227,7 +227,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.9.0-rc0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.8.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -241,11 +241,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.9.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.9.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index c8d706cf3c..0ed8160027 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -339,7 +339,9 @@ Docker will download the TensorFlow binary image the first time you launch it.
 
 #### GPU support
 
-To launch a Docker container with NVidia GPU support, enter a command of the following format (this [does not require any local CUDA installation](https://github.com/nvidia/nvidia-docker/wiki/CUDA#requirements)):
+Prior to installing TensorFlow with GPU support, ensure that your system meets all
+[NVIDIA software requirements](#NVIDIARequirements).  To launch a Docker container
+with NVidia GPU support, enter a command of the following format:
 
 <pre>
 $ <b>nvidia-docker run -it</b> <i>-p hostPort:containerPort TensorFlowGPUImage</i>
@@ -436,7 +438,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -515,7 +517,7 @@ on your system:
   from source. To use the TensorFlow binaries, version 3.5 or higher is required.
   See the [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a
   list of supported GPU cards.
-* [GPU drivers](http://nvidia.com/drivers) that support your version of the CUDA
+* [GPU drivers](http://nvidia.com/driver) that support your version of the CUDA
   Toolkit.
 * The `libcupti-dev` library is the NVIDIA CUDA Profile Tools Interface. This
   library provides advanced profiling support. To install this library,
@@ -682,14 +684,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -701,14 +703,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -720,14 +722,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -739,14 +741,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 9d01271c5a..29a867a9e3 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -522,7 +522,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl
 </pre>
 
 
@@ -530,5 +530,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index dc6c1e36fc..5ba522b436 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -81,7 +81,7 @@ or
 [macOS](#PrepareMac)
 
 
-<a name="PrepareLinux"></a>
+<a name="#PrepareLinux"></a>
 ## Prepare environment for Linux
 
 Before building TensorFlow on Linux, install the following build
@@ -328,10 +328,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.9.0rc0 on Linux:
+for TensorFlow 1.8.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.9.0rc0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -373,9 +373,9 @@ The build and installation problems you encounter typically depend on the
 operating system.  See the "Common installation problems" section
 of one of the following guides:
 
-  * @{$install_linux#common_installation_problems$Installing TensorFlow on Linux}
-  * @{$install_mac#common_installation_problems$Installing TensorFlow on Mac OS}
-  * @{$install_windows#common_installation_problems$Installing TensorFlow on Windows}
+  * @{$install_linux#CommonInstallationProblems$Installing TensorFlow on Linux}
+  * @{$install_mac#CommonInstallationProblems$Installing TensorFlow on Mac OS}
+  * @{$install_windows#CommonInstallationProblems$Installing TensorFlow on Windows}
 
 Beyond the errors documented in those two guides, the following table
 notes additional errors specific to building TensorFlow.  Note that we
@@ -433,8 +433,6 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.11.0</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.9.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.11.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
@@ -458,7 +456,6 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.11.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
@@ -475,8 +472,6 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.9.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/mobile/linking_libs.md b/tensorflow/docs_src/mobile/linking_libs.md
index efef5dd0da..cf0db59021 100644
--- a/tensorflow/docs_src/mobile/linking_libs.md
+++ b/tensorflow/docs_src/mobile/linking_libs.md
@@ -27,7 +27,7 @@ called `libandroid_tensorflow_inference_java.jar`. There are three ways to
 include this functionality in your program:
 
 1. Include the jcenter AAR which contains it, as in this
- [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/tfmobile/build.gradle#L59-L65)
+ [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/build.gradle#L59-L65)
 
 2. Download the nightly precompiled version from
 [ci.tensorflow.org](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/).
diff --git a/tensorflow/docs_src/mobile/prepare_models.md b/tensorflow/docs_src/mobile/prepare_models.md
index 2b84dbb973..8b22c04d87 100644
--- a/tensorflow/docs_src/mobile/prepare_models.md
+++ b/tensorflow/docs_src/mobile/prepare_models.md
@@ -105,8 +105,8 @@ inline constants so everything’s in one file.  To handle the conversion, you
 need the `freeze_graph.py` script, that’s held in
 [`tensorflow/python/tools/freeze_graph.py`](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py). You’ll run it like this:
 
-    bazel build tensorflow/python/tools:freeze_graph
-    bazel-bin/tensorflow/python/tools/freeze_graph \
+    bazel build tensorflow/tools:freeze_graph
+    bazel-bin/tensorflow/tools/freeze_graph \
     --input_graph=/tmp/model/my_graph.pb \
     --input_checkpoint=/tmp/model/model.ckpt-1000 \
     --output_graph=/tmp/frozen_graph.pb \
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index c97f74139c..2fea02d861 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -227,8 +227,8 @@ of 30.0f, and an 8-bit array, the quantized values represent the following:
   <table>
     <tr><th>Quantized</th><th>Float</th></tr>
     <tr><td>0</td><td>-10.0</td></tr>
-    <tr><td>128</td><td>10.0</td></tr>
     <tr><td>255</td><td>30.0</td></tr>
+    <tr><td>128</td><td>10.0</td></tr>
   </table>
   <figcaption>
     <b>Table 2</b>: Example quantized value range
diff --git a/tensorflow/docs_src/programmers_guide/estimators.md b/tensorflow/docs_src/programmers_guide/estimators.md
index b13b47184d..c4aae1d9d6 100644
--- a/tensorflow/docs_src/programmers_guide/estimators.md
+++ b/tensorflow/docs_src/programmers_guide/estimators.md
@@ -21,17 +21,18 @@ Note: TensorFlow also includes a deprecated `Estimator` class at
 
 Estimators provide the following benefits:
 
-*   You can run Estimator-based models on a local host or on a
+*   You can run Estimators-based models on a local host or on a
     distributed multi-server environment without changing your model.
-    Furthermore, you can run Estimator-based models on CPUs, GPUs,
+    Furthermore, you can run Estimators-based models on CPUs, GPUs,
     or TPUs without recoding your model.
 *   Estimators simplify sharing implementations between model developers.
-*   You can develop a state of the art model with high-level intuitive code.
+*   You can develop a state of the art model with high-level intuitive code,
     In short, it is generally much easier to create models with Estimators
     than with the low-level TensorFlow APIs.
-*   Estimators are themselves built on @{tf.layers}, which
+*   Estimators are themselves built on tf.layers, which
     simplifies customization.
-*   Estimators build the graph for you.
+*   Estimators build the graph for you.  In other words, you don't have to
+    build the graph.
 *   Estimators provide a safe distributed training loop that controls how and
     when to:
     *   build the graph
@@ -56,7 +57,7 @@ the "plumbing" for you.  That is, pre-made Estimators create and manage
 pre-made Estimators let you experiment with different model architectures by
 making only minimal code changes.  @{tf.estimator.DNNClassifier$`DNNClassifier`},
 for example, is a pre-made Estimator class that trains classification models
-based on dense, feed-forward neural networks.
+through dense, feed-forward neural networks.
 
 
 ### Structure of a pre-made Estimators program
@@ -78,7 +79,7 @@ of the following four steps:
     an input function:
 
         def input_fn(dataset):
-           ...  # manipulate dataset, extracting the feature dict and the label
+           ...  # manipulate dataset, extracting feature names and the label
            return feature_dict, label
 
     (See @{$programmers_guide/datasets} for full details.)
@@ -95,13 +96,13 @@ of the following four steps:
         population = tf.feature_column.numeric_column('population')
         crime_rate = tf.feature_column.numeric_column('crime_rate')
         median_education = tf.feature_column.numeric_column('median_education',
-                            normalizer_fn=lambda x: x - global_education_mean)
+                            normalizer_fn='lambda x: x - global_education_mean')
 
 3.  **Instantiate the relevant pre-made Estimator.**  For example, here's
     a sample instantiation of a pre-made Estimator named `LinearClassifier`:
 
         # Instantiate an estimator, passing the feature columns.
-        estimator = tf.estimator.LinearClassifier(
+        estimator = tf.estimator.Estimator.LinearClassifier(
             feature_columns=[population, crime_rate, median_education],
             )
 
diff --git a/tensorflow/docs_src/programmers_guide/feature_columns.md b/tensorflow/docs_src/programmers_guide/feature_columns.md
index 90f5c53a17..845194fe0e 100644
--- a/tensorflow/docs_src/programmers_guide/feature_columns.md
+++ b/tensorflow/docs_src/programmers_guide/feature_columns.md
@@ -528,10 +528,10 @@ suggested by the following snippet:
 categorical_column = ... # Create any categorical column
 
 # Represent the categorical column as an embedding column.
-# This means creating an embedding vector lookup table with one element for each category.
+# This means creating a one-hot vector with one element for each category.
 embedding_column = tf.feature_column.embedding_column(
     categorical_column=categorical_column,
-    dimension=embedding_dimensions)
+    dimension=dimension_of_embedding_vector)
 ```
 
 @{$programmers_guide/embedding$Embeddings} is a significant topic within machine
diff --git a/tensorflow/examples/learn/iris.py b/tensorflow/examples/learn/iris.py
index 86f5204ec3..03e60972aa 100644
--- a/tensorflow/examples/learn/iris.py
+++ b/tensorflow/examples/learn/iris.py
@@ -21,8 +21,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-
-from six.moves.urllib.request import urlretrieve
+import urllib
 
 import tensorflow as tf
 
@@ -39,7 +38,9 @@ FEATURE_KEYS = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
 def maybe_download_iris_data(file_name, download_url):
   """Downloads the file and returns the number of data."""
   if not os.path.exists(file_name):
-    urlretrieve(download_url, file_name)
+    raw = urllib.urlopen(download_url).read()
+    with open(file_name, 'w') as f:
+      f.write(raw)
 
   # The first line is a comma-separated string. The first one is the number of
   # total data in the file.
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index 9b171f66ec..debd95fc62 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -376,6 +376,9 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
     }
   }
   // op annotations
+  op_class.add_annotation(
+      Annotation::Create("Generated", "javax.annotation")
+          .attributes("value = \"TensorFlow Java Op Generator\""));
   if (endpoint.deprecated()) {
     op_class.add_annotation(Annotation::Create("Deprecated"));
     string explanation;
@@ -412,12 +415,8 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
   SourceFileWriter writer(op_file.get());
   std::list<Type> dependencies;
   CollectOpDependencies(op, mode, &dependencies);
-  writer.Write(kLicense)
-      .EndLine()
-      .Write("// This class has been generated, DO NOT EDIT!")
-      .EndLine()
-      .EndLine()
-      .BeginType(op_class, PUBLIC | FINAL, &dependencies, &op_javadoc);
+  writer.Write(kLicense).EndLine().BeginType(op_class, PUBLIC | FINAL,
+                                             &dependencies, &op_javadoc);
   if (!op.optional_attributes().empty()) {
     RenderOptionsClass(op, op_class, &writer);
   }
diff --git a/tensorflow/java/src/gen/cc/op_specs.cc b/tensorflow/java/src/gen/cc/op_specs.cc
index 941ab2699c..181fd4c5e3 100644
--- a/tensorflow/java/src/gen/cc/op_specs.cc
+++ b/tensorflow/java/src/gen/cc/op_specs.cc
@@ -96,7 +96,6 @@ Type TypeResolver::TypeOf(const OpDef_ArgDef& arg_def, bool* iterable_out) {
     *iterable_out = true;
     visited_attrs_.insert(std::make_pair(arg_def.number_attr(), Type::Int()));
   }
-
   Type type = Type::Wildcard();
   if (arg_def.type() != DataType::DT_INVALID) {
     // resolve type from DataType
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index bd97b181ff..b2e6c60021 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -196,11 +196,11 @@ def implicit_val_and_grad(f):
   # TODO(cais): Remove calls to tf.constant() once the gradients functions
   # accept lists and np.ndarrays.
 
-  def grad_fn(*args, **kwds):
+  def grad_fn(*args):
     """Computes the gradient of the wrapped function."""
     this_tape = tape.push_new_tape()
     try:
-      end_node = f(*args, **kwds)
+      end_node = f(*args)
       if end_node is None:
         raise ValueError("Cannot differentiate a function that returns None; "
                          "did you forget to return a value from {}?".format(
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 20522098b0..9cd17e0407 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -978,10 +978,7 @@ py_test(
     size = "large",
     srcs = ["keras_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_windows",
-        "notsan",
-    ],
+    tags = ["notsan"],
     deps = [
         ":keras",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/python/estimator/exporter.py b/tensorflow/python/estimator/exporter.py
index b18212cfcd..7cdf840c97 100644
--- a/tensorflow/python/estimator/exporter.py
+++ b/tensorflow/python/estimator/exporter.py
@@ -156,7 +156,7 @@ def _loss_smaller(best_eval_result, current_eval_result):
   return best_eval_result[default_key] > current_eval_result[default_key]
 
 
-def _verify_compare_fn_args(compare_fn):
+def _verify_compre_fn_args(compare_fn):
   """Verifies compare_fn arguments."""
   args = set(util.fn_args(compare_fn))
   if 'best_eval_result' not in args:
@@ -265,7 +265,7 @@ class BestExporter(Exporter):
     self._compare_fn = compare_fn
     if self._compare_fn is None:
       raise ValueError('`compare_fn` must not be None.')
-    _verify_compare_fn_args(self._compare_fn)
+    _verify_compre_fn_args(self._compare_fn)
 
     self._saved_model_exporter = _SavedModelExporter(
         name, serving_input_receiver_fn, assets_extra, as_text)
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index a6cefdece2..035c7c148c 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -136,13 +136,11 @@ def numpy_input_fn(x,
       values in `x` have same shape).
     ValueError: if duplicate keys are in both `x` and `y` when `y` is a dict.
     ValueError: if x or y is an empty dict.
-    TypeError: `x` is not a dict or array.
-    ValueError: if 'shuffle' is not provided or a bool.
+    TypeError: `x` is not a dict or array, or if `shuffle` is not bool.
   """
   if not isinstance(shuffle, bool):
-    raise ValueError('shuffle must be provided and explicitly set as boolean '
-                     '(it is recommended to set it as True for training); '
-                     'got {}'.format(shuffle))
+    raise TypeError('shuffle must be explicitly set as boolean; '
+                    'got {}'.format(shuffle))
 
   def input_fn():
     """Numpy input function."""
diff --git a/tensorflow/python/estimator/inputs/numpy_io_test.py b/tensorflow/python/estimator/inputs/numpy_io_test.py
index 81b201cc5c..92d057e25d 100644
--- a/tensorflow/python/estimator/inputs/numpy_io_test.py
+++ b/tensorflow/python/estimator/inputs/numpy_io_test.py
@@ -286,9 +286,8 @@ class NumpyIoTest(test.TestCase):
     x = np.arange(32, 36)
     y = np.arange(4)
     with self.test_session():
-      with self.assertRaisesRegexp(ValueError,
-                                   'shuffle must be provided and explicitly '
-                                   'set as boolean'):
+      with self.assertRaisesRegexp(TypeError,
+                                   'shuffle must be explicitly set as boolean'):
         # Default shuffle is None.
         numpy_io.numpy_input_fn(x, y)
 
diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py
index 57f8e5fd6a..938e244fb3 100644
--- a/tensorflow/python/estimator/inputs/pandas_io.py
+++ b/tensorflow/python/estimator/inputs/pandas_io.py
@@ -68,16 +68,15 @@ def pandas_input_fn(x,
   Raises:
     ValueError: if `x` already contains a column with the same name as `y`, or
       if the indexes of `x` and `y` don't match.
-    ValueError: if 'shuffle' is not provided or a bool.
+    TypeError: `shuffle` is not bool.
   """
   if not HAS_PANDAS:
     raise TypeError(
         'pandas_input_fn should not be called without pandas installed')
 
   if not isinstance(shuffle, bool):
-    raise ValueError('shuffle must be provided and explicitly set as boolean '
-                     '(it is recommended to set it as True for training); '
-                     'got {}'.format(shuffle))
+    raise TypeError('shuffle must be explicitly set as boolean; '
+                    'got {}'.format(shuffle))
 
   x = x.copy()
   if y is not None:
diff --git a/tensorflow/python/estimator/inputs/pandas_io_test.py b/tensorflow/python/estimator/inputs/pandas_io_test.py
index dcecf6dd61..e5912a3b28 100644
--- a/tensorflow/python/estimator/inputs/pandas_io_test.py
+++ b/tensorflow/python/estimator/inputs/pandas_io_test.py
@@ -70,9 +70,8 @@ class PandasIoTest(test.TestCase):
       return
     x, _ = self.makeTestDataFrame()
     y_noindex = pd.Series(np.arange(-32, -28))
-    with self.assertRaisesRegexp(ValueError,
-                                 'shuffle must be provided and explicitly '
-                                 'set as boolean'):
+    with self.assertRaisesRegexp(TypeError,
+                                 'shuffle must be explicitly set as boolean'):
       # Default shuffle is None
       pandas_io.pandas_input_fn(x, y_noindex)
 
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index 51a61adb21..8e2ec83020 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -250,7 +250,7 @@ class _PandasFeedFn(object):
                num_epochs=None):
     if len(placeholders) != len(dataframe.columns) + 1:
       raise ValueError("Expected {} placeholders; got {}.".format(
-          len(dataframe.columns) + 1, len(placeholders)))
+          len(dataframe.columns), len(placeholders)))
     self._index_placeholder = placeholders[0]
     self._col_placeholders = placeholders[1:]
     self._dataframe = dataframe
diff --git a/tensorflow/python/estimator/keras.py b/tensorflow/python/estimator/keras.py
index 2f439f765e..c80af08fba 100644
--- a/tensorflow/python/estimator/keras.py
+++ b/tensorflow/python/estimator/keras.py
@@ -70,7 +70,7 @@ def _convert_tensor(x):
   return x
 
 
-def _any_variable_initialized():
+def _any_variable_initalized():
   """Check if any variable has been initialized in the Keras model.
 
   Returns:
@@ -511,7 +511,7 @@ def model_to_estimator(keras_model=None,
       keras_model_fn, model_dir=model_dir, config=config)
 
   # Check if we need to call get_weights:
-  if _any_variable_initialized():
+  if _any_variable_initalized():
     keras_weights = keras_model.get_weights()
     # Warn if config passed to estimator tries to update GPUOptions. If a
     # session has already been created, the GPUOptions passed to the first
diff --git a/tensorflow/python/estimator/keras_test.py b/tensorflow/python/estimator/keras_test.py
index 5e094ae92b..6688a84130 100644
--- a/tensorflow/python/estimator/keras_test.py
+++ b/tensorflow/python/estimator/keras_test.py
@@ -31,10 +31,10 @@ from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.applications import mobilenet
 from tensorflow.python.keras.optimizers import SGD
-from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -146,13 +146,13 @@ def randomize_io_type(array, name):
 def multi_inputs_multi_outputs_model():
   a = keras.layers.Input(shape=(16,), name='input_a')
   b = keras.layers.Input(shape=(16,), name='input_b')
-  m = keras.layers.Input(shape=(8,), dtype='string', name='input_m')
+  m = keras.layers.Input(shape=(8,), dtype='bool', name='input_m')
   dense = keras.layers.Dense(8, name='dense_1')
 
   a_2 = dense(a)
-  # Read m
-  m_2 = keras.layers.Lambda(gen_parsing_ops.string_to_number)(m)
-  s_2 = keras.layers.Lambda(lambda k: k[0] * k[1])([m_2, a_2])
+  # Apply a mask
+  s_2 = keras.layers.Lambda(lambda k:
+                            K.switch(k[0], k[1], K.zeros_like(k[1])))([m, a_2])
   b_2 = dense(b)
   merged = keras.layers.concatenate([s_2, b_2], name='merge')
   c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
@@ -372,13 +372,13 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
 
     def train_input_fn():
       input_dict = {'input_a': a_train, 'input_b': b_train,
-                    'input_m': input_m_train.astype(np.str)}
+                    'input_m': input_m_train > 0}
       output_dict = {'dense_2': c_train, 'dense_3': d_train}
       return input_dict, output_dict
 
     def eval_input_fn():
       input_dict = {'input_a': a_test, 'input_b': b_test,
-                    'input_m': input_m_test.astype(np.str)}
+                    'input_m': input_m_test > 0}
       output_dict = {'dense_2': c_test, 'dense_3': d_test}
       return input_dict, output_dict
 
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index af5d709f7e..2d6925d1a8 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -1389,7 +1389,7 @@ class LayoutOptimizerTest(test.TestCase):
       expected_num_transposes = 3
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
-      self._assert_trans_nchw_to_nhwc('map/while/Add_1-0-2', nodes)
+      self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testLoopWithVecAnd4D(self):
@@ -1413,7 +1413,7 @@ class LayoutOptimizerTest(test.TestCase):
       expected_num_transposes = 2
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
-      self._assert_trans_nchw_to_nhwc('map/while/Add_1-0-2', nodes)
+      self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testBinaryOpSecondPort(self):
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index f608dea430..e487f583be 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -93,8 +93,6 @@ def selu(x):
       - To be used together with the initialization "lecun_normal".
       - To be used together with the dropout variant "AlphaDropout".
 
-  References:
-      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
   """
   alpha = 1.6732632423543772848170429916717
   scale = 1.0507009873554804934193349852946
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 9f91368e5b..70b6a8431a 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -724,6 +724,15 @@ class TensorBoard(Callback):
         for weight in layer.weights:
           mapped_weight_name = weight.name.replace(':', '_')
           tf_summary.histogram(mapped_weight_name, weight)
+          if self.write_grads:
+            grads = model.optimizer.get_gradients(model.total_loss, weight)
+
+            def is_indexed_slices(grad):
+              return type(grad).__name__ == 'IndexedSlices'
+
+            grads = [grad.values if is_indexed_slices(grad) else grad
+                     for grad in grads]
+            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
           if self.write_images:
             w_img = array_ops.squeeze(weight)
             shape = K.int_shape(w_img)
@@ -750,18 +759,6 @@ class TensorBoard(Callback):
             assert len(shape) == 4 and shape[-1] in [1, 3, 4]
             tf_summary.image(mapped_weight_name, w_img)
 
-        if self.write_grads:
-          for weight in layer.trainable_weights:
-            mapped_weight_name = weight.name.replace(':', '_')
-            grads = model.optimizer.get_gradients(model.total_loss, weight)
-
-            def is_indexed_slices(grad):
-              return type(grad).__name__ == 'IndexedSlices'
-
-            grads = [grad.values if is_indexed_slices(grad) else grad
-                     for grad in grads]
-            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
-
         if hasattr(layer, 'output'):
           tf_summary.histogram('{}_out'.format(layer.name), layer.output)
     self.merged = tf_summary.merge_all()
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 5062a26580..b355f4a269 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -653,8 +653,6 @@ class KerasCallbacksTest(test.TestCase):
       model.add(
           keras.layers.Dense(
               NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      # non_trainable_weights: moving_variance, moving_mean
-      model.add(keras.layers.BatchNormalization())
       model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
       model.compile(
           loss='categorical_crossentropy',
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 1c9135982e..a4cd017d60 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -123,7 +123,7 @@ class Network(base_layer.Layer):
     # Entries are unique. Includes input and output layers.
     self._layers = []
 
-    # Used in symbolic mode only, only in conjunction with graph-networks
+    # Used in symbolic mode only, only in conjonction with graph-networks
     self._outbound_nodes = []
     self._inbound_nodes = []
 
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index 7e82db028b..6a94986b9c 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -482,7 +482,7 @@ class TestWholeModelSaving(test.TestCase):
       with h5py.File(fname, 'r') as h5file:
         num_names_arrays = len([attr for attr in h5file['model_weights'].attrs
                                 if attr.startswith('layer_names')])
-      # The chunking of layer names array should have happened.
+      # The chunking of layer names array should have happend.
       self.assertGreater(num_names_arrays, 0)
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
@@ -527,7 +527,7 @@ class TestWholeModelSaving(test.TestCase):
         num_weight_arrays = len(
             [attr for attr in h5file['model_weights']['nested_model'].attrs
              if attr.startswith('weight_names')])
-      # The chunking of layer names array should have happened.
+      # The chunking of layer names array should have happend.
       self.assertGreater(num_weight_arrays, 0)
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index fce6cbdb7a..89c1f1a40f 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -24,7 +24,6 @@ import numpy as np
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -410,13 +409,11 @@ class Model(Network):
         else:
           if sample_weight_mode == 'temporal':
             sample_weights.append(array_ops.placeholder_with_default(
-                constant_op.constant([[1.]], dtype=K.floatx()),
-                shape=[None, None], name=name + '_sample_weights'))
+                [[1.]], shape=[None, None], name=name + '_sample_weights'))
             sample_weight_modes.append('temporal')
           else:
             sample_weights.append(array_ops.placeholder_with_default(
-                constant_op.constant([1.], dtype=K.floatx()),
-                shape=[None], name=name + '_sample_weights'))
+                [1.], shape=[None], name=name + '_sample_weights'))
             sample_weight_modes.append(None)
     self.sample_weight_modes = sample_weight_modes
     self._feed_sample_weight_modes = []
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index e8838cd3bc..2ecbff3a1c 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -732,7 +732,7 @@ def slice_arrays(arrays, indices, contiguous=True):
   """Slices batches out of provided arrays (workaround for eager tensors).
 
   Unfortunately eager tensors don't have the same slicing behavior as
-  Numpy arrays (they follow the same slicing behavior as symbolic TF tensors),
+  Numpy arrays (they folow  the same slicing behavior as symbolic TF tensors),
   hence we cannot use `generic_utils.slice_arrays` directly
   and we have to implement this workaround based on `concat`. This has a
   performance cost.
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index c519e194bd..a54d6da839 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -71,7 +71,7 @@ class KerasInitializersTest(test.TestCase):
                                                       stddev=1,
                                                       seed=126),
                    tensor_shape,
-                   target_mean=0., target_max=2, target_min=-2)
+                   target_mean=0., target_std=None, target_max=2)
 
   def test_constant(self):
     tensor_shape = (5, 6, 4)
@@ -83,49 +83,49 @@ class KerasInitializersTest(test.TestCase):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(1. / fan_in)
+      scale = np.sqrt(3. / fan_in)
       self._runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_max=scale, target_min=-scale)
 
   def test_glorot_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / (fan_in + fan_out))
+      scale = np.sqrt(6. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_max=scale, target_min=-scale)
 
   def test_he_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / fan_in)
+      scale = np.sqrt(6. / fan_in)
       self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_max=scale, target_min=-scale)
 
   def test_lecun_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(1. / fan_in)
+      scale = np.sqrt(1. / fan_in)
       self._runner(keras.initializers.lecun_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_std=None, target_max=2 * scale)
 
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / (fan_in + fan_out))
+      scale = np.sqrt(2. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_std=None, target_max=2 * scale)
 
   def test_he_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / fan_in)
+      scale = np.sqrt(2. / fan_in)
       self._runner(keras.initializers.he_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_std=None, target_max=2 * scale)
 
   def test_orthogonal(self):
     tensor_shape = (20, 20)
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index f60064ed63..5061825d38 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -19,9 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
-import sys
 import types as python_types
-import warnings
 
 import numpy as np
 
@@ -716,7 +714,6 @@ class Lambda(Layer):
     return self.mask
 
   def get_config(self):
-    module = self.function.__module__
     if isinstance(self.function, python_types.LambdaType):
       function = generic_utils.func_dump(self.function)
       function_type = 'lambda'
@@ -724,26 +721,21 @@ class Lambda(Layer):
       function = self.function.__name__
       function_type = 'function'
 
-    output_shape_module = None
     if isinstance(self._output_shape, python_types.LambdaType):
       output_shape = generic_utils.func_dump(self._output_shape)
       output_shape_type = 'lambda'
-      output_shape_module = self._output_shape.__module__
     elif callable(self._output_shape):
       output_shape = self._output_shape.__name__
       output_shape_type = 'function'
-      output_shape_module = self._output_shape.__module__
     else:
       output_shape = self._output_shape
       output_shape_type = 'raw'
 
     config = {
         'function': function,
-        'module': module,
         'function_type': function_type,
         'output_shape': output_shape,
         'output_shape_type': output_shape_type,
-        'output_shape_module': output_shape_module,
         'arguments': self.arguments
     }
     base_config = super(Lambda, self).get_config()
@@ -753,16 +745,8 @@ class Lambda(Layer):
   def from_config(cls, config, custom_objects=None):
     config = config.copy()
     globs = globals()
-    module = config.pop('module', None)
-    if module in sys.modules:
-      globs.update(sys.modules[module].__dict__)
-    elif module is not None:
-      # Note: we don't know the name of the function if it's a lambda.
-      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
-                    'It may cause errors.'.format(module)
-                    , UserWarning)
     if custom_objects:
-      globs.update(custom_objects)
+      globs = dict(list(globs.items()) + list(custom_objects.items()))
     function_type = config.pop('function_type')
     if function_type == 'function':
       # Simple lookup in custom objects
@@ -776,14 +760,6 @@ class Lambda(Layer):
     else:
       raise TypeError('Unknown function type:', function_type)
 
-    output_shape_module = config.pop('output_shape_module', None)
-    if output_shape_module in sys.modules:
-      globs.update(sys.modules[output_shape_module].__dict__)
-    elif output_shape_module is not None:
-      # Note: we don't know the name of the function if it's a lambda.
-      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
-                    'It may cause errors.'.format(output_shape_module)
-                    , UserWarning)
     output_shape_type = config.pop('output_shape_type')
     if output_shape_type == 'function':
       # Simple lookup in custom objects
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index e6e45902a8..c616d8f24f 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -144,19 +144,5 @@ class CheckpointingTests(test.TestCase):
     model.load_weights(save_prefix)
     self.assertEqual(12., self.evaluate(beta1_power))
 
-class TestModelBackend(test.TestCase):
-
-  def test_model_backend_float64_use_cases(self):
-    # Test case for GitHub issue 19318
-    floatx = keras.backend.floatx()
-    keras.backend.set_floatx('float64')
-
-    x = keras.Input((5,))
-    y = keras.layers.Dense(1)(x)
-    model = keras.models.Model(x, y)
-    model.compile('rmsprop', 'mse')
-
-    keras.backend.set_floatx(floatx)
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/as_string_op_test.py b/tensorflow/python/kernel_tests/as_string_op_test.py
index 94ed8ebd31..9d54add264 100644
--- a/tensorflow/python/kernel_tests/as_string_op_test.py
+++ b/tensorflow/python/kernel_tests/as_string_op_test.py
@@ -130,16 +130,6 @@ class AsStringOpTest(test.TestCase):
       result = output.eval(feed_dict={input_: int_inputs_})
       self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
 
-  def testHalfInt(self):
-    s = lambda strs: [x.decode("ascii") for x in strs]
-
-    with self.test_session():
-      input_ = array_ops.placeholder(dtypes.int16)
-      int_inputs_ = [np.iinfo(np.int16).min, np.iinfo(np.int16).max]
-      output = string_ops.as_string(input_)
-      result = output.eval(feed_dict={input_: int_inputs_})
-      self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
-
   def testBool(self):
     bool_inputs_ = [False, True]
     s = lambda strs: [x.decode("ascii") for x in strs]
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index 16fdedac41..08b03f8518 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -172,7 +172,7 @@ class BetaincTest(test.TestCase):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [gx_s.shape], tf_gout_t, gx_s.shape)
-      tf_logging.info("betainc gradient err = %g " % err)
+      print("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
       # Test broadcast gradient
@@ -181,7 +181,7 @@ class BetaincTest(test.TestCase):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [()], tf_gout_t, ga_s.shape)
-      tf_logging.info("betainc gradient err = %g " % err)
+      print("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
 
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index fb52d10475..e08123b041 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -417,16 +414,6 @@ class ClipTest(test.TestCase):
 
     self.assertAllClose(np_ans, tf_ans)
 
-  def testClipByValueEmptyTensor(self):
-    # Test case for GitHub issue 19337
-    zero = array_ops.placeholder(dtype=dtypes.float32, shape=None)
-    x = clip_ops.clip_by_value(zero, zero, zero)
-    y = clip_ops.clip_by_value(zero, 1.0, 1.0)
-    z = clip_ops.clip_by_value(zero, zero, 1.0)
-    w = clip_ops.clip_by_value(zero, 1.0, zero)
-    with self.test_session(use_gpu=True) as sess:
-      sess.run([x, y, z, w], feed_dict={zero: np.zeros((7, 0))})
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 80ba7dafc9..8699fd5b25 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -312,8 +312,8 @@ class Conv2DTest(test.TestCase):
       expected_values = self.evaluate(expected_results)
       computed_values = self.evaluate(computed_results)
       for e_value, c_value in zip(expected_values, computed_values):
-        tf_logging.info("expected = ", e_value)
-        tf_logging.info("actual = ", c_value)
+        print("expected = ", e_value)
+        print("actual = ", c_value)
         self.assertAllClose(
             e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-4)
 
@@ -337,8 +337,8 @@ class Conv2DTest(test.TestCase):
       for i in range(len(tensors)):
         conv = tensors[i]
         value = values[i]
-        tf_logging.info("expected = ", expected)
-        tf_logging.info("actual = ", value)
+        print("expected = ", expected)
+        print("actual = ", value)
         tol = 1e-5
         if value.dtype == np.float16:
           tol = 1e-3
@@ -547,8 +547,8 @@ class Conv2DTest(test.TestCase):
       # "values" consists of two tensors for two backprops
       value = self.evaluate(conv)
       self.assertShapeEqual(value, conv)
-    tf_logging.info("expected = ", expected)
-    tf_logging.info("actual = ", value)
+    print("expected = ", expected)
+    print("actual = ", value)
     self.assertArrayNear(expected, value.flatten(), err)
 
   def _CompareBackpropInput(self, input_sizes, filter_sizes, output_sizes,
@@ -723,8 +723,8 @@ class Conv2DTest(test.TestCase):
             data_format=data_format)
         value = self.evaluate(conv)
         self.assertShapeEqual(value, conv)
-      tf_logging.info("expected = ", expected)
-      tf_logging.info("actual = ", value)
+      print("expected = ", expected)
+      print("actual = ", value)
       self.assertArrayNear(expected, value.flatten(), 1e-5)
 
   def _CompareBackFilter(self, input_sizes, filter_sizes, output_sizes,
@@ -912,8 +912,8 @@ class Conv2DTest(test.TestCase):
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      tf_logging.info("expected = ", value_2)
-      tf_logging.info("actual = ", value)
+      print("expected = ", value_2)
+      print("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   # Testing for backprops
@@ -965,8 +965,8 @@ class Conv2DTest(test.TestCase):
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      tf_logging.info("expected = ", value_2)
-      tf_logging.info("actual = ", value)
+      print("expected = ", value_2)
+      print("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
@@ -1178,7 +1178,7 @@ class Conv2DTest(test.TestCase):
           # since fp16 numerical gradients are too imprecise.
           err = np.fabs(jacob_t - reference_jacob_t).max()
 
-        tf_logging.info("conv_2d gradient error = ", err)
+        print("conv_2d gradient error = ", err)
         self.assertLess(err, 0.002)
 
   def testInputGradientValidPaddingStrideOne(self):
@@ -1546,7 +1546,7 @@ class DepthwiseConv2DTest(test.TestCase):
       conv = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
       value = sess.run(conv)
-    tf_logging.info("value = ", value)
+    print("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1668,7 +1668,7 @@ class SeparableConv2DTest(test.TestCase):
         conv = array_ops.transpose(conv, [0, 2, 3, 1])
 
       value = sess.run(conv)
-    tf_logging.info("value = ", value)
+    print("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1826,7 +1826,7 @@ class Conv2DBenchmark(test.Benchmark):
         wall_time = time.time() - start
         self.report_benchmark(
             name="conv_stack_iter_%d" % iter_index, wall_time=wall_time)
-        tf_logging.info("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
+        print("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
 
 
 def GetInceptionFwdTest(input_size, filter_size, stride, padding,
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 58e2a8ac2a..91ebe8de99 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -197,21 +197,7 @@ class GatherNdTest(test.TestCase):
     self.assertEqual(None, shape.ndims)
     self.assertEqual(None, shape[0].value)
 
-  def testBadIndicesCPU(self):
-    with self.test_session(use_gpu=False):
-      params = [0, 1, 2]
-      indices = [[[0], [7]]]  # Make this one higher rank
-      gather_nd = array_ops.gather_nd(params, indices)
-      with self.assertRaisesOpError(
-          r"flat indices\[1, :\] = \[7\] does not index into param "
-          r"\(shape: \[3\]\)"):
-        gather_nd.eval()
-
-  def _disabledTestBadIndicesGPU(self):
-    # TODO disabled due to different behavior on GPU and CPU
-    # On GPU the bad indices do not raise error but fetch 0 values
-    if not test.is_gpu_available():
-      return
+  def testBadIndices(self):
     with self.test_session(use_gpu=True):
       params = [0, 1, 2]
       indices = [[[0], [7]]]  # Make this one higher rank
@@ -221,21 +207,7 @@ class GatherNdTest(test.TestCase):
           r"\(shape: \[3\]\)"):
         gather_nd.eval()
 
-  def testBadIndicesWithSlicesCPU(self):
-    with self.test_session(use_gpu=False):
-      params = [[0, 1, 2]]
-      indices = [[[0], [0], [1]]]  # Make this one higher rank
-      gather_nd = array_ops.gather_nd(params, indices)
-      with self.assertRaisesOpError(
-          r"flat indices\[2, :\] = \[1\] does not index into param "
-          r"\(shape: \[1,3\]\)"):
-        gather_nd.eval()
-
-  def _disabledTestBadIndicesWithSlicesGPU(self):
-    # TODO disabled due to different behavior on GPU and CPU
-    # On GPU the bad indices do not raise error but fetch 0 values
-    if not test.is_gpu_available():
-      return
+  def testBadIndicesWithSlices(self):
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2]]
       indices = [[[0], [0], [1]]]  # Make this one higher rank
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index 033fa95935..a2fcd751df 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -27,8 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.platform import test
 
-_TEST_TYPES = (dtypes.int64, dtypes.float32,
-               dtypes.complex64, dtypes.complex128)
+_TEST_TYPES = (dtypes.float32, dtypes.complex64, dtypes.complex128)
 
 
 class GatherTest(test.TestCase):
@@ -123,9 +122,6 @@ class GatherTest(test.TestCase):
                 gather, [tf_params, tf_indices, tf_axis], gather_grad)
             self.assertEqual(indices_grad, None)
             self.assertEqual(axis_grad, None)
-            if dtype.is_integer:
-              self.assertEqual(params_grad, None)
-              continue
             # For axis 0, we are able to create an efficient IndexedSlices for
             # the gradient.
             if axis == 0:
@@ -181,19 +177,7 @@ class GatherTest(test.TestCase):
     gather_t = array_ops.gather(params, indices, axis=axis)
     self.assertEqual(None, gather_t.shape)
 
-  def testBadIndicesCPU(self):
-    with self.test_session(use_gpu=False):
-      params = [[0, 1, 2], [3, 4, 5]]
-      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
-        array_ops.gather(params, [[7]], axis=0).eval()
-      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 3\)"):
-        array_ops.gather(params, [[7]], axis=1).eval()
-
-  def _disabledTestBadIndicesGPU(self):
-    # TODO disabled due to different behavior on GPU and CPU
-    # On GPU the bad indices do not raise error but fetch 0 values
-    if not test.is_gpu_available():
-      return
+  def testBadIndices(self):
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2], [3, 4, 5]]
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 795aa67248..a9b55854f1 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -362,33 +362,6 @@ class UniformUnitScalingInitializationTest(test.TestCase):
         dtype=dtypes.string)
 
 
-class VarianceScalingInitializationTest(test.TestCase):
-
-  def testNormalDistribution(self):
-    shape = [100, 100]
-    expect_mean = 0.
-    expect_var = 1. / shape[0]
-    init = init_ops.variance_scaling_initializer(distribution='normal')
-
-    with self.test_session(use_gpu=True):
-      x = init(shape).eval()
-
-    self.assertNear(np.mean(x), expect_mean, err=1e-2)
-    self.assertNear(np.var(x), expect_var, err=1e-2)
-
-  def testUniformDistribution(self):
-    shape = [100, 100]
-    expect_mean = 0.
-    expect_var = 1. / shape[0]
-    init = init_ops.variance_scaling_initializer(distribution='uniform')
-
-    with self.test_session(use_gpu=True):
-      x = init(shape).eval()
-
-    self.assertNear(np.mean(x), expect_mean, err=1e-2)
-    self.assertNear(np.var(x), expect_var, err=1e-2)
-
-
 # TODO(vrv): move to sequence_ops_test?
 class RangeTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index e95c729715..a0c372db7d 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -947,7 +947,7 @@ class PoolingTest(test.TestCase):
           output_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    tf_logging.info("%s gradient error = " % func_name, err)
+    print("%s gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _ConstructAndTestSecondGradient(self,
@@ -1024,7 +1024,7 @@ class PoolingTest(test.TestCase):
           input_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    tf_logging.info("%s second-order gradient error = " % func_name, err)
+    print("%s second-order gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _testMaxPoolGradValidPadding1_1(self, data_format, use_gpu):
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 253e43920b..677253946e 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -19,7 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import gc
 import re
 
 import numpy as np
@@ -435,29 +434,13 @@ class PyFuncTest(test.TestCase):
 
   # ----- Tests shared by py_func and eager_py_func -----
   def testCleanup(self):
-    # Delete everything created by previous tests to avoid side effects.
-    ops.reset_default_graph()
-    gc.collect()
-    initial_size = script_ops._py_funcs.size()
-    # Encapsulate the graph generation, so locals can be deleted.
-    def make_graphs():
-      for _ in xrange(1000):
-        g = ops.Graph()
-        with g.as_default():
-          c = constant_op.constant([1.], dtypes.float32)
-          _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
-          _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
-          # These ops have a reference to 'c' which has a reference to the graph.
-          # Checks if the functions are being deleted though the graph is referenced from them.
-          # (see #18292)
-          _ = script_ops.py_func(lambda x: x + c.shape[0], [c], [dtypes.float32])
-          _ = script_ops.eager_py_func(lambda x: x + c.shape[0], [c], [dtypes.float32])
- 
-    # Call garbage collector to enforce deletion.
-    make_graphs()
-    ops.reset_default_graph()
-    gc.collect()
-    self.assertEqual(initial_size, script_ops._py_funcs.size())
+    for _ in xrange(1000):
+      g = ops.Graph()
+      with g.as_default():
+        c = constant_op.constant([1.], dtypes.float32)
+        _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
+        _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
+    self.assertLess(script_ops._py_funcs.size(), 100)
 
   # ----- Tests for eager_py_func -----
   @test_util.run_in_graph_and_eager_modes()
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index faa4b49a8d..79fe927b8a 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -144,9 +144,7 @@ class StatefulScatterNdTest(test.TestCase):
         self.assertAllClose(new, ref_var.eval())
 
   def _VariableRankTests(self, np_scatter, tf_scatter):
-    for vtype in (np.int32,
-                  np.float32, np.float64,
-                  np.complex64, np.complex128):
+    for vtype in (np.float32, np.float64, np.complex64, np.complex128):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(np_scatter, tf_scatter, vtype, itype)
 
@@ -223,7 +221,7 @@ class StatefulScatterNdTest(test.TestCase):
   #   self._VariableRankTests(_NumpyDiv, state_ops.scatter_nd_div)
 
   def _ScatterRepeatIndicesTest(self, np_scatter, tf_scatter):
-    for vtype in (np.int32, np.float32, np.float64):
+    for vtype in (np.float32, np.float64):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(
             np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index 1a0fa744ae..c70a4ffce7 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -159,13 +159,7 @@ class ScatterTest(test.TestCase):
 
           # Clips small values to avoid division by zero.
           def clip_small_values(x):
-            threshold = 1e-4
-            sign = np.sign(x)
-
-            if isinstance(x, np.int32):
-              threshold = 1
-              sign = np.random.choice([-1, 1])
-            return threshold * sign if np.abs(x) < threshold else x
+            return 1e-4 * np.sign(x) if np.abs(x) < 1e-4 else x
 
           updates = np.vectorize(clip_small_values)(updates)
           old = _AsType(np.random.randn(*((first_dim,) + extra_shape)), vtype)
@@ -187,11 +181,7 @@ class ScatterTest(test.TestCase):
                          tf_scatter,
                          repeat_indices=False,
                          updates_are_scalar=False):
-    vtypes = [np.float32, np.float64]
-    if tf_scatter != state_ops.scatter_div:
-      vtypes.append(np.int32)
-
-    for vtype in vtypes:
+    for vtype in (np.float32, np.float64):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices,
                                updates_are_scalar)
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index a82855dfeb..794be096b7 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -264,9 +264,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
 
     # A subset of ops has been enabled for complex numbers
     self.complex_ops_list = [(np.add, None,
-                              math_ops.unsorted_segment_sum, lambda t: 0),
-                             (np.ndarray.__mul__, None,
-                              math_ops.unsorted_segment_prod, lambda t: 1)]
+                              math_ops.unsorted_segment_sum, lambda t: 0)]
     self.differentiable_dtypes = [dtypes_lib.float16, dtypes_lib.float32,
                                   dtypes_lib.float64]
     self.all_dtypes = (self.differentiable_dtypes +
diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index e20daccb28..a5bd1b6ee0 100644
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -146,101 +146,5 @@ class StringSplitOpTest(test.TestCase):
       self.assertAllEqual(shape, [3, 1])
 
 
-class StringSplitV2OpTest(test.TestCase):
-
-  def testSplitV2(self):
-    strings = ["pigs on the wing", "animals"]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings)
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
-      self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
-      self.assertAllEqual(shape, [2, 4])
-
-  def testSplitV2MultiCharSeparator(self):
-    # Match Python behavior:
-    # >>> '1<>2<>3'.split('<>')
-    # ['1', '2', '3']
-    # >>> "<><>4<>5<><>6<>".split("<>")
-    # ['', '', '4', '5', '', '6', '']
-    strings = ["1<>2<>3", "<><>4<>5<><>6<>"]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings, sep="<>")
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(
-          indices, [[0, 0], [0, 1], [0, 2],
-                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
-      self.assertAllEqual(values, [b"1", b"2", b"3",
-                                   b"", b"", b"4", b"5", b"", b"6", b""])
-      self.assertAllEqual(shape, [2, 7])
-
-  def testSplitV2SimpleSeparator(self):
-    # Match Python behavior:
-    # >>> '1,2,3'.split(',')
-    # ['1', '2', '3']
-    # >>> '1,2,,3,'.split(',')
-    # ['1', '2', '', '3', '']
-    strings = ["1,2,3", "4,5,,6,"]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings, sep=',')
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
-                                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
-      self.assertAllEqual(values, [b"1", b"2", b"3",
-                                   b"4", b"5", b"", b"6", b""])
-      self.assertAllEqual(shape, [2, 5])
-
-  def testSplitV2EmptySeparator(self):
-    # Match Python behavior:
-    # >>> '1 2 3'.split()
-    # ['1', '2', '3']
-    #>>> '   1   2   3   '.split()
-    #['1', '2', '3']
-    strings = ["1 2 3", "  4  5    6  "]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings)
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
-                                    [1, 0], [1, 1], [1, 2]])
-      self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
-      self.assertAllEqual(shape, [2, 3])
-
-  def testSplitV2SimpleSeparatorMaxSplit(self):
-    # Match Python behavior:
-    # >>> '1,2,3'.split(',', maxsplit=1)
-    # ['1', '2,3']
-    # >>> '4,5,,6,'.split(',', maxsplit=1)
-    # ['4', '5,,6,']
-    strings = ["1,2,3", "4,5,,6,"]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings, sep=',', maxsplit=1)
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(indices, [[0, 0], [0, 1],
-                                    [1, 0], [1, 1]])
-      self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
-      self.assertAllEqual(shape, [2, 2])
-
-  def testSplitV2EmptySeparatorMaxSplit(self):
-    # Match Python behavior:
-    # '1 2 3'.split(maxsplit=1)
-    # ['1', '2 3']
-    # >>> "  4  5    6  ".split(maxsplit=1)
-    # ['4', '5    6  ']
-    strings = ["1 2 3", "  4  5    6  "]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings, maxsplit=1)
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(indices, [[0, 0], [0, 1],
-                                    [1, 0], [1, 1]])
-      self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
-      self.assertAllEqual(shape, [2, 2])
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index fae63b1132..8129334703 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -2619,10 +2619,6 @@ reverse.__doc__ = gen_array_ops.reverse_v2.__doc__
 
 # pylint: disable=redefined-builtin
 @tf_export("reverse_sequence")
-@deprecation.deprecated_args(
-    None, "seq_dim is deprecated, use seq_axis instead", "seq_dim")
-@deprecation.deprecated_args(
-    None, "batch_dim is deprecated, use batch_axis instead", "batch_dim")
 def reverse_sequence(input,
                      seq_lengths,
                      seq_axis=None,
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 94c8d79335..12afcd0b51 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -283,10 +283,10 @@ def compute_gradient(x,
   numbers.  For example, if `x` is complex with shape `[m]` and `y` is complex
   with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with
 
-      J[::2, ::2] = d(Re y)/d(Re x)
-      J[::2, 1::2] = d(Im y)/d(Re x)
-      J[1::2, ::2] = d(Re y)/d(Im x)
-      J[1::2, 1::2] = d(Im y)/d(Im x)
+      J[:m, :n] = d(Re y)/d(Re x)
+      J[:m, n:] = d(Im y)/d(Re x)
+      J[m:, :n] = d(Re y)/d(Im x)
+      J[m:, n:] = d(Im y)/d(Im x)
 
   Args:
     x: a tensor or list of tensors
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index f27d9224c1..bdcf420980 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -28,7 +28,6 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
@@ -259,14 +258,14 @@ def random_flip_up_down(image, seed=None):
   dimension, which is `height`.  Otherwise output the image as-is.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
+    image: A 3-D tensor of shape `[height, width, channels].`
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A tensor of the same type and shape as `image`.
+    A 3-D tensor of the same type and shape as `image`.
+
   Raises:
     ValueError: if the shape of `image` not supported.
   """
@@ -281,14 +280,13 @@ def random_flip_left_right(image, seed=None):
   second dimension, which is `width`.  Otherwise output the image as-is.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
+    image: A 3-D tensor of shape `[height, width, channels].`
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A tensor of the same type and shape as `image`.
+    A 3-D tensor of the same type and shape as `image`.
 
   Raises:
     ValueError: if the shape of `image` not supported.
@@ -299,8 +297,7 @@ def random_flip_left_right(image, seed=None):
 def _random_flip(image, flip_index, seed, scope_name):
   """Randomly (50% chance) flip an image along axis `flip_index`.
     Args:
-      image: 4-D Tensor of shape `[batch, height, width, channels]` or
-             3-D Tensor of shape `[height, width, channels]`.
+      image: A 3-D tensor of shape `[height, width, channels].`
       flip_index: The dimension along which to flip the image.
                   Vertical: 0, Horizontal: 1
       seed: A Python integer. Used to create a random seed. See
@@ -309,37 +306,22 @@ def _random_flip(image, flip_index, seed, scope_name):
       scope_name: Name of the scope in which the ops are added.
 
     Returns:
-      A tensor of the same type and shape as `image`.
+      A 3-D tensor of the same type and shape as `image`.
 
     Raises:
       ValueError: if the shape of `image` not supported.
   """
   with ops.name_scope(None, scope_name, [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = _AssertAtLeast3DImage(image)
-    shape = image.get_shape()
-    if shape.ndims == 3 or shape.ndims is None:
-      uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-      mirror_cond = math_ops.less(uniform_random, .5)
-      result = control_flow_ops.cond(
-          mirror_cond,
-          lambda: array_ops.reverse(image, [flip_index]),
-          lambda: image,
-          name=scope
-      )
-      return fix_image_flip_shape(image, result)
-    elif shape.ndims == 4:
-      uniform_random = random_ops.random_uniform(
-          [array_ops.shape(image)[0]], 0, 1.0, seed=seed
-      )
-      mirror_cond = math_ops.less(uniform_random, .5)
-      return array_ops.where(
-          mirror_cond,
-          image,
-          functional_ops.map_fn(lambda x: array_ops.reverse(x, [flip_index]), image, dtype=image.dtype)
-      )
-    else:
-      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+    image = _Assert3DImage(image)
+    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
+    mirror_cond = math_ops.less(uniform_random, .5)
+    result = control_flow_ops.cond(
+        mirror_cond,
+        lambda: array_ops.reverse(image, [flip_index]),
+        lambda: image,
+        name=scope)
+    return fix_image_flip_shape(image, result)
 
 
 @tf_export('image.flip_left_right')
@@ -1652,13 +1634,13 @@ def is_jpeg(contents, name=None):
 
 
 @tf_export('image.decode_image')
-def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
+def decode_image(contents, channels=None, name=None):
   """Convenience function for `decode_bmp`, `decode_gif`, `decode_jpeg`,
   and `decode_png`.
 
   Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
-  appropriate operation to convert the input bytes `string` into a `Tensor`
-  of type `dtype`.
+  appropriate operation to convert the input bytes `string` into a `Tensor` of
+  type `uint8`.
 
   Note: `decode_gif` returns a 4-D array `[num_frames, height, width, 3]`, as
   opposed to `decode_bmp`, `decode_jpeg` and `decode_png`, which return 3-D
@@ -1670,11 +1652,10 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
     contents: 0-D `string`. The encoded image bytes.
     channels: An optional `int`. Defaults to `0`. Number of color channels for
       the decoded image.
-    dtype: The desired DType of the returned `Tensor`.
     name: A name for the operation (optional)
 
   Returns:
-    `Tensor` with type `dtype` and shape `[height, width, num_channels]` for
+    `Tensor` with type `uint8` with shape `[height, width, num_channels]` for
       BMP, JPEG, and PNG images and shape `[num_frames, height, width, 3]` for
       GIF images.
 
@@ -1698,7 +1679,7 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
       channels_msg = 'Channels must be in (None, 0, 3) when decoding BMP images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_decode, assert_channels]):
-        return convert_image_dtype(gen_image_ops.decode_bmp(contents), dtype)
+        return gen_image_ops.decode_bmp(contents)
 
     def _gif():
       # Create assert to make sure that channels is not set to 1
@@ -1711,7 +1692,7 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
       channels_msg = 'Channels must be in (None, 0, 3) when decoding GIF images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return convert_image_dtype(gen_image_ops.decode_gif(contents), dtype)
+        return gen_image_ops.decode_gif(contents)
 
     def check_gif():
       # Create assert op to check that bytes are GIF decodable
@@ -1720,11 +1701,7 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
 
     def _png():
       """Decodes a PNG image."""
-      return convert_image_dtype(
-          gen_image_ops.decode_png(contents, channels,
-                                   dtype=dtypes.uint8
-                                   if dtype == dtypes.uint8
-                                   else dtypes.uint16), dtype)
+      return gen_image_ops.decode_png(contents, channels)
 
     def check_png():
       """Checks if an image is PNG."""
@@ -1740,8 +1717,7 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
                       'images')
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return convert_image_dtype(
-            gen_image_ops.decode_jpeg(contents, channels), dtype)
+        return gen_image_ops.decode_jpeg(contents, channels)
 
     # Decode normal JPEG images (start with \xff\xd8\xff\xe0)
     # as well as JPEG images with EXIF data (start with \xff\xd8\xff\xe1).
@@ -1902,7 +1878,7 @@ def sample_distorted_bounding_box(image_size,
       width / height within this range.
     area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
       The cropped area of the image must contain a fraction of the
-      supplied image within this range.
+      supplied image within in this range.
     max_attempts: An optional `int`. Defaults to `100`.
       Number of attempts at generating a cropped region of the image
       of the specified constraints. After `max_attempts` failures, return the
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 2a6ab26e96..45499dcce0 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -533,37 +533,6 @@ class FlipImageBenchmark(test.Benchmark):
         iters=benchmark_rounds,
         wall_time=step_time)
 
-  def _benchmarkBatchedRandomFlipLeftRight(self, device, cpu_count):
-    image_shape = [16, 299, 299, 3]
-    warmup_rounds = 100
-    benchmark_rounds = 1000
-    config = config_pb2.ConfigProto()
-    if cpu_count is not None:
-      config.inter_op_parallelism_threads = 1
-      config.intra_op_parallelism_threads = cpu_count
-    with session.Session("", graph=ops.Graph(), config=config) as sess:
-      with ops.device(device):
-        inputs = variables.Variable(
-            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
-            trainable=False,
-            dtype=dtypes.float32)
-        run_op = image_ops.random_flip_left_right(inputs)
-        sess.run(variables.global_variables_initializer())
-        for i in xrange(warmup_rounds + benchmark_rounds):
-          if i == warmup_rounds:
-            start = time.time()
-          sess.run(run_op)
-    end = time.time()
-    step_time = (end - start) / benchmark_rounds
-    tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
-    print("benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s step_time: "
-          "%.2f us" %
-          (tag, step_time * 1e6))
-    self.report_benchmark(
-        name="benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s" % (tag),
-        iters=benchmark_rounds,
-        wall_time=step_time)
-
   def benchmarkFlipLeftRightCpu1(self):
     self._benchmarkFlipLeftRight("/cpu:0", 1)
 
@@ -582,15 +551,6 @@ class FlipImageBenchmark(test.Benchmark):
   def benchmarkRandomFlipLeftRightGpu(self):
     self._benchmarkRandomFlipLeftRight(test.gpu_device_name(), None)
 
-  def benchmarkBatchedRandomFlipLeftRightCpu1(self):
-    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", 1)
-
-  def benchmarkBatchedRandomFlipLeftRightCpuAll(self):
-    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", None)
-
-  def benchmarkBatchedRandomFlipLeftRightGpu(self):
-    self._benchmarkBatchedRandomFlipLeftRight(test.gpu_device_name(), None)
-
 
 class AdjustHueBenchmark(test.Benchmark):
 
@@ -1027,7 +987,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf, seed=seed)
+      y = image_ops.random_flip_left_right(x_tf)
       self.assertTrue(y.op.name.startswith("random_flip_left_right"))
 
       count_flipped = 0
@@ -1048,50 +1008,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
-  def testRandomFlipLeftRightWithBatch(self):
-    batch_size = 16
-    seed = 42
-
-    # create single item of test data
-    x_np_raw = np.array(
-        [[1, 2, 3], [1, 2, 3]], dtype=np.uint8
-    ).reshape([1, 2, 3, 1])
-    y_np_raw = np.array(
-        [[3, 2, 1], [3, 2, 1]], dtype=np.uint8
-    ).reshape([1, 2, 3, 1])
-
-    # create batched test data
-    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
-    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
-
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf, seed=seed)
-      self.assertTrue(y.op.name.startswith("random_flip_left_right"))
-
-      count_flipped = 0
-      count_unflipped = 0
-      for _ in range(100):
-        y_tf = y.eval()
-
-        # check every element of the batch
-        for i in range(batch_size):
-          if y_tf[i][0][0] == 1:
-            self.assertAllEqual(y_tf[i], x_np[i])
-            count_unflipped += 1
-          else:
-            self.assertAllEqual(y_tf[i], y_np[i])
-            count_flipped += 1
-
-      # 100 trials, each containing batch_size elements
-      # Mean: 50 * batch_size
-      # Std Dev: ~5 * sqrt(batch_size)
-      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
-      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
-      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
-      self.assertGreaterEqual(count_flipped, six_sigma)
-      self.assertGreaterEqual(count_unflipped, six_sigma)
-
   def testInvolutionUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1141,11 +1057,9 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
 
-    seed = 42
-
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf, seed=seed)
+      y = image_ops.random_flip_up_down(x_tf, seed=42)
       self.assertTrue(y.op.name.startswith("random_flip_up_down"))
       count_flipped = 0
       count_unflipped = 0
@@ -1165,50 +1079,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
-  def testRandomFlipUpDownWithBatch(self):
-    batch_size = 16
-    seed = 42
-
-    # create single item of test data
-    x_np_raw = np.array(
-        [[1, 2, 3], [4, 5, 6]], dtype=np.uint8
-    ).reshape([1, 2, 3, 1])
-    y_np_raw = np.array(
-        [[4, 5, 6], [1, 2, 3]], dtype=np.uint8
-    ).reshape([1, 2, 3, 1])
-
-    # create batched test data
-    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
-    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
-
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf, seed=seed)
-      self.assertTrue(y.op.name.startswith("random_flip_up_down"))
-
-      count_flipped = 0
-      count_unflipped = 0
-      for _ in range(100):
-        y_tf = y.eval()
-
-        # check every element of the batch
-        for i in range(batch_size):
-          if y_tf[i][0][0] == 1:
-            self.assertAllEqual(y_tf[i], x_np[i])
-            count_unflipped += 1
-          else:
-            self.assertAllEqual(y_tf[i], y_np[i])
-            count_flipped += 1
-
-      # 100 trials, each containing batch_size elements
-      # Mean: 50 * batch_size
-      # Std Dev: ~5 * sqrt(batch_size)
-      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
-      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
-      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
-      self.assertGreaterEqual(count_flipped, six_sigma)
-      self.assertGreaterEqual(count_unflipped, six_sigma)
-
   def testInvolutionTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1286,7 +1156,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     #Ops that support 4D input
     for op in [
         image_ops.flip_left_right, image_ops.flip_up_down,
-        image_ops.random_flip_left_right, image_ops.random_flip_up_down,
         image_ops.transpose_image, image_ops.rot90
     ]:
       transformed_unknown_dims_4 = op(p_unknown_dims_4)
@@ -1297,6 +1166,14 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
                                    "must be at least three-dimensional"):
         op(p_wrong_rank)
 
+    for op in [
+        image_ops.random_flip_left_right,
+        image_ops.random_flip_up_down,
+    ]:
+      with self.assertRaisesRegexp(ValueError, "must be three-dimensional"):
+        op(p_wrong_rank)
+
+
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -1331,6 +1208,41 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         y_np = np.rot90(image, k=k, axes=(1, 2))
         self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
+class RandomFlipTest(test_util.TensorFlowTestCase):
+
+  def testRandomLeftRight(self):
+    x_np = np.array([0, 1], dtype=np.uint8).reshape([1, 2, 1])
+    num_iterations = 500
+
+    hist = [0, 0]
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_left_right(x_tf)
+      for _ in xrange(num_iterations):
+        y_np = y.eval().flatten()[0]
+        hist[y_np] += 1
+
+    # Ensure that each entry is observed within 4 standard deviations.
+    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
+    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
+
+  def testRandomUpDown(self):
+    x_np = np.array([0, 1], dtype=np.uint8).reshape([2, 1, 1])
+    num_iterations = 500
+
+    hist = [0, 0]
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_up_down(x_tf)
+      for _ in xrange(num_iterations):
+        y_np = y.eval().flatten()[0]
+        hist[y_np] += 1
+
+    # Ensure that each entry is observed within 4 standard deviations.
+    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
+    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
+
+
 class AdjustContrastTest(test_util.TensorFlowTestCase):
 
   def _testContrast(self, x_np, y_np, contrast_factor):
@@ -3968,88 +3880,5 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
       self.assertAllClose(expected_batch, actual_sobel)
 
 
-class DecodeImageTest(test_util.TensorFlowTestCase):
-
-  def testJpegUint16(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/jpeg/testdata"
-      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
-      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
-                                             dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testPngUint16(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/png/testdata"
-      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
-      image0 = image_ops.decode_image(png0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(
-          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testGifUint16(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/gif/testdata"
-      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
-      image0 = image_ops.decode_image(gif0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
-                                             dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testBmpUint16(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/bmp/testdata"
-      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
-      image0 = image_ops.decode_image(bmp0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
-                                             dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testJpegFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/jpeg/testdata"
-      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
-      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
-                                             dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testPngFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/png/testdata"
-      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
-      image0 = image_ops.decode_image(png0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(
-          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testGifFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/gif/testdata"
-      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
-      image0 = image_ops.decode_image(gif0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
-                                             dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testBmpFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/bmp/testdata"
-      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
-      image0 = image_ops.decode_image(bmp0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
-                                             dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 724fcc39cd..2df230d470 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -467,8 +467,7 @@ class VarianceScaling(Initializer):
     else:
       scale /= max(1., (fan_in + fan_out) / 2.)
     if self.distribution == "normal":
-      # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
-      stddev = math.sqrt(scale) / .87962566103423978
+      stddev = math.sqrt(scale)
       return random_ops.truncated_normal(
           shape, 0.0, stddev, dtype, seed=self.seed)
     else:
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 8276047cb6..222b8ebc9d 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -35,9 +35,8 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 # Assert and Print are special symbols in python, so we must
-# have an upper-case version of them.  For users with Python 3 or Python 2.7
-# with `from __future__ import print_function`, we also allow lowercase.
-@tf_export("Print", "print")
+# use an upper-case version of them.
+@tf_export("Print")
 def Print(input_, data, message=None, first_n=None, summarize=None,
           name=None):
   """Prints a list of tensors.
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 466d0dadc8..e40481f3a7 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -125,8 +125,8 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` or `SparseTensor` of type `float16`, `float32`, `float64`,
-      `int32`, `int64`, `complex64` or `complex128`.
+    x: A `Tensor` or `SparseTensor` of type `float32`, `float64`, `int32`,
+      `int64`, `complex64` or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -430,10 +430,10 @@ def pow(x, y, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
-     `complex64`, or `complex128`.
-    y: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
-     `complex64`, or `complex128`.
+    x: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
+     or `complex128`.
+    y: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
+     or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -600,7 +600,7 @@ def round(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, or `int64`.
+    x: A `Tensor` of type `float32` or `float64`.
     name: A name for the operation (optional).
 
   Returns:
@@ -1257,7 +1257,7 @@ def reduce_sum(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1397,7 +1397,7 @@ def reduce_mean(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1469,7 +1469,7 @@ def reduce_prod(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1519,7 +1519,7 @@ def reduce_min(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1568,7 +1568,7 @@ def reduce_max(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1617,7 +1617,7 @@ def reduce_all(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1675,7 +1675,7 @@ def reduce_any(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index f47f38e29e..783d485892 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -621,7 +621,7 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
 
   Args:
-    counts: A `Tensor` containing the total count of the data (one value).
+    counts: A `Tensor` containing a the total count of the data (one value).
     mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly
       shifted) sum of the elements to average over.
     variance_ss: A `Tensor` containing the variance sufficient statistics: the
@@ -689,9 +689,6 @@ def moments(
     # Compute true mean while keeping the dims for proper broadcasting.
     mean = math_ops.reduce_mean(y, axes, keepdims=True, name="mean")
     # sample variance, not unbiased variance
-    # Note: stop_gradient does not change the gradient that gets 
-    #       backpropagated to the mean from the variance calculation,
-    #       because that gradient is zero
     variance = math_ops.reduce_mean(
         math_ops.squared_difference(y, array_ops.stop_gradient(mean)),
         axes,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 0c2f5b06c4..a0b55eb077 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1596,12 +1596,12 @@ def leaky_relu(features, alpha=0.2, name=None):
   Returns:
     The activation value.
   """
-  with ops.name_scope(name, "LeakyRelu", [features, alpha]) as name:
+  with ops.name_scope(name, "LeakyRelu", [features, alpha]):
     features = ops.convert_to_tensor(features, name="features")
     if features.dtype.is_integer:
       features = math_ops.to_float(features)
     alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
-    return math_ops.maximum(alpha * features, features, name=name)
+    return math_ops.maximum(alpha * features, features)
 
 
 def _flatten_outer_dims(logits):
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 035b4735af..46a5f4fae6 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -962,16 +962,6 @@ class LeakyReluTest(test_lib.TestCase):
       self.assertAllClose(
           outputs, [-0.4, -0.2, 0.0, 1.0, 2.0], rtol=tol, atol=tol)
 
-  def testName(self):
-    np_values = np.array([-2, -1, 0, 1, 2], dtype=np.float64)
-    outputs_with_name_set = nn_ops.leaky_relu(
-        constant_op.constant(np_values),
-        name='test_relu_op')
-    self.assertEqual(outputs_with_name_set.name, 'test_relu_op:0')
-    outputs_without_name_set = nn_ops.leaky_relu(
-        constant_op.constant(np_values))
-    self.assertEqual(outputs_without_name_set.name, 'LeakyRelu:0')
-
 
 class SwishTest(test_lib.TestCase):
 
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 219562de5d..f8676ccb5f 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -23,7 +23,6 @@ import threading
 
 # Used by py_util.cc to get tracebacks.
 import traceback  # pylint: disable=unused-import
-import weakref
 
 import numpy as np
 import six
@@ -130,14 +129,11 @@ class FuncRegistry(object):
   def __init__(self):
     self._lock = threading.Lock()
     self._unique_id = 0  # GUARDED_BY(self._lock)
-    # Only store weakrefs to the funtions. The strong reference is stored in
-    # the graph.
-    self._funcs = weakref.WeakValueDictionary()
+    self._funcs = {}
 
   def insert(self, func):
     """Registers `func` and returns a unique token for this entry."""
     token = self._next_unique_token()
-    # Store a weakref to the function
     self._funcs[token] = func
     return token
 
@@ -190,7 +186,7 @@ class FuncRegistry(object):
     Raises:
       ValueError: if no function is registered for `token`.
     """
-    func = self._funcs.get(token, None)
+    func = self._funcs[token]
     if func is None:
       raise ValueError("callback %s is not found" % token)
     if isinstance(func, EagerFunc):
@@ -232,6 +228,19 @@ _py_funcs = FuncRegistry()
 pywrap_tensorflow.InitializePyTrampoline(_py_funcs)
 
 
+class CleanupFunc(object):
+  """A helper class to remove a registered function from _py_funcs."""
+
+  def __init__(self, token):
+    self._token = token
+
+  def __del__(self):
+    if _py_funcs is not None:
+      # If _py_funcs is None, the program is most likely in shutdown, and the
+      # _py_funcs object has been destroyed already.
+      _py_funcs.remove(self._token)
+
+
 def _internal_py_func(func,
                       inp,
                       Tout,
@@ -261,15 +270,17 @@ def _internal_py_func(func,
     # bound to that of the outer graph instead.
     graph = graph._outer_graph
 
+  cleanup = CleanupFunc(token)
+
   # TODO(zhifengc): Consider adding a Graph method to collect
   # `cleanup` objects in one of its member.
-  if not hasattr(graph, "_py_funcs_used_in_graph"):
-    graph._py_funcs_used_in_graph = []
+  if not hasattr(graph, "_cleanup_py_funcs_used_in_graph"):
+    graph._cleanup_py_funcs_used_in_graph = []
 
-  # Store a reference to the function in the graph to ensure it stays alive
-  # as long as the graph lives. When the graph is destroyed, the function
-  # is left to the garbage collector for destruction as well.
-  graph._py_funcs_used_in_graph.append(func)
+  # When `graph` is destroyed, elements in _cleanup_py_funcs_used_in_graph
+  # will be destroyed and their __del__ will remove the 'token' from
+  # the funcs registry.
+  graph._cleanup_py_funcs_used_in_graph.append(cleanup)
   # pylint: enable=protected-access
 
   if eager:
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index c3b16a7bd5..0130233746 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -84,8 +84,6 @@ def _convert_to_sparse_tensors(sp_inputs):
 
 # pylint: disable=protected-access
 @tf_export("sparse_concat")
-@deprecation.deprecated_args(
-    None, "concat_dim is deprecated, use axis instead", "concat_dim")
 def sparse_concat(axis,
                   sp_inputs,
                   name=None,
@@ -599,8 +597,6 @@ class KeywordRequired(object):
 
 
 @tf_export("sparse_split")
-@deprecation.deprecated_args(
-    None, "split_dim is deprecated, use axis instead", "split_dim")
 def sparse_split(keyword_required=KeywordRequired(),
                  sp_input=None,
                  num_split=None,
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 0280c89c10..ae79c01949 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -91,59 +91,6 @@ def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=inv
   shape.set_shape([2])
   return sparse_tensor.SparseTensor(indices, values, shape)
 
-@tf_export("strings.split")
-def string_split_v2(source, sep=None, maxsplit=-1):
-  """Split elements of `source` based on `sep` into a `SparseTensor`.
-
-  Let N be the size of source (typically N will be the batch size). Split each
-  element of `source` based on `sep` and return a `SparseTensor`
-  containing the split tokens. Empty tokens are ignored.
-
-  For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
-  then the output will be
-
-  st.indices = [0, 0;
-                0, 1;
-                1, 0;
-                1, 1;
-                1, 2]
-  st.shape = [2, 3]
-  st.values = ['hello', 'world', 'a', 'b', 'c']
-
-  If `sep` is given, consecutive delimiters are not grouped together and are
-  deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
-  sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
-  string, consecutive whitespace are regarded as a single separator, and the
-  result will contain no empty strings at the startor end if the string has
-  leading or trailing whitespace.
-
-  Note that the above mentioned behavior matches python's str.split.
-
-  Args:
-    source: `1-D` string `Tensor`, the strings to split.
-    sep: `0-D` string `Tensor`, the delimiter character.
-    maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
-
-  Raises:
-    ValueError: If sep is not a string.
-
-  Returns:
-    A `SparseTensor` of rank `2`, the strings split according to the delimiter.
-    The first column of the indices corresponds to the row in `source` and the
-    second column corresponds to the index of the split component in this row.
-  """
-  if sep is None:
-    sep = ''
-  sep = ops.convert_to_tensor(sep, dtype=dtypes.string)
-  source = ops.convert_to_tensor(source, dtype=dtypes.string)
-
-  indices, values, shape = gen_string_ops.string_split_v2(
-      source, sep=sep, maxsplit=maxsplit)
-  indices.set_shape([None, 2])
-  values.set_shape([None])
-  shape.set_shape([2])
-  return sparse_tensor.SparseTensor(indices, values, shape)
-
 
 def _reduce_join_reduction_dims(x, axis, reduction_indices):
   """Returns range(rank(x) - 1, 0, -1) if reduction_indices is None."""
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 47414c28af..f49e2d314d 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -1786,23 +1786,6 @@ class variable_scope(object):
           assert v.name == "foo/bar/v:0"
   ```
 
-  Simple example of how to reenter a premade variable scope safely:
-
-  ```python
-  with tf.variable_scope("foo") as vs:
-    pass
-
-  # Re-enter the variable scope.
-  with tf.variable_scope(vs,
-                         auxiliary_name_scope=False) as vs1:
-    # Restore the original name_scope.
-    with tf.name_scope(vs1.original_name_scope):
-        v = tf.get_variable("v", [1])
-        assert v.name == "foo/v:0"
-        c = tf.constant([1], name="c")
-        assert c.name == "foo/c:0"
-  ```
-
   Basic example of sharing a variable AUTO_REUSE:
 
   ```python
@@ -1941,9 +1924,7 @@ class variable_scope(object):
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
       auxiliary_name_scope: If `True`, we create an auxiliary name scope with
-        the scope. If `False`, we don't create it. Note that the argument is
-        not inherited, and it only takes effect for once when creating. You
-        should only use it for re-entering a premade variable scope.
+        the scope. If `False`, we don't touch name scope.
 
     Returns:
       A scope that can be captured and reused.
diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
old mode 100644
new mode 100755
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index b59f8e1f98..522965990b 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1719,7 +1719,7 @@ def tf_py_build_info_genrule():
       name="py_build_info_gen",
       outs=["platform/build_info.py"],
       cmd=
-     "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
+      "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
       local=1,
       tools=[clean_dep("//tensorflow/tools/build_info:gen_build_info.py")],)
 
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 671b7e387e..bca9fa49eb 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -41,11 +41,7 @@ _GENERATED_FILE_HEADER = """# This file is MACHINE GENERATED! Do not edit.
 # Generated by: tensorflow/tools/api/generator/create_python_api.py script.
 \"\"\"%s
 \"\"\"
-
-from __future__ import print_function
-
 """
-_GENERATED_FILE_FOOTER = "\n\ndel print_function\n"
 
 
 class SymbolExposedTwiceError(Exception):
@@ -153,7 +149,6 @@ class _ModuleInitCodeBuilder(object):
 _names_with_underscore = [%s]
 __all__ = [_s for _s in dir() if not _s.startswith('_')]
 __all__.extend([_s for _s in _names_with_underscore])
-__all__.remove('print_function')
 ''' % underscore_names_str
 
     return module_text_map
@@ -338,8 +333,7 @@ def create_api_files(
     if module or not root_init_template:
       contents = (
           _GENERATED_FILE_HEADER %
-          get_module_docstring(module, package, api_name) +
-          text + _GENERATED_FILE_FOOTER)
+          get_module_docstring(module, package, api_name) + text)
     else:
       # Read base init file
       with open(root_init_template, 'r') as root_init_template_file:
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index 10171b3d60..5bb3b3c444 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -58,7 +58,7 @@ tf_module {
   }
   member_method {
     name: "decode_image"
-    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "decode_jpeg"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 3051c4437e..dc2bd40096 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -1532,10 +1532,6 @@ tf_module {
     name: "pow"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "print"
-    argspec: "args=[\'input_\', \'data\', \'message\', \'first_n\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "py_func"
     argspec: "args=[\'func\', \'inp\', \'Tout\', \'stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
index b641c39feb..a3fbe95bba 100644
--- a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
@@ -4,8 +4,4 @@ tf_module {
     name: "regex_full_match"
     argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "split"
-    argspec: "args=[\'source\', \'sep\', \'maxsplit\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
-  }
 }
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 883bb93647..5fa75e1d61 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -322,10 +322,6 @@ create_activate_virtualenv_and_install_tensorflow() {
   pip install -v ${PIP_FLAGS} ${WHL_PATH} || \
     die "pip install (forcing to reinstall tensorflow) FAILED"
   echo "Successfully installed pip package ${TF_WHEEL_PATH}"
-
-  # Force downgrade setuptools.
-  pip install --upgrade setuptools==39.1.0
-
 }
 
 ################################################################################
diff --git a/tensorflow/tools/ci_build/builds/with_the_same_user b/tensorflow/tools/ci_build/builds/with_the_same_user
index b216e3549f..d4bf546d40 100755
--- a/tensorflow/tools/ci_build/builds/with_the_same_user
+++ b/tensorflow/tools/ci_build/builds/with_the_same_user
@@ -40,7 +40,7 @@ if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
   ADDUSER_OPTS="--force-badname"
 fi
 
-getent group "${CI_BUILD_GID}" || addgroup ${ADDUSER_OPTS} --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
+getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
 getent passwd "${CI_BUILD_UID}" || adduser ${ADDUSER_OPTS} \
     --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
     --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 1f0fd0387a..072dd6ab99 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -134,12 +134,6 @@ if [[ $? != "0" ]]; then
   die "ERROR: docker build failed. Dockerfile is at ${DOCKERFILE_PATH}"
 fi
 
-# If caller wants the with_the_same_user script to allow bad usernames, 
-# pass the var to the docker environment
-if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
-        CI_BUILD_USER_FORCE_BADNAME_ENV="-e CI_BUILD_USER_FORCE_BADNAME=yes"
-fi
-
 # Run the command inside the container.
 echo "Running '${COMMAND[*]}' inside ${DOCKER_IMG_NAME}..."
 mkdir -p ${WORKSPACE}/bazel-ci_build-cache
@@ -154,7 +148,6 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_BUILD_GROUP=$(id -g -n)" \
     -e "CI_BUILD_GID=$(id -g)" \
     -e "CI_TENSORFLOW_SUBMODULE_PATH=${CI_TENSORFLOW_SUBMODULE_PATH}" \
-    ${CI_BUILD_USER_FORCE_BADNAME_ENV} \
     -v ${WORKSPACE}:/workspace \
     -w /workspace \
     ${GPU_EXTRA_PARAMS} \
diff --git a/tensorflow/tools/ci_build/copy_binary.py b/tensorflow/tools/ci_build/copy_binary.py
index 148526492d..420d390d2b 100755
--- a/tensorflow/tools/ci_build/copy_binary.py
+++ b/tensorflow/tools/ci_build/copy_binary.py
@@ -32,8 +32,7 @@ import shutil
 import tempfile
 import zipfile
 
-TF_NIGHTLY_REGEX = (r"(.+)tf_nightly(|_gpu)-(\d\.[\d]{1,2}"
-                    "\.\d.dev[\d]{0,8})-(.+)\.whl")
+TF_NIGHTLY_REGEX = r"(.+)tf_nightly(|_gpu)-(\d\.\d\.\d.dev[\d]{0,8})-(.+)\.whl"
 BINARY_STRING_TEMPLATE = "%s-%s-%s.whl"
 
 
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 88f1d04193..60290df833 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -115,7 +115,3 @@ pip2 install keras_applications==1.0.2
 pip3 install keras_applications==1.0.2
 pip2 install keras_preprocessing==1.0.1
 pip3 install keras_preprocessing==1.0.1
-
-# Install last working version of setuptools.
-pip2 install --upgrade setuptools==39.1.0
-pip3 install --upgrade setuptools==39.1.0
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index acd69ef346..edb9d4b929 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -39,6 +39,7 @@ if [[ -z $pip35_version ]]; then
 fi
 
 set -e
+pip3.5 install --upgrade setuptools
 pip3.5 install --upgrade pip
 
 pip3.5 install --upgrade virtualenv
@@ -85,7 +86,4 @@ pip3.5 install --upgrade termcolor
 pip3.5 install keras_applications==1.0.2
 pip3.5 install keras_preprocessing==1.0.1
 
-# Install last working version of setuptools.
-pip3.5 install --upgrade setuptools==39.1.0
-
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 323b30f48e..5635977731 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -49,6 +49,7 @@ cd Python-3.6.1
 make altinstall
 ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
 
+pip3 install --upgrade setuptools
 pip3 install --upgrade pip
 
 pip3 install --upgrade virtualenv
@@ -100,8 +101,4 @@ pip3 install --upgrade termcolor
 # Keras
 pip3.5 install keras_applications==1.0.2
 pip3.5 install keras_preprocessing==1.0.1
-
-# Install last working version of setuptools.
-pip3 install --upgrade setuptools==39.1.0
-
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh
deleted file mode 100755
index 10a09a415a..0000000000
--- a/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Usage: basic_mkl_test.sh
-
-# Helper function to traverse directories up until given file is found.
-function upsearch () {
-  test / == "$PWD" && return || \
-      test -e "$1" && echo "$PWD" && return || \
-      cd .. && upsearch "$1"
-}
-
-# Set up WORKSPACE.
-WORKSPACE="${WORKSPACE:-$(upsearch WORKSPACE)}"
-
-BUILD_TAG=mkl-ci-test CI_BUILD_USER_FORCE_BADNAME=yes ${WORKSPACE}/tensorflow/tools/ci_build/ci_build.sh cpu tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index b8bce57c87..1bd1852ffc 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -79,7 +79,6 @@ if [[ $1 == "PI_ONE" ]]; then
   --linkopt=-L${OPENBLAS_INSTALL_PATH}/lib/
   --linkopt=-l:libopenblas.a"
   echo "Building for the Pi One/Zero, with no NEON support"
-  WHEEL_ARCH=linux_armv6l
 else
   PI_COPTS='--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
   --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
@@ -87,7 +86,6 @@ else
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8'
-  WHEEL_ARCH=linux_armv7l
   echo "Building for the Pi Two/Three, with NEON acceleration"
 fi
 
@@ -102,8 +100,6 @@ bazel build -c opt ${PI_COPTS} \
   --copt=-fomit-frame-pointer --cpu=armeabi \
   --crosstool_top=@local_config_arm_compiler//:toolchain \
   --verbose_failures \
-  //tensorflow:libtensorflow.so \
-  //tensorflow:libtensorflow_framework.so \
   //tensorflow/tools/benchmark:benchmark_model \
   //tensorflow/tools/pip_package:build_pip_package
 
@@ -116,12 +112,10 @@ BDIST_OPTS="--universal" \
   bazel-bin/tensorflow/tools/pip_package/build_pip_package "${OUTDIR}"
 
 OLD_FN=$(ls "${OUTDIR}" | grep -m 1 \.whl)
-SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-'${WHEEL_ARCH}'.whl/; print'
+SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-any.whl/; print'
 NEW_FN=$(echo "${OLD_FN}" | perl -ne "${SUB}")
 mv "${OUTDIR}/${OLD_FN}" "${OUTDIR}/${NEW_FN}"
 cp bazel-bin/tensorflow/tools/benchmark/benchmark_model "${OUTDIR}"
-cp bazel-bin/tensorflow/libtensorflow.so "${OUTDIR}"
-cp bazel-bin/tensorflow/libtensorflow_framework.so "${OUTDIR}"
 
 echo "Output can be found here:"
 find "${OUTDIR}"
diff --git a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
index f8f63e276c..47539b2423 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+++ b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
@@ -31,11 +31,7 @@ def _def_file_filter_configure_impl(repository_ctx):
   vc_path = find_vc_path(repository_ctx)
   if vc_path == "visual-studio-not-found":
     auto_configure_fail("Visual C++ build tools not found on your machine")
-
-  undname = find_msvc_tool(repository_ctx, vc_path, "undname.exe")
-  if undname == None:
-    auto_configure_fail("Couldn't find undname.exe under %s, please check your VC installation and set BAZEL_VC environment variable correctly." % vc_path)
-  undname_bin_path = undname.replace("\\", "\\\\")
+  undname_bin_path = find_msvc_tool(repository_ctx, vc_path, "undname.exe").replace("\\", "\\\\")
 
   repository_ctx.template(
     "def_file_filter.py",
diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh
index b0114721bd..06c2b997cb 100755
--- a/tensorflow/tools/dist_test/local_test.sh
+++ b/tensorflow/tools/dist_test/local_test.sh
@@ -64,6 +64,9 @@ die() {
 # Configurations
 DOCKER_IMG_NAME="tensorflow/tf-dist-test-local-cluster"
 
+# Use TensorFlow v1.5.0 for Python 2.7 and CPU only as we set num_gpus to 0 in the below
+DEFAULT_WHL_FILE_LOCATION="https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl"
+
 # Parse input arguments
 LEAVE_CONTAINER_RUNNING=0
 MODEL_NAME=""
@@ -74,7 +77,8 @@ SYNC_REPLICAS_FLAG=""
 
 WHL_FILE_LOCATION=${1}
 if [[ -z "${WHL_FILE_LOCATION}" ]]; then
-  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
+  WHL_FILE_LOCATION=${DEFAULT_WHL_FILE_LOCATION}
+  echo "use default whl file location"
 fi
 
 while true; do
@@ -127,11 +131,7 @@ echo "Building in temporary directory: ${BUILD_DIR}"
 cp -r ${DIR}/* "${BUILD_DIR}"/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
-# Download whl file into the build context directory.
-if [[ -z "${WHL_FILE_LOCATION}" ]]; then
-  pip2 download --no-deps tf-nightly
-  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
-elif [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
+if [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
     # Download whl file into the build context directory.
     wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \
         die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}"
diff --git a/tensorflow/tools/dist_test/remote_test.sh b/tensorflow/tools/dist_test/remote_test.sh
index e188c88c8f..935535312d 100755
--- a/tensorflow/tools/dist_test/remote_test.sh
+++ b/tensorflow/tools/dist_test/remote_test.sh
@@ -108,7 +108,7 @@ fi
 # Parse command-line arguments.
 WHL_URL=${1}
 if [[ -z "${WHL_URL}" ]]; then
-  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
+  die "whl URL is not specified"
 fi
 
 # Create docker build context directory.
@@ -121,13 +121,8 @@ cp -r ${DIR}/* ${BUILD_DIR}/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
 # Download whl file into the build context directory.
-if [[ -z "${WHL_URL}" ]]; then
-  pip2 download --no-deps tf-nightly
-  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
-else
-  wget -P "${BUILD_DIR}" ${WHL_URL} || \
-    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
-fi
+wget -P "${BUILD_DIR}" ${WHL_URL} || \
+  die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
 
 # Build docker image for test.
 docker build ${NO_CACHE_FLAG} \
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 57a491255e..406d134699 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -76,7 +76,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.9 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index 6796ad70e5..a6cd44ced1 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.9
+ARG TF_BRANCH=r1.8
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 204b5b4dba..2fe47f3356 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -13,8 +13,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusparse-dev-9-0 \
         curl \
         git \
-        libcudnn7=7.1.4.18-1+cuda9.0 \
-        libcudnn7-dev=7.1.4.18-1+cuda9.0 \
+        libcudnn7=7.0.5.15-1+cuda9.0 \
+        libcudnn7-dev=7.0.5.15-1+cuda9.0 \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
@@ -85,7 +85,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.9 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 9197651ff4..bff4a20392 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusolver-9-0 \
         cuda-cusparse-9-0 \
         curl \
-        libcudnn7=7.1.4.18-1+cuda9.0 \
+        libcudnn7=7.0.5.15-1+cuda9.0 \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libpng12-dev \
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 620fef9363..5910f0625e 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -61,7 +61,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/autograph/core:core",
     "//tensorflow/contrib/autograph/impl:impl",
     "//tensorflow/contrib/autograph/lang:lang",
-    "//tensorflow/contrib/autograph/operators:operators",
     "//tensorflow/contrib/autograph/pyct:pyct",
     "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
     "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index f7e42ce536..0c4065bc77 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -41,15 +41,51 @@ function is_windows() {
   fi
 }
 
-function prepare_src() {
+function main() {
   if [ $# -lt 1 ] ; then
     echo "No destination dir provided"
     exit 1
   fi
 
-  TMPDIR="$1"
-  mkdir -p "$TMPDIR"
-  echo $(date) : "=== Preparing sources in dir: ${TMPDIR}"
+  DEST=$(real_path $1)
+  TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX)
+
+  PKG_NAME_FLAG=""
+  GPU_BUILD=0
+  NIGHTLY_BUILD=0
+  PROJECT_NAME=""
+  while true; do
+    if [[ "$1" == "--nightly_flag" ]]; then
+      NIGHTLY_BUILD=1
+    elif [[ "$1" == "--gpu" ]]; then
+      GPU_BUILD=1
+    elif [[ "$1" == "--gpudirect" ]]; then
+      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
+    elif [[ "$1" == "--project_name" ]]; then
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+    fi
+    shift
+
+    if [[ -z "$1" ]]; then
+      break
+    fi
+  done
+
+  if [[ -n ${PROJECT_NAME} ]]; then
+    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
+  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
+  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly"
+  elif [[ ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tensorflow_gpu"
+  fi
+
+  echo $(date) : "=== Using tmpdir: ${TMPDIR}"
 
   if [ ! -d bazel-bin/tensorflow ]; then
     echo "Could not find bazel-bin.  Did you run from the root of the build tree?"
@@ -119,28 +155,17 @@ function prepare_src() {
   # over so user defined ops can be compiled.
   mkdir -p ${TMPDIR}/google
   mkdir -p ${TMPDIR}/third_party
-  pushd ${RUNFILES%org_tensorflow} > /dev/null
+  pushd ${RUNFILES%org_tensorflow}
   for header in $(find protobuf_archive -name \*.h); do
     mkdir -p "${TMPDIR}/google/$(dirname ${header})"
     cp "$header" "${TMPDIR}/google/$(dirname ${header})/"
   done
-  popd > /dev/null
+  popd
   cp -R $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party
 
   cp tensorflow/tools/pip_package/MANIFEST.in ${TMPDIR}
   cp tensorflow/tools/pip_package/README ${TMPDIR}
   cp tensorflow/tools/pip_package/setup.py ${TMPDIR}
-}
-
-function build_wheel() {
-  if [ $# -lt 2 ] ; then
-    echo "No src and dest dir provided"
-    exit 1
-  fi
-
-  TMPDIR="$1"
-  DEST="$2"
-  PKG_NAME_FLAG="$3"
 
   # Before we leave the top-level directory, make sure we know how to
   # call python.
@@ -148,110 +173,15 @@ function build_wheel() {
     source tools/python_bin_path.sh
   fi
 
-  pushd ${TMPDIR} > /dev/null
+  pushd ${TMPDIR}
   rm -f MANIFEST
   echo $(date) : "=== Building wheel"
   "${PYTHON_BIN_PATH:-python}" setup.py bdist_wheel ${PKG_NAME_FLAG} >/dev/null
   mkdir -p ${DEST}
   cp dist/* ${DEST}
-  popd > /dev/null
+  popd
+  rm -rf ${TMPDIR}
   echo $(date) : "=== Output wheel file is in: ${DEST}"
 }
 
-function usage() {
-  echo "Usage:"
-  echo "$0 [--src srcdir] [--dst dstdir] [options]"
-  echo "$0 dstdir [options]"
-  echo ""
-  echo "    --src                 prepare sources in srcdir"
-  echo "                              will use temporary dir if not specified"
-  echo ""
-  echo "    --dst                 build wheel in dstdir"
-  echo "                              if dstdir is not set do not build, only prepare sources"
-  echo ""
-  echo "  Options:"
-  echo "    --project_name <name> set project name to name"
-  echo "    --gpu                 build tensorflow_gpu"
-  echo "    --gpudirect           build tensorflow_gpudirect"
-  echo "    --nightly_flag        build tensorflow nightly"
-  echo ""
-  exit 1
-}
-
-function main() {
-  PKG_NAME_FLAG=""
-  PROJECT_NAME=""
-  GPU_BUILD=0
-  NIGHTLY_BUILD=0
-  SRCDIR=""
-  DSTDIR=""
-  CLEANSRC=1
-  while true; do
-    if [[ "$1" == "--help" ]]; then
-      usage
-      exit 1
-    elif [[ "$1" == "--nightly_flag" ]]; then
-      NIGHTLY_BUILD=1
-    elif [[ "$1" == "--gpu" ]]; then
-      GPU_BUILD=1
-    elif [[ "$1" == "--gpudirect" ]]; then
-      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
-    elif [[ "$1" == "--project_name" ]]; then
-      shift
-      if [[ -z "$1" ]]; then
-        break
-      fi
-      PROJECT_NAME="$1"
-    elif [[ "$1" == "--src" ]]; then
-      shift
-      SRCDIR="$(real_path $1)"
-      CLEANSRC=0
-    elif [[ "$1" == "--dst" ]]; then
-      shift
-      DSTDIR="$(real_path $1)"
-    else
-      DSTDIR="$(real_path $1)"
-    fi
-    shift
-
-    if [[ -z "$1" ]]; then
-      break
-    fi
-  done
-
-  if [[ -z "$DSTDIR" ]] && [[ -z "$SRCDIR" ]]; then
-    echo "No destination dir provided"
-    usage
-    exit 1
-  fi
-
-  if [[ -z "$SRCDIR" ]]; then
-    # make temp srcdir if none set
-    SRCDIR="$(mktemp -d -t tmp.XXXXXXXXXX)"
-  fi
-
-  prepare_src "$SRCDIR"
-
-  if [[ -z "$DSTDIR" ]]; then
-      # only want to prepare sources
-      exit
-  fi
-
-  if [[ -n ${PROJECT_NAME} ]]; then
-    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
-  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
-  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly"
-  elif [[ ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_gpu"
-  fi
-
-  build_wheel "$SRCDIR" "$DSTDIR" "$PKG_NAME_FLAG"
-
-  if [[ $CLEANSRC -ne 0 ]]; then
-    rm -rf "${TMPDIR}"
-  fi
-}
-
 main "$@"
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 97f625e7e9..d25a9e77b1 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -45,7 +45,7 @@ DOCLINES = __doc__.split('\n')
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.9.0-rc0'
+_VERSION = '1.8.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
@@ -54,7 +54,6 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-    'setuptools <= 39.1.0',
     'tensorboard >= 1.8.0, < 1.9.0',
     'termcolor >= 1.1.0',
 ]
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
index 15d7c70281..29add6d5ea 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
@@ -814,9 +814,6 @@ void Generator::Generate(const FileDescriptor& fd) {
   // Add header to cc file.
   SetOutput(&cc_);
   Print("// GENERATED FILE - DO NOT MODIFY");
-  Print();
-  Print("#include <algorithm>");  // for `std::stable_sort()`
-  Print();
   headers = {GetProtoTextHeaderName(fd, true /* impl */)};
   AddHeadersToCurrentSection(headers);
   Print();
diff --git a/tensorflow/tools/quantization/quantize_graph_test.py b/tensorflow/tools/quantization/quantize_graph_test.py
index 92bb5127da..df71840b64 100644
--- a/tensorflow/tools/quantization/quantize_graph_test.py
+++ b/tensorflow/tools/quantization/quantize_graph_test.py
@@ -119,8 +119,8 @@ def are_tensors_near(a, b, tolerance):
   flat_a = a.flatten()
   flat_b = b.flatten()
   if len(flat_a) != len(flat_b):
-    tf_logging.info("Tensors are different sizes: " + str(len(flat_a)) + " vs "
-                    + str(len(flat_b)))
+    print("Tensors are different sizes: " + str(len(flat_a)) + " vs " + str(
+        len(flat_b)))
     return False
   value_count = len(flat_a)
   how_many_different = 0
@@ -140,10 +140,10 @@ def are_tensors_near(a, b, tolerance):
   if how_many_different == 0:
     return True
   else:
-    tf_logging.info("Tensors have {0} different values ({1}%), with mean"
-                    " difference {2} and mean absolute difference {3}".format(
-                        how_many_different, proportion_different * 100,
-                        mean_difference, mean_abs_difference))
+    print("Tensors have {0} different values ({1}%), with mean difference"
+          " {2} and mean absolute difference {3}".format(
+              how_many_different, proportion_different * 100, mean_difference,
+              mean_abs_difference))
     return False
 
 
diff --git a/tensorflow/tools/test/upload_test_benchmarks.py b/tensorflow/tools/test/upload_test_benchmarks.py
index c030575109..9c45359ee1 100644
--- a/tensorflow/tools/test/upload_test_benchmarks.py
+++ b/tensorflow/tools/test/upload_test_benchmarks.py
@@ -89,6 +89,7 @@ import shutil
 
 from six import text_type
 from google.cloud import datastore
+from six import text_type
 
 
 def is_real_file(dirpath, fname):
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 4f3df570a5..dbec66216a 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -50,31 +50,31 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl_linux",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
       ],
-      sha256 = "d2305244fdc9b87db7426ed4496e87a4b3977ad3374d73b8000e8b7a5b7aa725",
-      strip_prefix = "mklml_lnx_2018.0.3.20180406",
+      sha256 = "74844bd77294742bf2396ff040369d1aa4cdd9e826fcd38cf8398ae83564d146",
+      strip_prefix = "mklml_lnx_2018.0.2.20180127",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_windows",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip"
       ],
-      sha256 = "a584a5bf1c8d2ad70b90d12b52652030e9a338217719064fdb84b7ad0d693694",
-      strip_prefix = "mklml_win_2018.0.3.20180406",
+      sha256 = "d8fbf0faa0684bffa3548005d05fe5cfe56ff9dbc0e15e7612d7ac01055a6ded",
+      strip_prefix = "mklml_win_2018.0.2.20180127",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_darwin",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz"
       ],
-      sha256 = "094e3dfd61c816136dc8d12a45cc611ce26c5f4828176a3644cd0b0efa15a25b",
-      strip_prefix = "mklml_mac_2018.0.3.20180406",
+      sha256 = "aa740d71e14562bfea56e6829e6dc186e7487cbcf6748a88dec73826b7ec1943",
+      strip_prefix = "mklml_mac_2018.0.2.20180127",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
 
@@ -85,11 +85,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
-          "https://github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
+          "https://github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
       ],
-      sha256 = "efebc53882856afec86457a2da644693f5d59c68772d41d640d6b60a8efc4eb0",
-      strip_prefix = "mkl-dnn-0.14",
+      sha256 = "d2cfd93a70cfe86ebe054477c530c9b5c1218b70f75856eb6d1956c68ee89e8f",
+      strip_prefix = "mkl-dnn-0.13",
       build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
   )
 
@@ -187,11 +187,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "highwayhash",
       urls = [
-          "http://mirror.bazel.build/github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
-          "https://github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
+          "https://mirror.bazel.build/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
+          "https://github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
       ],
-      sha256 = "9c3e0e87d581feeb0c18d814d98f170ff23e62967a2bd6855847f0b2fe598a37",
-      strip_prefix = "highwayhash-fd3d9af80465e4383162e4a7c5e2f406e82dd968",
+      sha256 = "0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
+      strip_prefix = "highwayhash-dfcb97ca4fe9277bf9dc1802dd979b071896453b",
       build_file = clean_dep("//third_party:highwayhash.BUILD"),
   )
 
diff --git a/third_party/eigen.BUILD b/third_party/eigen.BUILD
index e54c1a4501..07bb6645eb 100644
--- a/third_party/eigen.BUILD
+++ b/third_party/eigen.BUILD
@@ -64,7 +64,6 @@ cc_library(
         # This define (mostly) guarantees we don't link any problematic
         # code. We use it, but we do not rely on it, as evidenced above.
         "EIGEN_MPL2_ONLY",
-        "EIGEN_MAX_ALIGN_BYTES=64",
     ],
     includes = ["."],
     visibility = ["//visibility:public"],
diff --git a/third_party/highwayhash.BUILD b/third_party/highwayhash.BUILD
index 08cb84ea2c..1b8e40765e 100644
--- a/third_party/highwayhash.BUILD
+++ b/third_party/highwayhash.BUILD
@@ -10,7 +10,6 @@ cc_library(
     srcs = ["highwayhash/sip_hash.cc"],
     hdrs = [
         "highwayhash/sip_hash.h",
-        "highwayhash/endianess.h",
         "highwayhash/state_helpers.h",
     ],
     visibility = ["//visibility:public"],
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index 663a218733..4418ac32fc 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -291,10 +291,8 @@ cc_library(
         "jchuff.h",
         "jconfig.h",
         "jdct.h",
-        "jerror.h",
         "jinclude.h",
         "jmorecfg.h",
-        "jpegint.h",
         "jpeglib.h",
         "jsimd.h",
         "jsimddct.h",
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index 17c5449cc0..76ab32d69c 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -28,14 +28,7 @@ cc_library(
         "pngwrite.c",
         "pngwtran.c",
         "pngwutil.c",
-    ] + select({
-        "@org_tensorflow//tensorflow:linux_ppc64le": [
-            "powerpc/powerpc_init.c",
-            "powerpc/filter_vsx_intrinsics.c",
-        ],
-        "//conditions:default": [
-        ],
-    }),
+    ],
     hdrs = [
         "png.h",
         "pngconf.h",
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index 3c7e5c8469..954f21f5f8 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -6,7 +6,6 @@
   * `PYTHON_LIB_PATH`: Location of python libraries.
 """
 
-_BAZEL_SH = "BAZEL_SH"
 _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 _PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
 _TF_PYTHON_CONFIG_REPO = "TF_PYTHON_CONFIG_REPO"
@@ -153,22 +152,6 @@ def _get_python_bin(repository_ctx):
             _PYTHON_BIN_PATH, repository_ctx.os.environ.get("PATH", "")))
 
 
-def _get_bash_bin(repository_ctx):
-  """Gets the bash bin path."""
-  bash_bin = repository_ctx.os.environ.get(_BAZEL_SH)
-  if bash_bin != None:
-    return bash_bin
-  else:
-    bash_bin_path = repository_ctx.which("bash")
-    if bash_bin_path != None:
-      return str(bash_bin_path)
-    else:
-      _fail("Cannot find bash in PATH, please make sure " +
-            "bash is installed and add its directory in PATH, or --define " +
-            "%s='/path/to/bash'.\nPATH=%s" % (
-                _BAZEL_SH, repository_ctx.os.environ.get("PATH", "")))
-
-
 def _get_python_lib(repository_ctx, python_bin):
   """Gets the python lib path."""
   python_lib = repository_ctx.os.environ.get(_PYTHON_LIB_PATH)
@@ -201,14 +184,14 @@ def _get_python_lib(repository_ctx, python_bin):
       "  print(paths[0])\n" +
       "END")
   cmd = '%s - %s' % (python_bin, print_lib)
-  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
+  result = repository_ctx.execute(["bash", "-c", cmd])
   return result.stdout.strip('\n')
 
 
 def _check_python_lib(repository_ctx, python_lib):
   """Checks the python lib path."""
   cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
-  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
+  result = repository_ctx.execute(["bash", "-c", cmd])
   if result.return_code == 1:
     _fail("Invalid python library path: %s" % python_lib)
 
@@ -216,7 +199,7 @@ def _check_python_lib(repository_ctx, python_lib):
 def _check_python_bin(repository_ctx, python_bin):
   """Checks the python bin path."""
   cmd =  '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
-  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
+  result = repository_ctx.execute(["bash", "-c", cmd])
   if result.return_code == 1:
     _fail("--define %s='%s' is not executable. Is it the python binary?" % (
         _PYTHON_BIN_PATH, python_bin))
@@ -311,7 +294,6 @@ def _python_autoconf_impl(repository_ctx):
 python_configure = repository_rule(
     implementation = _python_autoconf_impl,
     environ = [
-        _BAZEL_SH,
         _PYTHON_BIN_PATH,
         _PYTHON_LIB_PATH,
         _TF_PYTHON_CONFIG_REPO,
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index cb67d3e961..36f5aa5bde 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -17,6 +17,7 @@
 _SINGLE_URL_WHITELIST = depset([
     "arm_compiler",
     "ortools_archive",
+    "gemmlowp",
 ])
 
 def _is_windows(ctx):
@@ -87,9 +88,7 @@ def _tf_http_archive(ctx):
   if ctx.attr.patch_file != None:
     _apply_patch(ctx, ctx.attr.patch_file)
   if ctx.attr.build_file != None:
-    # Use BUILD.bazel to avoid conflict with third party projects with
-    # BUILD or build (directory) underneath.
-    ctx.template("BUILD.bazel", ctx.attr.build_file, {
+    ctx.template("BUILD", ctx.attr.build_file, {
         "%prefix%": ".." if _repos_are_siblings() else "external",
     }, False)
 
-- 
cgit v1.2.3


From 6070ae0e148f50dbc8f36e1654f0a3f53b8b067e Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 18 Jun 2018 21:00:34 -0700
Subject: Merge changes from github.

PiperOrigin-RevId: 201110240
---
 CONTRIBUTING.md                                    |   2 +-
 README.md                                          |   1 +
 RELEASE.md                                         |  67 +++-
 configure.py                                       |   5 +
 tensorflow/BUILD                                   |   4 +-
 tensorflow/c/generate-pc.sh                        |  11 +-
 tensorflow/cc/gradients/math_grad.cc               |   1 +
 tensorflow/cc/gradients/nn_grad.cc                 |  47 +++
 tensorflow/cc/gradients/nn_grad_test.cc            |  84 ++++-
 tensorflow/compiler/aot/codegen_test_h.golden      |   4 +-
 .../compiler/aot/embedded_protocol_buffers.h       |   2 +-
 tensorflow/compiler/aot/runtime.h                  |   4 +-
 tensorflow/compiler/aot/runtime_test.cc            |  16 +-
 tensorflow/compiler/xla/service/cpu/BUILD          |  18 +-
 tensorflow/compiler/xla/service/cpu/cpu_runtime.cc |   2 +
 tensorflow/compiler/xla/service/cpu/cpu_runtime.h  |   1 +
 tensorflow/compiler/xla/service/cpu/ir_emitter.cc  |   8 +-
 .../compiler/xla/service/cpu/runtime_fft_impl.h    |  20 +-
 .../xla/service/cpu/runtime_single_threaded_fft.cc |  32 ++
 .../xla/service/cpu/runtime_single_threaded_fft.h  |  31 ++
 .../compiler/xla/service/cpu/simple_orc_jit.cc     |   2 +
 tensorflow/compiler/xla/service/pattern_matcher.h  |   2 +-
 .../compiler/xla/service/tuple_simplifier.cc       |   7 +
 tensorflow/compiler/xla/service/tuple_simplifier.h |   9 +-
 .../compiler/xla/service/tuple_simplifier_test.cc  |  77 ++++
 tensorflow/contrib/autograph/__init__.py           |   3 +
 tensorflow/contrib/cmake/tf_c.cmake                |  22 +-
 tensorflow/contrib/cmake/tf_cc_ops.cmake           |   2 +-
 tensorflow/contrib/cmake/tf_python.cmake           |   3 +-
 tensorflow/contrib/cmake/tools/create_def_file.py  |   9 +-
 .../bijectors/sinh_arcsinh_bijector_test.py        |  28 +-
 tensorflow/contrib/eager/python/datasets.py        |   3 +-
 .../python/examples/notebooks/4_high_level.ipynb   |   4 +-
 .../feature_column/sequence_feature_column.py      |  22 +-
 .../feature_column/sequence_feature_column_test.py |  41 ++
 tensorflow/contrib/ffmpeg/__init__.py              |   1 -
 tensorflow/contrib/ffmpeg/ffmpeg_ops.py            |   1 -
 tensorflow/contrib/framework/__init__.py           |   3 +-
 .../ops/fused_conv2d_bias_activation_op_test.py    |  11 +-
 .../src_impl/hexagon_controller.c                  |   2 +-
 tensorflow/contrib/lite/download_dependencies.sh   |   4 +-
 .../contrib/lite/examples/minimal/minimal.cc       |   2 +-
 .../contrib/lite/g3doc/tf_ops_compatibility.md     |  14 +-
 tensorflow/contrib/lite/java/ovic/README.md        |   4 +-
 .../kernels/internal/reference/reference_ops.h     |   4 +-
 tensorflow/contrib/lite/python/interpreter.py      |   2 +-
 .../interpreter_wrapper/interpreter_wrapper.cc     |   9 +-
 .../interpreter_wrapper/interpreter_wrapper.h      |   3 +-
 tensorflow/contrib/lite/python/lite.py             |  11 +
 tensorflow/contrib/lite/toco/import_tensorflow.cc  |   2 +-
 tensorflow/contrib/lite/toco/toco_port.cc          |   6 +
 tensorflow/contrib/lite/toco/toco_port.h           |  18 +
 tensorflow/contrib/makefile/compile_nsync.sh       |   2 +-
 .../contrib/makefile/download_dependencies.sh      |   4 +-
 .../contrib/metrics/python/ops/metric_ops.py       |   2 +-
 tensorflow/contrib/mpi_collectives/kernels/ring.h  |   2 +-
 .../contrib/opt/python/training/adamax_test.py     |   6 +-
 .../opt/python/training/model_average_optimizer.py |   2 +-
 tensorflow/contrib/periodic_resample/BUILD         |  19 +-
 .../kernels/periodic_resample_op.cc                |   5 +
 .../kernels/periodic_resample_op.h                 | 415 +++++++++++++++------
 .../contrib/periodic_resample/ops/array_ops.cc     |  53 ++-
 .../periodic_resample/ops/array_ops_test.cc        |  41 ++
 .../kernel_tests/periodic_resample_op_test.py      |  27 +-
 .../python/ops/periodic_resample_op.py             |   8 +-
 .../predictor/contrib_estimator_predictor.py       |   5 +-
 .../contrib/predictor/core_estimator_predictor.py  |   5 +-
 .../contrib/predictor/predictor_factories.py       |  24 +-
 .../contrib/predictor/predictor_factories_test.py  |  19 +
 .../contrib/predictor/saved_model_predictor.py     |   6 +-
 tensorflow/contrib/quantize/README.md              |   2 +-
 .../contrib/slim/python/slim/evaluation_test.py    |  25 +-
 tensorflow/contrib/summary/summary.py              |   5 +-
 .../contrib/tensor_forest/client/eval_metrics.py   |  45 +--
 .../contrib/tensor_forest/python/tensor_forest.py  |  34 +-
 .../tensor_forest/python/tensor_forest_test.py     |  45 +++
 .../contrib/tensorrt/convert/convert_graph.cc      |  66 ++--
 .../contrib/tensorrt/convert/convert_nodes.cc      |  97 +++--
 tensorflow/contrib/tpu/python/tpu/datasets.py      |  16 +-
 tensorflow/contrib/tpu/python/tpu/datasets_test.py |  26 ++
 tensorflow/core/BUILD                              |   9 +-
 .../core/api_def/base_api/api_def_Selu.pbtxt       |   4 +
 .../api_def/base_api/api_def_StringSplitV2.pbtxt   |  48 +++
 .../api_def/python_api/api_def_StringSplitV2.pbtxt |   4 +
 tensorflow/core/common_runtime/bfc_allocator.cc    |   8 +-
 tensorflow/core/common_runtime/bfc_allocator.h     |   3 +-
 .../direct_session_with_tracking_alloc_test.cc     |  16 +
 .../common_runtime/mkl_threadpool_device_test.cc   |  53 +++
 tensorflow/core/common_runtime/process_util.cc     |  11 +-
 .../core/common_runtime/threadpool_device.cc       |  25 +-
 .../rpc/grpc_master_service_impl.cc                |   4 +-
 .../core/distributed_runtime/rpc/grpc_testlib.cc   |  10 +-
 tensorflow/core/framework/allocator.h              |   5 -
 tensorflow/core/framework/op_gen_lib.cc            |   1 +
 .../remote_fused_graph_execute_info.proto          |   2 +-
 tensorflow/core/framework/tensor_test.cc           |  24 +-
 tensorflow/core/graph/mkl_layout_pass.cc           | 148 +++++++-
 tensorflow/core/graph/mkl_layout_pass_test.cc      |  31 ++
 .../core/grappler/clusters/single_machine_test.cc  |   8 +-
 tensorflow/core/grappler/costs/graph_properties.cc |   1 -
 tensorflow/core/grappler/optimizers/BUILD          |   2 +-
 tensorflow/core/grappler/optimizers/remapper.cc    |   4 +-
 tensorflow/core/kernels/as_string_op.cc            |   2 +
 tensorflow/core/kernels/cwise_op_clip.cc           |  43 +--
 .../core/kernels/dense_update_functor_gpu.cu.cc    |   1 +
 tensorflow/core/kernels/gather_functor.cc          |   1 +
 tensorflow/core/kernels/gather_functor_gpu.cu.cc   |   1 +
 tensorflow/core/kernels/gather_nd_op.cc            |   4 +
 tensorflow/core/kernels/gather_nd_op_gpu.cu.cc     |   2 +
 tensorflow/core/kernels/gather_op.cc               |   1 +
 tensorflow/core/kernels/mkl_concat_op.cc           | 213 ++++++++---
 tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc  |   2 +
 tensorflow/core/kernels/mkl_pooling_ops_common.h   |   6 +-
 tensorflow/core/kernels/scatter_nd_op.cc           |   4 +
 tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc    |   1 +
 .../core/kernels/scoped_allocator_ops_test.cc      |   9 +-
 tensorflow/core/kernels/segment_reduction_ops.h    |  10 +-
 tensorflow/core/kernels/sparse_matmul_op.cc        |   2 +-
 tensorflow/core/kernels/string_split_op.cc         | 130 +++++++
 tensorflow/core/ops/candidate_sampling_ops.cc      |   5 +-
 tensorflow/core/ops/dataset_ops.cc                 |  24 +-
 tensorflow/core/ops/image_ops.cc                   |   4 +-
 tensorflow/core/ops/math_ops.cc                    |   2 +-
 tensorflow/core/ops/nn_ops.cc                      |   1 +
 tensorflow/core/ops/string_ops.cc                  |  20 +-
 tensorflow/core/platform/cpu_info.cc               |  23 ++
 tensorflow/core/platform/cpu_info.h                |   7 +
 tensorflow/core/platform/default/build_config.bzl  |   2 +
 .../core/platform/hadoop/hadoop_file_system.cc     |  21 +-
 tensorflow/core/platform/posix/port.cc             |   5 +
 tensorflow/core/public/version.h                   |   4 +-
 tensorflow/core/util/mkl_util.h                    |  50 ++-
 tensorflow/docs_src/community/groups.md            |  29 +-
 tensorflow/docs_src/get_started/eager.md           |   2 +-
 tensorflow/docs_src/get_started/index.md           |   4 +-
 tensorflow/docs_src/install/install_c.md           |   2 +-
 tensorflow/docs_src/install/install_go.md          |   2 +-
 tensorflow/docs_src/install/install_java.md        |  24 +-
 tensorflow/docs_src/install/install_linux.md       |  24 +-
 tensorflow/docs_src/install/install_mac.md         |  10 +-
 tensorflow/docs_src/install/install_sources.md     |  17 +-
 tensorflow/docs_src/mobile/linking_libs.md         |   2 +-
 tensorflow/docs_src/mobile/prepare_models.md       |   4 +-
 tensorflow/docs_src/performance/quantization.md    |   2 +-
 .../docs_src/programmers_guide/estimators.md       |  19 +-
 .../docs_src/programmers_guide/feature_columns.md  |   4 +-
 tensorflow/examples/learn/iris.py                  |   7 +-
 tensorflow/java/src/gen/cc/op_generator.cc         |  11 +-
 tensorflow/java/src/gen/cc/op_specs.cc             |   1 +
 tensorflow/python/eager/backprop.py                |   4 +-
 tensorflow/python/estimator/BUILD                  |   5 +-
 tensorflow/python/estimator/exporter.py            |   4 +-
 tensorflow/python/estimator/inputs/numpy_io.py     |   8 +-
 .../python/estimator/inputs/numpy_io_test.py       |   5 +-
 tensorflow/python/estimator/inputs/pandas_io.py    |   7 +-
 .../python/estimator/inputs/pandas_io_test.py      |   5 +-
 .../estimator/inputs/queues/feeding_functions.py   |   2 +-
 tensorflow/python/estimator/keras.py               |   4 +-
 tensorflow/python/estimator/keras_test.py          |  14 +-
 tensorflow/python/keras/activations.py             |   2 +
 tensorflow/python/keras/callbacks.py               |  21 +-
 tensorflow/python/keras/callbacks_test.py          |   2 +
 tensorflow/python/keras/engine/network.py          |   2 +-
 tensorflow/python/keras/engine/saving_test.py      |   4 +-
 tensorflow/python/keras/engine/training.py         |   7 +-
 tensorflow/python/keras/engine/training_eager.py   |   2 +-
 tensorflow/python/keras/initializers_test.py       |  26 +-
 tensorflow/python/keras/layers/core.py             |  26 +-
 tensorflow/python/keras/models_test.py             |  14 +
 .../python/kernel_tests/as_string_op_test.py       |  10 +
 tensorflow/python/kernel_tests/betainc_op_test.py  |   4 +-
 tensorflow/python/kernel_tests/clip_ops_test.py    |  13 +
 tensorflow/python/kernel_tests/conv_ops_test.py    |  32 +-
 .../python/kernel_tests/gather_nd_op_test.py       |  32 +-
 tensorflow/python/kernel_tests/gather_op_test.py   |  20 +-
 tensorflow/python/kernel_tests/init_ops_test.py    |  27 ++
 tensorflow/python/kernel_tests/pooling_ops_test.py |   4 +-
 tensorflow/python/kernel_tests/py_func_test.py     |  31 +-
 .../python/kernel_tests/scatter_nd_ops_test.py     |   6 +-
 tensorflow/python/kernel_tests/scatter_ops_test.py |  14 +-
 .../kernel_tests/segment_reduction_ops_test.py     |   4 +-
 .../python/kernel_tests/string_split_op_test.py    |  96 +++++
 tensorflow/python/ops/array_ops.py                 |   4 +
 tensorflow/python/ops/gradient_checker.py          |   8 +-
 tensorflow/python/ops/image_ops_impl.py            |  74 ++--
 tensorflow/python/ops/image_ops_test.py            | 261 ++++++++++---
 tensorflow/python/ops/init_ops.py                  |   3 +-
 tensorflow/python/ops/logging_ops.py               |   5 +-
 tensorflow/python/ops/math_ops.py                  |  28 +-
 tensorflow/python/ops/nn_impl.py                   |   5 +-
 tensorflow/python/ops/nn_ops.py                    |   4 +-
 tensorflow/python/ops/nn_test.py                   |  10 +
 tensorflow/python/ops/script_ops.py                |  35 +-
 tensorflow/python/ops/sparse_ops.py                |   4 +
 tensorflow/python/ops/string_ops.py                |  53 +++
 tensorflow/python/ops/variable_scope.py            |  21 +-
 .../python/tools/import_pb_to_tensorboard.py       |   0
 tensorflow/tensorflow.bzl                          |   2 +-
 .../tools/api/generator/create_python_api.py       |   8 +-
 tensorflow/tools/api/golden/tensorflow.image.pbtxt |   2 +-
 tensorflow/tools/api/golden/tensorflow.pbtxt       |   4 +
 .../tools/api/golden/tensorflow.strings.pbtxt      |   4 +
 tensorflow/tools/ci_build/builds/pip.sh            |   4 +
 .../tools/ci_build/builds/with_the_same_user       |   2 +-
 tensorflow/tools/ci_build/ci_build.sh              |   7 +
 tensorflow/tools/ci_build/copy_binary.py           |   3 +-
 .../tools/ci_build/install/install_pip_packages.sh |   4 +
 .../install/install_python3.5_pip_packages.sh      |   4 +-
 .../install/install_python3.6_pip_packages.sh      |   5 +-
 .../tools/ci_build/linux/mkl/basic-mkl-test.sh     |  29 ++
 tensorflow/tools/ci_build/pi/build_raspberry_pi.sh |   8 +-
 .../def_file_filter/def_file_filter_configure.bzl  |   6 +-
 tensorflow/tools/dist_test/local_test.sh           |  12 +-
 tensorflow/tools/dist_test/remote_test.sh          |  11 +-
 tensorflow/tools/docker/Dockerfile.devel           |   2 +-
 tensorflow/tools/docker/Dockerfile.devel-cpu-mkl   |   2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu       |   6 +-
 tensorflow/tools/docker/Dockerfile.gpu             |   2 +-
 tensorflow/tools/pip_package/BUILD                 |   1 +
 tensorflow/tools/pip_package/build_pip_package.sh  | 160 +++++---
 tensorflow/tools/pip_package/setup.py              |   3 +-
 .../proto_text/gen_proto_text_functions_lib.cc     |   3 +
 .../tools/quantization/quantize_graph_test.py      |  12 +-
 tensorflow/tools/test/upload_test_benchmarks.py    |   1 -
 tensorflow/workspace.bzl                           |  40 +-
 third_party/eigen.BUILD                            |   1 +
 third_party/highwayhash.BUILD                      |   1 +
 third_party/jpeg/jpeg.BUILD                        |   2 +
 third_party/png.BUILD                              |   9 +-
 third_party/py/python_configure.bzl                |  24 +-
 third_party/repo.bzl                               |   5 +-
 231 files changed, 3338 insertions(+), 905 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
 create mode 100644 tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
 create mode 100644 tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
 create mode 100644 tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
 create mode 100644 tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
 mode change 100755 => 100644 tensorflow/python/tools/import_pb_to_tensorboard.py
 create mode 100755 tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh

(limited to 'tensorflow/cc')

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8669c25c45..db4b1581ae 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -90,7 +90,7 @@ Bazel BUILD files also need to include a license section, e.g.,
 Changes to TensorFlow C++ code should conform to
 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
 
-Use `clang-tidy` to check your C/C++ changes. To install clang-tidy on ubuntu:16.04, do:
+Use `clang-tidy` to check your C/C++ changes. To install `clang-tidy` on ubuntu:16.04, do:
 
 ```bash
 apt-get install -y clang-tidy
diff --git a/README.md b/README.md
index 6fb4486d0d..63853137cf 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,7 @@ $ python
 42
 >>> sess.close()
 ```
+Learn more examples about how to do specific tasks in TensorFlow at the [tutorials page of tensorflow.org](https://www.tensorflow.org/tutorials/).
 
 ## Contribution guidelines
 
diff --git a/RELEASE.md b/RELEASE.md
index 84d9d52868..e09e9c6190 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,62 @@
+# Release 1.9.0
+
+## Major Features And Improvements
+* Update tf.keras to the Keras 2.1.6 API.
+* `tfe.Network` is deprecated. Please inherit from `tf.keras.Model`.
+* Adding support of core feature columns and losses to gradient boosted trees estimators.
+* The distributions.Bijector API supports broadcasting for Bijectors with new API changes. See [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/distributions/bijectors/Bijector) for more details.
+* Layered variable names have changed in the following conditions:
+  * Using `tf.keras.layers` with custom variable scopes.
+  * Using `tf.layers` in  a subclassed `tf.keras.Model` class. See [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/layers) for more details
+
+## Breaking Chances
+  * If you're opening empty variable scopes; replace `variable_scope`('', ...) by `variable_scope`(`tf.get_variable_scope()`, ...).
+
+## Bug Fixes and Other Changes
+* `tf.data`:
+  * The `DatasetBase::DebugString()` method is now `const`.
+  * Added the `tf.contrib.data.sample_from_datasets()` API for randomly sampling from multiple datasets.
+* Eager Execution:
+* `tf.keras`:
+  * Move Keras code out of _impl folder and remove API files.
+  * `tf.keras.Model.save_weights` now saves in TensorFlow format by default.
+  * Enable dataset iterators to be passed to `tf.keras.Model` training/eval methods.
+* Accelerated Linear Algebra (XLA):
+* TensorFlow Debugger (tfdbg): fix an issue in which the TensorBoard Debugger Plugin could not handle total source file size exceeding gRPC message size limit (4 MB).
+* `tf.contrib`:
+  * Add `tf.contrib.data.choose_from_datasets()`.
+  * `tf.contrib.data.make_csv_dataset()` now supports line breaks in quoted strings. Two arguments were removed from `make_csv_dataset`.
+  * `tf.contrib.framework.zero_initializer` supports ResourceVariable.
+  * Adding "constrained_optimization" to tensorflow/contrib.
+* Other:
+  * Add GCS Configuration Ops.
+  * Changing signature of `MakeIterator` to enable propagating error status.
+  * KL divergence for two Dirichlet distributions.
+  * More consistent GcsFileSystem behavior for certain reads past EOF.
+  * Update benchmark for tf.scan to match ranges across eager and graph modes.
+  * Fixed bug in `tf.reduce_prod gradient` for complex dtypes.
+  * Add optional `args` argument to `Dataset.from_generator()`.
+  * Allow the use of '.' in variables (e.g. "hparams.parse('a.b=1.0')"), which would previously raise an error. This will correspond to an attribute name with an embedded '.' symbol (e.g. 'a.b'), which can only be accessed indirectly (e.g. through getattr and setattr).  To set this up the user will first need to explicitly add the variable to the hparam object (e.g. "hparams.add_hparam(name='a.b', value=0.0)").
+  * Benchmark for tf.scan in graph and eager modes.
+  * Added complex128 support to FFT, FFT2D, FFT3D, IFFT, IFFT2D, and IFFT3D.
+  * Making ids unique in `nn.embedding_lookup_sparse`. This helps to reduce RPC calls for looking up the embeddings when there are repeated ids in the batch.
+  * Support indicator column in boosted trees.
+  * Prevent `tf.gradients()` from backpropagating through integer tensors.
+  * LinearOperator[1D,2D,3D]Circulant added to `tensorflow.linalg`.
+  * Conv3D, Conv3DBackpropInput, Conv3DBackpropFilter now supports arbitrary.
+  * Added `tf.train.Checkpoint` for reading/writing object-based checkpoints.
+  * `Dataset.list_files()` now produces determinstic results when `shuffle=False` or a `seed` is passed.
+  * Added LinearOperatorKronecker, a dense-free implementation of the Kronecker Product.
+  * Allow LinearOperator to broadcast.
+  * SavedModelBuilder will now deduplicate asset names that point to files with the same basename and the same contents. Note that this may result in new asset files included in SavedModels in cases where assets with the same name but different contents were previously overwriting each other.
+
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+Abdullah Alrasheed, Achal Shah, Ad-530, ADiegoCAlonso, Aditya Yogi, Ag Ramesh, akindyakov, Andy Kernahan, Anya Petrova, Aurelien Geron, Ben, Ben Barsdell, Bhavani-Subramanian, braincodercn, Brett Koonce, Brian Nemsick, Brian Zier, Bryan Heden, candy.dc, cclauss, Clayne Robison, ctiijima, Dalmo Cirne, David Norman, David T.H. Kao, DosLin, ekelsen, Elson Rodriguez, Erik Smistad, Felix Abecassis, Fergal Cotter, fo40225, foo0x29a, Freedom" Koan-Sin Tan, FréDéRic Branchaud-Charron, gdh1995, Geoffrey Irving, Giuseppe, gracehoney, Guido Zuidhof, Guillaume Klein, Guozhong Zhuang, Haggai, Harald Husum, imsheridan, Ivan Zhang, Jan Zikes, Jayaram Bobba, Jesse Benson, Jesse Gumz, Jiajia Li, Jie, jinghuangintel, Jingwen, jjsjann123, Joe Yearsley, Joel Hestness, Joel Shor, josephyearsley, Junpeng Lao, Karol M. Langner, Kb Sriram, krantideep95, Krish Ravindranath, Letian Feng, Loo Rong Jie, Lukas Geiger, Maciej, Mahmoud Abuzaina, ManHyuk, Mark Ryan, mbhuiyan, Michal Turek, Mostafa Alaa, Myungsung Kwak, Nand Dalal, Nehal J Wani, Neil Tenenholtz, ngc92, Nicholas Nadeau, P.Eng., Avs, Niranjan Hasabnis, P-Hidringer, Paul Van Eck, Peng Yu, Qing Zhao, Qingying Chen, Quanlong, Rajendra Arora, Rholais Lii, rmanyari, Robin Richtsfeld, Russell Klopfer, Sagi, Sam Sendelbach, Sandeep N Gupta, Sandip Giri, Sarah Edkins, Scott Tseng, Sdalbsoo, Sergii Khomenko, Seungwoo Choi (Biggie), Seyed Majid Azimi, Shaoning Zeng, shengfuintel, Siu Kei, Muk, Smit Shilu, soonson, Stefan Schweter, Sukhwan Kim, Sunitha Kambhampati, Taehoon Lee, tamimaddari82, Tang, Wenyi, Ted Chang, u2takey, Utkarsh Upadhyay, Vadim Markovtsev, voegtlel, Wai Hon Law, wangsiyu, Wenhao Hu, wenhao.hu, William D. Irons, Yan Facai (颜发才), Yanbo Liang, Yihong Wang, Yilei (Dolee) Yang, Yong Tang, Yuan (Terry) Tang
+
 # Release 1.8.0
 
 ## Major Features And Improvements
@@ -404,14 +463,6 @@ answered questions, and were part of inspiring discussions.
 
 # Release 1.4.0
 
-## Major Features And Improvements
-* `tf.keras` is now part of the core TensorFlow API.
-* [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
-  the core TensorFlow API.
-  * The API is now subject to backwards compatibility guarantees.
-
-# Release 1.4.0
-
 ## Major Features And Improvements
 * `tf.keras` is now part of the core TensorFlow API.
 * [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
diff --git a/configure.py b/configure.py
index bde7af8c0e..ada342a50a 100644
--- a/configure.py
+++ b/configure.py
@@ -1397,6 +1397,10 @@ def set_grpc_build_flags():
   write_to_bazelrc('build --define grpc_no_ares=true')
 
 
+def set_build_strip_flag():
+  write_to_bazelrc('build --strip=always')
+
+
 def set_windows_build_flags():
   if is_windows():
     # The non-monolithic build is not supported yet
@@ -1519,6 +1523,7 @@ def main():
 
   set_grpc_build_flags()
   set_cc_opt_flags(environ_cp)
+  set_build_strip_flag()
   set_windows_build_flags()
 
   if get_var(
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index a73c4ca3aa..6d134dbb80 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -475,7 +475,7 @@ tf_cc_shared_object(
 # excludes all but a subset of function names.
 # On MacOS, the linker does not support version_script, but has an
 # an "-exported_symbols_list" command.  -z defs disallows undefined
-# symbols in object files and -s strips the output.
+# symbols in object files.
 
 tf_cc_shared_object(
     name = "libtensorflow.so",
@@ -489,7 +489,6 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
-            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow/c:version_script.lds)",
         ],
@@ -515,7 +514,6 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
-            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow:tf_version_script.lds)",
         ],
diff --git a/tensorflow/c/generate-pc.sh b/tensorflow/c/generate-pc.sh
index 02a6a58b61..7184ad68fb 100755
--- a/tensorflow/c/generate-pc.sh
+++ b/tensorflow/c/generate-pc.sh
@@ -15,10 +15,12 @@
 # ==============================================================================
 
 TF_PREFIX='/usr/local'
+LIBDIR='lib'
 
 usage() {
     echo "Usage: $0 OPTIONS"
     echo -e "-p, --prefix\tset installation prefix (default: /usr/local)"
+    echo -e "-l, --libdir\tset lib directory (default: lib)"
     echo -e "-v, --version\tset TensorFlow version"
     echo -e "-h, --help\tdisplay this message"
 }
@@ -26,7 +28,7 @@ usage() {
 [ $# == 0 ] && usage && exit 0
 
 # read the options
-ARGS=$(getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@")
+ARGS=$(getopt -o p:l:v:h --long prefix:,libdir:,version:,help -n $0 -- "$@")
 eval set -- "$ARGS"
 
 # extract options and their arguments into variables.
@@ -38,6 +40,11 @@ while true ; do
                 "") shift 2 ;;
                 *) TF_PREFIX=$2 ; shift 2 ;;
             esac ;;
+        -l|--libdir)
+            case "$2" in
+                "") shift 2 ;;
+                *) LIBDIR=$2 ; shift 2 ;;
+            esac ;;
         -v|--version)
             case "$2" in
                 "") shift 2 ;;
@@ -55,7 +62,7 @@ echo "Generating pkgconfig file for TensorFlow $TF_VERSION in $TF_PREFIX"
 cat << EOF > tensorflow.pc
 prefix=${TF_PREFIX}
 exec_prefix=\${prefix}
-libdir=\${exec_prefix}/lib
+libdir=\${exec_prefix}/${LIBDIR}
 includedir=\${prefix}/include
 
 Name: TensorFlow
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index 52c177212a..35a01e0341 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -38,6 +38,7 @@ REGISTER_NO_GRADIENT_OP("NotEqual");
 REGISTER_NO_GRADIENT_OP("LogicalAnd");
 REGISTER_NO_GRADIENT_OP("LogicalOr");
 REGISTER_NO_GRADIENT_OP("LogicalNot");
+REGISTER_NO_GRADIENT_OP("Floor");
 
 // Conjugate helper function returns the conjugate of an Output if it
 // is complex valued.
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 0cb3132e94..c73482d5f4 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -255,6 +255,53 @@ Status LRNGradHelper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("LRN", LRNGradHelper);
 
+Status SoftplusGradHelper(const Scope& scope, const Operation& op,
+                          const std::vector<Output>& grad_inputs,
+                          std::vector<Output>* grad_outputs) {
+  auto dx = internal::SoftplusGrad(scope, grad_inputs[0], op.input(0));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Softplus", SoftplusGradHelper);
+
+Status SoftsignGradHelper(const Scope& scope, const Operation& op,
+                          const std::vector<Output>& grad_inputs,
+                          std::vector<Output>* grad_outputs) {
+  auto dx = internal::SoftsignGrad(scope, grad_inputs[0], op.input(0));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Softsign", SoftsignGradHelper);
+
+Status FractionalAvgPoolGradHelper(const Scope& scope, const Operation& op,
+                                   const std::vector<Output>& grad_inputs,
+                                   std::vector<Output>* grad_outputs) {
+  bool overlapping;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
+  auto dx = internal::FractionalAvgPoolGrad(
+      scope, Shape(scope, op.input(0), Shape::OutType(DT_INT64)),
+      grad_inputs[0], op.output(1), op.output(2),
+      internal::FractionalAvgPoolGrad::Overlapping(overlapping));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("FractionalAvgPool", FractionalAvgPoolGradHelper);
+
+Status FractionalMaxPoolGradHelper(const Scope& scope, const Operation& op,
+                                   const std::vector<Output>& grad_inputs,
+                                   std::vector<Output>* grad_outputs) {
+  bool overlapping;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
+  auto dx = internal::FractionalMaxPoolGrad(
+      scope, op.input(0), op.output(0), grad_inputs[0], op.output(1),
+      op.output(2), internal::FractionalMaxPoolGrad::Overlapping(overlapping));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("FractionalMaxPool", FractionalMaxPoolGradHelper);
+
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index c4eba7ecb0..b4d457a9d1 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -28,6 +28,8 @@ namespace {
 using ops::BiasAdd;
 using ops::Conv2D;
 using ops::Elu;
+using ops::FractionalAvgPool;
+using ops::FractionalMaxPool;
 using ops::L2Loss;
 using ops::LogSoftmax;
 using ops::LRN;
@@ -41,6 +43,8 @@ using ops::Relu;
 using ops::Relu6;
 using ops::Selu;
 using ops::Softmax;
+using ops::Softplus;
+using ops::Softsign;
 
 class NNGradTest : public ::testing::Test {
  protected:
@@ -71,22 +75,30 @@ class NNGradTest : public ::testing::Test {
     EXPECT_LT(max_error, 1e-3);
   }
 
-  // Sets tensor with random values, ensuring that the max value is largest by
-  // a reasonable amount.
-  // This is an issue for MaxPool, MaxPoolV2 and MaxPool3D, in which
-  // perturbations by the numeric gradient computation in the gradient checker
-  // can change the max value if values are too close together.
+  // Sets tensor with random values, ensuring that every pair of elements are at
+  // least a reasonable amount apart.
+  // This is an issue for max pooling operations, in which perturbations by the
+  // numeric gradient computation in the gradient checker can change the max
+  // value if a pool has values that are too close together.
   template <typename T>
-  void SetRandomValuesWithBumpedMax(Tensor* tensor) {
+  void SetRandomValuesForMaxPooling(Tensor* tensor) {
     auto tensor_flat = tensor->flat<T>();
-    tensor_flat.setRandom();
-    int32 max_index = 0;
-    for (size_t i = 1; i < tensor->NumElements(); i++) {
-      if (tensor_flat(i) > tensor_flat(max_index)) {
-        max_index = i;
-      }
+    // First set the array to an increasing sequence of values spaced
+    // a reasonable amount apart
+    T cur = 0;
+    for (size_t i = 0; i < tensor->NumElements(); i++) {
+      tensor_flat(i) = cur;
+      cur += 5e-2;
+    }
+    // Fischer-Yates shuffle the array
+    for (size_t i = tensor->NumElements() - 1; i >= 1; i--) {
+      // j <- random integer 0 <= j <= i
+      size_t j = random::New64() % (i + 1);
+      // swap values at i, j
+      T tmp = tensor_flat(i);
+      tensor_flat(i) = tensor_flat(j);
+      tensor_flat(j) = tmp;
     }
-    tensor_flat(max_index) += 1e-2;
   }
 
   Scope scope_;
@@ -189,7 +201,7 @@ TEST_F(NNGradTest, MaxPoolGradHelper) {
   const std::vector<int> strides{1, 2, 2, 1};
   auto y = MaxPool(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -202,7 +214,7 @@ TEST_F(NNGradTest, MaxPoolGradV2Helper) {
   Tensor strides = test::AsTensor<int>({1, 2, 2, 1}, {4});
   auto y = MaxPoolV2(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -215,7 +227,7 @@ TEST_F(NNGradTest, MaxPool3DGradHelper) {
   const std::vector<int> strides{1, 3, 3, 3, 1};
   auto y = MaxPool3D(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -248,5 +260,45 @@ TEST_F(NNGradTest, LRN){
   RunTest(x, x_shape, y, x_shape);
 }
 
+TEST_F(NNGradTest, SoftplusGrad) {
+  TensorShape shape({3, 7});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Softplus(scope_, x);
+  RunTest(x, shape, y, shape);
+}
+
+TEST_F(NNGradTest, SoftsignGrad) {
+  TensorShape shape({3, 7});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Softsign(scope_, x);
+  RunTest(x, shape, y, shape);
+}
+
+TEST_F(NNGradTest, FractionalAvgPoolGradHelper) {
+  TensorShape x_shape({1, 3, 7, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Force consistent pooling regions for unit testing.
+  auto y = FractionalAvgPool(
+      scope_, x, {1, 1.2, 1.9, 1},
+      FractionalAvgPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
+          2));
+  TensorShape y_shape({1, 2, 3, 1});
+  RunTest(x, x_shape, y.output, y_shape);
+}
+
+TEST_F(NNGradTest, FractionalMaxPoolGradHelper) {
+  TensorShape x_shape({1, 3, 7, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Force consistent pooling regions for unit testing.
+  auto y = FractionalMaxPool(
+      scope_, x, {1, 1.2, 1.9, 1},
+      FractionalMaxPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
+          2));
+  Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
+  TensorShape y_shape({1, 2, 3, 1});
+  RunTest(x, x_init_value, y.output, y_shape);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 6e050cf564..6641d45e83 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -56,9 +56,9 @@ namespace bar {
 //
 // Memory stats:
 //   arg bytes total:    104
-//   arg bytes aligned:  128
+//   arg bytes aligned:  192
 //   temp bytes total:   126
-//   temp bytes aligned: 224
+//   temp bytes aligned: 320
 class MyClass : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.h b/tensorflow/compiler/aot/embedded_protocol_buffers.h
index ebfe4806c2..4e194a6aba 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.h
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.h
@@ -71,7 +71,7 @@ struct ProtobufToEmbed {
   const ::tensorflow::protobuf::MessageLite* message;
 };
 
-// Embeds a a sequence of protocol buffers into an object file.
+// Embeds a sequence of protocol buffers into an object file.
 //
 // `target_triple` is the target triple for the target architecture for the
 // generated object file.
diff --git a/tensorflow/compiler/aot/runtime.h b/tensorflow/compiler/aot/runtime.h
index d085864f00..d1a669ceb1 100644
--- a/tensorflow/compiler/aot/runtime.h
+++ b/tensorflow/compiler/aot/runtime.h
@@ -25,8 +25,8 @@ namespace tensorflow {
 namespace tfcompile {
 namespace runtime {
 
-// Align to 32-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
-static constexpr size_t kAlign = 32;
+// Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
+static constexpr size_t kAlign = 64;
 
 // aligned_buffer_bytes returns the sum of each size in `sizes`, skipping -1
 // values.  There are `n` entries in `sizes`.  Each buffer is aligned to kAlign
diff --git a/tensorflow/compiler/aot/runtime_test.cc b/tensorflow/compiler/aot/runtime_test.cc
index 6d603a02eb..06ec623eb2 100644
--- a/tensorflow/compiler/aot/runtime_test.cc
+++ b/tensorflow/compiler/aot/runtime_test.cc
@@ -24,7 +24,7 @@ namespace runtime {
 namespace {
 
 TEST(Runtime, AlignmentValue) {
-  // We've chosen 32 byte alignment for the tfcompile runtime to mimic the
+  // We've chosen 64 byte alignment for the tfcompile runtime to mimic the
   // regular tensorflow allocator, which was chosen to play nicely with Eigen.
   // The tfcompile runtime also has a requirement that comes from the xla
   // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8
@@ -39,13 +39,13 @@ TEST(Runtime, AlignedBufferBytes) {
   EXPECT_EQ(aligned_buffer_bytes(sizesA, 1), 0);
 
   static constexpr intptr_t sizesB[1] = {3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 32);
+  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 64);
 
   static constexpr intptr_t sizesC[1] = {32};
-  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 32);
+  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 64);
 
   static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 192);
+  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 320);
 }
 
 void* add_ptr(void* base, uintptr_t delta) {
@@ -101,11 +101,11 @@ TEST(Runtime, MallocFreeContiguousBuffers) {
   EXPECT_NE(base, nullptr);
   EXPECT_EQ(bufD[0], add_ptr(base, 0));
   EXPECT_EQ(bufD[1], nullptr);
-  EXPECT_EQ(bufD[2], add_ptr(base, 32));
+  EXPECT_EQ(bufD[2], add_ptr(base, 64));
   EXPECT_EQ(bufD[3], nullptr);
-  EXPECT_EQ(bufD[4], add_ptr(base, 64));
-  EXPECT_EQ(bufD[5], add_ptr(base, 128));
-  EXPECT_EQ(bufD[6], add_ptr(base, 160));
+  EXPECT_EQ(bufD[4], add_ptr(base, 128));
+  EXPECT_EQ(bufD[5], add_ptr(base, 192));
+  EXPECT_EQ(bufD[6], add_ptr(base, 256));
   for (int i = 0; i < 7; ++i) {
     const intptr_t size = sizesD[i];
     if (size != -1) {
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index d82922a359..1067b38f93 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -178,6 +178,7 @@ cc_library(
         ":runtime_matmul",
         ":runtime_matmul_mkl",
         ":runtime_single_threaded_conv2d",
+        ":runtime_single_threaded_fft",
         ":runtime_single_threaded_matmul",
         "@llvm//:execution_engine",
         "@llvm//:core",
@@ -516,7 +517,6 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
@@ -578,6 +578,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "runtime_single_threaded_fft",
+    srcs = [
+        "runtime_fft_impl.h",
+        "runtime_single_threaded_fft.cc",
+    ],
+    hdrs = ["runtime_single_threaded_fft.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:framework_lite",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "runtime_single_threaded_matmul",
     srcs = ["runtime_single_threaded_matmul.cc"],
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 215405f680..54c52bc08f 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -51,6 +51,8 @@ extern const char* const kEigenConvF16SymbolName =
 extern const char* const kEigenConvF32SymbolName =
     "__xla_cpu_runtime_EigenConvF32";
 extern const char* const kEigenFftSymbolName = "__xla_cpu_runtime_EigenFft";
+extern const char* const kEigenSingleThreadedFftSymbolName =
+    "__xla_cpu_runtime_EigenSingleThreadedFft";
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName =
     "__xla_cpu_runtime_EigenSingleThreadedMatMulF16";
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index 1dce6efa5c..aa0e967123 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -52,6 +52,7 @@ extern const char* const kMKLSingleThreadedMatMulF64SymbolName;
 extern const char* const kEigenConvF16SymbolName;
 extern const char* const kEigenConvF32SymbolName;
 extern const char* const kEigenFftSymbolName;
+extern const char* const kEigenSingleThreadedFftSymbolName;
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF64SymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 2c20be155f..758b8c62b4 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1172,7 +1172,13 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
       {int8_ptr_type, int8_ptr_type, int8_ptr_type, int32_type, int32_type,
        int64_type, int64_type, int64_type, int64_type},
       /*isVarArg=*/false);
-  const char* fn_name = runtime::kEigenFftSymbolName;
+
+  bool multi_threaded_eigen =
+      hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+  const char* fn_name = multi_threaded_eigen
+                            ? runtime::kEigenFftSymbolName
+                            : runtime::kEigenSingleThreadedFftSymbolName;
+
   llvm::Function* fft_func = llvm::cast<llvm::Function>(
       module_->getOrInsertFunction(fn_name, fft_type));
   fft_func->setCallingConv(llvm::CallingConv::C);
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
index 984cb0616e..0bf693edd0 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
@@ -21,8 +21,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/types.h"
 
 // 'tensorflow' namespace is used so that int64 and other types don't require
@@ -71,11 +69,9 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
-  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = fft_shape[i];
     out_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
-    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<float, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -88,8 +84,8 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   const auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
 
   // Compute the full FFT using a temporary tensor.
-  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
-  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
+  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(in_dims);
+
   const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> zero_start_indices;
   full_fft.device(device) =
       input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(axes);
@@ -112,11 +108,9 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
-  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
     out_dims[i + 1] = fft_shape[i];
-    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -129,8 +123,7 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   // region we will slice from input given fft_shape. We slice input to
   // fft_shape on its inner-most dimensions, except the last (which we
   // slice to fft_shape[-1] / 2 + 1).
-  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
-  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
+  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(out_dims);
 
   // Calculate the starting point and range of the source of
   // negative frequency part.
@@ -179,7 +172,6 @@ template <int FFTRank, typename EigenDevice>
 void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
                       int32 fft_type, int64 input_batch, int64 fft_length0,
                       int64 fft_length1, int64 fft_length2) {
-  CHECK(::xla::FftType_IsValid(fft_type)) << fft_type;
   switch (fft_type) {
     case ::xla::FftType::FFT:
       EigenFftC2C<true, FFTRank, EigenDevice>(
@@ -204,7 +196,8 @@ void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
           input_batch, fft_length0, fft_length1, fft_length2);
       break;
     default:
-      LOG(FATAL) << "Unsupported FFT type: " << fft_type;
+      // Unsupported FFT type
+      abort();
   }
 }
 
@@ -230,7 +223,8 @@ void EigenFftImpl(const EigenDevice& device, void* out, void* operand,
                                                  fft_length1, fft_length2);
       break;
     default:
-      LOG(FATAL) << "Unsupported FFT rank " << fft_rank;
+      // Unsupported FFT rank
+      abort();
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
new file mode 100644
index 0000000000..2613ddb127
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
+
+#include "tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+using tensorflow::int32;
+using tensorflow::int64;
+
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenSingleThreadedFft(
+    const void* run_options_ptr, void* out, void* operand, int32 fft_type,
+    int32 fft_rank, int64 input_batch, int64 fft_length0, int64 fft_length1,
+    int64 fft_length2) {
+  tensorflow::xla::EigenFftImpl(Eigen::DefaultDevice(), out, operand, fft_type,
+                                fft_rank, input_batch, fft_length0, fft_length1,
+                                fft_length2);
+}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
new file mode 100644
index 0000000000..dcd133d012
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
+
+#include "tensorflow/core/platform/types.h"
+
+extern "C" {
+
+extern void __xla_cpu_runtime_EigenSingleThreadedFft(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, void* out,
+    void* operand, tensorflow::int32 fft_type, tensorflow::int32 fft_rank,
+    tensorflow::int64 input_batch, tensorflow::int64 fft_length0,
+    tensorflow::int64 fft_length1, tensorflow::int64 fft_length2);
+
+}  // extern "C"
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 8d8c5e4c44..c4c90515ac 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -202,6 +203,7 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF64);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedFft);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index d3bc47e61e..2515222cf2 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -204,7 +204,7 @@ class LayoutPattern {
   // Modifies the pattern to match only if the layout equals the given proto.
   // The layout must outlive the returned pattern.
   constexpr LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>> EqualTo(
-      const Layout* layout) const {
+      const ::xla::Layout* layout) const {
     return LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>>(
         LayoutPatternEqualImpl<Impl>(impl_, layout), matched_layout_);
   }
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.cc b/tensorflow/compiler/xla/service/tuple_simplifier.cc
index e536c8afbf..77bdcc9de0 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.cc
@@ -30,10 +30,17 @@ limitations under the License.
 
 namespace xla {
 
+TupleSimplifier::TupleSimplifier(bool exclude_entry_computation) :
+    exclude_entry_computation_(exclude_entry_computation) {}
+
 StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
   // Initially add all GTE and Tuple instructions to the worklist.
   std::queue<HloInstruction*> worklist;
   for (auto* computation : module->computations()) {
+    if (exclude_entry_computation_ &&
+        computation == module->entry_computation()) {
+      continue;
+    }
     for (auto* instruction : computation->instructions()) {
       if (instruction->opcode() == HloOpcode::kTuple ||
           instruction->opcode() == HloOpcode::kGetTupleElement) {
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.h b/tensorflow/compiler/xla/service/tuple_simplifier.h
index e5e9b10b5b..7509501883 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.h
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.h
@@ -27,13 +27,20 @@ namespace xla {
 // the module.
 class TupleSimplifier : public HloPassInterface {
  public:
-  TupleSimplifier() {}
+  TupleSimplifier() : TupleSimplifier(/*exclude_entry_computation=*/false) {}
+  explicit TupleSimplifier(bool exclude_entry_computation);
   ~TupleSimplifier() override {}
   tensorflow::StringPiece name() const override { return "tuple-simplifier"; }
 
   // Run tuple simplification on the given computation. Returns whether the
   // computation was changed.
   StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  // When set, this pipeline stage will perform optimization of all computations
+  // apart from the module's entry computation. This is used by Graphcore's
+  // backend.
+  bool exclude_entry_computation_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
index ca9ae91281..d3635eae81 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
@@ -42,6 +42,12 @@ class TupleSimplifierTest : public HloTestBase {
     TF_ASSERT_OK(changed_status.status());
     EXPECT_EQ(change_expected, changed_status.ValueOrDie());
   }
+  void Run(HloModule* module, bool change_expected, bool exclude_entry) {
+    TupleSimplifier simplifier(exclude_entry);
+    auto changed_status = simplifier.Run(module);
+    TF_ASSERT_OK(changed_status.status());
+    EXPECT_EQ(change_expected, changed_status.ValueOrDie());
+  }
 
   const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
   const Shape tuple_shape_ = ShapeUtil::MakeTupleShape(
@@ -211,5 +217,76 @@ TEST_F(TupleSimplifierTest, IncompatibleTuples) {
   EXPECT_THAT(computation->root_instruction(), tuple);
 }
 
+TEST_F(TupleSimplifierTest, CanExcludeEntryComputation) {
+  //  Verify that the root computation can be excluded
+  auto module = CreateNewModule();
+
+  HloInstruction* p0;
+  HloInstruction* p1;
+  HloComputation* c0;
+  HloComputation* c1;
+  HloComputation* entry;
+
+  {
+    HloComputation::Builder builder(TestName() + "_1");
+    p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
+    HloInstruction* gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 0));
+    HloInstruction* gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 1));
+    HloInstruction* gte2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 2));
+
+    builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
+
+    c0 = module->AddEmbeddedComputation(builder.Build());
+  }
+  {
+    HloComputation::Builder builder(TestName() + "_2");
+    p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
+    HloInstruction* gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 0));
+    HloInstruction* gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 1));
+    HloInstruction* gte2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 2));
+
+    builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
+
+    c1 = module->AddEmbeddedComputation(builder.Build());
+  }
+  {
+    HloComputation::Builder builder(TestName() + "_Entry");
+    HloInstruction* tuple_param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
+    HloInstruction* call0 = builder.AddInstruction(
+        HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c0));
+    HloInstruction* call1 = builder.AddInstruction(
+        HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c1));
+    HloInstruction* gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, call0, 0));
+    HloInstruction* gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, call1, 1));
+    HloInstruction* tuple0 =
+        builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
+    HloInstruction* gte2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 0));
+    HloInstruction* gte3 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 1));
+
+    builder.AddInstruction(HloInstruction::CreateTuple({gte2, gte3}));
+
+    entry = module->AddEntryComputation(builder.Build());
+  }
+
+  Run(module.get(), /*change_expected=*/true, /*exclude_entry=*/ true);
+
+  EXPECT_THAT(c0->root_instruction(), p0);
+  EXPECT_THAT(c1->root_instruction(), p1);
+  EXPECT_THAT(entry->instruction_count(), 9);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/contrib/autograph/__init__.py b/tensorflow/contrib/autograph/__init__.py
index 8fd83ef376..361cf2d77c 100644
--- a/tensorflow/contrib/autograph/__init__.py
+++ b/tensorflow/contrib/autograph/__init__.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 
 # TODO(mdan): Bring only the relevant symbols to the top level.
 from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph import operators
 from tensorflow.contrib.autograph.impl.api import convert
 from tensorflow.contrib.autograph.impl.api import converted_call
 from tensorflow.contrib.autograph.impl.api import do_not_convert
@@ -43,6 +44,8 @@ _allowed_symbols = [
     'do_not_convert',
     'to_code',
     'to_graph',
+    # Overloaded operators
+    'operators',
     # Python language "extensions"
     'set_element_type',
     'set_loop_options',
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
index bda5e26f43..2e0a2fcef4 100644
--- a/tensorflow/contrib/cmake/tf_c.cmake
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -37,13 +37,15 @@ add_dependencies(
   tf_core_lib
   tf_protos_cc)
 
-add_library(tf_c_python_api OBJECT
-  "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
-  "${tensorflow_source_dir}/tensorflow/c/python_api.h"
-)
-add_dependencies(
-  tf_c_python_api
-  tf_c
-  tf_core_lib
-  tf_core_framework
-  tf_protos_cc)
+if(tensorflow_BUILD_PYTHON_BINDINGS)
+  add_library(tf_c_python_api OBJECT
+    "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
+    "${tensorflow_source_dir}/tensorflow/c/python_api.h"
+  )
+  add_dependencies(
+    tf_c_python_api
+    tf_c
+    tf_core_lib
+    tf_core_framework
+    tf_protos_cc)
+endif()
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index f73da0b8ab..6c90cf398c 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -155,7 +155,7 @@ if (WIN32)
     set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib")
   endif()
 else (WIN32)
-  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so")
+  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}")
 endif (WIN32)
 add_custom_target(tf_extension_ops)
 
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index a0c3ddd28b..9244604489 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -715,7 +715,7 @@ if(WIN32)
   endif()
 else()
   add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}
                                      ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
 endif()
 
@@ -832,7 +832,6 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
 add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/testing/python/framework/util_test.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
-
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/README
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index cffe069aa3..4f957f1e0b 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -44,7 +44,8 @@ UNDNAME = "undname.exe"
 DUMPBIN = "dumpbin.exe"
 
 # Exclude if matched
-EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::")
+EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::|Internal|"
+                        r"python_op_gen_internal|grappler")
 
 # Include if matched before exclude
 INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
@@ -56,6 +57,10 @@ INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
                            r"tensorflow::ops::internal::Enter|"
                            r"tensorflow::strings::internal::AppendPieces|"
                            r"tensorflow::strings::internal::CatPieces|"
+                           r"tensorflow::errors::Internal|"
+                           r"tensorflow::Tensor::CopyFromInternal|"
+                           r"tensorflow::kernel_factory::"
+                           r"OpKernelRegistrar::InitInternal|"
                            r"tensorflow::io::internal::JoinPathImpl")
 
 # Include if matched after exclude
@@ -64,7 +69,7 @@ INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
                         r"tensorflow::|"
                         r"functor::|"
                         r"\?nsync_|"
-                        r"perftools::gputools")
+                        r"stream_executor::")
 
 # We want to identify data members explicitly in the DEF file, so that no one
 # can implicitly link against the DLL if they use one of the variables exported
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
index 45760a29ee..795f1993ba 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
@@ -151,16 +151,24 @@ class SinhArcsinhBijectorTest(test.TestCase):
         self.assertAllClose(y, bijector.forward(x).eval(), rtol=1e-4, atol=0.)
         self.assertAllClose(x, bijector.inverse(y).eval(), rtol=1e-4, atol=0.)
 
-        # Do the numpy calculation in float128 to avoid inf/nan.
-        y_float128 = np.float128(y)
-        self.assertAllClose(
-            np.log(np.cosh(
-                np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
-                    y_float128**2 + 1)) -
-            np.log(tailweight),
-            bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
-            rtol=1e-4,
-            atol=0.)
+        # On IBM PPC systems, longdouble (np.float128) is same as double except that it can have more precision.
+        # Type double being of 8 bytes, can't hold square of max of float64 (which is also 8 bytes) and
+        # below test fails due to overflow error giving inf. So this check avoids that error by skipping square
+        # calculation and corresponding assert.
+
+        if np.amax(y) <= np.sqrt(np.finfo(np.float128).max) and \
+           np.fabs(np.amin(y)) <= np.sqrt(np.fabs(np.finfo(np.float128).min)):
+
+          # Do the numpy calculation in float128 to avoid inf/nan.
+          y_float128 = np.float128(y)
+          self.assertAllClose(
+              np.log(np.cosh(
+                  np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
+                      y_float128**2 + 1)) -
+              np.log(tailweight),
+              bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
+              rtol=1e-4,
+              atol=0.)
         self.assertAllClose(
             -bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
             bijector.forward_log_det_jacobian(x, event_ndims=0).eval(),
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index d7909dd5a2..adf92c27ea 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -106,7 +106,8 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
             target_device=target,
             buffer_size=10,
             container="",
-            shared_name=_generate_shared_name("function_buffer_resource"))
+            shared_name=_generate_shared_name(
+                "contrib_eager_iterator_function_buffer_resource"))
         self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
             handle=self._buffer_resource_handle,
             handle_device=self._device)
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
index 4fe3a0e3f3..5749f22ac5 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
@@ -68,7 +68,7 @@
         "# simply construct the object. Most layers take as a first argument the number\n",
         "# of output dimensions / channels.\n",
         "layer = tf.keras.layers.Dense(100)\n",
-        "# The number of input dimensionss is often unnecessary, as it can be inferred\n",
+        "# The number of input dimensions is often unnecessary, as it can be inferred\n",
         "# the first time the layer is used, but it can be provided if you want to \n",
         "# specify it manually, which is useful in some complex models.\n",
         "layer = tf.keras.layers.Dense(10, input_shape=(None, 5))"
@@ -267,7 +267,7 @@
         "  * `build`, where you know the shapes of the input tensors and can do the rest of the initialization\n",
         "  * `call`, where you do the forward computation\n",
         "\n",
-        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes requires to create the variables will need to be explicitly specified."
+        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes required to create the variables will need to be explicitly specified."
       ]
     },
     {
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index 84a413c791..05bcdac2ca 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -346,7 +346,8 @@ def sequence_numeric_column(
     key,
     shape=(1,),
     default_value=0.,
-    dtype=dtypes.float32):
+    dtype=dtypes.float32,
+    normalizer_fn=None):
   """Returns a feature column that represents sequences of numeric data.
 
   Example:
@@ -370,6 +371,12 @@ def sequence_numeric_column(
     default_value: A single value compatible with `dtype` that is used for
       padding the sparse data into a dense `Tensor`.
     dtype: The type of values.
+    normalizer_fn: If not `None`, a function that can be used to normalize the
+      value of the tensor after `default_value` is applied for parsing.
+      Normalizer function takes the input `Tensor` as its argument, and returns
+      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
+      even though the most common use case of this function is normalization, it
+      can be used for any kind of Tensorflow transformations.
 
   Returns:
     A `_SequenceNumericColumn`.
@@ -383,12 +390,16 @@ def sequence_numeric_column(
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
+  if normalizer_fn is not None and not callable(normalizer_fn):
+    raise TypeError(
+        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
 
   return _SequenceNumericColumn(
       key,
       shape=shape,
       default_value=default_value,
-      dtype=dtype)
+      dtype=dtype,
+      normalizer_fn=normalizer_fn)
 
 
 def _assert_all_equal_and_return(tensors, name=None):
@@ -407,7 +418,7 @@ class _SequenceNumericColumn(
     fc._SequenceDenseColumn,
     collections.namedtuple(
         '_SequenceNumericColumn',
-        ['key', 'shape', 'default_value', 'dtype'])):
+        ['key', 'shape', 'default_value', 'dtype', 'normalizer_fn'])):
   """Represents sequences of numeric data."""
 
   @property
@@ -419,7 +430,10 @@ class _SequenceNumericColumn(
     return {self.key: parsing_ops.VarLenFeature(self.dtype)}
 
   def _transform_feature(self, inputs):
-    return inputs.get(self.key)
+    input_tensor = inputs.get(self.key)
+    if self.normalizer_fn is not None:
+      input_tensor = self.normalizer_fn(input_tensor)
+    return input_tensor
 
   @property
   def _variable_shape(self):
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
index ee74cf56dc..45d7b74046 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
 
@@ -947,6 +948,7 @@ class SequenceNumericColumnTest(test.TestCase):
     self.assertEqual((1,), a.shape)
     self.assertEqual(0., a.default_value)
     self.assertEqual(dtypes.float32, a.dtype)
+    self.assertIsNone(a.normalizer_fn)
 
   def test_shape_saved_as_tuple(self):
     a = sfc.sequence_numeric_column('aaa', shape=[1, 2])
@@ -965,6 +967,10 @@ class SequenceNumericColumnTest(test.TestCase):
         ValueError, 'dtype must be convertible to float'):
       sfc.sequence_numeric_column('aaa', dtype=dtypes.string)
 
+  def test_normalizer_fn_must_be_callable(self):
+    with self.assertRaisesRegexp(TypeError, 'must be a callable'):
+      sfc.sequence_numeric_column('aaa', normalizer_fn='NotACallable')
+
   def test_get_sequence_dense_tensor(self):
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, values [[0.], [1]]
@@ -985,6 +991,41 @@ class SequenceNumericColumnTest(test.TestCase):
       self.assertAllEqual(
           expected_dense_tensor, dense_tensor.eval(session=sess))
 
+  def test_get_sequence_dense_tensor_with_normalizer_fn(self):
+
+    def _increment_two(input_sparse_tensor):
+      return sparse_ops.sparse_add(
+          input_sparse_tensor,
+          sparse_tensor.SparseTensor(((0, 0), (1, 1)), (2.0, 2.0), (2, 2))
+      )
+
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0.], [1]]
+        # example 1, [[10.]]
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+
+    # Before _increment_two:
+    #   [[0.], [1.]],
+    #   [[10.], [0.]],
+    # After _increment_two:
+    #   [[2.], [1.]],
+    #   [[10.], [2.]],
+    expected_dense_tensor = [
+        [[2.], [1.]],
+        [[10.], [2.]],
+    ]
+    numeric_column = sfc.sequence_numeric_column(
+        'aaa', normalizer_fn=_increment_two)
+
+    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_dense_tensor, dense_tensor.eval(session=sess))
+
   def test_get_sequence_dense_tensor_with_shape(self):
     """Tests get_sequence_dense_tensor with shape !=(1,)."""
     sparse_input = sparse_tensor.SparseTensorValue(
diff --git a/tensorflow/contrib/ffmpeg/__init__.py b/tensorflow/contrib/ffmpeg/__init__.py
index daba965a98..484ffee3e7 100644
--- a/tensorflow/contrib/ffmpeg/__init__.py
+++ b/tensorflow/contrib/ffmpeg/__init__.py
@@ -28,7 +28,6 @@ from __future__ import print_function
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_audio
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import encode_audio
-from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 
 from tensorflow.python.util.all_util import remove_undocumented
 
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
index 020b5c99c6..b1b5126d9e 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from tensorflow.contrib.ffmpeg.ops import gen_decode_audio_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_encode_audio_op_py
-from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 10d1ecc738..dc49383c5c 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -119,14 +119,13 @@ from tensorflow.python.framework.smart_cond import smart_cond
 from tensorflow.python.framework.smart_cond import smart_constant_value
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
-from tensorflow.python.ops.array_ops import broadcast_to
 from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_1d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_3d
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['nest', 'broadcast_to']
+_allowed_symbols = ['nest']
 _nest_allowed_symbols = [
     'assert_same_structure',
     'is_sequence',
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
index 65cb94b5a4..a955e21b72 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -301,8 +301,8 @@ class FusedConv2DBiasActivationTest(test.TestCase):
           conv = tensors[i]
           value = values[i]
           ref_value = ref_values[i]
-          print("expected = ", ref_value)
-          print("actual = ", value)
+          tf_logging.info("expected = ", ref_value)
+          tf_logging.info("actual = ", value)
           tol = 1e-5
           if value.dtype == np.float16:
             tol = 1e-3
@@ -843,7 +843,8 @@ class FusedConvInt8Tests(test.TestCase):
                                                 vertical_stride, padding_type)
     output_width = CalculateConvolvedOutputDim(input_width, filter_width,
                                                horizontal_stride, padding_type)
-    print("output_height=", output_height, ", output_width=", output_width)
+    tf_logging.info("output_height=", output_height, ", output_width=", 
+			                 output_width)
 
     side_input, _, _ = gen_array_ops.quantize_v2(
         random_ops.random_uniform(
@@ -880,8 +881,8 @@ class FusedConvInt8Tests(test.TestCase):
     with self.test_session(
         use_gpu=True, config=NoMemoryOptimizationConfig()) as sess:
       actual_y, expected_y = sess.run([actual, expected])
-      print("actual_y = ", actual_y)
-      print("expected_y = ", expected_y)
+      tf_logging.info("actual_y = ", actual_y)
+      tf_logging.info("expected_y = ", expected_y)
       self.assertTrue(np.array_equal(actual_y, expected_y))
 
   def testFusedConvInt8(self):
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
index 6a5d982dc8..2e5c84704f 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "hexagon_controller.h"
 
-#include <malloc.h>
+#include <stdlib.h>
 #include <stdio.h>
 
 #include "adspmsgd.h"
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index 436c3e1d4c..840015a7fa 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -30,9 +30,7 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
-# the archive has been propagated in mirror.bazel.build.
-GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
diff --git a/tensorflow/contrib/lite/examples/minimal/minimal.cc b/tensorflow/contrib/lite/examples/minimal/minimal.cc
index 106e3b0270..8b0ace96cc 100644
--- a/tensorflow/contrib/lite/examples/minimal/minimal.cc
+++ b/tensorflow/contrib/lite/examples/minimal/minimal.cc
@@ -38,7 +38,7 @@ using namespace tflite;
 
 int main(int argc, char *argv[]) {
   if(argc != 2) {
-    fprintf(stderr, "Usage: %s <model>\n");
+    fprintf(stderr, "minimal <tflite model>\n");
     return 1;
   }
   const char* filename = argv[1];
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index bb2e615eac..965273f0f0 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -128,7 +128,6 @@ TensorFlow operation not listed above are likely unsupported. Notably, the
 following common ops are not supported at the moment:
 
 *   [tf.depth_to_space](https://www.tensorflow.org/api_docs/python/tf/depth_to_space)
-*   [tf.gather](https://www.tensorflow.org/api_docs/python/tf/gather)
 *   [tf.image.resize_bilinear](https://www.tensorflow.org/api_docs/python/tf/image/resize_bilinear)
 *   [tf.tanh](https://www.tensorflow.org/api_docs/python/tf/tanh)
 
@@ -306,6 +305,19 @@ Options {
 }
 ```
 
+**GATHER**
+
+```
+Inputs {
+  0: params tensor
+  1: indices tensor
+  2: axis tensor (optional)
+}
+Outputs {
+  0: a tensor with same type as the params tensor.
+}
+```
+
 **GREATER**
 
 ```
diff --git a/tensorflow/contrib/lite/java/ovic/README.md b/tensorflow/contrib/lite/java/ovic/README.md
index 5efa70987e..26349347fa 100644
--- a/tensorflow/contrib/lite/java/ovic/README.md
+++ b/tensorflow/contrib/lite/java/ovic/README.md
@@ -2,7 +2,7 @@
 
 This folder contains building code for track one of the [Low Power ImageNet Recognition Challenge workshop at CVPR 2018.](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018)
 
-## Pre-requesits
+## Pre-requisite
 
 Follow the steps [here](https://www.tensorflow.org/mobile/tflite/demo_android) to install Tensorflow, Bazel, and the Android NDK and SDK.
 
@@ -49,7 +49,7 @@ Once you have a submission that follows the instructions from the [competition s
 You can call the validator binary below to verify that your model fits the format requirements. This often helps you to catch size mismatches (e.g. output should be [1, 1001] instead of [1,1,1,1001]). Let say the submission file is located at `/path/to/my_model.lite`, then call:
 
 ```sh
-bazel build --cxxopt--std=c++11 //tensorflow/contrib/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
+bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
 bazel-bin/tensorflow/contrib/lite/java/ovic/ovic_validator /path/to/my_model.lite
 ```
 
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index a2f192bbc2..1908f7fa6c 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1934,7 +1934,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // The quantization of the input, output arrays is as follows:
 //  - The input activations are quantized as uint8 on the interval
 //    [-1, 127/128].
-//    The rationale for that is that that is the natural interval for output
+//    The rationale for that is that is the natural interval for output
 //    activations (see next point) and these need to be concatenated together.
 //    We could accommodate different ranges by re-scaling, but we empirically
 //    found that setting the input activations range to be [-1, 127/128] in the
@@ -1999,7 +1999,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // However, for a fixed-point implementation in 16-bit integers, using 5
 // integer bits to represent the [-16, 16] range would leave only 11
 // fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
-// representable values. Notice that that is higher than the
+// representable values. Notice that is higher than the
 // worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
 // Using [-8, 8] thus seems like the better compromise overall, enjoying
 // an increment of 2.4e-4 between representable values and a worst-case
diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
index 9400e757b9..fd90823425 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -55,7 +55,7 @@ class Interpreter(object):
     elif model_content and not model_path:
       self._interpreter = (
           _interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromBuffer(
-              model_content, len(model_content)))
+              model_content))
       if not self._interpreter:
         raise ValueError(
             'Failed to create model from {} bytes'.format(len(model_content)))
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index f705551fcb..b283551c45 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -397,9 +397,14 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
-    const char* data, size_t len) {
+    PyObject* data) {
+  char * buf = nullptr;
+  Py_ssize_t length;
+  if (PY_TO_CPPSTRING(data, &buf, &length) == -1) {
+    return nullptr;
+  }
   std::unique_ptr<tflite::FlatBufferModel> model =
-      tflite::FlatBufferModel::BuildFromBuffer(data, len);
+      tflite::FlatBufferModel::BuildFromBuffer(buf, length);
   return model ? new InterpreterWrapper(std::move(model)) : nullptr;
 }
 
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
index b0ed7c4559..cbeb53bee7 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -40,8 +40,7 @@ class InterpreterWrapper {
   static InterpreterWrapper* CreateWrapperCPPFromFile(const char* model_path);
 
   // SWIG caller takes ownership of pointer.
-  static InterpreterWrapper* CreateWrapperCPPFromBuffer(const char* data,
-                                                        size_t len);
+  static InterpreterWrapper* CreateWrapperCPPFromBuffer(PyObject* data);
 
   ~InterpreterWrapper();
   bool AllocateTensors();
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 0913cd2c5c..88dda7290b 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -34,6 +34,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from six import PY3
+
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
 from tensorflow.contrib.lite.python import lite_constants as constants
@@ -54,6 +56,7 @@ from tensorflow.python.framework.importer import import_graph_def
 from tensorflow.python.ops.variables import global_variables_initializer
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
+# from tensorflow.python.util.all_util import remove_undocumented
 
 
 class TocoConverter(object):
@@ -203,6 +206,12 @@ class TocoConverter(object):
       except (_text_format.ParseError, DecodeError):
         try:
           print("Ignore 'tcmalloc: large alloc' warnings.")
+
+          if not isinstance(file_content, str):
+            if PY3:
+              file_content = file_content.decode('utf-8')
+            else:
+              file_content = file_content.encode('utf-8')
           _text_format.Merge(file_content, graph_def)
         except (_text_format.ParseError, DecodeError):
           raise ValueError(
@@ -382,3 +391,5 @@ def _freeze_graph(sess, output_tensors):
                                                         output_arrays)
   else:
     return sess.graph_def
+
+# remove_undocumented(__name__)
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 4465f953ba..caca199d2e 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -178,7 +178,7 @@ ArrayDataType ConvertDataType(tensorflow::DataType dtype) {
   else if (dtype == DT_STRING)
     return ArrayDataType::kString;
   else
-    LOG(INFO) << "Unsupported data type in placehoder op: " << dtype;
+    LOG(INFO) << "Unsupported data type in placeholder op: " << dtype;
   return ArrayDataType::kNone;
 }
 
diff --git a/tensorflow/contrib/lite/toco/toco_port.cc b/tensorflow/contrib/lite/toco/toco_port.cc
index 1b21c8bc60..de76fd4032 100644
--- a/tensorflow/contrib/lite/toco/toco_port.cc
+++ b/tensorflow/contrib/lite/toco/toco_port.cc
@@ -20,6 +20,12 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
+#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
+namespace std {
+double round(double x) { return ::round(x); }
+}  // namespace std
+#endif
+
 namespace toco {
 namespace port {
 void CopyToBuffer(const string& src, char* dest) {
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h
index 5c019cb2bf..17f82b9dd7 100644
--- a/tensorflow/contrib/lite/toco/toco_port.h
+++ b/tensorflow/contrib/lite/toco/toco_port.h
@@ -34,6 +34,24 @@ limitations under the License.
 #define TFLITE_PROTO_NS google::protobuf
 #endif
 
+#ifdef __ANDROID__
+#include <sstream>
+namespace std {
+
+template <typename T>
+std::string to_string(T value)
+{
+    std::ostringstream os ;
+    os << value ;
+    return os.str() ;
+}
+
+#ifdef __ARM_ARCH_7A__
+double round(double x);
+#endif
+}
+#endif
+
 namespace toco {
 namespace port {
 
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index e8c6edd7ba..a28fc3a87f 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -270,7 +270,7 @@ for arch in $archs; do
                         PLATFORM_LDFLAGS=-pthread
                         MKDEP=${CC} -M -std=c++11
                         PLATFORM_C=../../platform/c++11/src/nsync_semaphore_mutex.cc \
-                                   ../../platform/c++11/src/per_thread_waiter.cc \
+                                   ../../platform/posix/src/per_thread_waiter.c \
                                    ../../platform/c++11/src/yield.cc \
                                    ../../platform/c++11/src/time_rep_timespec.cc \
                                    ../../platform/c++11/src/nsync_panic.cc
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index eff9081e35..48953e2e38 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -27,9 +27,7 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
-# the archive has been propagated in mirror.bazel.build.
-GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 2ed99d50a4..a6be2084aa 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -2503,7 +2503,7 @@ def _compute_recall_at_precision(tp, fp, fn, precision, name):
     name: An optional variable_scope name.
 
   Returns:
-    The recall at a the given `precision`.
+    The recall at a given `precision`.
   """
   precisions = math_ops.div(tp, tp + fp + _EPSILON)
   tf_index = math_ops.argmin(
diff --git a/tensorflow/contrib/mpi_collectives/kernels/ring.h b/tensorflow/contrib/mpi_collectives/kernels/ring.h
index 1d56d588bc..c001615d3f 100644
--- a/tensorflow/contrib/mpi_collectives/kernels/ring.h
+++ b/tensorflow/contrib/mpi_collectives/kernels/ring.h
@@ -129,7 +129,7 @@ cudaStream_t CudaStreamForMPI();
  *  has the fully accumulated Segment 1; and so on. The scatter-reduce is
  * complete.
  *
- *  Next, the allgather distributes these fully accumululated chunks across all
+ *  Next, the allgather distributes these fully accumulated chunks across all
  * nodes. Communication proceeds in the same ring, once again in N-1 steps. At
  * the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
  * For example, at the first iteration, the following transfers will occur:
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index 21bf3f5313..915e6504e1 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -224,8 +224,10 @@ class AdaMaxOptimizerTest(test.TestCase):
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0),
+                                             rtol=1e-2)
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1),
+                                             rtol=1e-2)
           if use_resource:
             self.assertEqual("var0_%d/AdaMax:0" % (i,),
                              opt.get_slot(var=var0, name="m").name)
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer.py b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
index a7c97a1da2..b6b10e500b 100644
--- a/tensorflow/contrib/opt/python/training/model_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
@@ -62,7 +62,7 @@ class ModelAverageCustomGetter(object):
   """
 
   def __init__(self, worker_device):
-    """Create a new `ElasticAverageCustomGetter`.
+    """Create a new `ModelAverageCustomGetter`.
 
     Args:
       worker_device: String.  Name of the `worker` job.
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
index 6ca7fe8b6e..f2171efc95 100644
--- a/tensorflow/contrib/periodic_resample/BUILD
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -6,12 +6,13 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "py_test",
+    "tf_cc_test",
     "tf_gen_op_libs",
     "tf_custom_op_library",
     "tf_custom_op_py_library",
     "tf_gen_op_wrapper_py",
 )
+load("//tensorflow:tensorflow.bzl", "py_test")
 
 cc_library(
     name = "all_ops",
@@ -84,6 +85,22 @@ py_test(
         ":init_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradient_checker",
+    ],
+)
+
+tf_cc_test(
+    name = "periodic_resample_op_cc_test",
+    size = "small",
+    srcs = [
+        "ops/array_ops_test.cc",
+    ],
+    deps = [
+        ":all_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
index e18923c8aa..514689cf45 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
@@ -22,4 +22,9 @@ namespace tensorflow {
 REGISTER_KERNEL_BUILDER(Name("PeriodicResample").Device(DEVICE_CPU),
                         PeriodicResampleOp);
 
+
+REGISTER_KERNEL_BUILDER(Name("PeriodicResampleOpGrad")
+                            .Device(DEVICE_CPU),
+                        PeriodicResampleOpGrad);
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
index 3ab588c458..42fba81a5c 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
@@ -25,92 +25,202 @@
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 namespace {
 
-template <class IndexVecT, class IndexT>
-IndexT compute_input_index(
-    IndexVecT* target_dimensions, const IndexT& output_index,
-    const IndexVecT& original_dimensions, const int& adjustable_dimension,
-    const std::vector<tensorflow::int64>& dimension_ceiling,
-    const std::vector<tensorflow::int64>& cumulative_dimensions, IndexT* result,
-    std::vector<IndexT>* output_indices, const int& rank) {
-  *result = 0;
-  output_indices->clear();
+// Computes input tensor index for given output index during forward
+// propagation through periodic_resample operation.
+class InputIndexer {
+ public:
+  InputIndexer(const std::vector<tensorflow::int64>& output_dimensions,
+               const tensorflow::TensorShape& input_shape,
+               int adjustable_dimension)
+      : output_dimensions_(output_dimensions),
+        adjustable_dimension_(adjustable_dimension),
+        rank_(input_shape.dims()),
+        linear_output_index_(0),
+        linear_input_index_(0),
+        adjustable_dimension_carriage_sum_(0) {
+    auto input_dimensions = TensorShapeToVector(input_shape);
+    // factors by which input_dimensions increases/decreases w.r.t.
+    // output_dimensions
+    dimension_ceiling_ =
+        ComputeDimensionCeiling(output_dimensions, input_dimensions);
+    cumulative_dimensions_ = ComputeCumulativeDimensions();
+
+    output_indices_.resize(output_dimensions_.size());
+    input_indices_.resize(output_dimensions_.size());
+
+    // Compute index_factors
+    index_factors_.resize(rank_);
+    tensorflow::int64 last_index_factor = 1;
+    for (auto r = rank_ - 1; r >= 0; --r) {
+      index_factors_[r] = last_index_factor;
+      last_index_factor *= input_dimensions[r];
+    }
+  }
+
+  tensorflow::int64 linear_input_index() const { return linear_input_index_; }
+
+  void MoveToOutputIndex(tensorflow::int64 output_index);
+  void IncrementOutputIndex();
+
+ private:
+  void RecomputeInputAdjustableDimensionIndex() {
+    tensorflow::int64 index = adjustable_dimension_carriage_sum_;
+    index *= output_dimensions_[adjustable_dimension_];
+    index += output_indices_[adjustable_dimension_];
+    input_indices_[adjustable_dimension_] = index;
+  }
+
+  std::vector<tensorflow::int64> TensorShapeToVector(
+      const tensorflow::TensorShape& tensor_shape);
+
+  std::vector<tensorflow::int64> ComputeDimensionCeiling(
+      const std::vector<tensorflow::int64>& output_dimensions,
+      const std::vector<tensorflow::int64>& input_dimensions);
+
+  std::vector<tensorflow::int64> ComputeCumulativeDimensions();
+
+  const std::vector<tensorflow::int64> output_dimensions_;
+  std::vector<tensorflow::int64> dimension_ceiling_;
+  std::vector<tensorflow::int64> index_factors_;
+  std::vector<tensorflow::int64> cumulative_dimensions_;
+  std::vector<tensorflow::int64> output_indices_;
+  std::vector<tensorflow::int64> input_indices_;
+
+  const int adjustable_dimension_;
+  const int rank_;
+  tensorflow::int64 linear_output_index_;
+  tensorflow::int64 linear_input_index_;
+  tensorflow::int64 adjustable_dimension_carriage_sum_;
+};
+
+void InputIndexer::MoveToOutputIndex(tensorflow::int64 output_index) {
+  linear_output_index_ = output_index;
+  linear_input_index_ = 0;
 
   // un-rasterize the output index
   auto last_reduced_i = output_index;
-  for (auto r = rank - 1; r >= 0; --r) {
-    (*output_indices)[r] = last_reduced_i % (*target_dimensions)[r];
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    output_indices_[r] = last_reduced_i % output_dimensions_[r];
     last_reduced_i =
-        (last_reduced_i - (*output_indices)[r]) / (*target_dimensions)[r];
+        (last_reduced_i - output_indices_[r]) / output_dimensions_[r];
   }
 
+  tensorflow::int64 carriage_sum = 0;
+  for (int qi = 0; qi < rank_; ++qi) {
+    if (qi == adjustable_dimension_) continue;
+    carriage_sum += cumulative_dimensions_[qi] *
+                    (output_indices_[qi] % dimension_ceiling_[qi]);
+  }
+  adjustable_dimension_carriage_sum_ = carriage_sum;
+
   // rasterize the input index
-  IndexT last_index_factor = 1;
-  for (auto r = rank - 1; r >= 0; --r) {
-    IndexT index = 0;
-    if (r != adjustable_dimension)
-      index = (*output_indices)[r] / dimension_ceiling[r];
-    else {
-      for (int qi = 0; qi < rank; ++qi) {
-        if (qi == adjustable_dimension) continue;
-        index += cumulative_dimensions[qi] *
-                 ((*output_indices)[qi] % dimension_ceiling[qi]);
-      }
-      index *= (*target_dimensions)[adjustable_dimension];
-      index += (*output_indices)[r];
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    if (r != adjustable_dimension_) {
+      input_indices_[r] = output_indices_[r] / dimension_ceiling_[r];
+    } else {
+      RecomputeInputAdjustableDimensionIndex();
     }
-    *result += last_index_factor * index;
-    last_index_factor *= original_dimensions[r];
   }
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    linear_input_index_ += index_factors_[r] * input_indices_[r];
+  }
+}
+
+void InputIndexer::IncrementOutputIndex() {
+  linear_output_index_++;
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    auto old_carriage_sum_increment =
+        cumulative_dimensions_[r] *
+        (output_indices_[r] % dimension_ceiling_[r]);
+    output_indices_[r] = (output_indices_[r] + 1) % output_dimensions_[r];
+    if (r != adjustable_dimension_) {
+      auto new_input_index = output_indices_[r] / dimension_ceiling_[r];
+      linear_input_index_ +=
+          (new_input_index - input_indices_[r]) * index_factors_[r];
+
+      input_indices_[r] = new_input_index;
+
+      auto new_carriage_sum_increment =
+          cumulative_dimensions_[r] *
+          (output_indices_[r] % dimension_ceiling_[r]);
 
-  return *result;
+      adjustable_dimension_carriage_sum_ = adjustable_dimension_carriage_sum_ -
+                                           old_carriage_sum_increment +
+                                           new_carriage_sum_increment;
+    }
+
+    if (output_indices_[r] != 0) {
+      // No more carries to higher indices.
+      break;
+    }
+  }
+  auto old_adjustable_dimension_input_index =
+      input_indices_[adjustable_dimension_];
+  RecomputeInputAdjustableDimensionIndex();
+  linear_input_index_ += (input_indices_[adjustable_dimension_] -
+                           old_adjustable_dimension_input_index) *
+                          index_factors_[adjustable_dimension_];
 }
 
-template <class InputDataT,
-          class IndexVecT>  // both types are needed here b/c IndexVecT and
-                            // InputDataT are not related
-                            void
-                            fill_periodic_tensor(
-                                tensorflow::OpKernelContext* context,
-                                const IndexVecT& desired_shape,
-                                const tensorflow::Tensor& input_tensor) {
-  // input is a strided array (last index is fastest, C-ordered)
-  auto input = input_tensor.flat<InputDataT>();
-  const int rank = input_tensor.dims();
-  // original and target dimensions
-  std::vector<tensorflow::int64> original_dimensions(rank),
-      target_dimensions(rank);
-  tensorflow::int64 total_size(input_tensor.NumElements()), new_sliced_size(1);
-  // factors by which original_dimensions increases/decreases w.r.t.
-  // target_dimensions
-  std::vector<tensorflow::int64> dimension_ceiling(rank),
-      cumulative_dimensions(rank);
-  // index of adjustable dimension
-  int adjustable_dimension;
-  tensorflow::TensorShape output_shape;
+std::vector<tensorflow::int64> InputIndexer::TensorShapeToVector(
+    const tensorflow::TensorShape& tensor_shape) {
+  std::vector<tensorflow::int64> result(tensor_shape.dims());
+  int count = 0;
+  for (const auto dim_info : tensor_shape) {
+    result[count] = dim_info.size;
+    ++count;
+  }
+  return result;
+}
 
-  // requires that the rank of the input tensor and length of the desired shape
-  // are equal
-  OP_REQUIRES(context, rank == desired_shape.size(),
-              tensorflow::errors::InvalidArgument(
-                  "periodic_resample expects the rank of the input tensor, ",
-                  rank, ", to be the same as the length of the desired shape, ",
-                  desired_shape.size(), "."));
+std::vector<tensorflow::int64> InputIndexer::ComputeDimensionCeiling(
+    const std::vector<tensorflow::int64>& output_dimensions,
+    const std::vector<tensorflow::int64>& input_dimensions) {
+  std::vector<tensorflow::int64> dimension_ceiling(input_dimensions.size());
+  for (size_t i = 0; i < input_dimensions.size(); ++i) {
+    dimension_ceiling[i] = (output_dimensions[i] + input_dimensions[i] - 1) /
+        input_dimensions[i];
+  }
+  return dimension_ceiling;
+}
 
-  bool found = false;
-  const auto& input_tensor_shape = input_tensor.shape();
+std::vector<tensorflow::int64> InputIndexer::ComputeCumulativeDimensions() {
+  std::vector<tensorflow::int64> cumulative_dimensions(rank_);
+  int count = 0;
+  for (int i = 0; i < rank_; ++i) {
+    if (count == 0) {
+      cumulative_dimensions[count] = 1;
+    } else {
+      cumulative_dimensions[count] =
+          cumulative_dimensions[count - 1] * dimension_ceiling_[count - 1];
+    }
+    ++count;
+  }
+  return cumulative_dimensions;
+}
 
+template <typename IndexVecT>
+void process_desired_shape(tensorflow::OpKernelContext* context,
+                           const tensorflow::TensorShape& input_tensor_shape,
+                           const IndexVecT& desired_shape,
+                           int* adjustable_dimension,
+                           std::vector<tensorflow::int64>* target_dimensions,
+                           tensorflow::int64* output_size) {
+  tensorflow::int64 new_sliced_size = 1;
+  bool found = false;
+  const int rank = input_tensor_shape.dims();
   for (int i = 0; i < rank; ++i) {
-    // if (desired_shape(i) < 1) {
     if (desired_shape[i] < 1) {
       // only one index can be adjustable
       OP_REQUIRES(context, !found,
                   tensorflow::errors::InvalidArgument(
                       "periodic_resample expects only "
                       "one index to be marked as adjustable."));
-      adjustable_dimension = i;
+      *adjustable_dimension = i;
       found = true;
     } else {
       OP_REQUIRES(
@@ -122,9 +232,8 @@ template <class InputDataT,
               i, " input tensor has size ", input_tensor_shape.dim_size(i),
               ", desired shape has size ", desired_shape[i], "."));
 
-      // target_dimensions[i] = desired_shape(i);
-      target_dimensions[i] = desired_shape[i];
-      new_sliced_size *= target_dimensions[i];
+      (*target_dimensions)[i] = desired_shape[i];
+      new_sliced_size *= (*target_dimensions)[i];
     }
   }
   // at least one index needs to be adjustable
@@ -132,26 +241,50 @@ template <class InputDataT,
               tensorflow::errors::InvalidArgument(
                   "periodic_resample expects at least "
                   "one index to be marked as adjustable."));
+  (*target_dimensions)[*adjustable_dimension] =
+      input_tensor_shape.num_elements() / new_sliced_size;
 
-  int count = 0;
-  for (const auto dim_info : input_tensor.shape()) {
-    original_dimensions[count] = dim_info.size;
-    ++count;
-  }
+  *output_size = new_sliced_size * (*target_dimensions)[*adjustable_dimension];
+}
 
-  target_dimensions[adjustable_dimension] = total_size / new_sliced_size;
+// Heuristic number based on measurements on
+// Intel(R) Core(TM) i7-4930K CPU @ 3.40GHz
+const tensorflow::int64 costPerFillIndex = 35;
 
-  count = 0;
-  for (int i = 0; i < input_tensor.shape().dims(); ++i) {
-    dimension_ceiling[count] = tensorflow::int64(std::ceil(
-        float(target_dimensions[count]) / float(original_dimensions[count])));
-    if (count == 0)
-      cumulative_dimensions[count] = 1;
-    else
-      cumulative_dimensions[count] =
-          cumulative_dimensions[count - 1] * dimension_ceiling[count - 1];
-    ++count;
-  }
+enum class Mode {
+  kForward,
+  kGradient
+};
+
+// Computes either periodic_resample operation output or gradients for it,
+// depending on |mode|.
+// |original_shape| is always shape of input to periodic_resample operation.
+// |source_tensor| is either source for periodic_resample (for forward mode)
+//     or gradients tensor.
+// |desired_shape| is always shape, provided by user, to which forward
+//     propagation attempts resample input tensor.
+template <class InputDataT, Mode mode>
+void
+do_periodic_resample_op(tensorflow::OpKernelContext* context,
+                        const tensorflow::TensorShape& original_shape,
+                        const tensorflow::PartialTensorShape& desired_shape,
+                        const tensorflow::Tensor& source_tensor) {
+  const int rank = source_tensor.dims();
+
+  // requires that the rank of the input tensor and length of the desired shape
+  // are equal
+  OP_REQUIRES(context, rank == desired_shape.dims(),
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample expects the rank of the input tensor, ",
+                  rank, ", to be the same as the length of the desired shape, ",
+                  desired_shape.dims(), "."));
+
+  std::vector<tensorflow::int64> target_dimensions(rank);
+  tensorflow::int64 new_size = 0;
+  // index of adjustable dimension
+  int adjustable_dimension = 0;
+  process_desired_shape(context, original_shape, desired_shape.dim_sizes(),
+                        &adjustable_dimension, &target_dimensions, &new_size);
 
   // ensure that the new dimension is greater than zero
   OP_REQUIRES(context, target_dimensions[adjustable_dimension] > 0,
@@ -160,11 +293,14 @@ template <class InputDataT,
                   "adjustable dimension, ",
                   adjustable_dimension, ", isn't greater than zero, ",
                   target_dimensions[adjustable_dimension], "."));
-  for (int i = 0; i < rank; ++i) {
-    output_shape.AddDim(target_dimensions[i]);
+  tensorflow::TensorShape output_shape;
+  if (mode == Mode::kForward) {
+    for (int i = 0; i < rank; ++i) {
+      output_shape.AddDim(target_dimensions[i]);
+    }
+  } else {
+    output_shape = original_shape;
   }
-  const auto new_size =
-      new_sliced_size * target_dimensions[adjustable_dimension];
 
   // Create an output tensor and attach it to the current context
   tensorflow::Tensor* output_tensor = nullptr;
@@ -172,47 +308,73 @@ template <class InputDataT,
                  context->allocate_output(0, output_shape, &output_tensor));
   auto output = output_tensor->flat<InputDataT>();
 
-  // memory is allocated for these variables outside the inner loop for
-  // efficiency (although, I could create a separate class scope for
-  // this purpose instead)
-  tensorflow::int64 result = 0;
-  std::vector<tensorflow::int64> output_indices(target_dimensions.size());
+  // input is a strided array (last index is fastest, C-ordered)
+  auto input = source_tensor.flat<InputDataT>();
 
   // Fill output tensor with periodically resampled input tensor values
-  for (tensorflow::int64 output_index = 0; output_index < new_size;
-       ++output_index) {
-    output(output_index) = input(compute_input_index(
-        &target_dimensions, output_index, original_dimensions,
-        adjustable_dimension, dimension_ceiling, cumulative_dimensions, &result,
-        &output_indices, rank));
-  }
+  InputIndexer input_indexer(target_dimensions, original_shape,
+                             adjustable_dimension);
+
+  auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+  auto fill_output_tensor = [&input_indexer, &output, &input](
+      tensorflow::int64 start, tensorflow::int64 limit) {
+    InputIndexer local_indexer(input_indexer);
+    local_indexer.MoveToOutputIndex(start);
+    for (tensorflow::int64 output_index = start; output_index < limit;
+         ++output_index) {
+      if (mode == Mode::kForward) {
+        output(output_index) = input(local_indexer.linear_input_index());
+      } else {
+        output(local_indexer.linear_input_index()) = input(output_index);
+      }
+      local_indexer.IncrementOutputIndex();
+    }
+  };
+  ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
+                      new_size, costPerFillIndex, fill_output_tensor);
 }
 
+#define DATA_TYPE_SWITCH(data_type, context, CASE)                            \
+  switch (data_type) {                                                        \
+    CASE(float)                                                               \
+    CASE(double)                                                              \
+    CASE(tensorflow::int32)                                                   \
+    CASE(tensorflow::int64)                                                   \
+    default:                                                                  \
+      context->CtxFailure(__FILE__, __LINE__,                                 \
+          tensorflow::errors::InvalidArgument(                                \
+              "Unsuppored tensor elements type"));                            \
+      break;                                                                  \
+  }
+
 void create_output_tensor(
     tensorflow::OpKernelContext* context,
     const tensorflow::Tensor& input_tensor,
     const tensorflow::DataType& input_tensor_type,
-    const tensorflow::PartialTensorShape& desired_shape_tensor) {
-  auto desired_shape = desired_shape_tensor.dim_sizes();
-
-  // obligatory type switch
-  switch (input_tensor_type) {
-    case tensorflow::DataTypeToEnum<float>::value:
-      fill_periodic_tensor<float>(context, desired_shape, input_tensor);
+    const tensorflow::PartialTensorShape& desired_shape) {
+#define CASE(type)                                                            \
+    case tensorflow::DataTypeToEnum<type>::value:                             \
+      do_periodic_resample_op<type, Mode::kForward>(                          \
+          context, input_tensor.shape(), desired_shape, input_tensor);        \
       break;
-    case tensorflow::DataTypeToEnum<double>::value:
-      fill_periodic_tensor<double>(context, desired_shape, input_tensor);
-      break;
-    case tensorflow::DataTypeToEnum<tensorflow::int32>::value:
-      fill_periodic_tensor<tensorflow::int32>(context, desired_shape,
-                                              input_tensor);
-      break;
-    case tensorflow::DataTypeToEnum<tensorflow::int64>::value:
-      fill_periodic_tensor<tensorflow::int64>(context, desired_shape,
-                                              input_tensor);
+
+  DATA_TYPE_SWITCH(input_tensor_type, context, CASE);
+#undef CASE
+}
+
+void create_grad_tensor(tensorflow::OpKernelContext* context,
+                        const tensorflow::Tensor& grad_tensor,
+                        const tensorflow::DataType& grad_tensor_type,
+                        const tensorflow::TensorShape& original_shape,
+                        const tensorflow::PartialTensorShape& desired_shape) {
+#define CASE(type)                                                            \
+    case tensorflow::DataTypeToEnum<type>::value:                             \
+      do_periodic_resample_op<type, Mode::kGradient>(                         \
+          context, original_shape, desired_shape, grad_tensor);               \
       break;
-    default:;
-  }
+
+  DATA_TYPE_SWITCH(grad_tensor_type, context, CASE);
+#undef CASE
 }
 
 }  // namespace
@@ -238,4 +400,25 @@ class PeriodicResampleOp : public tensorflow::OpKernel {
   tensorflow::PartialTensorShape desired_shape;
 };
 
+class PeriodicResampleOpGrad : public tensorflow::OpKernel {
+ public:
+  explicit PeriodicResampleOpGrad(tensorflow::OpKernelConstruction* context)
+      : tensorflow::OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("original_shape", &original_shape));
+    OP_REQUIRES_OK(context, context->GetAttr("desired_shape", &desired_shape));
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    const tensorflow::Tensor& grad_tensor = context->input(0);
+    const tensorflow::DataType grad_tensor_type = context->input_dtype(0);
+    create_grad_tensor(context, grad_tensor, grad_tensor_type, original_shape,
+                       desired_shape);
+  }
+
+ private:
+  tensorflow::TensorShape original_shape;
+  tensorflow::PartialTensorShape desired_shape;
+};
+
 #endif  // TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops.cc b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
index 82bd796956..fd38cd09b4 100644
--- a/tensorflow/contrib/periodic_resample/ops/array_ops.cc
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
@@ -26,7 +26,42 @@ REGISTER_OP("PeriodicResample")
     .Input("values: T")
     .Attr("shape: shape")
     .Output("output: T")
-    .SetShapeFn(shape_inference::ExplicitShape)
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      tensorflow::PartialTensorShape desired_shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &desired_shape));
+      shape_inference::ShapeHandle input_tensor_shape = c->input(0);
+      shape_inference::DimensionHandle num_input_elements =
+          c->NumElements(input_tensor_shape);
+      shape_inference::ShapeHandle result_shape_handle;
+      if (!shape_inference::InferenceContext::ValueKnown(num_input_elements)) {
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+            desired_shape, &result_shape_handle));
+      } else {
+        const int rank = c->Rank(input_tensor_shape);
+        std::vector<tensorflow::int64> target_dimensions(rank);
+        tensorflow::int64 new_sliced_size = 1;
+        int adjustable_dimension = 0;
+        for (int i = 0; i < rank; ++i) {
+          if (desired_shape.dim_size(i) < 1) {
+            adjustable_dimension = i;
+          } else {
+            target_dimensions[i] = desired_shape.dim_size(i);
+            new_sliced_size *= target_dimensions[i];
+          }
+        }
+        target_dimensions[adjustable_dimension] =
+            shape_inference::InferenceContext::Value(
+                num_input_elements) / new_sliced_size;
+        tensorflow::TensorShape result_shape;
+        for (int i = 0; i < rank; ++i) {
+          result_shape.AddDim(target_dimensions[i]);
+        }
+        TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(
+            result_shape, &result_shape_handle));
+      }
+      c->set_output(0, result_shape_handle);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Periodically resample elements of a tensor to conform to `shape`.
 
@@ -101,4 +136,20 @@ output: Periodically resampled tensor that has dimensions specified as in
 
 )doc");
 
+
+REGISTER_OP("PeriodicResampleOpGrad")
+    .Attr("T: numbertype")
+    .Input("grad: T")
+    .Attr("original_shape: shape")
+    .Attr("desired_shape: shape")
+    .Output("grad_values: T")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      tensorflow::TensorShape original_shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("original_shape", &original_shape));
+      shape_inference::ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(original_shape, &s));
+      c->set_output(0, s);
+      return Status::OK();
+});
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc b/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
new file mode 100644
index 0000000000..43b7c1799f
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(ArrayOpsTest, PeriodicResample_ShapeFn) {
+  ShapeInferenceTestOp op("PeriodicResample");
+  // Case 1: output shape can be fully inferreed.
+  PartialTensorShape shape({4, 4, -1});
+  TensorShapeProto shape_proto;
+  shape.AsProto(&shape_proto);
+
+  TF_ASSERT_OK(NodeDefBuilder("test", "PeriodicResample")
+                   .Input({"values", 0, DT_INT32})
+                   .Attr("shape", shape_proto)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[2,2,4]", "[4,4,1]");
+  // Case 2: output shape can not be inferred - report desired shape.
+  INFER_OK(op, "[2,2,?]", "[4,4,?]");
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
index a25de55e18..31a6fe1d94 100644
--- a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
+++ b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
@@ -21,8 +21,11 @@ from __future__ import print_function
 import numpy
 
 from tensorflow.contrib.periodic_resample import periodic_resample
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -93,7 +96,6 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
   def testPeriodicResampleErrors(self):
     input_tensor = numpy.zeros(shape=[1, 2, 2, 4])
     with self.test_session():
-      variables.global_variables_initializer().run()
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
           'Dimension 3 input tensor has size 4, desired shape has size 1'):
@@ -103,6 +105,29 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
           '4, to be the same as the length of the desired shape, 3'):
         periodic_resample(input_tensor, [None, 4, 4]).eval()
 
+  def testPeriodicResampleGradient(self):
+    desired_shape = numpy.array([4, 4, None])
+    result_shape = (4, 4, 1)
+    input_shape = (2, 2, 4)
+    with self.test_session() as sess:
+      x = array_ops.placeholder(dtypes.float32, shape=input_shape)
+      output = periodic_resample(x, desired_shape)
+      error = gradient_checker.compute_gradient_error(
+          x, input_shape, output, result_shape)
+      self.assertLess(error, 1e-4)
+
+  def testPeriodicResampleShapeInference(self):
+    with self.test_session() as sess:
+      # Case 1: output shape can be fully inferreed.
+      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, 4))
+      output = periodic_resample(x, [4, 4, None])
+      self.assertEqual(output.shape, [4, 4, 1])
+      # Case 2: output shape can not be inferred - report desired shape.
+      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, None))
+      output = periodic_resample(x, [4, 4, None])
+      self.assertTrue(output.shape.is_compatible_with([4, 4, None]))
+      self.assertEqual(output.shape[2].value, None)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
index 348623d8f8..470e300ccb 100644
--- a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
+++ b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
@@ -21,11 +21,17 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.contrib.periodic_resample.python.ops import gen_periodic_resample_op
 
-from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample
+from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample, periodic_resample_op_grad
 
 from tensorflow.contrib.util import loader
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
 # pylint: enable=unused-import
 
 _periodic_resample_op = loader.load_op_library(
     resource_loader.get_path_to_datafile('_periodic_resample_op.so'))
+
+@ops.RegisterGradient("PeriodicResample")
+def _periodic_resample_grad_cc(op, grad):
+  return periodic_resample_op_grad(
+      grad, op.inputs[0].shape, op.get_attr('shape'))
diff --git a/tensorflow/contrib/predictor/contrib_estimator_predictor.py b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
index b7a98c68e2..af3b2ad1b5 100644
--- a/tensorflow/contrib/predictor/contrib_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
@@ -34,7 +34,8 @@ class ContribEstimatorPredictor(predictor.Predictor):
                prediction_input_fn,
                input_alternative_key=None,
                output_alternative_key=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `ContribEstimatorPredictor`.
 
     Args:
@@ -48,6 +49,7 @@ class ContribEstimatorPredictor(predictor.Predictor):
         multi-headed models.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -58,6 +60,7 @@ class ContribEstimatorPredictor(predictor.Predictor):
       checkpoint_path = saver.latest_checkpoint(estimator.model_dir)
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
+              config=config,
               checkpoint_filename_with_path=checkpoint_path))
 
     input_alternative_key = (
diff --git a/tensorflow/contrib/predictor/core_estimator_predictor.py b/tensorflow/contrib/predictor/core_estimator_predictor.py
index d78d94c269..a725072e72 100644
--- a/tensorflow/contrib/predictor/core_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/core_estimator_predictor.py
@@ -51,7 +51,8 @@ class CoreEstimatorPredictor(predictor.Predictor):
                estimator,
                serving_input_receiver_fn,
                output_key=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -62,6 +63,7 @@ class CoreEstimatorPredictor(predictor.Predictor):
         `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -71,6 +73,7 @@ class CoreEstimatorPredictor(predictor.Predictor):
       checkpoint_dir = estimator.model_dir
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
+              config=config,
               checkpoint_dir=checkpoint_dir))
 
     feed_tensor_info = signature_def.inputs
diff --git a/tensorflow/contrib/predictor/predictor_factories.py b/tensorflow/contrib/predictor/predictor_factories.py
index 6e77e934fe..f275bc15ad 100644
--- a/tensorflow/contrib/predictor/predictor_factories.py
+++ b/tensorflow/contrib/predictor/predictor_factories.py
@@ -30,7 +30,8 @@ def from_contrib_estimator(estimator,
                            prediction_input_fn,
                            input_alternative_key=None,
                            output_alternative_key=None,
-                           graph=None):
+                           graph=None,
+                           config=None):
   """Constructs a `Predictor` from a `tf.contrib.learn.Estimator`.
 
   Args:
@@ -44,6 +45,7 @@ def from_contrib_estimator(estimator,
       multi-headed models.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -62,13 +64,15 @@ def from_contrib_estimator(estimator,
       prediction_input_fn,
       input_alternative_key=input_alternative_key,
       output_alternative_key=output_alternative_key,
-      graph=graph)
+      graph=graph,
+      config=config)
 
 
 def from_estimator(estimator,
                    serving_input_receiver_fn,
                    output_key=None,
-                   graph=None):
+                   graph=None,
+                   config=None):
   """Constructs a `Predictor` from a `tf.python.estimator.Estimator`.
 
   Args:
@@ -79,6 +83,7 @@ def from_estimator(estimator,
       `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -93,14 +98,19 @@ def from_estimator(estimator,
                     'tf.contrib.learn.Estimator. You likely want to call '
                     'from_contrib_estimator.')
   return core_estimator_predictor.CoreEstimatorPredictor(
-      estimator, serving_input_receiver_fn, output_key=output_key, graph=graph)
+      estimator,
+      serving_input_receiver_fn,
+      output_key=output_key,
+      graph=graph,
+      config=config)
 
 
 def from_saved_model(export_dir,
                      signature_def_key=None,
                      signature_def=None,
                      tags=None,
-                     graph=None):
+                     graph=None,
+                     config=None):
   """Constructs a `Predictor` from a `SavedModel` on disk.
 
   Args:
@@ -115,6 +125,7 @@ def from_saved_model(export_dir,
       `SignatureDef`. Defaults to `DEFAULT_TAGS`.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -128,4 +139,5 @@ def from_saved_model(export_dir,
       signature_def_key=signature_def_key,
       signature_def=signature_def,
       tags=tags,
-      graph=graph)
+      graph=graph,
+      config=config)
diff --git a/tensorflow/contrib/predictor/predictor_factories_test.py b/tensorflow/contrib/predictor/predictor_factories_test.py
index 578d9424b2..a2ef1dc3af 100644
--- a/tensorflow/contrib/predictor/predictor_factories_test.py
+++ b/tensorflow/contrib/predictor/predictor_factories_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.predictor import predictor_factories
 from tensorflow.contrib.predictor import testing_common
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import test
 
 MODEL_DIR_NAME = 'contrib/predictor/test_export_dir'
@@ -41,6 +42,11 @@ class PredictorFactoriesTest(test.TestCase):
     """Test loading from_saved_model with tags."""
     predictor_factories.from_saved_model(self._export_dir, tags='serve')
 
+  def testFromSavedModelWithSessionConfig(self):
+    """Test loading from_saved_model with session config."""
+    predictor_factories.from_saved_model(
+        self._export_dir, config=config_pb2.ConfigProto())
+
   def testFromSavedModelWithBadTags(self):
     """Test that loading fails for bad tags."""
     bad_tags_regex = ('.*? could not be found in SavedModel')
@@ -53,6 +59,13 @@ class PredictorFactoriesTest(test.TestCase):
     predictor_factories.from_contrib_estimator(
         estimator, input_fn, output_alternative_key='sum')
 
+  def testFromContribEstimatorWithSessionConfig(self):
+    estimator = testing_common.get_arithmetic_estimator(core=False)
+    input_fn = testing_common.get_arithmetic_input_fn(core=False)
+    predictor_factories.from_contrib_estimator(
+        estimator, input_fn, output_alternative_key='sum',
+        config=config_pb2.ConfigProto())
+
   def testFromContribEstimatorWithCoreEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=True)
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
@@ -64,6 +77,12 @@ class PredictorFactoriesTest(test.TestCase):
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
     predictor_factories.from_estimator(estimator, input_fn)
 
+  def testFromCoreEstimatorWithSessionConfig(self):
+    estimator = testing_common.get_arithmetic_estimator(core=True)
+    input_fn = testing_common.get_arithmetic_input_fn(core=True)
+    predictor_factories.from_estimator(
+        estimator, input_fn, config=config_pb2.ConfigProto())
+
   def testFromCoreEstimatorWithContribEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=False)
     input_fn = testing_common.get_arithmetic_input_fn(core=False)
diff --git a/tensorflow/contrib/predictor/saved_model_predictor.py b/tensorflow/contrib/predictor/saved_model_predictor.py
index 0dbca0f813..95da6d04ed 100644
--- a/tensorflow/contrib/predictor/saved_model_predictor.py
+++ b/tensorflow/contrib/predictor/saved_model_predictor.py
@@ -121,7 +121,8 @@ class SavedModelPredictor(predictor.Predictor):
                input_names=None,
                output_names=None,
                tags=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -142,6 +143,7 @@ class SavedModelPredictor(predictor.Predictor):
         the correct `SignatureDef`. Defaults to `DEFAULT_TAGS`.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     Raises:
       ValueError: If more than one of signature_def_key OR signature_def OR
         (input_names AND output_names) is specified.
@@ -152,7 +154,7 @@ class SavedModelPredictor(predictor.Predictor):
     self._graph = graph or ops.Graph()
 
     with self._graph.as_default():
-      self._session = session.Session()
+      self._session = session.Session(config=config)
       loader.load(self._session, tags.split(','), export_dir)
 
     if input_names is None:
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index c83623ec94..27a933c0f9 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -6,7 +6,7 @@ inference. The details of the transformation implemented in this package is
 described here [1].
 
 This is done using the
-[fake quantization op](https://www.tensorflow.org/versions/r0.12/api_docs/python/array_ops/fake_quantization).
+[fake quantization op](https://www.tensorflow.org/api_guides/python/array_ops#Fake_quantization).
 
 Literature has shown that fixed point networks provide comparable performance to
 floating point networks [2]. This is achieved by modeling the quantization
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 94fc12ca81..3d0308aaf3 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -26,7 +26,6 @@ import time
 import numpy as np
 
 from tensorflow.contrib.framework.python.ops import variables as variables_lib
-from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.slim.python.slim import evaluation
 from tensorflow.contrib.training.python.training import evaluation as evaluation_lib
 from tensorflow.core.protobuf import saver_pb2
@@ -37,6 +36,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import gfile
@@ -89,8 +89,8 @@ class EvaluationTest(test.TestCase):
     self._predictions, self._scale = TestModel(self._inputs)
 
   def testFinalOpsOnEvaluationLoop(self):
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
     # Create checkpoint and log directories:
@@ -136,9 +136,10 @@ class EvaluationTest(test.TestCase):
     self.assertTrue(obj.hook_was_run)
 
   def _create_names_to_metrics(self, predictions, labels):
-    accuracy0, update_op0 = metric_ops.streaming_accuracy(predictions, labels)
-    accuracy1, update_op1 = metric_ops.streaming_accuracy(predictions + 1,
-                                                          labels)
+    accuracy0, update_op0 = metrics.accuracy(
+        labels=labels, predictions=predictions)
+    accuracy1, update_op1 = metrics.accuracy(
+        labels=labels, predictions=predictions + 1)
 
     names_to_values = {'Accuracy': accuracy0, 'Another_accuracy': accuracy1}
     names_to_updates = {'Accuracy': update_op0, 'Another_accuracy': update_op1}
@@ -198,8 +199,8 @@ class EvaluationTest(test.TestCase):
     predictions_limited = input.limit_epochs(self._predictions, num_epochs=1)
     labels_limited = input.limit_epochs(self._labels, num_epochs=1)
 
-    value_op, update_op = metric_ops.streaming_accuracy(
-        predictions_limited, labels_limited)
+    value_op, update_op = metrics.accuracy(
+        labels=labels_limited, predictions=predictions_limited)
 
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
@@ -260,8 +261,8 @@ class SingleEvaluationTest(test.TestCase):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
 
     # Run the evaluation and verify the results:
     accuracy_value = evaluation.evaluate_once(
@@ -276,8 +277,8 @@ class SingleEvaluationTest(test.TestCase):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
 
     dumping_root = os.path.join(self.get_temp_dir(), 'tfdbg_dump_dir')
     dumping_hook = hooks.DumpingDebugHook(dumping_root, log_usage=False)
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index 99ced53e11..d22b80ac88 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -21,6 +21,7 @@ from @{tf.summary.merge_all} to @{tf.summary.FileWriter}.
 
 To use with eager execution enabled, write your code as follows:
 
+```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -30,9 +31,11 @@ with summary_writer.as_default(), tf.contrib.summary.always_record_summaries():
   tf.contrib.summary.scalar("loss", my_loss)
   # In this case every call to tf.contrib.summary.scalar will generate a record
   # ...
+```
 
 To use it with graph execution, write your code as follows:
 
+```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -53,7 +56,7 @@ with tf.Session(...) as sess:
   while not_done_training:
     sess.run([train_op, tf.contrib.summary.all_summary_ops()])
     # ...
-
+```
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index e893e1d1c8..d8236a0a6f 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -21,10 +21,10 @@ import numpy as np
 
 from tensorflow.contrib import losses
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
-from tensorflow.contrib.metrics.python.ops import metric_ops
 
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 
 INFERENCE_PROB_NAME = prediction_key.PredictionKey.PROBABILITIES
@@ -38,12 +38,13 @@ def _top_k_generator(k):
     targets = math_ops.to_int32(targets)
     if targets.get_shape().ndims > 1:
       targets = array_ops.squeeze(targets, axis=[1])
-    return metric_ops.streaming_mean(nn.in_top_k(probabilities, targets, k))
+    return metrics.mean(nn.in_top_k(probabilities, targets, k))
   return _top_k
 
 
 def _accuracy(predictions, targets, weights=None):
-  return metric_ops.streaming_accuracy(predictions, targets, weights=weights)
+  return metrics.accuracy(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _r2(probabilities, targets, weights=None):
@@ -53,7 +54,7 @@ def _r2(probabilities, targets, weights=None):
   squares_residuals = math_ops.reduce_sum(
       math_ops.square(targets - probabilities), 0)
   score = 1 - math_ops.reduce_sum(squares_residuals / squares_total)
-  return metric_ops.streaming_mean(score, weights=weights)
+  return metrics.mean(score, weights=weights)
 
 
 def _squeeze_and_onehot(targets, depth):
@@ -62,7 +63,7 @@ def _squeeze_and_onehot(targets, depth):
 
 
 def _sigmoid_entropy(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.sigmoid_cross_entropy(probabilities,
                                    _squeeze_and_onehot(
                                        targets,
@@ -71,7 +72,7 @@ def _sigmoid_entropy(probabilities, targets, weights=None):
 
 
 def _softmax_entropy(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.sparse_softmax_cross_entropy(probabilities,
                                           math_ops.to_int32(targets)),
       weights=weights)
@@ -82,7 +83,7 @@ def _predictions(predictions, unused_targets, **unused_kwargs):
 
 
 def _class_log_loss(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.log_loss(probabilities,
                       _squeeze_and_onehot(targets,
                                           array_ops.shape(probabilities)[1])),
@@ -90,34 +91,36 @@ def _class_log_loss(probabilities, targets, weights=None):
 
 
 def _precision(predictions, targets, weights=None):
-  return metric_ops.streaming_precision(predictions, targets, weights=weights)
+  return metrics.precision(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _precision_at_thresholds(predictions, targets, weights=None):
-  return metric_ops.streaming_precision_at_thresholds(
-      array_ops.slice(predictions, [0, 1], [-1, 1]),
-      targets,
-      np.arange(
-          0, 1, 0.01, dtype=np.float32),
+  return metrics.precision_at_thresholds(
+      labels=targets,
+      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
+      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _recall(predictions, targets, weights=None):
-  return metric_ops.streaming_recall(predictions, targets, weights=weights)
+  return metrics.recall(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _recall_at_thresholds(predictions, targets, weights=None):
-  return metric_ops.streaming_recall_at_thresholds(
-      array_ops.slice(predictions, [0, 1], [-1, 1]),
-      targets,
-      np.arange(
-          0, 1, 0.01, dtype=np.float32),
+  return metrics.recall_at_thresholds(
+      labels=targets,
+      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
+      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _auc(probs, targets, weights=None):
-  return metric_ops.streaming_auc(array_ops.slice(probs, [0, 1], [-1, 1]),
-                                  targets, weights=weights)
+  return metrics.auc(
+      labels=targets,
+      predictions=array_ops.slice(probs, [0, 1], [-1, 1]),
+      weights=weights)
 
 
 _EVAL_METRICS = {
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index 7a35a70bbe..6f62cd11a9 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -295,7 +295,7 @@ def get_epoch_variable():
 
 
 # A simple container to hold the training variables for a single tree.
-class TreeTrainingVariables(object):
+class TreeVariables(object):
   """Stores tf.Variables for training a single random tree.
 
   Uses tf.get_variable to get tree-specific names so that this can be used
@@ -303,7 +303,7 @@ class TreeTrainingVariables(object):
   then relies on restoring that model to evaluate).
   """
 
-  def __init__(self, params, tree_num, training):
+  def __init__(self, params, tree_num, training, tree_config='', tree_stat=''):
     if (not hasattr(params, 'params_proto') or
         not isinstance(params.params_proto,
                        _params_proto.TensorForestParams)):
@@ -315,27 +315,28 @@ class TreeTrainingVariables(object):
       # TODO(gilberth): Manually shard this to be able to fit it on
       # multiple machines.
       self.stats = stats_ops.fertile_stats_variable(
-          params, '', self.get_tree_name('stats', tree_num))
+          params, tree_stat, self.get_tree_name('stats', tree_num))
     self.tree = model_ops.tree_variable(
-        params, '', self.stats, self.get_tree_name('tree', tree_num))
+        params, tree_config, self.stats, self.get_tree_name('tree', tree_num))
 
   def get_tree_name(self, name, num):
     return '{0}-{1}'.format(name, num)
 
 
-class ForestTrainingVariables(object):
+class ForestVariables(object):
   """A container for a forests training data, consisting of multiple trees.
 
-  Instantiates a TreeTrainingVariables object for each tree. We override the
+  Instantiates a TreeVariables object for each tree. We override the
   __getitem__ and __setitem__ function so that usage looks like this:
 
-    forest_variables = ForestTrainingVariables(params)
+    forest_variables = ForestVariables(params)
 
     ... forest_variables.tree ...
   """
 
   def __init__(self, params, device_assigner, training=True,
-               tree_variables_class=TreeTrainingVariables):
+               tree_variables_class=TreeVariables,
+               tree_configs=None, tree_stats=None):
     self.variables = []
     # Set up some scalar variables to run through the device assigner, then
     # we can use those to colocate everything related to a tree.
@@ -347,7 +348,13 @@ class ForestTrainingVariables(object):
 
     for i in range(params.num_trees):
       with ops.device(self.device_dummies[i].device):
-        self.variables.append(tree_variables_class(params, i, training))
+        kwargs = {}
+        if tree_configs is not None:
+          kwargs.update(dict(tree_config=tree_configs[i]))
+        if tree_stats is not None:
+          kwargs.update(dict(tree_stat=tree_stats[i]))
+        self.variables.append(tree_variables_class(
+            params, i, training, **kwargs))
 
   def __setitem__(self, t, val):
     self.variables[t] = val
@@ -361,9 +368,11 @@ class RandomForestGraphs(object):
 
   def __init__(self,
                params,
+               tree_configs=None,
+               tree_stats=None,
                device_assigner=None,
                variables=None,
-               tree_variables_class=TreeTrainingVariables,
+               tree_variables_class=TreeVariables,
                tree_graphs=None,
                training=True):
     self.params = params
@@ -371,9 +380,10 @@ class RandomForestGraphs(object):
         device_assigner or framework_variables.VariableDeviceChooser())
     logging.info('Constructing forest with params = ')
     logging.info(self.params.__dict__)
-    self.variables = variables or ForestTrainingVariables(
+    self.variables = variables or ForestVariables(
         self.params, device_assigner=self.device_assigner, training=training,
-        tree_variables_class=tree_variables_class)
+        tree_variables_class=tree_variables_class,
+        tree_configs=tree_configs, tree_stats=tree_stats)
     tree_graph_class = tree_graphs or RandomTreeGraphs
     self.trees = [
         tree_graph_class(self.variables[i], self.params, i)
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
index bbe627b157..1c9c81827e 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
@@ -18,10 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from google.protobuf.json_format import ParseDict
+from tensorflow.contrib.decision_trees.proto import generic_tree_model_pb2 as _tree_proto
 from tensorflow.contrib.tensor_forest.python import tensor_forest
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -110,6 +114,47 @@ class TensorForestTest(test_util.TensorFlowTestCase):
     self.assertTrue(isinstance(paths, ops.Tensor))
     self.assertTrue(isinstance(var, ops.Tensor))
 
+  def testInfrenceFromRestoredModel(self):
+    input_data = [[-1., 0.], [-1., 2.],  # node 1
+                  [1., 0.], [1., -2.]]  # node 2
+    expected_prediction = [[0.0, 1.0], [0.0, 1.0],
+                           [0.0, 1.0], [0.0, 1.0]]
+    hparams = tensor_forest.ForestHParams(
+        num_classes=2,
+        num_features=2,
+        num_trees=1,
+        max_nodes=1000,
+        split_after_samples=25).fill()
+    tree_weight = {'decisionTree':
+                       {'nodes':
+                        [{'binaryNode':
+                          {'rightChildId': 2,
+                           'leftChildId': 1,
+                           'inequalityLeftChildTest':
+                           {'featureId': {'id': '0'},
+                            'threshold': {'floatValue': 0}}}},
+                         {'leaf': {'vector':
+                                   {'value': [{'floatValue': 0.0},
+                                              {'floatValue': 1.0}]}},
+                          'nodeId': 1},
+                         {'leaf': {'vector':
+                                   {'value': [{'floatValue': 0.0},
+                                              {'floatValue': 1.0}]}},
+                          'nodeId': 2}]}}
+    restored_tree_param = ParseDict(tree_weight,
+                                    _tree_proto.Model()).SerializeToString()
+    graph_builder = tensor_forest.RandomForestGraphs(hparams,
+                                                     [restored_tree_param])
+    probs, paths, var = graph_builder.inference_graph(input_data)
+    self.assertTrue(isinstance(probs, ops.Tensor))
+    self.assertTrue(isinstance(paths, ops.Tensor))
+    self.assertTrue(isinstance(var, ops.Tensor))
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+      self.assertEquals(probs.eval().shape, (4, 2))
+      self.assertEquals(probs.eval().tolist(), expected_prediction)
+
   def testTrainingConstructionClassificationSparse(self):
     input_data = sparse_tensor.SparseTensor(
         indices=[[0, 0], [0, 3], [1, 0], [1, 7], [2, 1], [3, 9]],
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index b7b26cfb1c..da4dd5a14c 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -91,8 +91,11 @@ void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
       if (!subgraph_node_ids.count(edge->src()->id()) &&
           !edge->src()->IsSource() && !edge->IsControlEdge()) {
         incoming_edges->insert(edge);
+        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
+                << " Y, ";
       } else {
-        VLOG(2) << node->name() << " -> " << edge->src()->name() << " N, ";
+        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
+                << " N, ";
       }
     }
   }
@@ -106,10 +109,12 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
     for (const tensorflow::Edge* edge : node->out_edges()) {
       if (!subgraph_node_ids.count(edge->dst()->id()) &&
           !edge->dst()->IsSink() && !edge->IsControlEdge()) {
-        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " Y, ";
+        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
+                << " Y, ";
         outgoing_edges->insert(edge);
       } else {
-        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " N, ";
+        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
+                << " N, ";
       }
     }
   }
@@ -181,29 +186,27 @@ struct ConvertGraphParams {
 static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
   GetSubGraphIncomingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_incoming_edges);
+
+  std::set<std::pair<int, int>> unique_tensors;
+  // Add only unique input source nodes. If output of an outside node is shared
+  // between multiple nodes inside the engine, only one edge should be created
   for (const tensorflow::Edge* edge : p->subgraph_incoming_edges) {
-    p->subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
-  }
-  auto output_name_to_index_map = BuildTensorNameMap(p->output_names);
-  std::set<std::pair<int, int>> subgraph_outputs_set;
-  // Collect outputs referenced from output_names
-  for (int node_id : p->subgraph_node_ids) {
-    tensorflow::Node* node = p->graph.FindNodeId(node_id);
-    if (output_name_to_index_map.count(node->name())) {
-      for (int index : output_name_to_index_map.at(node->name())) {
-        subgraph_outputs_set.insert({node_id, index});
-      }
-    }
+    unique_tensors.insert({edge->src()->id(), edge->src_output()});
   }
+  p->subgraph_inputs.insert(p->subgraph_inputs.begin(), unique_tensors.begin(),
+                            unique_tensors.end());
   GetSubGraphOutgoingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_outgoing_edges);
+  unique_tensors.clear();
+  // Similar to above, if multiple ouside nodes are sharing the output of an
+  // internal node only one output port should be created and shared between
+  // outputs
   for (const tensorflow::Edge* edge : p->subgraph_outgoing_edges) {
-    subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
+    unique_tensors.insert({edge->src()->id(), edge->src_output()});
   }
-  p->subgraph_outputs.reserve(subgraph_outputs_set.size());
+  p->subgraph_outputs.reserve(unique_tensors.size());
   p->subgraph_outputs.insert(p->subgraph_outputs.begin(),
-                             subgraph_outputs_set.begin(),
-                             subgraph_outputs_set.end());
+                             unique_tensors.begin(), unique_tensors.end());
   return tensorflow::Status::OK();
 }
 
@@ -225,7 +228,6 @@ tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
   for (auto in_edge :
        params->subgraph_incoming_edges) {  // loop over incoming edges and
                                            // attach them to calib node
-    // tensorflow::Node* src_node = in_edge->src();
     auto src_output = in_edge->src_output();
     auto dst_node = in_edge->dst();
     auto dst_input = in_edge->dst_input();
@@ -257,19 +259,24 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
   for (size_t i = 0; i < params->subgraph_inputs.size(); ++i) {
     subgraph_edge_to_input_map.insert({params->subgraph_inputs.at(i), i});
   }
+  std::set<std::pair<int, int>> unique_tensors;
   for (const tensorflow::Edge* edge : params->subgraph_incoming_edges) {
     std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
+    if (unique_tensors.count(old_src)) continue;
+    unique_tensors.insert(old_src);
     int new_src_output = subgraph_edge_to_input_map.at(old_src);
     params->graph.AddEdge(edge->src(), edge->src_output(), trt_node,
                           new_src_output);
+    VLOG(1) << "Wire " << edge->src()->name() << ":" << edge->src_output()
+            << " -> " << trt_node->name() << ":" << new_src_output;
     params->graph.RemoveEdge(edge);
   }
-
-  VLOG(2) << "new wiring edges: " << trt_node->in_edges().size();
-  for (const tensorflow::Edge* edge : trt_node->in_edges()) {
-    VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "new edge count: " << trt_node->in_edges().size();
+    for (const tensorflow::Edge* edge : trt_node->in_edges()) {
+      VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
+    }
   }
-
   TF_RETURN_IF_ERROR(status);
 
   // Re-map outgoing edges to use the new TRT node instead of the orig subgraph
@@ -283,6 +290,8 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
     int new_src_output = subgraph_edge_to_output_map.at(old_src);
     TF_RETURN_IF_ERROR(params->graph.UpdateEdge(
         trt_node, new_src_output, edge->dst(), edge->dst_input()));
+    VLOG(1) << "Wire " << trt_node->name() << ":" << new_src_output << " -> "
+            << edge->dst()->name() << ":" << edge->dst_input();
   }
   // Remove the original subgraph
   for (int node_id : params->subgraph_node_ids) {
@@ -317,9 +326,12 @@ tensorflow::Status ConvertCalibGraphToInferGraph(
       tensorflow::GraphConstructorOptions(), graph_def, &graph));
   //  get calib nodes
   std::vector<tensorflow::Node*> calib_nodes;
-  for (auto node : graph.op_nodes()) {
+  std::vector<tensorflow::Node*> topo_order;
+  tensorflow::GetPostOrder(graph, &topo_order);
+  for (auto rit = topo_order.rbegin(); rit != topo_order.rend(); ++rit) {
+    auto node = *rit;
     if (node->type_string() == "TRTCalibOp") {
-      VLOG(1) << "Found Calib Node";
+      VLOG(1) << "Found Calib Node " << node->name();
       calib_nodes.push_back(node);
     }
   }
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 96e0700862..4e4d295538 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -362,10 +362,11 @@ void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
       break;
     }
     case tensorflow::DataType::DT_HALF: {
-      Reorder2({k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
-               istrides, static_cast<Eigen::half*>(
-                             const_cast<void*>(oweights->GetValues())),
-               ostrides);
+      Reorder2(
+          {k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
+          istrides,
+          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
+          ostrides);
       break;
     }
     default:
@@ -1179,9 +1180,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
-                                             " not supported at: " +
-                                             node_def.name());
+    return tensorflow::errors::Unimplemented(
+        "binary op: " + node_def.op() +
+        " not supported at: " + node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -2138,9 +2139,7 @@ void Converter::register_op_converters() {
 }
 
 }  // namespace
-tensorflow::Status GetTensorRTGraph(tensorrt::convert::SubGraphParams& s) {
-  return tensorflow::errors::Unimplemented("Not implemented yet");
-}
+
 tensorflow::Status ConvertCalibrationNodeToEngineNode(
     tensorflow::Graph& graph, tensorflow::Node* c_node) {
   const auto ndef = c_node->def();
@@ -2164,9 +2163,23 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   for (auto n : graph.op_nodes()) {
     node_maps.insert({n->name(), n});
   }
+  std::set<int> subgraph_ids;
+  for (const auto internal_node : segment_nodes) {
+    subgraph_ids.insert(node_maps.at(internal_node)->id());
+  }
+  if (VLOG_IS_ON(2)) {
+    string node_names = StrCat(c_node->name(), " segment nodes= ");
+
+    for (const auto& node_name : segment_nodes) {
+      StrAppend(&node_names, node_name, ", ");
+    }
+    VLOG(2) << node_names;
+  }
+
   VLOG(1) << "Output Nodes:";
   std::vector<tensorflow::DataType> out_types;
   std::vector<const tensorflow::Edge*> out_edges;
+
   for (auto& i : output_nodes) {
     auto node_port = tensorflow::str_util::Split(i, ":");
     VLOG(1) << " " << i << " in graph " << node_maps.count(i);
@@ -2186,18 +2199,24 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
         out_types.push_back(out_node->output_type(0));
       }
       for (auto out_edge : out_node->out_edges()) {
+        if (subgraph_ids.count(out_edge->dst()->id()))
+          continue;  // skip internal edges;
         if (out_edge->src_output() == port) {
           out_edges.push_back(out_edge);
-          break;
+          VLOG(1) << "OUTPUT EDGE " << out_edge->src()->name() << ":"
+                  << out_edge->src_output() << " -> " << out_edge->dst()->name()
+                  << ":" << out_edge->dst_input();
         }
       }
     } else {
       LOG(WARNING) << " couldn't find output node " << out_node_name;
     }
   }
-  VLOG(1) << "Input Nodes:";
-  for (auto& i : input_names) {
-    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << c_node->name() << " Input Nodes:";
+    for (auto& i : input_names) {
+      VLOG(1) << " Input " << i << " in graph " << node_maps.count(i);
+    }
   }
   auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
   auto resmgr = trt_rm->getManager("TRTCalibOps");
@@ -2231,14 +2250,24 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   calib_res->builder_ = nullptr;
   tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
   std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
+  income_edges.resize(c_node->num_inputs());
   for (const auto in_edge : c_node->in_edges()) {
     auto src = in_edge->src();
     int dest_port = in_edge->dst_input();
-    income_edges.emplace_back(src->name(), in_edge->src_output(),
-                              c_node->input_type(dest_port));
+    VLOG(1) << "Incoming connection " << src->name() << ":"
+            << in_edge->src_output() << " -> " << c_node->name() << ":"
+            << dest_port;
+    income_edges.at(dest_port) = {src->name(), in_edge->src_output(),
+                                  c_node->input_type(dest_port)};
   }
   tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
       income_edges);
+  if (VLOG_IS_ON(2)) {
+    for (const auto& inp : input_list) {
+      VLOG(2) << " Input from inputlist " << inp.node << ":" << inp.index << " "
+              << tensorflow::DataTypeString(inp.data_type);
+    }
+  }
   op_builder.Input(input_list);
   tensorflow::NodeDef engine_node;
   const char* engine_plan_data = static_cast<const char*>(engine_plan->data());
@@ -2255,13 +2284,26 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   }
   auto trt_engine_node = graph.AddNode(engine_node, &status);
   TF_RETURN_IF_ERROR(status);
-  for (size_t i = 0; i < out_edges.size(); i++) {
-    VLOG(1) << "Connecting trt_engine_node output " << i << " with "
-            << out_edges.at(i)->dst()->name() << " port "
-            << out_edges.at(i)->dst_input();
-    TF_RETURN_IF_ERROR(graph.UpdateEdge(trt_engine_node, i,
-                                        out_edges.at(i)->dst(),
-                                        out_edges.at(i)->dst_input()));
+  std::map<string, int> port_map;
+  for (size_t t = 0; t < output_nodes.size(); t++) {
+    port_map.insert({output_nodes.at(t), t});
+  }
+  for (auto& i : out_edges) {
+    string s(i->src()->name());
+    if (i->src_output()) StrAppend(&s, ":", i->src_output());
+    int out_port = port_map.at(s);
+    VLOG(1) << "Connecting " << trt_engine_node->name() << ":" << out_port
+            << " -> " << i->dst()->name() << ":" << i->dst_input();
+    TF_RETURN_IF_ERROR(
+        graph.UpdateEdge(trt_engine_node, out_port, i->dst(), i->dst_input()));
+  }
+  for (const auto ed : trt_engine_node->in_edges()) {
+    VLOG(1) << "In Edge  " << ed->src()->name() << ":" << ed->src_output()
+            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
+  }
+  for (const auto ed : trt_engine_node->out_edges()) {
+    VLOG(1) << "Out Edge " << ed->src()->name() << ":" << ed->src_output()
+            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
   }
   VLOG(1) << "Segment nodes:";
   for (auto& i : segment_nodes) {
@@ -2332,6 +2374,7 @@ tensorflow::Status ConvertSubgraph(
     std::vector<string>* output_names,
     std::vector<tensorflow::DataType>* output_dtypes,
     const string& engine_name) {
+  std::set<string> added_tensors;
   for (const std::pair<int, int>& input : s.input_inds) {
     VLOG(2) << "parsing input. Node id= " << input.first;
     int node_id = input.first;
@@ -2374,7 +2417,6 @@ tensorflow::Status ConvertSubgraph(
 
     auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
-    input_dtypes->push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
     auto type_status = ConvertDType(tf_dtype, &dtype);
@@ -2410,8 +2452,10 @@ tensorflow::Status ConvertSubgraph(
     if (output_idx != 0) {
       input_tensor_name = StrCat(node_name, ":", output_idx);
     }
-
+    if (added_tensors.count(input_tensor_name)) continue;
+    added_tensors.insert(input_tensor_name);
     input_names->push_back(input_tensor_name);
+    input_dtypes->push_back(tf_dtype);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
         input_tensor_name.c_str(), dtype, input_dim_pseudo_chw);
 
@@ -2435,6 +2479,7 @@ tensorflow::Status ConvertSubgraph(
 
   // Gather output metadata
   int trt_engine_op_output_idx = 0;
+  added_tensors.clear();
   for (const std::pair<int, int>& output : s.output_inds) {
     int node_id = output.first;
     int output_idx = output.second;
@@ -2451,6 +2496,8 @@ tensorflow::Status ConvertSubgraph(
     if (output_idx != 0)
       tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
     VLOG(2) << "Output tensor name: " << tensor_name;
+    if (added_tensors.count(tensor_name)) continue;
+    added_tensors.insert(tensor_name);
     output_names->push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index 2e472a2805..d879170b68 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -166,11 +166,21 @@ def StreamingFilesDataset(files,
     return remote_iterator.get_next()
 
   def MapFn(unused_input):
-    return functional_ops.remote_call(
+    if isinstance(source_dataset.output_types, dtypes.DType):
+      output_types = [source_dataset.output_types]
+    elif isinstance(source_dataset.output_types, (list, tuple)):
+      output_types = source_dataset.output_types
+    else:
+      raise ValueError('source dataset has invalid output types')
+    remote_calls = functional_ops.remote_call(
         args=[source_handle],
-        Tout=[dtypes.string],
+        Tout=output_types,
         f=LoadingFunc,
-        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)[0]
+        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
+    if len(remote_calls) == 1:
+      return remote_calls[0]
+    else:
+      return remote_calls
 
   with ops.device('/job:%s' % worker_job):
     output_dataset = dataset_ops.Dataset.range(2).repeat().map(
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
index 918cf0ed8e..b58d05eac5 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
@@ -26,6 +26,8 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -162,6 +164,30 @@ class DatasetsTest(test.TestCase):
 
     self.assertEqual(set(all_contents), set(retrieved_values))
 
+  def testArbitraryReaderFuncFromDatasetGenerator(self):
+
+    def my_generator():
+      yield (1, [1] * 10)
+
+    def gen_dataset(dummy):
+      return dataset_ops.Dataset.from_generator(
+          my_generator, (dtypes.int64, dtypes.int64),
+          (tensor_shape.TensorShape([]), tensor_shape.TensorShape([10])))
+
+    dataset = datasets.StreamingFilesDataset(
+        dataset_ops.Dataset.range(10), filetype=gen_dataset)
+
+    iterator = dataset.make_initializable_iterator()
+    self._sess.run(iterator.initializer)
+    get_next = iterator.get_next()
+
+    retrieved_values = self._sess.run(get_next)
+
+    self.assertIsInstance(retrieved_values, (list, tuple))
+    self.assertEqual(len(retrieved_values), 2)
+    self.assertEqual(retrieved_values[0], 1)
+    self.assertItemsEqual(retrieved_values[1], [1] * 10)
+
   def testUnexpectedFiletypeString(self):
     with self.assertRaises(ValueError):
       datasets.StreamingFilesDataset(
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index c72ba2daff..a0cf59852b 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -700,7 +700,9 @@ cc_library(
     srcs = ["platform/stacktrace_handler.cc"],
     hdrs = ["platform/stacktrace_handler.h"],
     deps = [
+        ":abi",
         ":lib_platform",
+        ":stacktrace",
     ],
 )
 
@@ -3090,6 +3092,8 @@ cc_library(
         # we now need at least "str_util".
         ":lib",
         ":lib_platform",
+        ":stacktrace_handler",
+        ":test_lite",
         "//tensorflow/core/platform/default/build_config:test_lite_main",
     ],
     alwayslink = 1,
@@ -3570,7 +3574,10 @@ tf_cc_tests_gpu(
 tf_cc_test_mkl(
     name = "mkl_runtime_tests",
     size = "small",
-    srcs = ["common_runtime/mkl_cpu_allocator_test.cc"],
+    srcs = [
+        "common_runtime/mkl_cpu_allocator_test.cc",
+        "common_runtime/mkl_threadpool_device_test.cc",
+    ],
     linkstatic = 1,
     deps = [
         ":core",
diff --git a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
index cbe76de415..985f09312f 100644
--- a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
@@ -4,6 +4,10 @@ op {
   description: <<END
 if < 0, `scale * features` otherwise.
 
+To be used together with
+`initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
+For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+
 See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
new file mode 100644
index 0000000000..6e13d0d049
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
@@ -0,0 +1,48 @@
+op {
+  graph_op_name: "StringSplitV2"
+  in_arg {
+    name: "input"
+    description: <<END
+`1-D` string `Tensor`, the strings to split.
+END
+  }
+  in_arg {
+    name: "sep"
+    description: <<END
+`0-D` string `Tensor`, the delimiter character.
+END
+  }
+  attr {
+    name: "maxsplit"
+    description: <<END
+An `int`. If `maxsplit > 0`, limit of the split of the result.
+END
+  }
+  summary: "Split elements of `source` based on `sep` into a `SparseTensor`."
+  description: <<END
+Let N be the size of source (typically N will be the batch size). Split each
+element of `source` based on `sep` and return a `SparseTensor`
+containing the split tokens. Empty tokens are ignored.
+
+For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+then the output will be
+```
+st.indices = [0, 0;
+              0, 1;
+              1, 0;
+              1, 1;
+              1, 2]
+st.shape = [2, 3]
+st.values = ['hello', 'world', 'a', 'b', 'c']
+```
+
+If `sep` is given, consecutive delimiters are not grouped together and are
+deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+string, consecutive whitespace are regarded as a single separator, and the
+result will contain no empty strings at the startor end if the string has
+leading or trailing whitespace.
+
+Note that the above mentioned behavior matches python's str.split.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
new file mode 100644
index 0000000000..0e8576fb01
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StringSplitV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 8f2a419756..9cda17867b 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -86,7 +86,7 @@ BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
   return &(chunks_[h]);
 }
 
-bool BFCAllocator::Extend(size_t rounded_bytes) {
+bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
   size_t available_bytes = memory_limit_ - total_region_allocated_bytes_;
   // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
   available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
@@ -108,7 +108,7 @@ bool BFCAllocator::Extend(size_t rounded_bytes) {
 
   // Try allocating.
   size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
-  void* mem_addr = suballocator_->Alloc(32, bytes);
+  void* mem_addr = suballocator_->Alloc(alignment, bytes);
   if (mem_addr == nullptr && !started_backpedal_) {
     // Only backpedal once.
     started_backpedal_ = true;
@@ -119,7 +119,7 @@ bool BFCAllocator::Extend(size_t rounded_bytes) {
     while (mem_addr == nullptr) {
       bytes = RoundedBytes(bytes * kBackpedalFactor);
       if (bytes < rounded_bytes) break;
-      mem_addr = suballocator_->Alloc(32, bytes);
+      mem_addr = suballocator_->Alloc(alignment, bytes);
     }
   }
 
@@ -261,7 +261,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
   }
 
   // Try to extend
-  if (Extend(rounded_bytes)) {
+  if (Extend(unused_alignment, rounded_bytes)) {
     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
     if (ptr != nullptr) {
       return ptr;
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index ba5a3eea3a..52aedb1e9c 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -305,7 +305,8 @@ class BFCAllocator : public VisitableAllocator {
   // Try to add a new memory region that can satisfy an allocation of
   // 'rounded_bytes' bytes.  Returns true on success and false on
   // failure.
-  bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  bool Extend(size_t alignment, size_t rounded_bytes)
+      EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Returns a pointer to an underlying allocated chunk of size
   // 'rounded_bytes'.
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 6e08e33f8e..486f0be698 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -105,9 +105,25 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
         EXPECT_EQ(2, shape.dim(0).size());
         EXPECT_EQ(1, shape.dim(1).size());
         if (node->name() == y->name()) {
+#ifdef INTEL_MKL
+          // if MKL is used, it goes through various additional 
+          // graph rewrite pass. In TF, everytime a graph pass 
+          // happens, "constant" nodes are allocated
+          // and deallocated. Each allocation calls the
+          // (FindChunkPtr of BFCAllocator),
+          // which increments the value of AllocationId. 
+          // Thus AllocationId becomes more than 3 and 4 if 
+          // MKL is used. Now they are 9 and 10 for MKL. 
+          EXPECT_EQ(19, cm->AllocationId(node, 0));
+#else
           EXPECT_EQ(21, cm->AllocationId(node, 0));
+#endif 
         } else {
+#ifdef INTEL_MKL
+          EXPECT_EQ(20, cm->AllocationId(node, 0));
+#else
           EXPECT_EQ(22, cm->AllocationId(node, 0));
+#endif 
         }
       }
       EXPECT_LE(0, cm->MaxExecutionTime(node));
diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
new file mode 100644
index 0000000000..5d583a8360
--- /dev/null
+++ b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+#ifdef _OPENMP
+TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) {
+  SessionOptions options;
+  unsetenv("OMP_NUM_THREADS");
+
+  ThreadPoolDevice* tp = new ThreadPoolDevice(
+      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
+
+  const int ht = port::NumHyperthreadsPerCore();
+  EXPECT_EQ(omp_get_max_threads(), (port::NumSchedulableCPUs() + ht - 1) / ht);
+}
+
+TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) {
+  SessionOptions options;
+  setenv("OMP_NUM_THREADS", "314", 1);
+
+  ThreadPoolDevice* tp = new ThreadPoolDevice(
+      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
+
+  EXPECT_EQ(omp_get_max_threads(), 314);
+}
+#endif  // _OPENMP
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index 21912236d0..a5d31b75c7 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 
 #ifdef INTEL_MKL
+#ifdef _OPENMP
 #include <omp.h>
-#endif
+#endif  // _OPENMP
+#endif  // INTEL_MKL
 #include <string.h>
 
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -57,7 +59,10 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   // MKL library executes ops in parallel using OMP threads
   // Set inter_op conservatively to avoid thread oversubscription that could
   // lead to severe perf degradations and OMP resource exhaustion
-  const int mkl_intra_op = omp_get_max_threads();
+  int mkl_intra_op = 1;
+#ifdef _OPENMP
+  mkl_intra_op = omp_get_max_threads();
+#endif  // _OPENMP
   CHECK_GE(mkl_intra_op, 1);
   const int32 mkl_inter_op = std::max(
       (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
@@ -68,7 +73,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
 #else
   // Default to using the number of cores available in the process.
   return port::NumSchedulableCPUs();
-#endif
+#endif  // INTEL_MKL
 }
 
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index f7a07fe503..74a87215e1 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -31,7 +31,11 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 
 #ifdef INTEL_MKL
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #endif
 
 namespace tensorflow {
@@ -43,7 +47,26 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
     : LocalDevice(options, Device::BuildDeviceAttributes(
                                name, DEVICE_CPU, memory_limit, locality)),
       allocator_(allocator),
-      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {}
+      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {
+#ifdef INTEL_MKL
+#ifdef _OPENMP
+  const char* user_omp_threads = getenv("OMP_NUM_THREADS");
+  if (user_omp_threads == nullptr) {
+    // OMP_NUM_THREADS controls MKL's intra-op parallelization
+    // Default to available physical cores
+    const int mkl_intra_op = port::NumSchedulableCPUs();
+    const int ht = port::NumHyperthreadsPerCore();
+    omp_set_num_threads((mkl_intra_op + ht - 1) / ht);
+  } else {
+    uint64 user_val = 0;
+    if (strings::safe_strtou64(user_omp_threads, &user_val)) {
+      // Superflous but triggers OpenMP loading
+      omp_set_num_threads(user_val);
+    }
+  }
+#endif  // _OPENMP
+#endif  // INTEL_MKL
+}
 
 ThreadPoolDevice::~ThreadPoolDevice() {}
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index 1cea1b1462..770a0fcf14 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -147,7 +147,9 @@ MasterService::Stub::Stub(
 }
 
 MasterService::AsyncService::AsyncService() {
-  for (int i = 0; i < 10; ++i) {
+  int method_len = sizeof(grpcMasterService_method_names) / 
+                    sizeof(grpcMasterService_method_names[0]);
+  for (int i = 0; i < method_len; ++i) {
     AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcMasterService_method_names[i],
         ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
index 89f83f9f24..a8508d2d4f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_session.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
@@ -50,9 +51,14 @@ Status TestCluster::MakeTestCluster(const SessionOptions& options, int n,
   }
 
   for (int i = 0; i < n; ++i) {
+    string server_file =
+        strings::StrCat(testing::TensorFlowSrcRoot(),
+                        "/core/distributed_runtime/rpc/grpc_testlib_server");
+    if (!options.env->FileExists(server_file).ok()) {
+      return errors::Internal("Could not find grpc_testlib_server");
+    }
     const std::vector<string> argv(
-        {strings::StrCat(testing::TensorFlowSrcRoot(),
-                         "/core/distributed_runtime/rpc/grpc_testlib_server"),
+        {server_file,
          /* see grpc_testlib_server.cc for flags */
          tf_jobs, "--tf_job=localhost", strings::StrCat("--tf_task=", i),
          strings::StrCat("--num_cpus=", num_cpus),
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 2c87156dca..2bb4d32d57 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -67,13 +67,8 @@ struct AllocatorStats {
 // device memory.
 class Allocator {
  public:
-#ifdef EIGEN_VECTORIZE_AVX512
   // Align to 64 byte boundary.
   static constexpr size_t kAllocatorAlignment = 64;
-#else
-  // Align to 32 byte boundary.
-  static constexpr size_t kAllocatorAlignment = 32;
-#endif
 
   virtual ~Allocator();
 
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 3d7920a6e2..4b56d807df 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_gen_lib.h"
 
+#include <algorithm>
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/framework/remote_fused_graph_execute_info.proto b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
index eb689ec1e6..10072724d2 100644
--- a/tensorflow/core/framework/remote_fused_graph_execute_info.proto
+++ b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RemoteFusedGraphExecuteInfoProto";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-//add go_package externally
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index b613effd18..80e168df97 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -1147,29 +1147,29 @@ TEST(Tensor, FailureToAllocate) {
 
 // On the alignment.
 //
-// As of 2015/8, tensorflow::Tensor allocates its buffer with 32-byte
+// As of 2018/5, tensorflow::Tensor allocates its buffer with 64-byte
 // alignment. Tensor::tensor/flat/vec/matrix methods requires the
 // buffer satisfies Eigen::Aligned (e.g., 16-bytes aligned usually,
-// and 32-bytes for AVX). Tensor::Slice requires the caller to ensure
-// its result is aligned if the caller intends to use those methods.
-// In this test case, we simply make sure each slice is 32-byte
-// aligned: sizeof(float) * 4 * 2 = 32.
+// 32-bytes for AVX, and 64-bytes for AVX512). Tensor::Slice requires
+// the caller to ensure its result is aligned if the caller intends
+// to use those methods. In this test case, we simply make sure each
+// slice is 64-byte aligned: sizeof(float) * 4 * 36 = 576.  576 % 64 = 0.
 TEST(Tensor, Slice_Basic) {
   Tensor saved;
   {  // General
-    Tensor x(DT_FLOAT, TensorShape({10, 4, 34}));
+    Tensor x(DT_FLOAT, TensorShape({10, 4, 36}));
     // Fills in known values.
     for (int i = 0; i < 10; ++i) {
       x.Slice(i, i + 1).flat<float>().setConstant(i * 1.f);
     }
     // A simple slice along dim0.
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 34})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 36})));
     auto tx = x.tensor<float, 3>();
     auto ty = y.tensor<float, 3>();
     for (int i = 0; i < 4; ++i) {
       for (int j = 0; j < 4; ++j) {
-        for (int k = 0; k < 34; ++k) {
+        for (int k = 0; k < 36; ++k) {
           EXPECT_EQ(ty(i, j, k), 4.0 + i);
           EXPECT_EQ(&tx(4 + i, j, k), &ty(i, j, k));
         }
@@ -1186,7 +1186,7 @@ TEST(Tensor, Slice_Basic) {
     auto tz = z.tensor<float, 3>();
     EXPECT_EQ(1, z.dim_size(0));
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 34; ++k) {
+      for (int k = 0; k < 36; ++k) {
         EXPECT_EQ(tz(0, j, k), 6.0);
       }
     }
@@ -1198,16 +1198,16 @@ TEST(Tensor, Slice_Basic) {
     EXPECT_EQ(1, saved.dim_size(0));
     auto tsaved = saved.tensor<float, 3>();
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 34; ++k) {
+      for (int k = 0; k < 36; ++k) {
         EXPECT_EQ(tsaved(0, j, k), 6.0);
       }
     }
   }
   {  // Empty
-    Tensor x(DT_FLOAT, TensorShape({10, 0, 34}));
+    Tensor x(DT_FLOAT, TensorShape({10, 0, 36}));
     x.flat<float>().setRandom();
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 34})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 36})));
   }
 
   {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 72a13d4da7..b9667998d6 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -2691,14 +2691,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // If Op has been specifically assigned to a non-CPU device, then No.
     if (!n->assigned_device_name().empty() &&
-        !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->assigned_device_name(), kCPUDeviceSubStr)) {
       result = false;
       reason = "Op has been assigned a runtime device that is not CPU.";
     }
 
     // If user has specifically assigned this op to a non-CPU device, then No.
     if (!n->def().device().empty() &&
-        !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->def().device(), kCPUDeviceSubStr)) {
       result = false;
       reason = "User has assigned a device that is not CPU.";
     }
@@ -2865,9 +2865,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
-  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized 
-  // path. The unoptimized path is slow. Thus we dont rewrite the node 
-  // and use default Eigen. But for depth_radius=2, MKL DNN optimized 
+  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized
+  // path. The unoptimized path is slow. Thus we dont rewrite the node
+  // and use default Eigen. But for depth_radius=2, MKL DNN optimized
   // path is taken, i.e., eigen node is rewritten by MKl DNN node.
   static bool LrnRewrite(const Node* n) {
     CHECK_NOTNULL(n);
@@ -2876,13 +2876,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     CHECK_EQ(GetNodeAttr(n->def(), "depth_radius", &depth_radius).ok(), true);
 
     // if the depth_radius of LRN is not 2, don't rewrite the node by MKL DNN
-    // and use eigen node instead 
+    // and use eigen node instead
     if (depth_radius == 2) {
       return true;
     }
     VLOG(1) << "LrnRewrite: The model sets depth_radius as not 2 which"
             << "case is not optimized by Intel MKL, thus using Eigen op"
-            << "for LRN " ; 
+            << "for LRN ";
 
     return false;
   }
@@ -3015,6 +3015,35 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                 std::vector<NodeBuilder::NodeOut>* ws_tensors,
                                 bool* are_ws_tensors_added);
 
+  // Helper function used by FixMklMetaDataEdges. Fixes the metadata edge
+  // pointed by 'e_metadata' corresponding to the data edge 'e_data' in graph
+  // 'g'. Returns true is fixup was done; otherwise, it returns false.
+  bool FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
+    const Edge* e_data, const Edge* e_metadata);
+
+  // Are the input Mkl metadata edges for node 'n' in graph 'g' correctly
+  // connected? If not, then fix them. This is needed because a graph may have
+  // some input Mkl metadata edges incorrectly setup after node merge and
+  // rewrite passes. This could happen because GetReversePostOrder function may
+  // not provide topologically sorted order if a graph contains cycles. The
+  // function returns true if at least one Mkl metadata edge for node 'n' was
+  // fixed. Otherwise, it returns false.
+  //
+  // Example:
+  //
+  // X = MklConv2D(_, _, _)
+  // Y = MklConv2DWithBias(_, _, _, _, _, _)
+  // Z = MklAdd(X, Y, DummyMklTensor, Y:1)
+  //
+  // For a graph such as shown above, note that 3rd argument of MklAdd contains
+  // DummyMklTensor. Actually, it should be getting the Mkl metadata from
+  // MklConv2D op (specifically, X:2). This incorrect plumbing could be possible
+  // (although rare) if the Mkl NodeMerge + NodeRewrite passes visit Z before X
+  // (possible if X, Y, Z are part of a loop.) This function fixes the Mkl
+  // metadata edges only - it does not rewrite nodes nor does it modify the Mkl
+  // data edges (1st and 2nd arguments of MklAdd).
+  bool FixMklMetaDataEdges(std::unique_ptr<Graph>* g, Node* n);
+
   // Functions specific to operators to copy attributes
   // We need operator-specific function to copy attributes because the framework
   // does not provide any generic function for it.
@@ -4241,6 +4270,92 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   return nullptr;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+//              Post-rewrite Mkl metadata fixup pass
+///////////////////////////////////////////////////////////////////////////////
+bool MklLayoutRewritePass::FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
+    const Edge* e_data, const Edge* e_metadata) {
+  if (g == nullptr || e_data == nullptr || e_metadata == nullptr) {
+    return false;
+  }
+
+  Node* n_data = e_data->src();
+  int n_data_op_slot = e_data->src_output();
+  int n_metadata_op_slot = GetTensorMetaDataIndex(n_data_op_slot,
+                                                  n_data->num_outputs());
+
+  // If the source of meta edge is a constant node (producing dummy Mkl metadata
+  // tensor), then we will need to fix.
+  if (IsConstant(e_metadata->src())) {
+    Node* e_metadata_dst = e_metadata->dst();
+    int e_metadata_in_slot = e_metadata->dst_input();
+    CHECK_NOTNULL((*g)->AddEdge(n_data, n_metadata_op_slot,
+                  e_metadata_dst, e_metadata_in_slot));
+
+    (*g)->RemoveEdge(e_metadata);
+    return true;
+  }
+
+  return false;
+}
+
+bool MklLayoutRewritePass::FixMklMetaDataEdges(std::unique_ptr<Graph>* g,
+    Node* n) {
+  bool result = false;
+
+  // If graph node is not Mkl node, then return.
+  DataType T = DT_INVALID;
+  if (!GetNodeAttr(n->def(), "T", &T).ok() ||
+      !mkl_op_registry::IsMklOp(n->type_string(), T)) {
+    return result;
+  }
+
+  // If it is Mkl node, then check if the input edges to this node that carry
+  // Mkl metadata are linked up correctly with the source node.
+
+  // For Mkl nodes, we generate twice the number of input tensors (n for Mkl
+  // data tensors + n for Mkl metadata tensors). We need to check for correct
+  // connection of n metadata tensors only.
+  int num_data_inputs = n->num_inputs() / 2;
+  for (int idx = 0; idx < num_data_inputs; idx++) {
+    // Get the edge connecting input slot with index (idx).
+    const Edge* e = nullptr;
+    TF_CHECK_OK(n->input_edge(idx, &e));
+
+    // If e is control edge, then skip.
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    // Check that the source node for edge 'e' is Mkl node. If it is not an Mkl
+    // node, then we don't need to do anything.
+    Node* e_src = e->src();
+    if (GetNodeAttr(e_src->def(), "T", &T).ok() &&
+        mkl_op_registry::IsMklOp(e_src->type_string(), T)) {
+      // Source node for edge 'e' is Mkl node.
+      // Destination node and destination input slot of e is node 'n' and 'idx'
+      // resp.
+      CHECK_EQ(e->dst(), n);
+      CHECK_EQ(e->dst_input(), idx);
+
+      // Let's get edge that carries Mkl metadata corresponding to Mkl data edge
+      // 'e'. For that, let's first get the input slot of 'n' where the meta
+      // edge will feed the value.
+      int e_meta_in_slot = GetTensorMetaDataIndex(e->dst_input(),
+                                                  n->num_inputs());
+      const Edge* e_meta = nullptr;
+      TF_CHECK_OK(n->input_edge(e_meta_in_slot, &e_meta));
+
+      // Let's check if we need to fix this meta edge.
+      if (FixMklMetaDataEdgeIfNeeded(g, e, e_meta)) {
+        result = true;
+      }
+    }
+  }
+
+  return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 //              Run function for the pass
 ///////////////////////////////////////////////////////////////////////////////
@@ -4307,6 +4422,25 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
 
   DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite)", &**g);
 
+  order.clear();
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+    if (FixMklMetaDataEdges(g, n)) {
+      string node_name = n->name();
+      string op_name = n->type_string();
+
+      VLOG(1) << "MklLayoutRewritePass: fixed metadata edges for node "
+              << node_name << " with op " << op_name;
+      result = true;
+    }
+  }
+  DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite+Fixup)",
+            &**g);
+
   return result;
 }
 
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 029cdcf94a..7645b4a7f0 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -3518,6 +3518,37 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
             "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
 }
 
+/////////////////////////////////////////////////////////////////////
+//         Post-rewrite fixup pass test
+
+TEST_F(MklLayoutPassTest, PostRewriteFixUpPass) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_UINT8 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_UINT8 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'E' op: '_MklAdd'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A', 'D', 'D']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Const);E(_MklAdd);"
+            "M(_MklInput);N(_MklInput)|A->C;A->E:1;B->C:1;C->E;C:2->E:2;"
+            "D->E:3;M->C:2;N->C:3");
+}
+
 /////////////////////////////////////////////////////////////////////
 
 static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index 352f08fede..31b19cfcfd 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -546,7 +546,7 @@ TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
   TF_CHECK_OK(cluster_->GetPeakMemoryUsage(&device_peak_memory_before));
   EXPECT_EQ(device_peak_memory_before.size(), 1);
   // There might be a bit memory used before session's running anything.
-  EXPECT_LT(device_peak_memory_before.begin()->second, 200);
+  EXPECT_LT(device_peak_memory_before.begin()->second, 400);
 
   RunMetadata metadata;
   TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
@@ -567,8 +567,8 @@ TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
   // Check memory used by resources are released after cluster destruction.
   EXPECT_EQ(device_peak_memory_before.size(), 1);
   EXPECT_EQ(device_peak_memory_after.size(), 1);
-  EXPECT_LT(device_peak_memory_before.begin()->second, 200);
-  EXPECT_LT(device_peak_memory_after.begin()->second, 200);
+  EXPECT_LT(device_peak_memory_before.begin()->second, 400);
+  EXPECT_LT(device_peak_memory_after.begin()->second, 400);
 }
 
 TEST_F(SingleMachineTest, PeakMemory) {
@@ -597,7 +597,7 @@ TEST_F(SingleMachineTest, PeakMemory) {
       device_peak_memory.end());
   cpu_memory =
       device_peak_memory["/job:localhost/replica:0/task:0/device:CPU:0"];
-  EXPECT_LT(cpu_memory, 100);
+  EXPECT_LT(cpu_memory, 200);
 }
 
 TEST_F(SingleMachineTest, PeakMemoryStatsNotEnabled) {
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 6749a7c571..0c02876ac5 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -610,7 +610,6 @@ class SymbolicShapeRefiner {
     }
   };
 
-  // Compute the shape of the tensors outputed by node 'node' at output port
   // 'port_index' as the union of shape1 and shape2.
   ShapeHandle OutputAsUnion(const NodeDef* node, int port_index,
                             ShapeHandle shape1, ShapeHandle shape2) {
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 1b18087cdf..8ca726df0b 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -679,6 +679,7 @@ cc_library(
     deps = [
         ":constant_folding",
         ":graph_optimizer",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
@@ -780,7 +781,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:scoped_allocator_ops_op_lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 4dde7ed1b4..03e36a7b9c 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -200,8 +201,7 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
         }
       }
       if (optimizable) {
-        VLOG(2) << "Optimizing fused batch norm node " << node.DebugString()
-                << std::endl;
+        VLOG(1) << "Optimizing fused batch norm node " << node.DebugString();
         AddBatchNormNodes(optimized_graph, node);
         continue;
       }
diff --git a/tensorflow/core/kernels/as_string_op.cc b/tensorflow/core/kernels/as_string_op.cc
index 66c4aff3e3..a7757d1361 100644
--- a/tensorflow/core/kernels/as_string_op.cc
+++ b/tensorflow/core/kernels/as_string_op.cc
@@ -73,6 +73,7 @@ class AsStringOp : public OpKernel {
     }
     switch (dtype) {
       case DT_INT8:
+      case DT_INT16:
       case DT_INT32:
         strings::Appendf(&format_, "d");
         break;
@@ -129,6 +130,7 @@ class AsStringOp : public OpKernel {
       ENCODE_TYPE(DT_FLOAT, float, format_);
       ENCODE_TYPE(DT_DOUBLE, double, format_);
       ENCODE_TYPE(DT_INT8, int8, format_);
+      ENCODE_TYPE(DT_INT16, int16, format_);
       case (DT_BOOL): {
         const auto& input_flat = input_tensor->flat<bool>();
         for (int i = 0; i < input_flat.size(); ++i) {
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index 14d889e8e3..49b90e855b 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -33,52 +33,41 @@ class ClipOp : public OpKernel {
     const Tensor& in0 = ctx->input(0);
     const Tensor& in1 = ctx->input(1);
     const Tensor& in2 = ctx->input(2);
+    OP_REQUIRES(ctx, (in0.shape() == in1.shape() ||
+                      TensorShapeUtils::IsScalar(in1.shape())) &&
+                     (in0.shape() == in2.shape() ||
+                      TensorShapeUtils::IsScalar(in2.shape())),
+                errors::InvalidArgument(
+                    "clip_value_min and clip_value_max must be either of "
+                    "the same shape as input, or a scalar. ",
+                    "input shape: ", in0.shape().DebugString(),
+                    "clip_value_min shape: ", in1.shape().DebugString(),
+                    "clip_value_max shape: ", in2.shape().DebugString()));
+
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
+    if (out->NumElements() == 0) return;  // Nothing to do for empty output
 
     auto in0_flat = in0.flat<T>();
     auto in1_flat = in1.flat<T>();
     auto in2_flat = in2.flat<T>();
+    auto out_flat = out->flat<T>();
     const Device& d = ctx->eigen_device<Device>();
 
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
-    auto out_flat = out->flat<T>();
     if (in1.shape() == in2.shape()) {
       if (in0.shape() == in1.shape()) {
         functor::TernaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                             out_flat);
       } else {
-        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in1.shape()),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::UnaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                           out_flat);
       }
     } else {
       if (in0.shape() == in1.shape()) {
-        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in2.shape()),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryLeftClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                out_flat);
       } else {
-        OP_REQUIRES(ctx,
-                    (in0.shape() == in2.shape() &&
-                     TensorShapeUtils::IsScalar(in1.shape())),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryRightClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                 out_flat);
       }
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 9a3b2303a3..17a85d9773 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -57,6 +57,7 @@ struct DenseUpdate<GPUDevice, T, SUB> {
   template struct functor::DenseUpdate<GPUDevice, T, ADD>; \
   template struct functor::DenseUpdate<GPUDevice, T, SUB>;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+TF_CALL_int32(DEFINE_GPU_KERNELS);
 TF_CALL_int64(DEFINE_GPU_KERNELS);
 #undef DEFINE_GPU_KERNELS
 
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index e6fefe643b..5cd8e04927 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -37,6 +37,7 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.cc b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
index 39b6924d74..4563fc6353 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
@@ -31,6 +31,7 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
+TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 7e5a9e1ec5..4e53291b7f 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -228,6 +228,8 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
+TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
@@ -239,6 +241,8 @@ TF_CALL_complex128(DECLARE_GPU_SPECS);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_ND_GPU(type) REGISTER_GATHER_ND_ALL_INDICES(GPU, type)
 
+TF_CALL_int32(REGISTER_GATHER_ND_GPU);
+TF_CALL_int64(REGISTER_GATHER_ND_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex64(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex128(REGISTER_GATHER_ND_GPU);
diff --git a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
index b03efc684f..da8d2e9e3c 100644
--- a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
@@ -119,6 +119,8 @@ struct GatherNdSlice<GPUDevice, T, Index, IXDIM> {
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
+TF_CALL_int32(DEFINE_GPU_SPECS);
+TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index ef332ebee3..094504d6b9 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -153,6 +153,7 @@ TF_CALL_uint64(REGISTER_GATHER_CPU);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
+TF_CALL_int64(REGISTER_GATHER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
 TF_CALL_complex64(REGISTER_GATHER_GPU);
 TF_CALL_complex128(REGISTER_GATHER_GPU);
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 5eeb23d810..31d1b949ef 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 
 #include <limits>
 #include <vector>
+#include <unordered_map>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -590,8 +591,8 @@ class MklConcatOp : public OpKernel {
       const int N = input_tensors.size();
 
       // Get Tensor shapes.
-      std::vector<MklDnnShape> input_shapes(N);
-      GetMklShapeList(context, "values", &input_shapes);
+      std::vector<MklDnnShape> mkl_input_shapes(N);
+      GetMklShapeList(context, "values", &mkl_input_shapes);
 
       const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM)
                                             ? MklGetInput(context, 0)
@@ -610,19 +611,14 @@ class MklConcatOp : public OpKernel {
       int i = 0;
       bool invoke_eigen = false;
       bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
-      const TensorShape expected_shape = input_shapes[0].IsMklTensor()
-                                             ? input_shapes[0].GetTfShape()
-                                             : input_tensors[0].shape();
+      const TensorShape expected_shape = mkl_input_shapes[0].IsMklTensor()
+                                       ? mkl_input_shapes[0].GetTfShape()
+                                       : input_tensors[0].shape();
       size_t expected_dims = expected_shape.dims();
 
       if (concat_dim < 0) concat_dim = expected_dims + concat_dim;
 
-      for (auto& s : input_shapes) {
-        if (s == expected_shape) {
-          ++i;
-          continue;
-        }
-
+      for (auto& s : mkl_input_shapes) {
         TensorShape s_shape =
             s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
         size_t s_dims = s_shape.dims();
@@ -665,21 +661,14 @@ class MklConcatOp : public OpKernel {
 
       // Call Eigen library
       if (invoke_eigen) {
-        TensorShapeList tf_input_shapes;
-        i = 0;
-        for (auto& s : input_shapes) {
-          TensorShape s_shape =
-              s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
-          tf_input_shapes.push_back(s_shape);
-          ++i;
-        }
-        CallEigenVersion(context, input_tensors, tf_input_shapes);
+        CallEigenVersion(context, input_tensors, mkl_input_shapes);
         return;
       }
 
       memory::dims dst_dims;
+
       if (are_all_mkl_inputs)
-        dst_dims = TFShapeToMklDnnDims(input_shapes[0].GetTfShape());
+        dst_dims = TFShapeToMklDnnDims(mkl_input_shapes[0].GetTfShape());
       else
         // When all the inputs are in Tensorflow format, we don't know
         // what is the input data format. In that case, we just use
@@ -689,26 +678,61 @@ class MklConcatOp : public OpKernel {
       std::vector<memory::primitive_desc> srcs_pd;
       std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
       int64 dst_concat_dim_size = 0;
-      for (int k = 0; k < N; k++) {
-        bool is_mkl_tensor = input_shapes[k].IsMklTensor();
-        memory::dims src_dims;
-
-        // Same comment as dst_dims for src_dims.
-        src_dims = (is_mkl_tensor)
-                       ? TFShapeToMklDnnDims(input_shapes[k].GetTfShape())
-                       : TFShapeToMklDnnDims(input_tensors[k].shape());
-
-        dst_concat_dim_size += src_dims[concat_dim];
-        auto src_md =
-            is_mkl_tensor ? input_shapes[k].GetMklLayout() :
-                          // It does not matter what data format we use here
-                          // (NHWC or NCHW). We just need to ensure that output
-                          // of Concat uses same data format as input.
-                memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
-
-        srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-        auto src_mpd = srcs[k].GetUsrMemPrimDesc();
-        srcs_pd.push_back(src_mpd);
+
+      bool isMklReorderNeeded = false;
+      memory::format mkl_common_format = memory::format::any;
+      if (are_all_mkl_inputs) {
+        mkl_common_format =
+            FindMklCommonFormat(mkl_input_shapes, concat_dim,
+               &isMklReorderNeeded, &dst_concat_dim_size);
+
+        if (!isMklReorderNeeded) {
+          // All MKL tensors have a same format. Reorder is not needed.
+          for (int k = 0; k < N; k++) {
+            if (input_tensors[k].NumElements() == 0)
+              continue;
+
+            auto src_md = mkl_input_shapes[k].GetMklLayout();
+            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+            auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+            srcs_pd.push_back(src_mpd);
+          }
+        } else {
+          // MKL tensors have different formats.
+          // Reorder them to most common format.
+          for (int k = 0; k < N; k++) {
+            if (input_tensors[k].NumElements() == 0)
+              continue;
+
+            auto src_dims = TFShapeToMklDnnDims(
+                mkl_input_shapes[k].GetTfShape());
+            auto src_md = mkl_input_shapes[k].GetMklLayout();
+            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+
+            if (src_md.data.format != mkl_common_format)
+              src_md = memory::desc(src_dims, MklDnnType<T>(),
+                           mkl_common_format);
+
+            srcs_pd.push_back(memory::primitive_desc(src_md, cpu_engine));
+          }
+        }
+      } else {  // All TF inputs
+        for (int k = 0; k < N; k++) {
+          if (input_tensors[k].NumElements() == 0)
+            continue;
+
+          memory::dims src_dims = TFShapeToMklDnnDims(input_tensors[k].shape());
+          dst_concat_dim_size += src_dims[concat_dim];
+
+          // It does not matter what data format to be used (NHWC versus NCHW).
+          // We just need to ensure that output uses same data format as inputs.
+          auto src_md =
+              memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
+
+          srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+          auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+          srcs_pd.push_back(src_mpd);
+        }
       }
       dst_dims[concat_dim] = dst_concat_dim_size;
 
@@ -718,25 +742,33 @@ class MklConcatOp : public OpKernel {
       if (are_all_mkl_inputs) {
         // Since we are passing a specific format for destination,
         // we need to have dst_dims in MklDnn order (NCHW).
-        auto orig_tf_format = input_shapes[0].GetTfDataFormat();
+        auto orig_tf_format = mkl_input_shapes[0].GetTfDataFormat();
         dst_dims_in_nchw = MklDnnDimsInNCHW(
             dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
-        // We will set the output in the same format as input to avoid layout
-        // conversions.
-        // Currently we are setting dst format same as input format.
-        // See if we can make this choice in a better way.
+        // Set the output format same as the most common format of inputs
+        // to avoid layout conversions.
         dst_md = memory::desc(
-            dst_dims_in_nchw, MklDnnType<T>(),
-            (memory::format)input_shapes[0].GetMklLayout().data.format);
+            dst_dims_in_nchw, MklDnnType<T>(), mkl_common_format);
       } else {
-        // Again, format does not matter here. We just need to make it same as
-        // input format.
+        // All inputs are TF tensors.
+        // Set the output format same as input format (nchw).
         dst_md = memory::desc(dst_dims, MklDnnType<T>(), memory::format::nchw);
       }
 
       std::vector<primitive::at> inputs;
-      for (int k = 0; k < input_tensors.size(); k++)
-        inputs.push_back(srcs[k].GetOpMem());
+      std::vector<primitive> net;
+      if (isMklReorderNeeded) {
+        for (int k = 0; k < input_tensors.size(); k++) {
+          if (input_tensors[k].NumElements() > 0) {
+            srcs[k].CheckReorderToOpMem(srcs_pd[k], &net);
+          }
+        }
+      }
+      for (int k = 0; k < input_tensors.size(); k++) {
+        if (input_tensors[k].NumElements() > 0) {
+          inputs.push_back(srcs[k].GetOpMem());
+        }
+      }
 
       // If all inputs are in MKL format, then meaning of concat_dim needs to
       // change. Value of concat_dim is tied to input Tensorflow data format
@@ -745,7 +777,8 @@ class MklConcatOp : public OpKernel {
       // But ifinput tensors are in NHWC order, then semantics need to change.
       // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
       // then since MklDnn order is NCHW, concat_dim needs to be 1.
-      if (are_all_mkl_inputs) concat_dim = input_shapes[0].TfDimIdx(concat_dim);
+      if (are_all_mkl_inputs)
+         concat_dim = mkl_input_shapes[0].TfDimIdx(concat_dim);
 
       auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
 
@@ -758,7 +791,7 @@ class MklConcatOp : public OpKernel {
         dnn_shape_dst.SetMklLayout(&dst_pd);
         dnn_shape_dst.SetElemType(MklDnnType<T>());
         dnn_shape_dst.SetTfLayout(dst_dims.size(), dst_dims_in_nchw,
-                                  input_shapes[0].GetTfDataFormat());
+                                  mkl_input_shapes[0].GetTfDataFormat());
         tf_shape_dst.AddDim((dst_pd.get_size() / sizeof(T)));
       } else {
         dnn_shape_dst.SetMklTensor(false);
@@ -773,7 +806,6 @@ class MklConcatOp : public OpKernel {
       dst.SetUsrMem(dst_md, dst_tensor);
 
       auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
-      std::vector<primitive> net;
       net.push_back(concat_op);
       stream(stream::kind::eager).submit(net).wait();
     } catch (mkldnn::error& e) {
@@ -787,15 +819,27 @@ class MklConcatOp : public OpKernel {
   }
 
   void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
-                        const TensorShapeList& input_shapes) {
-    CHECK_EQ(values.size(), input_shapes.size());
+                        const MklDnnShapeList& mkl_input_shapes) {
+    CHECK_EQ(values.size(), mkl_input_shapes.size());
 
     std::vector<Tensor> converted_values;
-    for (int i = 0; i < input_shapes.size(); i++)
-      converted_values.push_back(values[i]);
+    TensorShapeList tf_input_shapes;
+    for (int i = 0; i < mkl_input_shapes.size(); i++) {
+      if (mkl_input_shapes[i].IsMklTensor()) {
+        // do conversion from MKL to TF
+        Tensor tmp_tensor =
+            ConvertMklToTF<T>(context, values[i], mkl_input_shapes[i]);
+        converted_values.push_back(tmp_tensor);
+        tf_input_shapes.push_back(mkl_input_shapes[i].GetTfShape());
+      } else {
+        // no conversion since it is TF tensor already
+        converted_values.push_back(values[i]);
+        tf_input_shapes.push_back(values[i].shape());
+      }
+    }
 
     // Call Eigen concat.
-    eigen_concat_op_.Compute(context, converted_values, input_shapes);
+    eigen_concat_op_.Compute(context, converted_values, tf_input_shapes);
 
     // Set output Mkl tensor for this op.
     MklDnnShape dnn_shape_output;
@@ -812,6 +856,55 @@ class MklConcatOp : public OpKernel {
         output_tensor->flat<uint8>().data(),
         output_tensor->flat<uint8>().size() * sizeof(uint8));
   }
+
+  // This method finds the most commom format accross all MKL inputs
+  // Inputs:
+  //   1. input_shapes: shapes of input (MKL) tensors.
+  //   2. concat_dim: concat dimension.
+  // Outputs:
+  //   1. is_reorder_needed is set to true if inputs have difference formats
+  //      It is set to false otherwise.
+  //   2. concat_dim_size is the size of concat_dim.
+  // Return:
+  //   return the common MKL format.
+  memory::format FindMklCommonFormat(const MklDnnShapeList& input_shapes,
+      int concat_dim, bool* is_reorder_needed, int64* concat_dim_size) {
+    *is_reorder_needed = false;
+    *concat_dim_size = 0;
+    std::unordered_map<int, int> occurrence_map;
+    if (input_shapes.size() == 0)
+      return memory::format::any;
+
+    // Compute ocurrences of each format of all inputs.
+    for (int k=0; k <input_shapes.size(); k++) {
+      auto src_dims = TFShapeToMklDnnDims(input_shapes[k].GetTfShape());
+      *concat_dim_size += src_dims[concat_dim];
+      int fmt = static_cast<int>(
+          input_shapes[k].GetMklLayout().data.format);
+      occurrence_map[fmt] += 1;
+    }
+
+    if (occurrence_map.size() == 1) {
+       // this means that all inputs have a same format
+       // return it with is_reorder_needed set false.
+       return static_cast<memory::format>(
+           input_shapes[0].GetMklLayout().data.format);
+    }
+
+    // Input tensors have different formats. Thus, reorder is needed.
+    // We pick up the most common format to minimize the total
+    // number of input reorder.
+    memory::format commonest_format = memory::format::any;
+    int max_occurrence = 0;
+    *is_reorder_needed = true;
+    for (auto item : occurrence_map) {
+      if (item.second > max_occurrence) {
+        commonest_format = static_cast<memory::format>(item.first);
+        max_occurrence = item.second;
+      }
+    }
+    return commonest_format;
+  }
 };
 
 #endif
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index c1da0ded1d..f857be6c32 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -18,6 +18,7 @@ limitations under the License.
 // bias.
 
 #ifdef INTEL_MKL
+#ifdef INTEL_MKL_ML
 
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
@@ -264,4 +265,5 @@ class MklConv2DCustomBackpropBiasOp : public OpKernel {
 TF_CALL_float(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 } /* namespace tensorflow */
+#endif /* INTEL_MKL_ML */
 #endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 279167aba2..c0dfed7d7d 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -199,13 +199,15 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
     CHECK_NOTNULL(pool_params);
     CHECK_NOTNULL(dnn_data_input);
     TensorShape input_tensor_shape = input_tensor.shape();
-    memory::desc input_md =
+    if (input_tensor.NumElements() != 0) {
+      memory::desc input_md =
         input_mkl_shape.IsMklTensor()
             ? input_mkl_shape.GetMklLayout()
             : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
                                                      this->data_format_tf_),
                            MklDnnType<T>(), this->data_format_mkldnn_);
-    dnn_data_input->SetUsrMem(input_md, &input_tensor);
+      dnn_data_input->SetUsrMem(input_md, &input_tensor);
+    }
     this->InitMklPoolParameters(context, pool_params, input_mkl_shape,
                                 input_tensor_shape);
   }
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 43c5b29509..e1fc2ea128 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -292,6 +292,7 @@ TF_CALL_string(REGISTER_SCATTER_ND_CPU);
   REGISTER_SCATTER_ND_UPDATE_GPU(type);   \
   REGISTER_SCATTER_ND_GPU(type);
 
+TF_CALL_int32(REGISTER_SCATTER_ND_ALL_GPU);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_complex64(REGISTER_SCATTER_ND_ALL_GPU);
@@ -306,6 +307,8 @@ TF_CALL_complex128(REGISTER_SCATTER_ND_ALL_GPU);
 #define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
   REGISTER_SCATTER_ND_UPDATE(type, SYCL);
 
+TF_CALL_int32(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
+TF_CALL_int32(REGISTER_SCATTER_ND_UPDATE_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
 #undef REGISTER_SCATTER_ND_ADD_SUB_SYCL
@@ -576,6 +579,7 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index a3c21edc15..08b657f4c3 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -170,6 +170,7 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
index bb0129fa6f..634f9ba887 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@@ -216,8 +216,13 @@ TEST_F(ScopedAllocatorConcatOpTest, Success3) {
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, Reshape) {
-  MakeOp({2, 2, 2}, DT_DOUBLE, true, "test", 120, 2);
-  ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}});
+  MakeOp({2, 2, 4}, DT_DOUBLE, true, "test", 120, 2);
+
+  // The elements of the third parameter to ExecOp must be multiples of
+  // Allocator::kAllocatorAlignment in size.  If they are not, the backing
+  // tensor allocated by PrepOp will have too many elements and reshaping
+  // will fail.
+  ExecOp(DT_DOUBLE, 120, {{2, 4}, {2, 4}});
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, NoReshapeAttr) {
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index 7796bf3587..d65692a552 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -16,6 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
+
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
+
 // This file requires the following include because it uses CudaAtomicMax:
 // #include "tensorflow/core/util/cuda_kernel_helper.h"
 
@@ -130,4 +138,4 @@ struct Highest {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index a1f9667b78..866c5dcd52 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -1490,7 +1490,7 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
 
 #endif  // TENSORFLOW_USE_LIBXSMM
 
-// Here is a an overview of the SparseMatMul code. Note that we assume that the
+// Here is an overview of the SparseMatMul code. Note that we assume that the
 // left matrix is sparse.
 //
 // The matrix "left" is divided into a grid with blocksize of (M, KL). Each
diff --git a/tensorflow/core/kernels/string_split_op.cc b/tensorflow/core/kernels/string_split_op.cc
index 4c2b312c34..26ab72f12e 100644
--- a/tensorflow/core/kernels/string_split_op.cc
+++ b/tensorflow/core/kernels/string_split_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
@@ -43,6 +44,63 @@ std::vector<string> Split(const string& str, const string& delimiter,
   return char_vector;
 }
 
+std::vector<string> SplitV2(const string& str, StringPiece sep, int maxsplit) {
+  // This SplitV2 method matches the behavior of python's str.split:
+  //   If sep is given, consecutive delimiters are not grouped together
+  //   and are deemed to delimit empty strings (for example, '1,,2'.split(',')
+  //   returns ['1', '', '2']). The sep argument may consist of multiple
+  //   characters (for example, '1<>2<>3'.split('<>') returns ['1', '2', '3']).
+  //   Splitting an empty string with a specified separator returns [''].
+  //
+  //   If sep is not specified or is None, a different splitting algorithm is
+  //   applied: runs of consecutive whitespace are regarded as a single
+  //   separator, and the result will contain no empty strings at the start or
+  //   end if the string has leading or trailing whitespace. Consequently,
+  //   splitting an empty string or a string consisting of just whitespace
+  //   with a None separator returns [].
+
+  std::vector<string> result;
+
+  StringPiece text(str);
+  if (maxsplit == 0) {
+    result.emplace_back(std::string(text));
+    return result;
+  }
+
+  if (sep.empty()) {
+    StringPiece token;
+    // Remove leading whitespaces.
+    str_util::RemoveLeadingWhitespace(&text);
+    int split = 0;
+    while (str_util::ConsumeNonWhitespace(&text, &token)) {
+      result.emplace_back(std::string(token));
+      str_util::RemoveLeadingWhitespace(&text);
+      ++split;
+      if (maxsplit > 0 && split == maxsplit) {
+        result.emplace_back(std::string(text));
+        return result;
+      }
+    }
+    return result;
+  }
+  auto p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
+  int split = 0;
+  while (p != text.end()) {
+    StringPiece token = text.substr(0, p - text.begin());
+    result.emplace_back(std::string(token));
+    text.remove_prefix(token.size());
+    text.remove_prefix(sep.size());
+    ++split;
+    if (maxsplit > 0 && split == maxsplit) {
+      result.emplace_back(std::string(text));
+      return result;
+    }
+    p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
+  }
+  result.emplace_back(std::string(text));
+  return result;
+}
+
 }  // namespace
 
 class StringSplitOp : public OpKernel {
@@ -122,6 +180,78 @@ class StringSplitOp : public OpKernel {
   bool skip_empty_;
 };
 
+class StringSplitV2Op : public OpKernel {
+ public:
+  explicit StringSplitV2Op(OpKernelConstruction* context)
+      : OpKernel(context), maxsplit_(-1) {
+    OP_REQUIRES_OK(context, context->GetAttr("maxsplit", &maxsplit_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(input_tensor->shape()),
+                errors::InvalidArgument("input must be a vector, got shape: ",
+                                        input_tensor->shape().DebugString()));
+
+    const auto input_vec = input_tensor->vec<string>();
+    const int64 batch_size = input_vec.dimension(0);
+
+    const Tensor* sep_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("sep", &sep_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(sep_tensor->shape()),
+                errors::InvalidArgument("sep must be a scalar, got shape: ",
+                                        sep_tensor->shape().DebugString()));
+    const auto sep_vec = sep_tensor->flat<string>();
+    StringPiece sep(sep_vec(0));
+    std::vector<string> tokens;
+    // Guess that we'll be unpacking a handful of tokens per example.
+    static constexpr int kReserveSize = 4;
+    tokens.reserve(batch_size * kReserveSize);
+
+    int64 output_size = 0;
+    int64 max_num_entries = 0;
+    std::vector<int64> num_indices(batch_size);
+    for (int64 i = 0; i < batch_size; ++i) {
+      std::vector<string> parts = SplitV2(input_vec(i), sep, maxsplit_);
+      int64 n_entries = parts.size();
+      num_indices[i] = n_entries;
+      output_size += n_entries;
+      max_num_entries = std::max(max_num_entries, n_entries);
+      tokens.insert(tokens.end(), parts.begin(), parts.end());
+    }
+
+    Tensor* sp_indices_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({output_size, 2}),
+                                             &sp_indices_t));
+    Tensor* sp_tokens_t;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(1, TensorShape({output_size}), &sp_tokens_t));
+    Tensor* sp_shape_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(2, TensorShape({2}), &sp_shape_t));
+
+    auto sp_indices = sp_indices_t->matrix<int64>();
+    auto sp_tokens = sp_tokens_t->vec<string>();
+    auto sp_shape = sp_shape_t->vec<int64>();
+    sp_shape(0) = batch_size;
+    sp_shape(1) = max_num_entries;
+    size_t c = 0;
+    for (size_t i = 0; i < batch_size; ++i) {
+      for (size_t j = 0; j < num_indices[i]; ++j) {
+        sp_indices(c, 0) = i;
+        sp_indices(c, 1) = j;
+        sp_tokens(c) = tokens[c];
+        ++c;
+      }
+    }
+  }
+
+ private:
+  int maxsplit_;
+};
+
 REGISTER_KERNEL_BUILDER(Name("StringSplit").Device(DEVICE_CPU), StringSplitOp);
+REGISTER_KERNEL_BUILDER(Name("StringSplitV2").Device(DEVICE_CPU),
+                        StringSplitV2Op);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/candidate_sampling_ops.cc b/tensorflow/core/ops/candidate_sampling_ops.cc
index 6e4d100b04..6e589c8d1c 100644
--- a/tensorflow/core/ops/candidate_sampling_ops.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops.cc
@@ -145,12 +145,15 @@ REGISTER_OP("ComputeAccidentalHits")
       int64 num_true;
       TF_RETURN_IF_ERROR(c->GetAttr("num_true", &num_true));
 
-      // Validate true_classes.
+      // Validate true_classes, must be a matrix.
       ShapeHandle true_classes;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &true_classes));
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(
           c->WithValue(c->Dim(true_classes, 1), num_true, &unused));
+      // Validate sampled_candidates, must be a vector.
+      ShapeHandle sampled_candidates;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &sampled_candidates));
 
       // All three outputs are the same shape.
       ShapeHandle v = c->Vector(InferenceContext::kUnknownDim);
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 15e0ca8af9..9dca5f53ce 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -218,7 +218,17 @@ REGISTER_OP("MapAndBatchDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Use index from the end to retrieve the Input shapes,
+      // so that to avoid guessing the length of "other_arguments".
+      // batch_size, num_parallel_batches, and drop_remainder are 0-D scalars.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("MapAndBatchDatasetV2")
     .Input("input_dataset: variant")
@@ -231,7 +241,17 @@ REGISTER_OP("MapAndBatchDatasetV2")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Use index from the end to retrieve the Input shapes,
+      // so that to avoid guessing the length of "other_arguments".
+      // batch_size, num_parallel_calls, and drop_remainder are 0-D scalars.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("PrefetchDataset")
     .Input("input_dataset: variant")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index d949e70c66..87f4991134 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -454,7 +454,9 @@ REGISTER_OP("DrawBoundingBoxes")
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 2), 4, &unused));
 
-      return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
+      // The rank of the input image (rank = 4) has already been restricted
+      // above, and the output is of the same shape as the input.
+      return shape_inference::UnchangedShape(c);
     });
 
 // --------------------------------------------------------------------------
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 1740fa152c..b3487122e2 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1084,7 +1084,7 @@ REGISTER_OP("UnsortedSegmentProd")
     .Input("segment_ids: Tindices")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: realnumbertype")
+    .Attr("T: numbertype")
     .Attr("Tindices: {int32,int64}")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .SetShapeFn(UnsortedSegmentReductionShapeFn);
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index fc60e807b9..41efa49ce3 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1453,6 +1453,7 @@ REGISTER_OP("QuantizedReluX")
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 1d5c743a56..4423062362 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -78,7 +78,7 @@ REGISTER_OP("ReduceJoin")
 REGISTER_OP("AsString")
     .Input("input: T")
     .Output("output: string")
-    .Attr("T: {int32, int64, complex64, float, double, bool, int8}")
+    .Attr("T: {int8, int16, int32, int64, complex64, float, double, bool}")
     .Attr("precision: int = -1")
     .Attr("scientific: bool = false")
     .Attr("shortest: bool = false")
@@ -134,6 +134,24 @@ REGISTER_OP("StringSplit")
       return Status::OK();
     });
 
+REGISTER_OP("StringSplitV2")
+    .Input("input: string")
+    .Input("sep: string")
+    .Output("indices: int64")
+    .Output("values: string")
+    .Output("shape: int64")
+    .Attr("maxsplit: int = -1")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+
+      c->set_output(0, c->Matrix(InferenceContext::kUnknownDim, 2));
+      c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    });
+
 REGISTER_OP("StringStrip")
     .Input("input: string")
     .Output("output: string")
diff --git a/tensorflow/core/platform/cpu_info.cc b/tensorflow/core/platform/cpu_info.cc
index 99de364042..e9da3d8e32 100644
--- a/tensorflow/core/platform/cpu_info.cc
+++ b/tensorflow/core/platform/cpu_info.cc
@@ -344,5 +344,28 @@ int CPUModelNum() {
 #endif
 }
 
+int CPUIDNumSMT() {
+#ifdef PLATFORM_IS_X86
+  // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration
+  // https://software.intel.com/en-us/articles/intel-sdm (Vol 3A)
+  // Section: Detecting Hardware Multi-threads Support and Topology
+  // Uses CPUID Leaf 11 to enumerate system topology on Intel x86 architectures
+  // Other cases not supported
+  uint32 eax, ebx, ecx, edx;
+  // Check if system supports Leaf 11
+  GETCPUID(eax, ebx, ecx, edx, 0, 0);
+  if (eax >= 11) {
+    // 1) Leaf 11 available? CPUID.(EAX=11, ECX=0):EBX != 0
+    // 2) SMT_Mask_Width = CPUID.(EAX=11, ECX=0):EAX[4:0] if CPUID.(EAX=11,
+    // ECX=0):ECX[15:8] is 1
+    GETCPUID(eax, ebx, ecx, edx, 11, 0);
+    if (ebx != 0 && ((ecx & 0xff00) >> 8) == 1) {
+      return 1 << (eax & 0x1f);  // 2 ^ SMT_Mask_Width
+    }
+  }
+#endif  // PLATFORM_IS_X86
+  return 0;
+}
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index b5be7e8b54..175c9ae8b1 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -35,6 +35,10 @@ namespace port {
 // software can change it dynamically.
 int NumSchedulableCPUs();
 
+// Returns an estimate of the number of hyperthreads per physical core
+// on the CPU
+int NumHyperthreadsPerCore();
+
 // Mostly ISA related features that we care about
 enum CPUFeature {
   // Do not change numeric assignments.
@@ -107,6 +111,9 @@ int CPUModelNum();
 // Returns nominal core processor cycles per second of each processor.
 double NominalCPUFrequency();
 
+// Returns num of hyperthreads per physical core
+int CPUIDNumSMT();
+
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index ae81f9b5b3..a319ccbdbe 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -71,6 +71,8 @@ def pyx_library(
         name = filename + "_cython_translation",
         srcs = [filename],
         outs = [filename.split(".")[0] + ".cpp"],
+        # Optionally use PYTHON_BIN_PATH on Linux platforms so that python 3
+        # works. Windows has issues with cython_binary so skip PYTHON_BIN_PATH.
         cmd = "PYTHONHASHSEED=0 $(location @cython//:cython_binary) --cplus $(SRCS) --output-file $(OUTS)",
         tools = ["@cython//:cython_binary"] + pxd_srcs,
     )
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 72c12318ca..ff4b4436bb 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -115,18 +115,17 @@ class LibHDFS {
     const char* kLibHdfsDso = "libhdfs.so";
 #endif
     char* hdfs_home = getenv("HADOOP_HDFS_HOME");
-    if (hdfs_home == nullptr) {
-      status_ = errors::FailedPrecondition(
-          "Environment variable HADOOP_HDFS_HOME not set");
-      return;
-    }
-    string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
-    status_ = TryLoadAndBind(path.c_str(), &handle_);
-    if (!status_.ok()) {
-      // try load libhdfs.so using dynamic loader's search path in case
-      // libhdfs.so is installed in non-standard location
-      status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
+    if (hdfs_home != nullptr) {
+      string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
+      status_ = TryLoadAndBind(path.c_str(), &handle_);
+      if (status_.ok()) {
+        return;
+      }
     }
+
+    // Try to load the library dynamically in case it has been installed
+    // to a in non-standard location.
+    status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
   }
 
   Status status_;
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 8e316472fe..708f32ba80 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -74,6 +74,11 @@ int NumSchedulableCPUs() {
   return kDefaultCores;
 }
 
+int NumHyperthreadsPerCore() {
+  static const int ht_per_core = tensorflow::port::CPUIDNumSMT();
+  return (ht_per_core > 0) ? ht_per_core : 1;
+}
+
 void* AlignedMalloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__)
   return memalign(minimum_alignment, size);
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 522a9d84fd..cb1fd09dbb 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 8
+#define TF_MINOR_VERSION 9
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX ""
+#define TF_VERSION_SUFFIX "-rc0"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index dffc965b14..90b6533690 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -42,6 +42,7 @@ limitations under the License.
 
 #ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
+#include "tensorflow/core/lib/core/stringpiece.h"
 
 using mkldnn::engine;
 using mkldnn::memory;
@@ -712,15 +713,48 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
   return output_tensor;
 }
 #else
+using mkldnn::stream;
+template <typename T> class MklDnnData;
+
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklDnnShape& mkl_shape) {
   Tensor output_tensor;
-  TensorShape output_shape;
-
-  TF_CHECK_OK(
-      Status(error::Code::UNIMPLEMENTED, "Unimplemented conversion function"));
-
+  try {
+    if (!mkl_shape.IsMklTensor())
+      return mkl_tensor;  // return input since it is already TF tensor
+
+    TensorShape output_shape = mkl_shape.GetTfShape();;
+
+    // Allocate output tensor.
+    context->allocate_temp(DataTypeToEnum<T>::v(),
+        output_shape, &output_tensor);
+
+    auto cpu_engine = engine(engine::cpu, 0);
+    MklDnnData<T> input(&cpu_engine);
+
+    // Get Mkl layout of input tensor.
+    auto input_mkl_md = mkl_shape.GetMklLayout();
+    auto output_tf_md = mkl_shape.GetTfLayout();
+    auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
+    input.SetUsrMem(input_mkl_md, &mkl_tensor);
+
+    // reorder
+    if (input.IsReorderNeeded(output_tf_pd)) {
+      std::vector<primitive> net;
+      CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, &output_tensor, &net),
+             true);
+      stream(stream::kind::eager).submit(net).wait();
+    } else {
+      // If not, just forward input tensor to output tensor.
+      CHECK(output_tensor.CopyFrom(mkl_tensor, output_shape));
+    }
+  } catch (mkldnn::error& e) {
+    string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) + ", in file " +
+                       string(__FILE__) + ":" + std::to_string(__LINE__);
+    LOG(FATAL) << "Operation received an exception: " << error_msg;
+  }
   return output_tensor;
 }
 #endif
@@ -1843,7 +1877,7 @@ class FactoryKeyCreator {
   template <typename T>
   void AddAsKey(const T data) {
     auto buffer = reinterpret_cast<const char *>(&data);
-    Append(absl::string_view(buffer, sizeof(T)));
+    Append(StringPiece(buffer, sizeof(T)));
   }
 
   std::string GetKey() {
@@ -1854,8 +1888,8 @@ class FactoryKeyCreator {
   string key_;
   const char delimiter = 'x';
   const int kMaxKeyLength = 256;
-  void Append(absl::string_view s) {
-    key_.append(string(s));
+  void Append(StringPiece s) {
+    key_.append(s.ToString());
     key_.append(1, delimiter);
   }
 };
diff --git a/tensorflow/docs_src/community/groups.md b/tensorflow/docs_src/community/groups.md
index d92f5775fa..0b07d413da 100644
--- a/tensorflow/docs_src/community/groups.md
+++ b/tensorflow/docs_src/community/groups.md
@@ -1,17 +1,38 @@
 # User Groups
 
-TensorFlow has communities around the world.
+TensorFlow has communities around the world. [Submit your community!](https://docs.google.com/forms/d/e/1FAIpQLSc_RQIUYtVgLLihzATaO_WUXkEyBDE_OoRoOXYDPmBEvHuEBA/viewform)
 
 ## Asia
 
-* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
-* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
-* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
+* [TensorFlow China community](https://www.tensorflowers.cn)
+* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/)
+* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/)
+* [Soleil Data Dojo](https://soleildatadojo.connpass.com/)
 * [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/)
+* [TensorFlow Philippines Community](https://www.facebook.com/groups/TensorFlowPH/)
+* [TensorFlow and Deep Learning Singapore](https://www.meetup.com/TensorFlow-and-Deep-Learning-Singapore/)
+* [TensorFlow India](https://www.facebook.com/tensorflowindia)
 
 
 ## Europe
 
 * [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
 * [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
+* [Tensorflow Belgium](https://www.meetup.com/TensorFlow-Belgium)
+* [TensorFlow x Rome Meetup](https://www.meetup.com/it-IT/TensorFlow-x-Rome-Meetup)
+* [TensorFlow London](https://www.meetup.com/TensorFlow-London/)
+* [TensorFlow Edinburgh](https://www.meetup.com/tensorflow-edinburgh/)
 
+
+## America
+
+* [TensorFlow Buenos Aires](https://www.meetup.com/TensorFlow-Buenos-Aires/)
+
+
+## Oceania
+* [Melbourne TensorFlow Meetup](https://www.meetup.com/Melbourne-TensorFlow-Meetup)
+
+
+## Africa
+
+* [TensorFlow Tunis Meetup](https://www.meetup.com/fr-FR/TensorFlow-Tunis-Meetup/)
diff --git a/tensorflow/docs_src/get_started/eager.md b/tensorflow/docs_src/get_started/eager.md
index f08ac74425..bbb25e20c6 100644
--- a/tensorflow/docs_src/get_started/eager.md
+++ b/tensorflow/docs_src/get_started/eager.md
@@ -1,3 +1,3 @@
 # Get Started with Eager Execution
 
-[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/r1.8.0/samples/core/get_started/eager.ipynb)
+[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/r1.9.0/samples/core/get_started/eager.ipynb)
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index 55579d52fb..232d2f1547 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -10,9 +10,9 @@ course prior to diving into TensorFlow documentation:
 TensorFlow is a tool for machine learning. While it contains a wide range of
 functionality, TensorFlow is mainly designed for deep neural network models.
 
-The easiest way to get started with TensorFlow is using Eager Execution.
+The easiest way to get started with TensorFlow is by using Eager Execution.
 
-  * @{$get_started/eager}, is for anyone new to  machine learning or TensorFlow.
+  * @{$get_started/eager}, is for anyone new to machine learning or TensorFlow.
 
 TensorFlow provides many APIs. The remainder of this section focuses on the
 Estimator API which provide scalable, high-performance models. See the
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 1abd840ab3..2901848745 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 52a2a3f8a6..55bc0f64e7 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.9.0-rc0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 1256fb99c4..637231da12 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.8.0</version>
+  <version>1.9.0-rc0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.8.0</version>
+                 <version>1.9.0-rc0</version>
                </dependency>
              </dependencies>
          </project>
@@ -124,12 +124,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.8.0</version>
+  <version>1.9.0-rc0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.8.0</version>
+  <version>1.9.0-rc0</version>
 </dependency>
 ```
 
@@ -148,7 +148,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -167,7 +167,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -175,13 +175,13 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.9.0-rc0.zip).
   3. Extract this .zip file.
 
-
+__Note__: The native library (`tensorflow_jni.dll`) requires `msvcp140.dll` at runtime, which is included in the [Visual C++ 2015 Redistributable](https://www.microsoft.com/en-us/download/details.aspx?id=48145) package. 
 
 ### Validate the installation
 
@@ -227,7 +227,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.8.0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.9.0-rc0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -241,11 +241,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.8.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.9.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.8.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.9.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 0ed8160027..c8d706cf3c 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -339,9 +339,7 @@ Docker will download the TensorFlow binary image the first time you launch it.
 
 #### GPU support
 
-Prior to installing TensorFlow with GPU support, ensure that your system meets all
-[NVIDIA software requirements](#NVIDIARequirements).  To launch a Docker container
-with NVidia GPU support, enter a command of the following format:
+To launch a Docker container with NVidia GPU support, enter a command of the following format (this [does not require any local CUDA installation](https://github.com/nvidia/nvidia-docker/wiki/CUDA#requirements)):
 
 <pre>
 $ <b>nvidia-docker run -it</b> <i>-p hostPort:containerPort TensorFlowGPUImage</i>
@@ -438,7 +436,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -517,7 +515,7 @@ on your system:
   from source. To use the TensorFlow binaries, version 3.5 or higher is required.
   See the [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a
   list of supported GPU cards.
-* [GPU drivers](http://nvidia.com/driver) that support your version of the CUDA
+* [GPU drivers](http://nvidia.com/drivers) that support your version of the CUDA
   Toolkit.
 * The `libcupti-dev` library is the NVIDIA CUDA Profile Tools Interface. This
   library provides advanced profiling support. To install this library,
@@ -684,14 +682,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -703,14 +701,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -722,14 +720,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -741,14 +739,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 29a867a9e3..9d01271c5a 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -522,7 +522,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl
 </pre>
 
 
@@ -530,5 +530,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 5ba522b436..dc6c1e36fc 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -81,7 +81,7 @@ or
 [macOS](#PrepareMac)
 
 
-<a name="#PrepareLinux"></a>
+<a name="PrepareLinux"></a>
 ## Prepare environment for Linux
 
 Before building TensorFlow on Linux, install the following build
@@ -328,10 +328,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.8.0 on Linux:
+for TensorFlow 1.9.0rc0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.9.0rc0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -373,9 +373,9 @@ The build and installation problems you encounter typically depend on the
 operating system.  See the "Common installation problems" section
 of one of the following guides:
 
-  * @{$install_linux#CommonInstallationProblems$Installing TensorFlow on Linux}
-  * @{$install_mac#CommonInstallationProblems$Installing TensorFlow on Mac OS}
-  * @{$install_windows#CommonInstallationProblems$Installing TensorFlow on Windows}
+  * @{$install_linux#common_installation_problems$Installing TensorFlow on Linux}
+  * @{$install_mac#common_installation_problems$Installing TensorFlow on Mac OS}
+  * @{$install_windows#common_installation_problems$Installing TensorFlow on Windows}
 
 Beyond the errors documented in those two guides, the following table
 notes additional errors specific to building TensorFlow.  Note that we
@@ -433,6 +433,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.11.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.9.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.11.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
@@ -456,6 +458,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.11.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
@@ -472,6 +475,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.9.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/mobile/linking_libs.md b/tensorflow/docs_src/mobile/linking_libs.md
index cf0db59021..efef5dd0da 100644
--- a/tensorflow/docs_src/mobile/linking_libs.md
+++ b/tensorflow/docs_src/mobile/linking_libs.md
@@ -27,7 +27,7 @@ called `libandroid_tensorflow_inference_java.jar`. There are three ways to
 include this functionality in your program:
 
 1. Include the jcenter AAR which contains it, as in this
- [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/build.gradle#L59-L65)
+ [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/tfmobile/build.gradle#L59-L65)
 
 2. Download the nightly precompiled version from
 [ci.tensorflow.org](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/).
diff --git a/tensorflow/docs_src/mobile/prepare_models.md b/tensorflow/docs_src/mobile/prepare_models.md
index 8b22c04d87..2b84dbb973 100644
--- a/tensorflow/docs_src/mobile/prepare_models.md
+++ b/tensorflow/docs_src/mobile/prepare_models.md
@@ -105,8 +105,8 @@ inline constants so everything’s in one file.  To handle the conversion, you
 need the `freeze_graph.py` script, that’s held in
 [`tensorflow/python/tools/freeze_graph.py`](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py). You’ll run it like this:
 
-    bazel build tensorflow/tools:freeze_graph
-    bazel-bin/tensorflow/tools/freeze_graph \
+    bazel build tensorflow/python/tools:freeze_graph
+    bazel-bin/tensorflow/python/tools/freeze_graph \
     --input_graph=/tmp/model/my_graph.pb \
     --input_checkpoint=/tmp/model/model.ckpt-1000 \
     --output_graph=/tmp/frozen_graph.pb \
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index 2fea02d861..c97f74139c 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -227,8 +227,8 @@ of 30.0f, and an 8-bit array, the quantized values represent the following:
   <table>
     <tr><th>Quantized</th><th>Float</th></tr>
     <tr><td>0</td><td>-10.0</td></tr>
-    <tr><td>255</td><td>30.0</td></tr>
     <tr><td>128</td><td>10.0</td></tr>
+    <tr><td>255</td><td>30.0</td></tr>
   </table>
   <figcaption>
     <b>Table 2</b>: Example quantized value range
diff --git a/tensorflow/docs_src/programmers_guide/estimators.md b/tensorflow/docs_src/programmers_guide/estimators.md
index c4aae1d9d6..b13b47184d 100644
--- a/tensorflow/docs_src/programmers_guide/estimators.md
+++ b/tensorflow/docs_src/programmers_guide/estimators.md
@@ -21,18 +21,17 @@ Note: TensorFlow also includes a deprecated `Estimator` class at
 
 Estimators provide the following benefits:
 
-*   You can run Estimators-based models on a local host or on a
+*   You can run Estimator-based models on a local host or on a
     distributed multi-server environment without changing your model.
-    Furthermore, you can run Estimators-based models on CPUs, GPUs,
+    Furthermore, you can run Estimator-based models on CPUs, GPUs,
     or TPUs without recoding your model.
 *   Estimators simplify sharing implementations between model developers.
-*   You can develop a state of the art model with high-level intuitive code,
+*   You can develop a state of the art model with high-level intuitive code.
     In short, it is generally much easier to create models with Estimators
     than with the low-level TensorFlow APIs.
-*   Estimators are themselves built on tf.layers, which
+*   Estimators are themselves built on @{tf.layers}, which
     simplifies customization.
-*   Estimators build the graph for you.  In other words, you don't have to
-    build the graph.
+*   Estimators build the graph for you.
 *   Estimators provide a safe distributed training loop that controls how and
     when to:
     *   build the graph
@@ -57,7 +56,7 @@ the "plumbing" for you.  That is, pre-made Estimators create and manage
 pre-made Estimators let you experiment with different model architectures by
 making only minimal code changes.  @{tf.estimator.DNNClassifier$`DNNClassifier`},
 for example, is a pre-made Estimator class that trains classification models
-through dense, feed-forward neural networks.
+based on dense, feed-forward neural networks.
 
 
 ### Structure of a pre-made Estimators program
@@ -79,7 +78,7 @@ of the following four steps:
     an input function:
 
         def input_fn(dataset):
-           ...  # manipulate dataset, extracting feature names and the label
+           ...  # manipulate dataset, extracting the feature dict and the label
            return feature_dict, label
 
     (See @{$programmers_guide/datasets} for full details.)
@@ -96,13 +95,13 @@ of the following four steps:
         population = tf.feature_column.numeric_column('population')
         crime_rate = tf.feature_column.numeric_column('crime_rate')
         median_education = tf.feature_column.numeric_column('median_education',
-                            normalizer_fn='lambda x: x - global_education_mean')
+                            normalizer_fn=lambda x: x - global_education_mean)
 
 3.  **Instantiate the relevant pre-made Estimator.**  For example, here's
     a sample instantiation of a pre-made Estimator named `LinearClassifier`:
 
         # Instantiate an estimator, passing the feature columns.
-        estimator = tf.estimator.Estimator.LinearClassifier(
+        estimator = tf.estimator.LinearClassifier(
             feature_columns=[population, crime_rate, median_education],
             )
 
diff --git a/tensorflow/docs_src/programmers_guide/feature_columns.md b/tensorflow/docs_src/programmers_guide/feature_columns.md
index 845194fe0e..90f5c53a17 100644
--- a/tensorflow/docs_src/programmers_guide/feature_columns.md
+++ b/tensorflow/docs_src/programmers_guide/feature_columns.md
@@ -528,10 +528,10 @@ suggested by the following snippet:
 categorical_column = ... # Create any categorical column
 
 # Represent the categorical column as an embedding column.
-# This means creating a one-hot vector with one element for each category.
+# This means creating an embedding vector lookup table with one element for each category.
 embedding_column = tf.feature_column.embedding_column(
     categorical_column=categorical_column,
-    dimension=dimension_of_embedding_vector)
+    dimension=embedding_dimensions)
 ```
 
 @{$programmers_guide/embedding$Embeddings} is a significant topic within machine
diff --git a/tensorflow/examples/learn/iris.py b/tensorflow/examples/learn/iris.py
index 03e60972aa..86f5204ec3 100644
--- a/tensorflow/examples/learn/iris.py
+++ b/tensorflow/examples/learn/iris.py
@@ -21,7 +21,8 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import urllib
+
+from six.moves.urllib.request import urlretrieve
 
 import tensorflow as tf
 
@@ -38,9 +39,7 @@ FEATURE_KEYS = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
 def maybe_download_iris_data(file_name, download_url):
   """Downloads the file and returns the number of data."""
   if not os.path.exists(file_name):
-    raw = urllib.urlopen(download_url).read()
-    with open(file_name, 'w') as f:
-      f.write(raw)
+    urlretrieve(download_url, file_name)
 
   # The first line is a comma-separated string. The first one is the number of
   # total data in the file.
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index debd95fc62..9b171f66ec 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -376,9 +376,6 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
     }
   }
   // op annotations
-  op_class.add_annotation(
-      Annotation::Create("Generated", "javax.annotation")
-          .attributes("value = \"TensorFlow Java Op Generator\""));
   if (endpoint.deprecated()) {
     op_class.add_annotation(Annotation::Create("Deprecated"));
     string explanation;
@@ -415,8 +412,12 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
   SourceFileWriter writer(op_file.get());
   std::list<Type> dependencies;
   CollectOpDependencies(op, mode, &dependencies);
-  writer.Write(kLicense).EndLine().BeginType(op_class, PUBLIC | FINAL,
-                                             &dependencies, &op_javadoc);
+  writer.Write(kLicense)
+      .EndLine()
+      .Write("// This class has been generated, DO NOT EDIT!")
+      .EndLine()
+      .EndLine()
+      .BeginType(op_class, PUBLIC | FINAL, &dependencies, &op_javadoc);
   if (!op.optional_attributes().empty()) {
     RenderOptionsClass(op, op_class, &writer);
   }
diff --git a/tensorflow/java/src/gen/cc/op_specs.cc b/tensorflow/java/src/gen/cc/op_specs.cc
index 181fd4c5e3..941ab2699c 100644
--- a/tensorflow/java/src/gen/cc/op_specs.cc
+++ b/tensorflow/java/src/gen/cc/op_specs.cc
@@ -96,6 +96,7 @@ Type TypeResolver::TypeOf(const OpDef_ArgDef& arg_def, bool* iterable_out) {
     *iterable_out = true;
     visited_attrs_.insert(std::make_pair(arg_def.number_attr(), Type::Int()));
   }
+
   Type type = Type::Wildcard();
   if (arg_def.type() != DataType::DT_INVALID) {
     // resolve type from DataType
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index b2e6c60021..bd97b181ff 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -196,11 +196,11 @@ def implicit_val_and_grad(f):
   # TODO(cais): Remove calls to tf.constant() once the gradients functions
   # accept lists and np.ndarrays.
 
-  def grad_fn(*args):
+  def grad_fn(*args, **kwds):
     """Computes the gradient of the wrapped function."""
     this_tape = tape.push_new_tape()
     try:
-      end_node = f(*args)
+      end_node = f(*args, **kwds)
       if end_node is None:
         raise ValueError("Cannot differentiate a function that returns None; "
                          "did you forget to return a value from {}?".format(
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 9cd17e0407..20522098b0 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -978,7 +978,10 @@ py_test(
     size = "large",
     srcs = ["keras_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],
+    tags = [
+        "no_windows",
+        "notsan",
+    ],
     deps = [
         ":keras",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/python/estimator/exporter.py b/tensorflow/python/estimator/exporter.py
index 7cdf840c97..b18212cfcd 100644
--- a/tensorflow/python/estimator/exporter.py
+++ b/tensorflow/python/estimator/exporter.py
@@ -156,7 +156,7 @@ def _loss_smaller(best_eval_result, current_eval_result):
   return best_eval_result[default_key] > current_eval_result[default_key]
 
 
-def _verify_compre_fn_args(compare_fn):
+def _verify_compare_fn_args(compare_fn):
   """Verifies compare_fn arguments."""
   args = set(util.fn_args(compare_fn))
   if 'best_eval_result' not in args:
@@ -265,7 +265,7 @@ class BestExporter(Exporter):
     self._compare_fn = compare_fn
     if self._compare_fn is None:
       raise ValueError('`compare_fn` must not be None.')
-    _verify_compre_fn_args(self._compare_fn)
+    _verify_compare_fn_args(self._compare_fn)
 
     self._saved_model_exporter = _SavedModelExporter(
         name, serving_input_receiver_fn, assets_extra, as_text)
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index 035c7c148c..a6cefdece2 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -136,11 +136,13 @@ def numpy_input_fn(x,
       values in `x` have same shape).
     ValueError: if duplicate keys are in both `x` and `y` when `y` is a dict.
     ValueError: if x or y is an empty dict.
-    TypeError: `x` is not a dict or array, or if `shuffle` is not bool.
+    TypeError: `x` is not a dict or array.
+    ValueError: if 'shuffle' is not provided or a bool.
   """
   if not isinstance(shuffle, bool):
-    raise TypeError('shuffle must be explicitly set as boolean; '
-                    'got {}'.format(shuffle))
+    raise ValueError('shuffle must be provided and explicitly set as boolean '
+                     '(it is recommended to set it as True for training); '
+                     'got {}'.format(shuffle))
 
   def input_fn():
     """Numpy input function."""
diff --git a/tensorflow/python/estimator/inputs/numpy_io_test.py b/tensorflow/python/estimator/inputs/numpy_io_test.py
index 92d057e25d..81b201cc5c 100644
--- a/tensorflow/python/estimator/inputs/numpy_io_test.py
+++ b/tensorflow/python/estimator/inputs/numpy_io_test.py
@@ -286,8 +286,9 @@ class NumpyIoTest(test.TestCase):
     x = np.arange(32, 36)
     y = np.arange(4)
     with self.test_session():
-      with self.assertRaisesRegexp(TypeError,
-                                   'shuffle must be explicitly set as boolean'):
+      with self.assertRaisesRegexp(ValueError,
+                                   'shuffle must be provided and explicitly '
+                                   'set as boolean'):
         # Default shuffle is None.
         numpy_io.numpy_input_fn(x, y)
 
diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py
index 938e244fb3..57f8e5fd6a 100644
--- a/tensorflow/python/estimator/inputs/pandas_io.py
+++ b/tensorflow/python/estimator/inputs/pandas_io.py
@@ -68,15 +68,16 @@ def pandas_input_fn(x,
   Raises:
     ValueError: if `x` already contains a column with the same name as `y`, or
       if the indexes of `x` and `y` don't match.
-    TypeError: `shuffle` is not bool.
+    ValueError: if 'shuffle' is not provided or a bool.
   """
   if not HAS_PANDAS:
     raise TypeError(
         'pandas_input_fn should not be called without pandas installed')
 
   if not isinstance(shuffle, bool):
-    raise TypeError('shuffle must be explicitly set as boolean; '
-                    'got {}'.format(shuffle))
+    raise ValueError('shuffle must be provided and explicitly set as boolean '
+                     '(it is recommended to set it as True for training); '
+                     'got {}'.format(shuffle))
 
   x = x.copy()
   if y is not None:
diff --git a/tensorflow/python/estimator/inputs/pandas_io_test.py b/tensorflow/python/estimator/inputs/pandas_io_test.py
index e5912a3b28..dcecf6dd61 100644
--- a/tensorflow/python/estimator/inputs/pandas_io_test.py
+++ b/tensorflow/python/estimator/inputs/pandas_io_test.py
@@ -70,8 +70,9 @@ class PandasIoTest(test.TestCase):
       return
     x, _ = self.makeTestDataFrame()
     y_noindex = pd.Series(np.arange(-32, -28))
-    with self.assertRaisesRegexp(TypeError,
-                                 'shuffle must be explicitly set as boolean'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'shuffle must be provided and explicitly '
+                                 'set as boolean'):
       # Default shuffle is None
       pandas_io.pandas_input_fn(x, y_noindex)
 
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index 8e2ec83020..51a61adb21 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -250,7 +250,7 @@ class _PandasFeedFn(object):
                num_epochs=None):
     if len(placeholders) != len(dataframe.columns) + 1:
       raise ValueError("Expected {} placeholders; got {}.".format(
-          len(dataframe.columns), len(placeholders)))
+          len(dataframe.columns) + 1, len(placeholders)))
     self._index_placeholder = placeholders[0]
     self._col_placeholders = placeholders[1:]
     self._dataframe = dataframe
diff --git a/tensorflow/python/estimator/keras.py b/tensorflow/python/estimator/keras.py
index c80af08fba..2f439f765e 100644
--- a/tensorflow/python/estimator/keras.py
+++ b/tensorflow/python/estimator/keras.py
@@ -70,7 +70,7 @@ def _convert_tensor(x):
   return x
 
 
-def _any_variable_initalized():
+def _any_variable_initialized():
   """Check if any variable has been initialized in the Keras model.
 
   Returns:
@@ -511,7 +511,7 @@ def model_to_estimator(keras_model=None,
       keras_model_fn, model_dir=model_dir, config=config)
 
   # Check if we need to call get_weights:
-  if _any_variable_initalized():
+  if _any_variable_initialized():
     keras_weights = keras_model.get_weights()
     # Warn if config passed to estimator tries to update GPUOptions. If a
     # session has already been created, the GPUOptions passed to the first
diff --git a/tensorflow/python/estimator/keras_test.py b/tensorflow/python/estimator/keras_test.py
index 6688a84130..5e094ae92b 100644
--- a/tensorflow/python/estimator/keras_test.py
+++ b/tensorflow/python/estimator/keras_test.py
@@ -31,10 +31,10 @@ from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.applications import mobilenet
 from tensorflow.python.keras.optimizers import SGD
+from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -146,13 +146,13 @@ def randomize_io_type(array, name):
 def multi_inputs_multi_outputs_model():
   a = keras.layers.Input(shape=(16,), name='input_a')
   b = keras.layers.Input(shape=(16,), name='input_b')
-  m = keras.layers.Input(shape=(8,), dtype='bool', name='input_m')
+  m = keras.layers.Input(shape=(8,), dtype='string', name='input_m')
   dense = keras.layers.Dense(8, name='dense_1')
 
   a_2 = dense(a)
-  # Apply a mask
-  s_2 = keras.layers.Lambda(lambda k:
-                            K.switch(k[0], k[1], K.zeros_like(k[1])))([m, a_2])
+  # Read m
+  m_2 = keras.layers.Lambda(gen_parsing_ops.string_to_number)(m)
+  s_2 = keras.layers.Lambda(lambda k: k[0] * k[1])([m_2, a_2])
   b_2 = dense(b)
   merged = keras.layers.concatenate([s_2, b_2], name='merge')
   c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
@@ -372,13 +372,13 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
 
     def train_input_fn():
       input_dict = {'input_a': a_train, 'input_b': b_train,
-                    'input_m': input_m_train > 0}
+                    'input_m': input_m_train.astype(np.str)}
       output_dict = {'dense_2': c_train, 'dense_3': d_train}
       return input_dict, output_dict
 
     def eval_input_fn():
       input_dict = {'input_a': a_test, 'input_b': b_test,
-                    'input_m': input_m_test > 0}
+                    'input_m': input_m_test.astype(np.str)}
       output_dict = {'dense_2': c_test, 'dense_3': d_test}
       return input_dict, output_dict
 
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index e487f583be..f608dea430 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -93,6 +93,8 @@ def selu(x):
       - To be used together with the initialization "lecun_normal".
       - To be used together with the dropout variant "AlphaDropout".
 
+  References:
+      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
   """
   alpha = 1.6732632423543772848170429916717
   scale = 1.0507009873554804934193349852946
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 70b6a8431a..9f91368e5b 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -724,15 +724,6 @@ class TensorBoard(Callback):
         for weight in layer.weights:
           mapped_weight_name = weight.name.replace(':', '_')
           tf_summary.histogram(mapped_weight_name, weight)
-          if self.write_grads:
-            grads = model.optimizer.get_gradients(model.total_loss, weight)
-
-            def is_indexed_slices(grad):
-              return type(grad).__name__ == 'IndexedSlices'
-
-            grads = [grad.values if is_indexed_slices(grad) else grad
-                     for grad in grads]
-            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
           if self.write_images:
             w_img = array_ops.squeeze(weight)
             shape = K.int_shape(w_img)
@@ -759,6 +750,18 @@ class TensorBoard(Callback):
             assert len(shape) == 4 and shape[-1] in [1, 3, 4]
             tf_summary.image(mapped_weight_name, w_img)
 
+        if self.write_grads:
+          for weight in layer.trainable_weights:
+            mapped_weight_name = weight.name.replace(':', '_')
+            grads = model.optimizer.get_gradients(model.total_loss, weight)
+
+            def is_indexed_slices(grad):
+              return type(grad).__name__ == 'IndexedSlices'
+
+            grads = [grad.values if is_indexed_slices(grad) else grad
+                     for grad in grads]
+            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
+
         if hasattr(layer, 'output'):
           tf_summary.histogram('{}_out'.format(layer.name), layer.output)
     self.merged = tf_summary.merge_all()
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index b355f4a269..5062a26580 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -653,6 +653,8 @@ class KerasCallbacksTest(test.TestCase):
       model.add(
           keras.layers.Dense(
               NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      # non_trainable_weights: moving_variance, moving_mean
+      model.add(keras.layers.BatchNormalization())
       model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
       model.compile(
           loss='categorical_crossentropy',
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index a4cd017d60..1c9135982e 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -123,7 +123,7 @@ class Network(base_layer.Layer):
     # Entries are unique. Includes input and output layers.
     self._layers = []
 
-    # Used in symbolic mode only, only in conjonction with graph-networks
+    # Used in symbolic mode only, only in conjunction with graph-networks
     self._outbound_nodes = []
     self._inbound_nodes = []
 
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index 6a94986b9c..7e82db028b 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -482,7 +482,7 @@ class TestWholeModelSaving(test.TestCase):
       with h5py.File(fname, 'r') as h5file:
         num_names_arrays = len([attr for attr in h5file['model_weights'].attrs
                                 if attr.startswith('layer_names')])
-      # The chunking of layer names array should have happend.
+      # The chunking of layer names array should have happened.
       self.assertGreater(num_names_arrays, 0)
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
@@ -527,7 +527,7 @@ class TestWholeModelSaving(test.TestCase):
         num_weight_arrays = len(
             [attr for attr in h5file['model_weights']['nested_model'].attrs
              if attr.startswith('weight_names')])
-      # The chunking of layer names array should have happend.
+      # The chunking of layer names array should have happened.
       self.assertGreater(num_weight_arrays, 0)
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 89c1f1a40f..fce6cbdb7a 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -409,11 +410,13 @@ class Model(Network):
         else:
           if sample_weight_mode == 'temporal':
             sample_weights.append(array_ops.placeholder_with_default(
-                [[1.]], shape=[None, None], name=name + '_sample_weights'))
+                constant_op.constant([[1.]], dtype=K.floatx()),
+                shape=[None, None], name=name + '_sample_weights'))
             sample_weight_modes.append('temporal')
           else:
             sample_weights.append(array_ops.placeholder_with_default(
-                [1.], shape=[None], name=name + '_sample_weights'))
+                constant_op.constant([1.], dtype=K.floatx()),
+                shape=[None], name=name + '_sample_weights'))
             sample_weight_modes.append(None)
     self.sample_weight_modes = sample_weight_modes
     self._feed_sample_weight_modes = []
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index 2ecbff3a1c..e8838cd3bc 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -732,7 +732,7 @@ def slice_arrays(arrays, indices, contiguous=True):
   """Slices batches out of provided arrays (workaround for eager tensors).
 
   Unfortunately eager tensors don't have the same slicing behavior as
-  Numpy arrays (they folow  the same slicing behavior as symbolic TF tensors),
+  Numpy arrays (they follow the same slicing behavior as symbolic TF tensors),
   hence we cannot use `generic_utils.slice_arrays` directly
   and we have to implement this workaround based on `concat`. This has a
   performance cost.
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index a54d6da839..c519e194bd 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -71,7 +71,7 @@ class KerasInitializersTest(test.TestCase):
                                                       stddev=1,
                                                       seed=126),
                    tensor_shape,
-                   target_mean=0., target_std=None, target_max=2)
+                   target_mean=0., target_max=2, target_min=-2)
 
   def test_constant(self):
     tensor_shape = (5, 6, 4)
@@ -83,49 +83,49 @@ class KerasInitializersTest(test.TestCase):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(3. / fan_in)
+      std = np.sqrt(1. / fan_in)
       self._runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_glorot_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(6. / (fan_in + fan_out))
+      std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_he_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(6. / fan_in)
+      std = np.sqrt(2. / fan_in)
       self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_lecun_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(1. / fan_in)
+      std = np.sqrt(1. / fan_in)
       self._runner(keras.initializers.lecun_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(2. / (fan_in + fan_out))
+      std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_he_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(2. / fan_in)
+      std = np.sqrt(2. / fan_in)
       self._runner(keras.initializers.he_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_orthogonal(self):
     tensor_shape = (20, 20)
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 5061825d38..f60064ed63 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -19,7 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import sys
 import types as python_types
+import warnings
 
 import numpy as np
 
@@ -714,6 +716,7 @@ class Lambda(Layer):
     return self.mask
 
   def get_config(self):
+    module = self.function.__module__
     if isinstance(self.function, python_types.LambdaType):
       function = generic_utils.func_dump(self.function)
       function_type = 'lambda'
@@ -721,21 +724,26 @@ class Lambda(Layer):
       function = self.function.__name__
       function_type = 'function'
 
+    output_shape_module = None
     if isinstance(self._output_shape, python_types.LambdaType):
       output_shape = generic_utils.func_dump(self._output_shape)
       output_shape_type = 'lambda'
+      output_shape_module = self._output_shape.__module__
     elif callable(self._output_shape):
       output_shape = self._output_shape.__name__
       output_shape_type = 'function'
+      output_shape_module = self._output_shape.__module__
     else:
       output_shape = self._output_shape
       output_shape_type = 'raw'
 
     config = {
         'function': function,
+        'module': module,
         'function_type': function_type,
         'output_shape': output_shape,
         'output_shape_type': output_shape_type,
+        'output_shape_module': output_shape_module,
         'arguments': self.arguments
     }
     base_config = super(Lambda, self).get_config()
@@ -745,8 +753,16 @@ class Lambda(Layer):
   def from_config(cls, config, custom_objects=None):
     config = config.copy()
     globs = globals()
+    module = config.pop('module', None)
+    if module in sys.modules:
+      globs.update(sys.modules[module].__dict__)
+    elif module is not None:
+      # Note: we don't know the name of the function if it's a lambda.
+      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
+                    'It may cause errors.'.format(module)
+                    , UserWarning)
     if custom_objects:
-      globs = dict(list(globs.items()) + list(custom_objects.items()))
+      globs.update(custom_objects)
     function_type = config.pop('function_type')
     if function_type == 'function':
       # Simple lookup in custom objects
@@ -760,6 +776,14 @@ class Lambda(Layer):
     else:
       raise TypeError('Unknown function type:', function_type)
 
+    output_shape_module = config.pop('output_shape_module', None)
+    if output_shape_module in sys.modules:
+      globs.update(sys.modules[output_shape_module].__dict__)
+    elif output_shape_module is not None:
+      # Note: we don't know the name of the function if it's a lambda.
+      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
+                    'It may cause errors.'.format(output_shape_module)
+                    , UserWarning)
     output_shape_type = config.pop('output_shape_type')
     if output_shape_type == 'function':
       # Simple lookup in custom objects
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index c616d8f24f..e6e45902a8 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -144,5 +144,19 @@ class CheckpointingTests(test.TestCase):
     model.load_weights(save_prefix)
     self.assertEqual(12., self.evaluate(beta1_power))
 
+class TestModelBackend(test.TestCase):
+
+  def test_model_backend_float64_use_cases(self):
+    # Test case for GitHub issue 19318
+    floatx = keras.backend.floatx()
+    keras.backend.set_floatx('float64')
+
+    x = keras.Input((5,))
+    y = keras.layers.Dense(1)(x)
+    model = keras.models.Model(x, y)
+    model.compile('rmsprop', 'mse')
+
+    keras.backend.set_floatx(floatx)
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/as_string_op_test.py b/tensorflow/python/kernel_tests/as_string_op_test.py
index 9d54add264..94ed8ebd31 100644
--- a/tensorflow/python/kernel_tests/as_string_op_test.py
+++ b/tensorflow/python/kernel_tests/as_string_op_test.py
@@ -130,6 +130,16 @@ class AsStringOpTest(test.TestCase):
       result = output.eval(feed_dict={input_: int_inputs_})
       self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
 
+  def testHalfInt(self):
+    s = lambda strs: [x.decode("ascii") for x in strs]
+
+    with self.test_session():
+      input_ = array_ops.placeholder(dtypes.int16)
+      int_inputs_ = [np.iinfo(np.int16).min, np.iinfo(np.int16).max]
+      output = string_ops.as_string(input_)
+      result = output.eval(feed_dict={input_: int_inputs_})
+      self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
+
   def testBool(self):
     bool_inputs_ = [False, True]
     s = lambda strs: [x.decode("ascii") for x in strs]
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index 08b03f8518..16fdedac41 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -172,7 +172,7 @@ class BetaincTest(test.TestCase):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [gx_s.shape], tf_gout_t, gx_s.shape)
-      print("betainc gradient err = %g " % err)
+      tf_logging.info("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
       # Test broadcast gradient
@@ -181,7 +181,7 @@ class BetaincTest(test.TestCase):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [()], tf_gout_t, ga_s.shape)
-      print("betainc gradient err = %g " % err)
+      tf_logging.info("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
 
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index e08123b041..fb52d10475 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -18,9 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -414,6 +417,16 @@ class ClipTest(test.TestCase):
 
     self.assertAllClose(np_ans, tf_ans)
 
+  def testClipByValueEmptyTensor(self):
+    # Test case for GitHub issue 19337
+    zero = array_ops.placeholder(dtype=dtypes.float32, shape=None)
+    x = clip_ops.clip_by_value(zero, zero, zero)
+    y = clip_ops.clip_by_value(zero, 1.0, 1.0)
+    z = clip_ops.clip_by_value(zero, zero, 1.0)
+    w = clip_ops.clip_by_value(zero, 1.0, zero)
+    with self.test_session(use_gpu=True) as sess:
+      sess.run([x, y, z, w], feed_dict={zero: np.zeros((7, 0))})
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 8699fd5b25..80ba7dafc9 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -312,8 +312,8 @@ class Conv2DTest(test.TestCase):
       expected_values = self.evaluate(expected_results)
       computed_values = self.evaluate(computed_results)
       for e_value, c_value in zip(expected_values, computed_values):
-        print("expected = ", e_value)
-        print("actual = ", c_value)
+        tf_logging.info("expected = ", e_value)
+        tf_logging.info("actual = ", c_value)
         self.assertAllClose(
             e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-4)
 
@@ -337,8 +337,8 @@ class Conv2DTest(test.TestCase):
       for i in range(len(tensors)):
         conv = tensors[i]
         value = values[i]
-        print("expected = ", expected)
-        print("actual = ", value)
+        tf_logging.info("expected = ", expected)
+        tf_logging.info("actual = ", value)
         tol = 1e-5
         if value.dtype == np.float16:
           tol = 1e-3
@@ -547,8 +547,8 @@ class Conv2DTest(test.TestCase):
       # "values" consists of two tensors for two backprops
       value = self.evaluate(conv)
       self.assertShapeEqual(value, conv)
-    print("expected = ", expected)
-    print("actual = ", value)
+    tf_logging.info("expected = ", expected)
+    tf_logging.info("actual = ", value)
     self.assertArrayNear(expected, value.flatten(), err)
 
   def _CompareBackpropInput(self, input_sizes, filter_sizes, output_sizes,
@@ -723,8 +723,8 @@ class Conv2DTest(test.TestCase):
             data_format=data_format)
         value = self.evaluate(conv)
         self.assertShapeEqual(value, conv)
-      print("expected = ", expected)
-      print("actual = ", value)
+      tf_logging.info("expected = ", expected)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(expected, value.flatten(), 1e-5)
 
   def _CompareBackFilter(self, input_sizes, filter_sizes, output_sizes,
@@ -912,8 +912,8 @@ class Conv2DTest(test.TestCase):
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      print("expected = ", value_2)
-      print("actual = ", value)
+      tf_logging.info("expected = ", value_2)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   # Testing for backprops
@@ -965,8 +965,8 @@ class Conv2DTest(test.TestCase):
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      print("expected = ", value_2)
-      print("actual = ", value)
+      tf_logging.info("expected = ", value_2)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
@@ -1178,7 +1178,7 @@ class Conv2DTest(test.TestCase):
           # since fp16 numerical gradients are too imprecise.
           err = np.fabs(jacob_t - reference_jacob_t).max()
 
-        print("conv_2d gradient error = ", err)
+        tf_logging.info("conv_2d gradient error = ", err)
         self.assertLess(err, 0.002)
 
   def testInputGradientValidPaddingStrideOne(self):
@@ -1546,7 +1546,7 @@ class DepthwiseConv2DTest(test.TestCase):
       conv = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
       value = sess.run(conv)
-    print("value = ", value)
+    tf_logging.info("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1668,7 +1668,7 @@ class SeparableConv2DTest(test.TestCase):
         conv = array_ops.transpose(conv, [0, 2, 3, 1])
 
       value = sess.run(conv)
-    print("value = ", value)
+    tf_logging.info("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1826,7 +1826,7 @@ class Conv2DBenchmark(test.Benchmark):
         wall_time = time.time() - start
         self.report_benchmark(
             name="conv_stack_iter_%d" % iter_index, wall_time=wall_time)
-        print("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
+        tf_logging.info("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
 
 
 def GetInceptionFwdTest(input_size, filter_size, stride, padding,
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 91ebe8de99..58e2a8ac2a 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -197,7 +197,21 @@ class GatherNdTest(test.TestCase):
     self.assertEqual(None, shape.ndims)
     self.assertEqual(None, shape[0].value)
 
-  def testBadIndices(self):
+  def testBadIndicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [0, 1, 2]
+      indices = [[[0], [7]]]  # Make this one higher rank
+      gather_nd = array_ops.gather_nd(params, indices)
+      with self.assertRaisesOpError(
+          r"flat indices\[1, :\] = \[7\] does not index into param "
+          r"\(shape: \[3\]\)"):
+        gather_nd.eval()
+
+  def _disabledTestBadIndicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [0, 1, 2]
       indices = [[[0], [7]]]  # Make this one higher rank
@@ -207,7 +221,21 @@ class GatherNdTest(test.TestCase):
           r"\(shape: \[3\]\)"):
         gather_nd.eval()
 
-  def testBadIndicesWithSlices(self):
+  def testBadIndicesWithSlicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [[0, 1, 2]]
+      indices = [[[0], [0], [1]]]  # Make this one higher rank
+      gather_nd = array_ops.gather_nd(params, indices)
+      with self.assertRaisesOpError(
+          r"flat indices\[2, :\] = \[1\] does not index into param "
+          r"\(shape: \[1,3\]\)"):
+        gather_nd.eval()
+
+  def _disabledTestBadIndicesWithSlicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2]]
       indices = [[[0], [0], [1]]]  # Make this one higher rank
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index a2fcd751df..033fa95935 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -27,7 +27,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.platform import test
 
-_TEST_TYPES = (dtypes.float32, dtypes.complex64, dtypes.complex128)
+_TEST_TYPES = (dtypes.int64, dtypes.float32,
+               dtypes.complex64, dtypes.complex128)
 
 
 class GatherTest(test.TestCase):
@@ -122,6 +123,9 @@ class GatherTest(test.TestCase):
                 gather, [tf_params, tf_indices, tf_axis], gather_grad)
             self.assertEqual(indices_grad, None)
             self.assertEqual(axis_grad, None)
+            if dtype.is_integer:
+              self.assertEqual(params_grad, None)
+              continue
             # For axis 0, we are able to create an efficient IndexedSlices for
             # the gradient.
             if axis == 0:
@@ -177,7 +181,19 @@ class GatherTest(test.TestCase):
     gather_t = array_ops.gather(params, indices, axis=axis)
     self.assertEqual(None, gather_t.shape)
 
-  def testBadIndices(self):
+  def testBadIndicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [[0, 1, 2], [3, 4, 5]]
+      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
+        array_ops.gather(params, [[7]], axis=0).eval()
+      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 3\)"):
+        array_ops.gather(params, [[7]], axis=1).eval()
+
+  def _disabledTestBadIndicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2], [3, 4, 5]]
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index a9b55854f1..795aa67248 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -362,6 +362,33 @@ class UniformUnitScalingInitializationTest(test.TestCase):
         dtype=dtypes.string)
 
 
+class VarianceScalingInitializationTest(test.TestCase):
+
+  def testNormalDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops.variance_scaling_initializer(distribution='normal')
+
+    with self.test_session(use_gpu=True):
+      x = init(shape).eval()
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+  def testUniformDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops.variance_scaling_initializer(distribution='uniform')
+
+    with self.test_session(use_gpu=True):
+      x = init(shape).eval()
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+
 # TODO(vrv): move to sequence_ops_test?
 class RangeTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index a0c372db7d..e95c729715 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -947,7 +947,7 @@ class PoolingTest(test.TestCase):
           output_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    print("%s gradient error = " % func_name, err)
+    tf_logging.info("%s gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _ConstructAndTestSecondGradient(self,
@@ -1024,7 +1024,7 @@ class PoolingTest(test.TestCase):
           input_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    print("%s second-order gradient error = " % func_name, err)
+    tf_logging.info("%s second-order gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _testMaxPoolGradValidPadding1_1(self, data_format, use_gpu):
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 677253946e..253e43920b 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gc
 import re
 
 import numpy as np
@@ -434,13 +435,29 @@ class PyFuncTest(test.TestCase):
 
   # ----- Tests shared by py_func and eager_py_func -----
   def testCleanup(self):
-    for _ in xrange(1000):
-      g = ops.Graph()
-      with g.as_default():
-        c = constant_op.constant([1.], dtypes.float32)
-        _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
-        _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
-    self.assertLess(script_ops._py_funcs.size(), 100)
+    # Delete everything created by previous tests to avoid side effects.
+    ops.reset_default_graph()
+    gc.collect()
+    initial_size = script_ops._py_funcs.size()
+    # Encapsulate the graph generation, so locals can be deleted.
+    def make_graphs():
+      for _ in xrange(1000):
+        g = ops.Graph()
+        with g.as_default():
+          c = constant_op.constant([1.], dtypes.float32)
+          _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
+          _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
+          # These ops have a reference to 'c' which has a reference to the graph.
+          # Checks if the functions are being deleted though the graph is referenced from them.
+          # (see #18292)
+          _ = script_ops.py_func(lambda x: x + c.shape[0], [c], [dtypes.float32])
+          _ = script_ops.eager_py_func(lambda x: x + c.shape[0], [c], [dtypes.float32])
+ 
+    # Call garbage collector to enforce deletion.
+    make_graphs()
+    ops.reset_default_graph()
+    gc.collect()
+    self.assertEqual(initial_size, script_ops._py_funcs.size())
 
   # ----- Tests for eager_py_func -----
   @test_util.run_in_graph_and_eager_modes()
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 79fe927b8a..faa4b49a8d 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -144,7 +144,9 @@ class StatefulScatterNdTest(test.TestCase):
         self.assertAllClose(new, ref_var.eval())
 
   def _VariableRankTests(self, np_scatter, tf_scatter):
-    for vtype in (np.float32, np.float64, np.complex64, np.complex128):
+    for vtype in (np.int32,
+                  np.float32, np.float64,
+                  np.complex64, np.complex128):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(np_scatter, tf_scatter, vtype, itype)
 
@@ -221,7 +223,7 @@ class StatefulScatterNdTest(test.TestCase):
   #   self._VariableRankTests(_NumpyDiv, state_ops.scatter_nd_div)
 
   def _ScatterRepeatIndicesTest(self, np_scatter, tf_scatter):
-    for vtype in (np.float32, np.float64):
+    for vtype in (np.int32, np.float32, np.float64):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(
             np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index c70a4ffce7..1a0fa744ae 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -159,7 +159,13 @@ class ScatterTest(test.TestCase):
 
           # Clips small values to avoid division by zero.
           def clip_small_values(x):
-            return 1e-4 * np.sign(x) if np.abs(x) < 1e-4 else x
+            threshold = 1e-4
+            sign = np.sign(x)
+
+            if isinstance(x, np.int32):
+              threshold = 1
+              sign = np.random.choice([-1, 1])
+            return threshold * sign if np.abs(x) < threshold else x
 
           updates = np.vectorize(clip_small_values)(updates)
           old = _AsType(np.random.randn(*((first_dim,) + extra_shape)), vtype)
@@ -181,7 +187,11 @@ class ScatterTest(test.TestCase):
                          tf_scatter,
                          repeat_indices=False,
                          updates_are_scalar=False):
-    for vtype in (np.float32, np.float64):
+    vtypes = [np.float32, np.float64]
+    if tf_scatter != state_ops.scatter_div:
+      vtypes.append(np.int32)
+
+    for vtype in vtypes:
       for itype in (np.int32, np.int64):
         self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices,
                                updates_are_scalar)
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 794be096b7..a82855dfeb 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -264,7 +264,9 @@ class UnsortedSegmentTest(SegmentReductionHelper):
 
     # A subset of ops has been enabled for complex numbers
     self.complex_ops_list = [(np.add, None,
-                              math_ops.unsorted_segment_sum, lambda t: 0)]
+                              math_ops.unsorted_segment_sum, lambda t: 0),
+                             (np.ndarray.__mul__, None,
+                              math_ops.unsorted_segment_prod, lambda t: 1)]
     self.differentiable_dtypes = [dtypes_lib.float16, dtypes_lib.float32,
                                   dtypes_lib.float64]
     self.all_dtypes = (self.differentiable_dtypes +
diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index a5bd1b6ee0..e20daccb28 100644
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -146,5 +146,101 @@ class StringSplitOpTest(test.TestCase):
       self.assertAllEqual(shape, [3, 1])
 
 
+class StringSplitV2OpTest(test.TestCase):
+
+  def testSplitV2(self):
+    strings = ["pigs on the wing", "animals"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
+      self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
+      self.assertAllEqual(shape, [2, 4])
+
+  def testSplitV2MultiCharSeparator(self):
+    # Match Python behavior:
+    # >>> '1<>2<>3'.split('<>')
+    # ['1', '2', '3']
+    # >>> "<><>4<>5<><>6<>".split("<>")
+    # ['', '', '4', '5', '', '6', '']
+    strings = ["1<>2<>3", "<><>4<>5<><>6<>"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep="<>")
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(
+          indices, [[0, 0], [0, 1], [0, 2],
+                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
+      self.assertAllEqual(values, [b"1", b"2", b"3",
+                                   b"", b"", b"4", b"5", b"", b"6", b""])
+      self.assertAllEqual(shape, [2, 7])
+
+  def testSplitV2SimpleSeparator(self):
+    # Match Python behavior:
+    # >>> '1,2,3'.split(',')
+    # ['1', '2', '3']
+    # >>> '1,2,,3,'.split(',')
+    # ['1', '2', '', '3', '']
+    strings = ["1,2,3", "4,5,,6,"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep=',')
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
+                                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
+      self.assertAllEqual(values, [b"1", b"2", b"3",
+                                   b"4", b"5", b"", b"6", b""])
+      self.assertAllEqual(shape, [2, 5])
+
+  def testSplitV2EmptySeparator(self):
+    # Match Python behavior:
+    # >>> '1 2 3'.split()
+    # ['1', '2', '3']
+    #>>> '   1   2   3   '.split()
+    #['1', '2', '3']
+    strings = ["1 2 3", "  4  5    6  "]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
+                                    [1, 0], [1, 1], [1, 2]])
+      self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
+      self.assertAllEqual(shape, [2, 3])
+
+  def testSplitV2SimpleSeparatorMaxSplit(self):
+    # Match Python behavior:
+    # >>> '1,2,3'.split(',', maxsplit=1)
+    # ['1', '2,3']
+    # >>> '4,5,,6,'.split(',', maxsplit=1)
+    # ['4', '5,,6,']
+    strings = ["1,2,3", "4,5,,6,"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep=',', maxsplit=1)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1],
+                                    [1, 0], [1, 1]])
+      self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
+      self.assertAllEqual(shape, [2, 2])
+
+  def testSplitV2EmptySeparatorMaxSplit(self):
+    # Match Python behavior:
+    # '1 2 3'.split(maxsplit=1)
+    # ['1', '2 3']
+    # >>> "  4  5    6  ".split(maxsplit=1)
+    # ['4', '5    6  ']
+    strings = ["1 2 3", "  4  5    6  "]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, maxsplit=1)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1],
+                                    [1, 0], [1, 1]])
+      self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
+      self.assertAllEqual(shape, [2, 2])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 8129334703..fae63b1132 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -2619,6 +2619,10 @@ reverse.__doc__ = gen_array_ops.reverse_v2.__doc__
 
 # pylint: disable=redefined-builtin
 @tf_export("reverse_sequence")
+@deprecation.deprecated_args(
+    None, "seq_dim is deprecated, use seq_axis instead", "seq_dim")
+@deprecation.deprecated_args(
+    None, "batch_dim is deprecated, use batch_axis instead", "batch_dim")
 def reverse_sequence(input,
                      seq_lengths,
                      seq_axis=None,
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 12afcd0b51..94c8d79335 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -283,10 +283,10 @@ def compute_gradient(x,
   numbers.  For example, if `x` is complex with shape `[m]` and `y` is complex
   with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with
 
-      J[:m, :n] = d(Re y)/d(Re x)
-      J[:m, n:] = d(Im y)/d(Re x)
-      J[m:, :n] = d(Re y)/d(Im x)
-      J[m:, n:] = d(Im y)/d(Im x)
+      J[::2, ::2] = d(Re y)/d(Re x)
+      J[::2, 1::2] = d(Im y)/d(Re x)
+      J[1::2, ::2] = d(Re y)/d(Im x)
+      J[1::2, 1::2] = d(Im y)/d(Im x)
 
   Args:
     x: a tensor or list of tensors
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index bdcf420980..f27d9224c1 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
@@ -258,14 +259,14 @@ def random_flip_up_down(image, seed=None):
   dimension, which is `height`.  Otherwise output the image as-is.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
-
+    A tensor of the same type and shape as `image`.
   Raises:
     ValueError: if the shape of `image` not supported.
   """
@@ -280,13 +281,14 @@ def random_flip_left_right(image, seed=None):
   second dimension, which is `width`.  Otherwise output the image as-is.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
+    A tensor of the same type and shape as `image`.
 
   Raises:
     ValueError: if the shape of `image` not supported.
@@ -297,7 +299,8 @@ def random_flip_left_right(image, seed=None):
 def _random_flip(image, flip_index, seed, scope_name):
   """Randomly (50% chance) flip an image along axis `flip_index`.
     Args:
-      image: A 3-D tensor of shape `[height, width, channels].`
+      image: 4-D Tensor of shape `[batch, height, width, channels]` or
+             3-D Tensor of shape `[height, width, channels]`.
       flip_index: The dimension along which to flip the image.
                   Vertical: 0, Horizontal: 1
       seed: A Python integer. Used to create a random seed. See
@@ -306,22 +309,37 @@ def _random_flip(image, flip_index, seed, scope_name):
       scope_name: Name of the scope in which the ops are added.
 
     Returns:
-      A 3-D tensor of the same type and shape as `image`.
+      A tensor of the same type and shape as `image`.
 
     Raises:
       ValueError: if the shape of `image` not supported.
   """
   with ops.name_scope(None, scope_name, [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = _Assert3DImage(image)
-    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-    mirror_cond = math_ops.less(uniform_random, .5)
-    result = control_flow_ops.cond(
-        mirror_cond,
-        lambda: array_ops.reverse(image, [flip_index]),
-        lambda: image,
-        name=scope)
-    return fix_image_flip_shape(image, result)
+    image = _AssertAtLeast3DImage(image)
+    shape = image.get_shape()
+    if shape.ndims == 3 or shape.ndims is None:
+      uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
+      mirror_cond = math_ops.less(uniform_random, .5)
+      result = control_flow_ops.cond(
+          mirror_cond,
+          lambda: array_ops.reverse(image, [flip_index]),
+          lambda: image,
+          name=scope
+      )
+      return fix_image_flip_shape(image, result)
+    elif shape.ndims == 4:
+      uniform_random = random_ops.random_uniform(
+          [array_ops.shape(image)[0]], 0, 1.0, seed=seed
+      )
+      mirror_cond = math_ops.less(uniform_random, .5)
+      return array_ops.where(
+          mirror_cond,
+          image,
+          functional_ops.map_fn(lambda x: array_ops.reverse(x, [flip_index]), image, dtype=image.dtype)
+      )
+    else:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
 
 @tf_export('image.flip_left_right')
@@ -1634,13 +1652,13 @@ def is_jpeg(contents, name=None):
 
 
 @tf_export('image.decode_image')
-def decode_image(contents, channels=None, name=None):
+def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
   """Convenience function for `decode_bmp`, `decode_gif`, `decode_jpeg`,
   and `decode_png`.
 
   Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
-  appropriate operation to convert the input bytes `string` into a `Tensor` of
-  type `uint8`.
+  appropriate operation to convert the input bytes `string` into a `Tensor`
+  of type `dtype`.
 
   Note: `decode_gif` returns a 4-D array `[num_frames, height, width, 3]`, as
   opposed to `decode_bmp`, `decode_jpeg` and `decode_png`, which return 3-D
@@ -1652,10 +1670,11 @@ def decode_image(contents, channels=None, name=None):
     contents: 0-D `string`. The encoded image bytes.
     channels: An optional `int`. Defaults to `0`. Number of color channels for
       the decoded image.
+    dtype: The desired DType of the returned `Tensor`.
     name: A name for the operation (optional)
 
   Returns:
-    `Tensor` with type `uint8` with shape `[height, width, num_channels]` for
+    `Tensor` with type `dtype` and shape `[height, width, num_channels]` for
       BMP, JPEG, and PNG images and shape `[num_frames, height, width, 3]` for
       GIF images.
 
@@ -1679,7 +1698,7 @@ def decode_image(contents, channels=None, name=None):
       channels_msg = 'Channels must be in (None, 0, 3) when decoding BMP images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_decode, assert_channels]):
-        return gen_image_ops.decode_bmp(contents)
+        return convert_image_dtype(gen_image_ops.decode_bmp(contents), dtype)
 
     def _gif():
       # Create assert to make sure that channels is not set to 1
@@ -1692,7 +1711,7 @@ def decode_image(contents, channels=None, name=None):
       channels_msg = 'Channels must be in (None, 0, 3) when decoding GIF images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return gen_image_ops.decode_gif(contents)
+        return convert_image_dtype(gen_image_ops.decode_gif(contents), dtype)
 
     def check_gif():
       # Create assert op to check that bytes are GIF decodable
@@ -1701,7 +1720,11 @@ def decode_image(contents, channels=None, name=None):
 
     def _png():
       """Decodes a PNG image."""
-      return gen_image_ops.decode_png(contents, channels)
+      return convert_image_dtype(
+          gen_image_ops.decode_png(contents, channels,
+                                   dtype=dtypes.uint8
+                                   if dtype == dtypes.uint8
+                                   else dtypes.uint16), dtype)
 
     def check_png():
       """Checks if an image is PNG."""
@@ -1717,7 +1740,8 @@ def decode_image(contents, channels=None, name=None):
                       'images')
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return gen_image_ops.decode_jpeg(contents, channels)
+        return convert_image_dtype(
+            gen_image_ops.decode_jpeg(contents, channels), dtype)
 
     # Decode normal JPEG images (start with \xff\xd8\xff\xe0)
     # as well as JPEG images with EXIF data (start with \xff\xd8\xff\xe1).
@@ -1878,7 +1902,7 @@ def sample_distorted_bounding_box(image_size,
       width / height within this range.
     area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
       The cropped area of the image must contain a fraction of the
-      supplied image within in this range.
+      supplied image within this range.
     max_attempts: An optional `int`. Defaults to `100`.
       Number of attempts at generating a cropped region of the image
       of the specified constraints. After `max_attempts` failures, return the
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 45499dcce0..2a6ab26e96 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -533,6 +533,37 @@ class FlipImageBenchmark(test.Benchmark):
         iters=benchmark_rounds,
         wall_time=step_time)
 
+  def _benchmarkBatchedRandomFlipLeftRight(self, device, cpu_count):
+    image_shape = [16, 299, 299, 3]
+    warmup_rounds = 100
+    benchmark_rounds = 1000
+    config = config_pb2.ConfigProto()
+    if cpu_count is not None:
+      config.inter_op_parallelism_threads = 1
+      config.intra_op_parallelism_threads = cpu_count
+    with session.Session("", graph=ops.Graph(), config=config) as sess:
+      with ops.device(device):
+        inputs = variables.Variable(
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
+            trainable=False,
+            dtype=dtypes.float32)
+        run_op = image_ops.random_flip_left_right(inputs)
+        sess.run(variables.global_variables_initializer())
+        for i in xrange(warmup_rounds + benchmark_rounds):
+          if i == warmup_rounds:
+            start = time.time()
+          sess.run(run_op)
+    end = time.time()
+    step_time = (end - start) / benchmark_rounds
+    tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
+    print("benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s step_time: "
+          "%.2f us" %
+          (tag, step_time * 1e6))
+    self.report_benchmark(
+        name="benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s" % (tag),
+        iters=benchmark_rounds,
+        wall_time=step_time)
+
   def benchmarkFlipLeftRightCpu1(self):
     self._benchmarkFlipLeftRight("/cpu:0", 1)
 
@@ -551,6 +582,15 @@ class FlipImageBenchmark(test.Benchmark):
   def benchmarkRandomFlipLeftRightGpu(self):
     self._benchmarkRandomFlipLeftRight(test.gpu_device_name(), None)
 
+  def benchmarkBatchedRandomFlipLeftRightCpu1(self):
+    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", 1)
+
+  def benchmarkBatchedRandomFlipLeftRightCpuAll(self):
+    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", None)
+
+  def benchmarkBatchedRandomFlipLeftRightGpu(self):
+    self._benchmarkBatchedRandomFlipLeftRight(test.gpu_device_name(), None)
+
 
 class AdjustHueBenchmark(test.Benchmark):
 
@@ -987,7 +1027,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf)
+      y = image_ops.random_flip_left_right(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_left_right"))
 
       count_flipped = 0
@@ -1008,6 +1048,50 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  def testRandomFlipLeftRightWithBatch(self):
+    batch_size = 16
+    seed = 42
+
+    # create single item of test data
+    x_np_raw = np.array(
+        [[1, 2, 3], [1, 2, 3]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+    y_np_raw = np.array(
+        [[3, 2, 1], [3, 2, 1]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+
+    # create batched test data
+    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
+    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_left_right(x_tf, seed=seed)
+      self.assertTrue(y.op.name.startswith("random_flip_left_right"))
+
+      count_flipped = 0
+      count_unflipped = 0
+      for _ in range(100):
+        y_tf = y.eval()
+
+        # check every element of the batch
+        for i in range(batch_size):
+          if y_tf[i][0][0] == 1:
+            self.assertAllEqual(y_tf[i], x_np[i])
+            count_unflipped += 1
+          else:
+            self.assertAllEqual(y_tf[i], y_np[i])
+            count_flipped += 1
+
+      # 100 trials, each containing batch_size elements
+      # Mean: 50 * batch_size
+      # Std Dev: ~5 * sqrt(batch_size)
+      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
+      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
+      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
+      self.assertGreaterEqual(count_flipped, six_sigma)
+      self.assertGreaterEqual(count_unflipped, six_sigma)
+
   def testInvolutionUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1057,9 +1141,11 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
 
+    seed = 42
+
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf, seed=42)
+      y = image_ops.random_flip_up_down(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_up_down"))
       count_flipped = 0
       count_unflipped = 0
@@ -1079,6 +1165,50 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  def testRandomFlipUpDownWithBatch(self):
+    batch_size = 16
+    seed = 42
+
+    # create single item of test data
+    x_np_raw = np.array(
+        [[1, 2, 3], [4, 5, 6]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+    y_np_raw = np.array(
+        [[4, 5, 6], [1, 2, 3]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+
+    # create batched test data
+    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
+    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_up_down(x_tf, seed=seed)
+      self.assertTrue(y.op.name.startswith("random_flip_up_down"))
+
+      count_flipped = 0
+      count_unflipped = 0
+      for _ in range(100):
+        y_tf = y.eval()
+
+        # check every element of the batch
+        for i in range(batch_size):
+          if y_tf[i][0][0] == 1:
+            self.assertAllEqual(y_tf[i], x_np[i])
+            count_unflipped += 1
+          else:
+            self.assertAllEqual(y_tf[i], y_np[i])
+            count_flipped += 1
+
+      # 100 trials, each containing batch_size elements
+      # Mean: 50 * batch_size
+      # Std Dev: ~5 * sqrt(batch_size)
+      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
+      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
+      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
+      self.assertGreaterEqual(count_flipped, six_sigma)
+      self.assertGreaterEqual(count_unflipped, six_sigma)
+
   def testInvolutionTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1156,6 +1286,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     #Ops that support 4D input
     for op in [
         image_ops.flip_left_right, image_ops.flip_up_down,
+        image_ops.random_flip_left_right, image_ops.random_flip_up_down,
         image_ops.transpose_image, image_ops.rot90
     ]:
       transformed_unknown_dims_4 = op(p_unknown_dims_4)
@@ -1166,14 +1297,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
                                    "must be at least three-dimensional"):
         op(p_wrong_rank)
 
-    for op in [
-        image_ops.random_flip_left_right,
-        image_ops.random_flip_up_down,
-    ]:
-      with self.assertRaisesRegexp(ValueError, "must be three-dimensional"):
-        op(p_wrong_rank)
-
-
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -1208,41 +1331,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         y_np = np.rot90(image, k=k, axes=(1, 2))
         self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
-class RandomFlipTest(test_util.TensorFlowTestCase):
-
-  def testRandomLeftRight(self):
-    x_np = np.array([0, 1], dtype=np.uint8).reshape([1, 2, 1])
-    num_iterations = 500
-
-    hist = [0, 0]
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf)
-      for _ in xrange(num_iterations):
-        y_np = y.eval().flatten()[0]
-        hist[y_np] += 1
-
-    # Ensure that each entry is observed within 4 standard deviations.
-    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
-    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
-
-  def testRandomUpDown(self):
-    x_np = np.array([0, 1], dtype=np.uint8).reshape([2, 1, 1])
-    num_iterations = 500
-
-    hist = [0, 0]
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf)
-      for _ in xrange(num_iterations):
-        y_np = y.eval().flatten()[0]
-        hist[y_np] += 1
-
-    # Ensure that each entry is observed within 4 standard deviations.
-    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
-    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
-
-
 class AdjustContrastTest(test_util.TensorFlowTestCase):
 
   def _testContrast(self, x_np, y_np, contrast_factor):
@@ -3880,5 +3968,88 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
       self.assertAllClose(expected_batch, actual_sobel)
 
 
+class DecodeImageTest(test_util.TensorFlowTestCase):
+
+  def testJpegUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/jpeg/testdata"
+      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
+      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
+                                             dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testPngUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/png/testdata"
+      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
+      image0 = image_ops.decode_image(png0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(
+          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testGifUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/gif/testdata"
+      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+      image0 = image_ops.decode_image(gif0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
+                                             dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testBmpUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/bmp/testdata"
+      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
+      image0 = image_ops.decode_image(bmp0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
+                                             dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testJpegFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/jpeg/testdata"
+      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
+      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
+                                             dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testPngFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/png/testdata"
+      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
+      image0 = image_ops.decode_image(png0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(
+          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testGifFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/gif/testdata"
+      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+      image0 = image_ops.decode_image(gif0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
+                                             dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testBmpFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/bmp/testdata"
+      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
+      image0 = image_ops.decode_image(bmp0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
+                                             dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 2df230d470..724fcc39cd 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -467,7 +467,8 @@ class VarianceScaling(Initializer):
     else:
       scale /= max(1., (fan_in + fan_out) / 2.)
     if self.distribution == "normal":
-      stddev = math.sqrt(scale)
+      # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+      stddev = math.sqrt(scale) / .87962566103423978
       return random_ops.truncated_normal(
           shape, 0.0, stddev, dtype, seed=self.seed)
     else:
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 222b8ebc9d..8276047cb6 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -35,8 +35,9 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 # Assert and Print are special symbols in python, so we must
-# use an upper-case version of them.
-@tf_export("Print")
+# have an upper-case version of them.  For users with Python 3 or Python 2.7
+# with `from __future__ import print_function`, we also allow lowercase.
+@tf_export("Print", "print")
 def Print(input_, data, message=None, first_n=None, summarize=None,
           name=None):
   """Prints a list of tensors.
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index e40481f3a7..466d0dadc8 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -125,8 +125,8 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` or `SparseTensor` of type `float32`, `float64`, `int32`,
-      `int64`, `complex64` or `complex128`.
+    x: A `Tensor` or `SparseTensor` of type `float16`, `float32`, `float64`,
+      `int32`, `int64`, `complex64` or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -430,10 +430,10 @@ def pow(x, y, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-     or `complex128`.
-    y: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-     or `complex128`.
+    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
+     `complex64`, or `complex128`.
+    y: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
+     `complex64`, or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -600,7 +600,7 @@ def round(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float32` or `float64`.
+    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, or `int64`.
     name: A name for the operation (optional).
 
   Returns:
@@ -1257,7 +1257,7 @@ def reduce_sum(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1397,7 +1397,7 @@ def reduce_mean(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1469,7 +1469,7 @@ def reduce_prod(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1519,7 +1519,7 @@ def reduce_min(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1568,7 +1568,7 @@ def reduce_max(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1617,7 +1617,7 @@ def reduce_all(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1675,7 +1675,7 @@ def reduce_any(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 783d485892..f47f38e29e 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -621,7 +621,7 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
 
   Args:
-    counts: A `Tensor` containing a the total count of the data (one value).
+    counts: A `Tensor` containing the total count of the data (one value).
     mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly
       shifted) sum of the elements to average over.
     variance_ss: A `Tensor` containing the variance sufficient statistics: the
@@ -689,6 +689,9 @@ def moments(
     # Compute true mean while keeping the dims for proper broadcasting.
     mean = math_ops.reduce_mean(y, axes, keepdims=True, name="mean")
     # sample variance, not unbiased variance
+    # Note: stop_gradient does not change the gradient that gets 
+    #       backpropagated to the mean from the variance calculation,
+    #       because that gradient is zero
     variance = math_ops.reduce_mean(
         math_ops.squared_difference(y, array_ops.stop_gradient(mean)),
         axes,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index a0b55eb077..0c2f5b06c4 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1596,12 +1596,12 @@ def leaky_relu(features, alpha=0.2, name=None):
   Returns:
     The activation value.
   """
-  with ops.name_scope(name, "LeakyRelu", [features, alpha]):
+  with ops.name_scope(name, "LeakyRelu", [features, alpha]) as name:
     features = ops.convert_to_tensor(features, name="features")
     if features.dtype.is_integer:
       features = math_ops.to_float(features)
     alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
-    return math_ops.maximum(alpha * features, features)
+    return math_ops.maximum(alpha * features, features, name=name)
 
 
 def _flatten_outer_dims(logits):
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 46a5f4fae6..035b4735af 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -962,6 +962,16 @@ class LeakyReluTest(test_lib.TestCase):
       self.assertAllClose(
           outputs, [-0.4, -0.2, 0.0, 1.0, 2.0], rtol=tol, atol=tol)
 
+  def testName(self):
+    np_values = np.array([-2, -1, 0, 1, 2], dtype=np.float64)
+    outputs_with_name_set = nn_ops.leaky_relu(
+        constant_op.constant(np_values),
+        name='test_relu_op')
+    self.assertEqual(outputs_with_name_set.name, 'test_relu_op:0')
+    outputs_without_name_set = nn_ops.leaky_relu(
+        constant_op.constant(np_values))
+    self.assertEqual(outputs_without_name_set.name, 'LeakyRelu:0')
+
 
 class SwishTest(test_lib.TestCase):
 
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index f8676ccb5f..219562de5d 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -23,6 +23,7 @@ import threading
 
 # Used by py_util.cc to get tracebacks.
 import traceback  # pylint: disable=unused-import
+import weakref
 
 import numpy as np
 import six
@@ -129,11 +130,14 @@ class FuncRegistry(object):
   def __init__(self):
     self._lock = threading.Lock()
     self._unique_id = 0  # GUARDED_BY(self._lock)
-    self._funcs = {}
+    # Only store weakrefs to the funtions. The strong reference is stored in
+    # the graph.
+    self._funcs = weakref.WeakValueDictionary()
 
   def insert(self, func):
     """Registers `func` and returns a unique token for this entry."""
     token = self._next_unique_token()
+    # Store a weakref to the function
     self._funcs[token] = func
     return token
 
@@ -186,7 +190,7 @@ class FuncRegistry(object):
     Raises:
       ValueError: if no function is registered for `token`.
     """
-    func = self._funcs[token]
+    func = self._funcs.get(token, None)
     if func is None:
       raise ValueError("callback %s is not found" % token)
     if isinstance(func, EagerFunc):
@@ -228,19 +232,6 @@ _py_funcs = FuncRegistry()
 pywrap_tensorflow.InitializePyTrampoline(_py_funcs)
 
 
-class CleanupFunc(object):
-  """A helper class to remove a registered function from _py_funcs."""
-
-  def __init__(self, token):
-    self._token = token
-
-  def __del__(self):
-    if _py_funcs is not None:
-      # If _py_funcs is None, the program is most likely in shutdown, and the
-      # _py_funcs object has been destroyed already.
-      _py_funcs.remove(self._token)
-
-
 def _internal_py_func(func,
                       inp,
                       Tout,
@@ -270,17 +261,15 @@ def _internal_py_func(func,
     # bound to that of the outer graph instead.
     graph = graph._outer_graph
 
-  cleanup = CleanupFunc(token)
-
   # TODO(zhifengc): Consider adding a Graph method to collect
   # `cleanup` objects in one of its member.
-  if not hasattr(graph, "_cleanup_py_funcs_used_in_graph"):
-    graph._cleanup_py_funcs_used_in_graph = []
+  if not hasattr(graph, "_py_funcs_used_in_graph"):
+    graph._py_funcs_used_in_graph = []
 
-  # When `graph` is destroyed, elements in _cleanup_py_funcs_used_in_graph
-  # will be destroyed and their __del__ will remove the 'token' from
-  # the funcs registry.
-  graph._cleanup_py_funcs_used_in_graph.append(cleanup)
+  # Store a reference to the function in the graph to ensure it stays alive
+  # as long as the graph lives. When the graph is destroyed, the function
+  # is left to the garbage collector for destruction as well.
+  graph._py_funcs_used_in_graph.append(func)
   # pylint: enable=protected-access
 
   if eager:
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 0130233746..c3b16a7bd5 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -84,6 +84,8 @@ def _convert_to_sparse_tensors(sp_inputs):
 
 # pylint: disable=protected-access
 @tf_export("sparse_concat")
+@deprecation.deprecated_args(
+    None, "concat_dim is deprecated, use axis instead", "concat_dim")
 def sparse_concat(axis,
                   sp_inputs,
                   name=None,
@@ -597,6 +599,8 @@ class KeywordRequired(object):
 
 
 @tf_export("sparse_split")
+@deprecation.deprecated_args(
+    None, "split_dim is deprecated, use axis instead", "split_dim")
 def sparse_split(keyword_required=KeywordRequired(),
                  sp_input=None,
                  num_split=None,
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index ae79c01949..0280c89c10 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -91,6 +91,59 @@ def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=inv
   shape.set_shape([2])
   return sparse_tensor.SparseTensor(indices, values, shape)
 
+@tf_export("strings.split")
+def string_split_v2(source, sep=None, maxsplit=-1):
+  """Split elements of `source` based on `sep` into a `SparseTensor`.
+
+  Let N be the size of source (typically N will be the batch size). Split each
+  element of `source` based on `sep` and return a `SparseTensor`
+  containing the split tokens. Empty tokens are ignored.
+
+  For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+  then the output will be
+
+  st.indices = [0, 0;
+                0, 1;
+                1, 0;
+                1, 1;
+                1, 2]
+  st.shape = [2, 3]
+  st.values = ['hello', 'world', 'a', 'b', 'c']
+
+  If `sep` is given, consecutive delimiters are not grouped together and are
+  deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+  sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+  string, consecutive whitespace are regarded as a single separator, and the
+  result will contain no empty strings at the startor end if the string has
+  leading or trailing whitespace.
+
+  Note that the above mentioned behavior matches python's str.split.
+
+  Args:
+    source: `1-D` string `Tensor`, the strings to split.
+    sep: `0-D` string `Tensor`, the delimiter character.
+    maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
+
+  Raises:
+    ValueError: If sep is not a string.
+
+  Returns:
+    A `SparseTensor` of rank `2`, the strings split according to the delimiter.
+    The first column of the indices corresponds to the row in `source` and the
+    second column corresponds to the index of the split component in this row.
+  """
+  if sep is None:
+    sep = ''
+  sep = ops.convert_to_tensor(sep, dtype=dtypes.string)
+  source = ops.convert_to_tensor(source, dtype=dtypes.string)
+
+  indices, values, shape = gen_string_ops.string_split_v2(
+      source, sep=sep, maxsplit=maxsplit)
+  indices.set_shape([None, 2])
+  values.set_shape([None])
+  shape.set_shape([2])
+  return sparse_tensor.SparseTensor(indices, values, shape)
+
 
 def _reduce_join_reduction_dims(x, axis, reduction_indices):
   """Returns range(rank(x) - 1, 0, -1) if reduction_indices is None."""
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index f49e2d314d..47414c28af 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -1786,6 +1786,23 @@ class variable_scope(object):
           assert v.name == "foo/bar/v:0"
   ```
 
+  Simple example of how to reenter a premade variable scope safely:
+
+  ```python
+  with tf.variable_scope("foo") as vs:
+    pass
+
+  # Re-enter the variable scope.
+  with tf.variable_scope(vs,
+                         auxiliary_name_scope=False) as vs1:
+    # Restore the original name_scope.
+    with tf.name_scope(vs1.original_name_scope):
+        v = tf.get_variable("v", [1])
+        assert v.name == "foo/v:0"
+        c = tf.constant([1], name="c")
+        assert c.name == "foo/c:0"
+  ```
+
   Basic example of sharing a variable AUTO_REUSE:
 
   ```python
@@ -1924,7 +1941,9 @@ class variable_scope(object):
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
       auxiliary_name_scope: If `True`, we create an auxiliary name scope with
-        the scope. If `False`, we don't touch name scope.
+        the scope. If `False`, we don't create it. Note that the argument is
+        not inherited, and it only takes effect for once when creating. You
+        should only use it for re-entering a premade variable scope.
 
     Returns:
       A scope that can be captured and reused.
diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
old mode 100755
new mode 100644
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 1f9fbad0b4..c3bc9ccd45 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1723,7 +1723,7 @@ def tf_py_build_info_genrule():
       name="py_build_info_gen",
       outs=["platform/build_info.py"],
       cmd=
-      "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
+     "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
       local=1,
       tools=[clean_dep("//tensorflow/tools/build_info:gen_build_info.py")],)
 
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index bca9fa49eb..671b7e387e 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -41,7 +41,11 @@ _GENERATED_FILE_HEADER = """# This file is MACHINE GENERATED! Do not edit.
 # Generated by: tensorflow/tools/api/generator/create_python_api.py script.
 \"\"\"%s
 \"\"\"
+
+from __future__ import print_function
+
 """
+_GENERATED_FILE_FOOTER = "\n\ndel print_function\n"
 
 
 class SymbolExposedTwiceError(Exception):
@@ -149,6 +153,7 @@ class _ModuleInitCodeBuilder(object):
 _names_with_underscore = [%s]
 __all__ = [_s for _s in dir() if not _s.startswith('_')]
 __all__.extend([_s for _s in _names_with_underscore])
+__all__.remove('print_function')
 ''' % underscore_names_str
 
     return module_text_map
@@ -333,7 +338,8 @@ def create_api_files(
     if module or not root_init_template:
       contents = (
           _GENERATED_FILE_HEADER %
-          get_module_docstring(module, package, api_name) + text)
+          get_module_docstring(module, package, api_name) +
+          text + _GENERATED_FILE_FOOTER)
     else:
       # Read base init file
       with open(root_init_template, 'r') as root_init_template_file:
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index 5bb3b3c444..10171b3d60 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -58,7 +58,7 @@ tf_module {
   }
   member_method {
     name: "decode_image"
-    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
   }
   member_method {
     name: "decode_jpeg"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index dc2bd40096..3051c4437e 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -1532,6 +1532,10 @@ tf_module {
     name: "pow"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "print"
+    argspec: "args=[\'input_\', \'data\', \'message\', \'first_n\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "py_func"
     argspec: "args=[\'func\', \'inp\', \'Tout\', \'stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
index a3fbe95bba..b641c39feb 100644
--- a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
@@ -4,4 +4,8 @@ tf_module {
     name: "regex_full_match"
     argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "split"
+    argspec: "args=[\'source\', \'sep\', \'maxsplit\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
+  }
 }
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 5fa75e1d61..883bb93647 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -322,6 +322,10 @@ create_activate_virtualenv_and_install_tensorflow() {
   pip install -v ${PIP_FLAGS} ${WHL_PATH} || \
     die "pip install (forcing to reinstall tensorflow) FAILED"
   echo "Successfully installed pip package ${TF_WHEEL_PATH}"
+
+  # Force downgrade setuptools.
+  pip install --upgrade setuptools==39.1.0
+
 }
 
 ################################################################################
diff --git a/tensorflow/tools/ci_build/builds/with_the_same_user b/tensorflow/tools/ci_build/builds/with_the_same_user
index d4bf546d40..b216e3549f 100755
--- a/tensorflow/tools/ci_build/builds/with_the_same_user
+++ b/tensorflow/tools/ci_build/builds/with_the_same_user
@@ -40,7 +40,7 @@ if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
   ADDUSER_OPTS="--force-badname"
 fi
 
-getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
+getent group "${CI_BUILD_GID}" || addgroup ${ADDUSER_OPTS} --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
 getent passwd "${CI_BUILD_UID}" || adduser ${ADDUSER_OPTS} \
     --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
     --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 072dd6ab99..1f0fd0387a 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -134,6 +134,12 @@ if [[ $? != "0" ]]; then
   die "ERROR: docker build failed. Dockerfile is at ${DOCKERFILE_PATH}"
 fi
 
+# If caller wants the with_the_same_user script to allow bad usernames, 
+# pass the var to the docker environment
+if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
+        CI_BUILD_USER_FORCE_BADNAME_ENV="-e CI_BUILD_USER_FORCE_BADNAME=yes"
+fi
+
 # Run the command inside the container.
 echo "Running '${COMMAND[*]}' inside ${DOCKER_IMG_NAME}..."
 mkdir -p ${WORKSPACE}/bazel-ci_build-cache
@@ -148,6 +154,7 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_BUILD_GROUP=$(id -g -n)" \
     -e "CI_BUILD_GID=$(id -g)" \
     -e "CI_TENSORFLOW_SUBMODULE_PATH=${CI_TENSORFLOW_SUBMODULE_PATH}" \
+    ${CI_BUILD_USER_FORCE_BADNAME_ENV} \
     -v ${WORKSPACE}:/workspace \
     -w /workspace \
     ${GPU_EXTRA_PARAMS} \
diff --git a/tensorflow/tools/ci_build/copy_binary.py b/tensorflow/tools/ci_build/copy_binary.py
index 420d390d2b..148526492d 100755
--- a/tensorflow/tools/ci_build/copy_binary.py
+++ b/tensorflow/tools/ci_build/copy_binary.py
@@ -32,7 +32,8 @@ import shutil
 import tempfile
 import zipfile
 
-TF_NIGHTLY_REGEX = r"(.+)tf_nightly(|_gpu)-(\d\.\d\.\d.dev[\d]{0,8})-(.+)\.whl"
+TF_NIGHTLY_REGEX = (r"(.+)tf_nightly(|_gpu)-(\d\.[\d]{1,2}"
+                    "\.\d.dev[\d]{0,8})-(.+)\.whl")
 BINARY_STRING_TEMPLATE = "%s-%s-%s.whl"
 
 
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 60290df833..88f1d04193 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -115,3 +115,7 @@ pip2 install keras_applications==1.0.2
 pip3 install keras_applications==1.0.2
 pip2 install keras_preprocessing==1.0.1
 pip3 install keras_preprocessing==1.0.1
+
+# Install last working version of setuptools.
+pip2 install --upgrade setuptools==39.1.0
+pip3 install --upgrade setuptools==39.1.0
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index edb9d4b929..acd69ef346 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -39,7 +39,6 @@ if [[ -z $pip35_version ]]; then
 fi
 
 set -e
-pip3.5 install --upgrade setuptools
 pip3.5 install --upgrade pip
 
 pip3.5 install --upgrade virtualenv
@@ -86,4 +85,7 @@ pip3.5 install --upgrade termcolor
 pip3.5 install keras_applications==1.0.2
 pip3.5 install keras_preprocessing==1.0.1
 
+# Install last working version of setuptools.
+pip3.5 install --upgrade setuptools==39.1.0
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 5635977731..323b30f48e 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -49,7 +49,6 @@ cd Python-3.6.1
 make altinstall
 ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
 
-pip3 install --upgrade setuptools
 pip3 install --upgrade pip
 
 pip3 install --upgrade virtualenv
@@ -101,4 +100,8 @@ pip3 install --upgrade termcolor
 # Keras
 pip3.5 install keras_applications==1.0.2
 pip3.5 install keras_preprocessing==1.0.1
+
+# Install last working version of setuptools.
+pip3 install --upgrade setuptools==39.1.0
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh
new file mode 100755
index 0000000000..10a09a415a
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Usage: basic_mkl_test.sh
+
+# Helper function to traverse directories up until given file is found.
+function upsearch () {
+  test / == "$PWD" && return || \
+      test -e "$1" && echo "$PWD" && return || \
+      cd .. && upsearch "$1"
+}
+
+# Set up WORKSPACE.
+WORKSPACE="${WORKSPACE:-$(upsearch WORKSPACE)}"
+
+BUILD_TAG=mkl-ci-test CI_BUILD_USER_FORCE_BADNAME=yes ${WORKSPACE}/tensorflow/tools/ci_build/ci_build.sh cpu tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 1bd1852ffc..b8bce57c87 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -79,6 +79,7 @@ if [[ $1 == "PI_ONE" ]]; then
   --linkopt=-L${OPENBLAS_INSTALL_PATH}/lib/
   --linkopt=-l:libopenblas.a"
   echo "Building for the Pi One/Zero, with no NEON support"
+  WHEEL_ARCH=linux_armv6l
 else
   PI_COPTS='--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
   --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
@@ -86,6 +87,7 @@ else
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8'
+  WHEEL_ARCH=linux_armv7l
   echo "Building for the Pi Two/Three, with NEON acceleration"
 fi
 
@@ -100,6 +102,8 @@ bazel build -c opt ${PI_COPTS} \
   --copt=-fomit-frame-pointer --cpu=armeabi \
   --crosstool_top=@local_config_arm_compiler//:toolchain \
   --verbose_failures \
+  //tensorflow:libtensorflow.so \
+  //tensorflow:libtensorflow_framework.so \
   //tensorflow/tools/benchmark:benchmark_model \
   //tensorflow/tools/pip_package:build_pip_package
 
@@ -112,10 +116,12 @@ BDIST_OPTS="--universal" \
   bazel-bin/tensorflow/tools/pip_package/build_pip_package "${OUTDIR}"
 
 OLD_FN=$(ls "${OUTDIR}" | grep -m 1 \.whl)
-SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-any.whl/; print'
+SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-'${WHEEL_ARCH}'.whl/; print'
 NEW_FN=$(echo "${OLD_FN}" | perl -ne "${SUB}")
 mv "${OUTDIR}/${OLD_FN}" "${OUTDIR}/${NEW_FN}"
 cp bazel-bin/tensorflow/tools/benchmark/benchmark_model "${OUTDIR}"
+cp bazel-bin/tensorflow/libtensorflow.so "${OUTDIR}"
+cp bazel-bin/tensorflow/libtensorflow_framework.so "${OUTDIR}"
 
 echo "Output can be found here:"
 find "${OUTDIR}"
diff --git a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
index 47539b2423..f8f63e276c 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+++ b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
@@ -31,7 +31,11 @@ def _def_file_filter_configure_impl(repository_ctx):
   vc_path = find_vc_path(repository_ctx)
   if vc_path == "visual-studio-not-found":
     auto_configure_fail("Visual C++ build tools not found on your machine")
-  undname_bin_path = find_msvc_tool(repository_ctx, vc_path, "undname.exe").replace("\\", "\\\\")
+
+  undname = find_msvc_tool(repository_ctx, vc_path, "undname.exe")
+  if undname == None:
+    auto_configure_fail("Couldn't find undname.exe under %s, please check your VC installation and set BAZEL_VC environment variable correctly." % vc_path)
+  undname_bin_path = undname.replace("\\", "\\\\")
 
   repository_ctx.template(
     "def_file_filter.py",
diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh
index 06c2b997cb..b0114721bd 100755
--- a/tensorflow/tools/dist_test/local_test.sh
+++ b/tensorflow/tools/dist_test/local_test.sh
@@ -64,9 +64,6 @@ die() {
 # Configurations
 DOCKER_IMG_NAME="tensorflow/tf-dist-test-local-cluster"
 
-# Use TensorFlow v1.5.0 for Python 2.7 and CPU only as we set num_gpus to 0 in the below
-DEFAULT_WHL_FILE_LOCATION="https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl"
-
 # Parse input arguments
 LEAVE_CONTAINER_RUNNING=0
 MODEL_NAME=""
@@ -77,8 +74,7 @@ SYNC_REPLICAS_FLAG=""
 
 WHL_FILE_LOCATION=${1}
 if [[ -z "${WHL_FILE_LOCATION}" ]]; then
-  WHL_FILE_LOCATION=${DEFAULT_WHL_FILE_LOCATION}
-  echo "use default whl file location"
+  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
 fi
 
 while true; do
@@ -131,7 +127,11 @@ echo "Building in temporary directory: ${BUILD_DIR}"
 cp -r ${DIR}/* "${BUILD_DIR}"/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
-if [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
+# Download whl file into the build context directory.
+if [[ -z "${WHL_FILE_LOCATION}" ]]; then
+  pip2 download --no-deps tf-nightly
+  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
+elif [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
     # Download whl file into the build context directory.
     wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \
         die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}"
diff --git a/tensorflow/tools/dist_test/remote_test.sh b/tensorflow/tools/dist_test/remote_test.sh
index 935535312d..e188c88c8f 100755
--- a/tensorflow/tools/dist_test/remote_test.sh
+++ b/tensorflow/tools/dist_test/remote_test.sh
@@ -108,7 +108,7 @@ fi
 # Parse command-line arguments.
 WHL_URL=${1}
 if [[ -z "${WHL_URL}" ]]; then
-  die "whl URL is not specified"
+  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
 fi
 
 # Create docker build context directory.
@@ -121,8 +121,13 @@ cp -r ${DIR}/* ${BUILD_DIR}/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
 # Download whl file into the build context directory.
-wget -P "${BUILD_DIR}" ${WHL_URL} || \
-  die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+if [[ -z "${WHL_URL}" ]]; then
+  pip2 download --no-deps tf-nightly
+  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
+else
+  wget -P "${BUILD_DIR}" ${WHL_URL} || \
+    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+fi
 
 # Build docker image for test.
 docker build ${NO_CACHE_FLAG} \
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 406d134699..57a491255e 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -76,7 +76,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.9 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index a6cd44ced1..6796ad70e5 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.8
+ARG TF_BRANCH=r1.9
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 2fe47f3356..204b5b4dba 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -13,8 +13,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusparse-dev-9-0 \
         curl \
         git \
-        libcudnn7=7.0.5.15-1+cuda9.0 \
-        libcudnn7-dev=7.0.5.15-1+cuda9.0 \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
+        libcudnn7-dev=7.1.4.18-1+cuda9.0 \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
@@ -85,7 +85,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.9 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index bff4a20392..9197651ff4 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusolver-9-0 \
         cuda-cusparse-9-0 \
         curl \
-        libcudnn7=7.0.5.15-1+cuda9.0 \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libpng12-dev \
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index d0fd0fae97..d149365ac1 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -61,6 +61,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/autograph/core:test_lib",
     "//tensorflow/contrib/autograph/impl:impl",
     "//tensorflow/contrib/autograph/lang:lang",
+    "//tensorflow/contrib/autograph/operators:operators",
     "//tensorflow/contrib/autograph/pyct:pyct",
     "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
     "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 0c4065bc77..f7e42ce536 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -41,51 +41,15 @@ function is_windows() {
   fi
 }
 
-function main() {
+function prepare_src() {
   if [ $# -lt 1 ] ; then
     echo "No destination dir provided"
     exit 1
   fi
 
-  DEST=$(real_path $1)
-  TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX)
-
-  PKG_NAME_FLAG=""
-  GPU_BUILD=0
-  NIGHTLY_BUILD=0
-  PROJECT_NAME=""
-  while true; do
-    if [[ "$1" == "--nightly_flag" ]]; then
-      NIGHTLY_BUILD=1
-    elif [[ "$1" == "--gpu" ]]; then
-      GPU_BUILD=1
-    elif [[ "$1" == "--gpudirect" ]]; then
-      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
-    elif [[ "$1" == "--project_name" ]]; then
-      shift
-      if [[ -z "$1" ]]; then
-        break
-      fi
-      PROJECT_NAME="$1"
-    fi
-    shift
-
-    if [[ -z "$1" ]]; then
-      break
-    fi
-  done
-
-  if [[ -n ${PROJECT_NAME} ]]; then
-    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
-  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
-  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly"
-  elif [[ ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_gpu"
-  fi
-
-  echo $(date) : "=== Using tmpdir: ${TMPDIR}"
+  TMPDIR="$1"
+  mkdir -p "$TMPDIR"
+  echo $(date) : "=== Preparing sources in dir: ${TMPDIR}"
 
   if [ ! -d bazel-bin/tensorflow ]; then
     echo "Could not find bazel-bin.  Did you run from the root of the build tree?"
@@ -155,17 +119,28 @@ function main() {
   # over so user defined ops can be compiled.
   mkdir -p ${TMPDIR}/google
   mkdir -p ${TMPDIR}/third_party
-  pushd ${RUNFILES%org_tensorflow}
+  pushd ${RUNFILES%org_tensorflow} > /dev/null
   for header in $(find protobuf_archive -name \*.h); do
     mkdir -p "${TMPDIR}/google/$(dirname ${header})"
     cp "$header" "${TMPDIR}/google/$(dirname ${header})/"
   done
-  popd
+  popd > /dev/null
   cp -R $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party
 
   cp tensorflow/tools/pip_package/MANIFEST.in ${TMPDIR}
   cp tensorflow/tools/pip_package/README ${TMPDIR}
   cp tensorflow/tools/pip_package/setup.py ${TMPDIR}
+}
+
+function build_wheel() {
+  if [ $# -lt 2 ] ; then
+    echo "No src and dest dir provided"
+    exit 1
+  fi
+
+  TMPDIR="$1"
+  DEST="$2"
+  PKG_NAME_FLAG="$3"
 
   # Before we leave the top-level directory, make sure we know how to
   # call python.
@@ -173,15 +148,110 @@ function main() {
     source tools/python_bin_path.sh
   fi
 
-  pushd ${TMPDIR}
+  pushd ${TMPDIR} > /dev/null
   rm -f MANIFEST
   echo $(date) : "=== Building wheel"
   "${PYTHON_BIN_PATH:-python}" setup.py bdist_wheel ${PKG_NAME_FLAG} >/dev/null
   mkdir -p ${DEST}
   cp dist/* ${DEST}
-  popd
-  rm -rf ${TMPDIR}
+  popd > /dev/null
   echo $(date) : "=== Output wheel file is in: ${DEST}"
 }
 
+function usage() {
+  echo "Usage:"
+  echo "$0 [--src srcdir] [--dst dstdir] [options]"
+  echo "$0 dstdir [options]"
+  echo ""
+  echo "    --src                 prepare sources in srcdir"
+  echo "                              will use temporary dir if not specified"
+  echo ""
+  echo "    --dst                 build wheel in dstdir"
+  echo "                              if dstdir is not set do not build, only prepare sources"
+  echo ""
+  echo "  Options:"
+  echo "    --project_name <name> set project name to name"
+  echo "    --gpu                 build tensorflow_gpu"
+  echo "    --gpudirect           build tensorflow_gpudirect"
+  echo "    --nightly_flag        build tensorflow nightly"
+  echo ""
+  exit 1
+}
+
+function main() {
+  PKG_NAME_FLAG=""
+  PROJECT_NAME=""
+  GPU_BUILD=0
+  NIGHTLY_BUILD=0
+  SRCDIR=""
+  DSTDIR=""
+  CLEANSRC=1
+  while true; do
+    if [[ "$1" == "--help" ]]; then
+      usage
+      exit 1
+    elif [[ "$1" == "--nightly_flag" ]]; then
+      NIGHTLY_BUILD=1
+    elif [[ "$1" == "--gpu" ]]; then
+      GPU_BUILD=1
+    elif [[ "$1" == "--gpudirect" ]]; then
+      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
+    elif [[ "$1" == "--project_name" ]]; then
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+    elif [[ "$1" == "--src" ]]; then
+      shift
+      SRCDIR="$(real_path $1)"
+      CLEANSRC=0
+    elif [[ "$1" == "--dst" ]]; then
+      shift
+      DSTDIR="$(real_path $1)"
+    else
+      DSTDIR="$(real_path $1)"
+    fi
+    shift
+
+    if [[ -z "$1" ]]; then
+      break
+    fi
+  done
+
+  if [[ -z "$DSTDIR" ]] && [[ -z "$SRCDIR" ]]; then
+    echo "No destination dir provided"
+    usage
+    exit 1
+  fi
+
+  if [[ -z "$SRCDIR" ]]; then
+    # make temp srcdir if none set
+    SRCDIR="$(mktemp -d -t tmp.XXXXXXXXXX)"
+  fi
+
+  prepare_src "$SRCDIR"
+
+  if [[ -z "$DSTDIR" ]]; then
+      # only want to prepare sources
+      exit
+  fi
+
+  if [[ -n ${PROJECT_NAME} ]]; then
+    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
+  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
+  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly"
+  elif [[ ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tensorflow_gpu"
+  fi
+
+  build_wheel "$SRCDIR" "$DSTDIR" "$PKG_NAME_FLAG"
+
+  if [[ $CLEANSRC -ne 0 ]]; then
+    rm -rf "${TMPDIR}"
+  fi
+}
+
 main "$@"
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index d25a9e77b1..97f625e7e9 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -45,7 +45,7 @@ DOCLINES = __doc__.split('\n')
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.8.0'
+_VERSION = '1.9.0-rc0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
@@ -54,6 +54,7 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
+    'setuptools <= 39.1.0',
     'tensorboard >= 1.8.0, < 1.9.0',
     'termcolor >= 1.1.0',
 ]
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
index 29add6d5ea..15d7c70281 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
@@ -814,6 +814,9 @@ void Generator::Generate(const FileDescriptor& fd) {
   // Add header to cc file.
   SetOutput(&cc_);
   Print("// GENERATED FILE - DO NOT MODIFY");
+  Print();
+  Print("#include <algorithm>");  // for `std::stable_sort()`
+  Print();
   headers = {GetProtoTextHeaderName(fd, true /* impl */)};
   AddHeadersToCurrentSection(headers);
   Print();
diff --git a/tensorflow/tools/quantization/quantize_graph_test.py b/tensorflow/tools/quantization/quantize_graph_test.py
index df71840b64..92bb5127da 100644
--- a/tensorflow/tools/quantization/quantize_graph_test.py
+++ b/tensorflow/tools/quantization/quantize_graph_test.py
@@ -119,8 +119,8 @@ def are_tensors_near(a, b, tolerance):
   flat_a = a.flatten()
   flat_b = b.flatten()
   if len(flat_a) != len(flat_b):
-    print("Tensors are different sizes: " + str(len(flat_a)) + " vs " + str(
-        len(flat_b)))
+    tf_logging.info("Tensors are different sizes: " + str(len(flat_a)) + " vs "
+                    + str(len(flat_b)))
     return False
   value_count = len(flat_a)
   how_many_different = 0
@@ -140,10 +140,10 @@ def are_tensors_near(a, b, tolerance):
   if how_many_different == 0:
     return True
   else:
-    print("Tensors have {0} different values ({1}%), with mean difference"
-          " {2} and mean absolute difference {3}".format(
-              how_many_different, proportion_different * 100, mean_difference,
-              mean_abs_difference))
+    tf_logging.info("Tensors have {0} different values ({1}%), with mean"
+                    " difference {2} and mean absolute difference {3}".format(
+                        how_many_different, proportion_different * 100,
+                        mean_difference, mean_abs_difference))
     return False
 
 
diff --git a/tensorflow/tools/test/upload_test_benchmarks.py b/tensorflow/tools/test/upload_test_benchmarks.py
index 9c45359ee1..c030575109 100644
--- a/tensorflow/tools/test/upload_test_benchmarks.py
+++ b/tensorflow/tools/test/upload_test_benchmarks.py
@@ -89,7 +89,6 @@ import shutil
 
 from six import text_type
 from google.cloud import datastore
-from six import text_type
 
 
 def is_real_file(dirpath, fname):
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 161d1dbd06..b4fbbd6c23 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -50,31 +50,31 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl_linux",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz"
       ],
-      sha256 = "74844bd77294742bf2396ff040369d1aa4cdd9e826fcd38cf8398ae83564d146",
-      strip_prefix = "mklml_lnx_2018.0.2.20180127",
+      sha256 = "d2305244fdc9b87db7426ed4496e87a4b3977ad3374d73b8000e8b7a5b7aa725",
+      strip_prefix = "mklml_lnx_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_windows",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip"
       ],
-      sha256 = "d8fbf0faa0684bffa3548005d05fe5cfe56ff9dbc0e15e7612d7ac01055a6ded",
-      strip_prefix = "mklml_win_2018.0.2.20180127",
+      sha256 = "a584a5bf1c8d2ad70b90d12b52652030e9a338217719064fdb84b7ad0d693694",
+      strip_prefix = "mklml_win_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_darwin",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz"
       ],
-      sha256 = "aa740d71e14562bfea56e6829e6dc186e7487cbcf6748a88dec73826b7ec1943",
-      strip_prefix = "mklml_mac_2018.0.2.20180127",
+      sha256 = "094e3dfd61c816136dc8d12a45cc611ce26c5f4828176a3644cd0b0efa15a25b",
+      strip_prefix = "mklml_mac_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
 
@@ -85,11 +85,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
-          "https://github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
+          "https://github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
       ],
-      sha256 = "d2cfd93a70cfe86ebe054477c530c9b5c1218b70f75856eb6d1956c68ee89e8f",
-      strip_prefix = "mkl-dnn-0.13",
+      sha256 = "efebc53882856afec86457a2da644693f5d59c68772d41d640d6b60a8efc4eb0",
+      strip_prefix = "mkl-dnn-0.14",
       build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
   )
 
@@ -187,11 +187,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "highwayhash",
       urls = [
-          "https://mirror.bazel.build/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
-          "https://github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
+          "http://mirror.bazel.build/github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
+          "https://github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
       ],
-      sha256 = "0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
-      strip_prefix = "highwayhash-dfcb97ca4fe9277bf9dc1802dd979b071896453b",
+      sha256 = "9c3e0e87d581feeb0c18d814d98f170ff23e62967a2bd6855847f0b2fe598a37",
+      strip_prefix = "highwayhash-fd3d9af80465e4383162e4a7c5e2f406e82dd968",
       build_file = clean_dep("//third_party:highwayhash.BUILD"),
   )
 
diff --git a/third_party/eigen.BUILD b/third_party/eigen.BUILD
index 07bb6645eb..e54c1a4501 100644
--- a/third_party/eigen.BUILD
+++ b/third_party/eigen.BUILD
@@ -64,6 +64,7 @@ cc_library(
         # This define (mostly) guarantees we don't link any problematic
         # code. We use it, but we do not rely on it, as evidenced above.
         "EIGEN_MPL2_ONLY",
+        "EIGEN_MAX_ALIGN_BYTES=64",
     ],
     includes = ["."],
     visibility = ["//visibility:public"],
diff --git a/third_party/highwayhash.BUILD b/third_party/highwayhash.BUILD
index 1b8e40765e..08cb84ea2c 100644
--- a/third_party/highwayhash.BUILD
+++ b/third_party/highwayhash.BUILD
@@ -10,6 +10,7 @@ cc_library(
     srcs = ["highwayhash/sip_hash.cc"],
     hdrs = [
         "highwayhash/sip_hash.h",
+        "highwayhash/endianess.h",
         "highwayhash/state_helpers.h",
     ],
     visibility = ["//visibility:public"],
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index 4418ac32fc..663a218733 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -291,8 +291,10 @@ cc_library(
         "jchuff.h",
         "jconfig.h",
         "jdct.h",
+        "jerror.h",
         "jinclude.h",
         "jmorecfg.h",
+        "jpegint.h",
         "jpeglib.h",
         "jsimd.h",
         "jsimddct.h",
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index 76ab32d69c..17c5449cc0 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -28,7 +28,14 @@ cc_library(
         "pngwrite.c",
         "pngwtran.c",
         "pngwutil.c",
-    ],
+    ] + select({
+        "@org_tensorflow//tensorflow:linux_ppc64le": [
+            "powerpc/powerpc_init.c",
+            "powerpc/filter_vsx_intrinsics.c",
+        ],
+        "//conditions:default": [
+        ],
+    }),
     hdrs = [
         "png.h",
         "pngconf.h",
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index 954f21f5f8..3c7e5c8469 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -6,6 +6,7 @@
   * `PYTHON_LIB_PATH`: Location of python libraries.
 """
 
+_BAZEL_SH = "BAZEL_SH"
 _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 _PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
 _TF_PYTHON_CONFIG_REPO = "TF_PYTHON_CONFIG_REPO"
@@ -152,6 +153,22 @@ def _get_python_bin(repository_ctx):
             _PYTHON_BIN_PATH, repository_ctx.os.environ.get("PATH", "")))
 
 
+def _get_bash_bin(repository_ctx):
+  """Gets the bash bin path."""
+  bash_bin = repository_ctx.os.environ.get(_BAZEL_SH)
+  if bash_bin != None:
+    return bash_bin
+  else:
+    bash_bin_path = repository_ctx.which("bash")
+    if bash_bin_path != None:
+      return str(bash_bin_path)
+    else:
+      _fail("Cannot find bash in PATH, please make sure " +
+            "bash is installed and add its directory in PATH, or --define " +
+            "%s='/path/to/bash'.\nPATH=%s" % (
+                _BAZEL_SH, repository_ctx.os.environ.get("PATH", "")))
+
+
 def _get_python_lib(repository_ctx, python_bin):
   """Gets the python lib path."""
   python_lib = repository_ctx.os.environ.get(_PYTHON_LIB_PATH)
@@ -184,14 +201,14 @@ def _get_python_lib(repository_ctx, python_bin):
       "  print(paths[0])\n" +
       "END")
   cmd = '%s - %s' % (python_bin, print_lib)
-  result = repository_ctx.execute(["bash", "-c", cmd])
+  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
   return result.stdout.strip('\n')
 
 
 def _check_python_lib(repository_ctx, python_lib):
   """Checks the python lib path."""
   cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
-  result = repository_ctx.execute(["bash", "-c", cmd])
+  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
   if result.return_code == 1:
     _fail("Invalid python library path: %s" % python_lib)
 
@@ -199,7 +216,7 @@ def _check_python_lib(repository_ctx, python_lib):
 def _check_python_bin(repository_ctx, python_bin):
   """Checks the python bin path."""
   cmd =  '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
-  result = repository_ctx.execute(["bash", "-c", cmd])
+  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
   if result.return_code == 1:
     _fail("--define %s='%s' is not executable. Is it the python binary?" % (
         _PYTHON_BIN_PATH, python_bin))
@@ -294,6 +311,7 @@ def _python_autoconf_impl(repository_ctx):
 python_configure = repository_rule(
     implementation = _python_autoconf_impl,
     environ = [
+        _BAZEL_SH,
         _PYTHON_BIN_PATH,
         _PYTHON_LIB_PATH,
         _TF_PYTHON_CONFIG_REPO,
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index 36f5aa5bde..cb67d3e961 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -17,7 +17,6 @@
 _SINGLE_URL_WHITELIST = depset([
     "arm_compiler",
     "ortools_archive",
-    "gemmlowp",
 ])
 
 def _is_windows(ctx):
@@ -88,7 +87,9 @@ def _tf_http_archive(ctx):
   if ctx.attr.patch_file != None:
     _apply_patch(ctx, ctx.attr.patch_file)
   if ctx.attr.build_file != None:
-    ctx.template("BUILD", ctx.attr.build_file, {
+    # Use BUILD.bazel to avoid conflict with third party projects with
+    # BUILD or build (directory) underneath.
+    ctx.template("BUILD.bazel", ctx.attr.build_file, {
         "%prefix%": ".." if _repos_are_siblings() else "external",
     }, False)
 
-- 
cgit v1.2.3


From aa191c15adf00ac6e5fdb55996746af7468b5d8b Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Fri, 22 Jun 2018 10:04:46 -0700
Subject: Avoid double linking tf framework when building op_gen_wrappers.

PiperOrigin-RevId: 201701040
---
 tensorflow/cc/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tensorflow/cc')

diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 079e063d3e..a98f0b00b2 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -530,7 +530,7 @@ cc_library_with_android_deps(
         "//tensorflow/core/api_def:base_api_def",
     ],
     deps = [
-        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:op_gen_lib",
-- 
cgit v1.2.3


From eb61daae91432be0b07bb2f6854887bedfa6fc95 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Tue, 26 Jun 2018 00:57:33 -0700
Subject: [C API]: Bugfix for TF_AddGradients.

TF_AddGradients could create nodes in the graph with names that conflicted with
other nodes in the graph. This would most clearly happen if TF_AddGradients()
was called twice on the same graph, and could also happen if there were other
nodes in the graph that happened to have "gradients" as a prefix of their name.

Fix that.

The added test in c_api_test.cc would fail in the call to TF_SessionRun() with
  Node 'gradients/OnesLike' is not unique
without the changes to c_api.cc and c_api_internal.h

While at it, also fixed a possible name collision bug when using the C++ API
to constructor graphs (using Scope).

Thanks @karllessard for pointing this out.

PiperOrigin-RevId: 202087996
---
 tensorflow/c/c_api.cc                    | 13 ++++++-
 tensorflow/c/c_api_test.cc               | 65 +++++++++++++++++++++++++++++---
 tensorflow/c/c_test_util.cc              |  7 ++++
 tensorflow/c/c_test_util.h               |  3 ++
 tensorflow/cc/framework/scope.cc         | 30 +++++++++++----
 tensorflow/cc/framework/scope_internal.h |  3 +-
 tensorflow/cc/framework/scope_test.cc    | 10 +++++
 7 files changed, 116 insertions(+), 15 deletions(-)

(limited to 'tensorflow/cc')

diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 09a03639d6..37c8302e08 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -2414,7 +2414,18 @@ void TF_AddGradients(TF_Graph* g, TF_Output* y, int ny, TF_Output* x, int nx,
     for (int i = first_new_node_id; i < g->graph.num_node_ids(); ++i) {
       Node* n = g->graph.FindNodeId(i);
       if (n == nullptr) continue;
-      g->name_map[n->name()] = n;
+      // We have a convoluted scheme here: Using the C++ graph construction API
+      // to add potentially many nodes to the graph without running the checks
+      // (such as uniqueness of the names of nodes) we run with other functions
+      // that add a node to the graph (like TF_FinishOperation).
+      if (!g->name_map.insert(std::make_pair(n->name(), n)).second) {
+        status->status = tensorflow::errors::Internal(
+            "BUG: The API allowed construction of a graph with duplicate node "
+            "names (",
+            n->name(),
+            "). This is a bug. Please file an issue at "
+            "https://github.com/tensorflow/tensorflow/issues.");
+      }
     }
   }
 
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 577f10c5e6..bc04b53fbb 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -1160,7 +1160,7 @@ TEST(CAPI, GetOpDef) {
 }
 
 void StringVectorToArrays(const std::vector<string>& v,
-                          std::unique_ptr<const void* []>* ptrs,
+                          std::unique_ptr<const void*[]>* ptrs,
                           std::unique_ptr<size_t[]>* lens) {
   ptrs->reset(new const void*[v.size()]);
   lens->reset(new size_t[v.size()]);
@@ -1196,7 +1196,7 @@ class CApiColocationTest : public ::testing::Test {
 
   void SetViaStringList(TF_OperationDescription* desc,
                         const std::vector<string>& list) {
-    std::unique_ptr<const void* []> list_ptrs;
+    std::unique_ptr<const void*[]> list_ptrs;
     std::unique_ptr<size_t[]> list_lens;
     StringVectorToArrays(list, &list_ptrs, &list_lens);
     TF_SetAttrStringList(desc, tensorflow::kColocationAttrName, list_ptrs.get(),
@@ -1700,6 +1700,61 @@ TEST_F(CApiGradientsTest, OpWithNoGradientRegistered_NoGradInputs) {
   TestGradientsError(false);
 }
 
+void ScalarFloatFromTensor(const TF_Tensor* t, float* f) {
+  ASSERT_TRUE(t != nullptr);
+  ASSERT_EQ(TF_FLOAT, TF_TensorType(t));
+  ASSERT_EQ(0, TF_NumDims(t));
+  ASSERT_EQ(4, TF_TensorByteSize(t));
+  float* p = static_cast<float*>(TF_TensorData(t));
+  *f = *p;
+}
+
+TEST_F(CApiGradientsTest, MultipleCallsToAddGradients) {
+  const float X = 3.0f, Y = 7.0f;
+  TF_Operation* x = Placeholder(graph_, s_, "x", TF_FLOAT);
+  TF_Operation* y = Placeholder(graph_, s_, "y", TF_FLOAT);
+  TF_Operation* xy = Mul(x, y, graph_, s_, "xy");
+  TF_Output dxy_dx, dxy_dy;
+
+  TF_Output outputs[1] = {{xy, 0}};
+  TF_Output inputs[1] = {{x, 0}};
+  TF_AddGradients(graph_, outputs, 1, inputs, 1, nullptr, s_, &dxy_dx);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  inputs[0] = {y, 0};
+  TF_AddGradients(graph_, outputs, 1, inputs, 1, nullptr, s_, &dxy_dy);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  TF_SessionOptions* opts = TF_NewSessionOptions();
+  TF_Session* sess = TF_NewSession(graph_, opts, s_);
+  TF_DeleteSessionOptions(opts);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  TF_Output feeds[] = {{x, 0}, {y, 0}};
+  TF_Tensor* feedValues[] = {FloatTensor(X), FloatTensor(Y)};
+  TF_Output fetches[] = {dxy_dx, dxy_dy};
+  TF_Tensor* fetchValues[] = {nullptr, nullptr};
+
+  TF_SessionRun(sess, nullptr /* run_options */, feeds, feedValues, 2, fetches,
+                fetchValues, 2, nullptr /* target_opers */, 0,
+                nullptr /* run_metadata */, s_);
+  TF_DeleteTensor(feedValues[0]);
+  TF_DeleteTensor(feedValues[1]);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  TF_DeleteSession(sess, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  float dxy_dxValue = 0.0f, dxy_dyValue = 0.0f;
+  ScalarFloatFromTensor(fetchValues[0], &dxy_dxValue);
+  EXPECT_EQ(Y, dxy_dxValue);
+
+  ScalarFloatFromTensor(fetchValues[1], &dxy_dyValue);
+  EXPECT_EQ(X, dxy_dyValue);
+
+  TF_DeleteTensor(fetchValues[0]);
+  TF_DeleteTensor(fetchValues[1]);
+}
+
 // REGISTER_OP for CApiAttributesTest test cases.
 // Registers two ops, each with a single attribute called 'v'.
 // The attribute in one op will have a type 'type', the other
@@ -1784,7 +1839,7 @@ TEST_F(CApiAttributesTest, String) {
 
 TEST_F(CApiAttributesTest, StringList) {
   std::vector<string> list = {"bugs", "bunny", "duck"};
-  std::unique_ptr<const void* []> list_ptrs;
+  std::unique_ptr<const void*[]> list_ptrs;
   std::unique_ptr<size_t[]> list_lens;
   StringVectorToArrays(list, &list_ptrs, &list_lens);
   int list_total_size = 0;
@@ -1800,7 +1855,7 @@ TEST_F(CApiAttributesTest, StringList) {
   ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
 
   EXPECT_TF_META("v", list.size(), TF_ATTR_STRING, list_total_size);
-  std::unique_ptr<void* []> values(new void*[list.size()]);
+  std::unique_ptr<void*[]> values(new void*[list.size()]);
   std::unique_ptr<size_t[]> lens(new size_t[list.size()]);
   std::unique_ptr<char[]> storage(new char[list_total_size]);
   TF_OperationGetAttrStringList(oper, "v", values.get(), lens.get(),
@@ -2025,7 +2080,7 @@ TEST_F(CApiAttributesTest, TensorShapeProtoList) {
   tensorflow::PartialTensorShape(pts2).AsProto(&proto);
   proto.SerializeToString(&bytes2);
 
-  std::unique_ptr<const void* []> list_ptrs;
+  std::unique_ptr<const void*[]> list_ptrs;
   std::unique_ptr<size_t[]> list_lens;
   const std::vector<string> list = {bytes1, bytes2};
   StringVectorToArrays(list, &list_ptrs, &list_lens);
diff --git a/tensorflow/c/c_test_util.cc b/tensorflow/c/c_test_util.cc
index f3b28c1708..24eb6c069b 100644
--- a/tensorflow/c/c_test_util.cc
+++ b/tensorflow/c/c_test_util.cc
@@ -216,6 +216,13 @@ TF_Operation* Min(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
   return MinWithDevice(l, r, graph, /*op_device=*/"", s, name);
 }
 
+TF_Operation* Mul(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                  TF_Status* s, const char* name) {
+  TF_Operation* op;
+  BinaryOpHelper("Mul", l, r, graph, s, name, &op, "", true);
+  return op;
+}
+
 TF_Operation* Add(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s,
                   const char* name) {
   TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h
index c16aba666e..38313d647c 100644
--- a/tensorflow/c/c_test_util.h
+++ b/tensorflow/c/c_test_util.h
@@ -80,6 +80,9 @@ TF_Operation* Add(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s,
 TF_Operation* Min(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
                   TF_Status* s, const char* name = "min");
 
+TF_Operation* Mul(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                  TF_Status* s, const char* name = "mul");
+
 // If `op_device` is non-empty, set the created op on that device.
 TF_Operation* MinWithDevice(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
                             const string& op_device, TF_Status* s,
diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index 62a889181e..8c886f3171 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -37,6 +37,11 @@ Scope& Scope::operator=(const Scope& other) {
   return *this;
 }
 
+namespace {
+const char kScopeSeparator[] = "/";
+const char kSuffixSeparator[] = "_";
+}  // namespace
+
 Scope::Impl::Impl(Graph* graph, Status* status, NameMap* name_map,
                   ShapeRefiner* refiner, bool disable_shape_inference)
     : graph_(graph),
@@ -308,19 +313,23 @@ string Scope::Impl::GetUniqueName(const string& prefix,
     return prefix;
   }
   auto entry = name_map_->find(prefix);
-  string unique_name = prefix;
   if (entry == name_map_->end()) {
     name_map_->insert({prefix, 0});
-  } else {
-    unique_name = strings::StrCat(unique_name, "_", ++entry->second);
+    return prefix;
   }
+  string unique_name;
+  do {
+    unique_name = strings::StrCat(prefix, kSuffixSeparator, ++entry->second);
+  } while (name_map_->find(unique_name) != name_map_->end());
+  name_map_->insert({unique_name, 0});
   return unique_name;
 }
 
 string Scope::Impl::GetNameForOp(const string& default_name) const {
   const string unique_name =
       GetUniqueName(default_name, true /* check_single_use */);
-  const string sep = name_.empty() || unique_name.empty() ? "" : "/";
+  const string sep =
+      name_.empty() || unique_name.empty() ? "" : kScopeSeparator;
   return strings::StrCat(name_, sep, unique_name);
 }
 
@@ -345,7 +354,8 @@ Scope Scope::NewSubScope(const string& child_scope_name) const {
   }
   const string unique_name =
       impl()->GetUniqueName(child_scope_name, false /* check_single_use */);
-  const string sep = impl()->name_.empty() || unique_name.empty() ? "" : "/";
+  const string sep =
+      impl()->name_.empty() || unique_name.empty() ? "" : kScopeSeparator;
   return Scope(new Impl(*this, Impl::Tags::ScopeName(),
                         strings::StrCat(impl()->name_, sep, unique_name),
                         false /* copy_names */));
@@ -412,7 +422,7 @@ CompositeOpScopes Scope::GetCompositeOpScopes(
   if (!impl()->single_use_scope()) {
     Scope child = NewSubScope(impl()->op_name_.empty() ? composite_op_name
                                                        : impl()->op_name_);
-    const string child_op_sep = impl()->name_.empty() ? "" : "_";
+    const string child_op_sep = impl()->name_.empty() ? "" : kSuffixSeparator;
     const string child_name =
         strings::StrCat(impl()->name_, child_op_sep, child.impl()->name_);
     return {child,
@@ -435,7 +445,13 @@ class InternalScope {
   static Scope NewScope(Graph* graph, Status* status, ShapeRefiner* refiner) {
     Scope::Impl::NameMap* name_map = new Scope::Impl::NameMap;
     for (const Node* node : graph->nodes()) {
-      (*name_map)[node->name()] = 0;
+      const string& name = node->name();
+      (*name_map)[name] = 0;
+      // Add all name prefixes ('/' separated).
+      size_t idx = -1;
+      while ((idx = name.find(kScopeSeparator, idx + 1)) != string::npos) {
+        (*name_map)[name.substr(0, idx)] = 0;
+      }
     }
     // We provide null destructors for these shared ptrs (except for name_map)
     // since the caller owns them and doesn't want the scope to destroy them.
diff --git a/tensorflow/cc/framework/scope_internal.h b/tensorflow/cc/framework/scope_internal.h
index 8efcfed20d..58adaef2e9 100644
--- a/tensorflow/cc/framework/scope_internal.h
+++ b/tensorflow/cc/framework/scope_internal.h
@@ -34,8 +34,7 @@ class Scope::Impl {
   // name that has not been used so far in a scope will get no suffix. Later
   // uses of the same name will get suffixes _1, _2, _3, etc. Multiple scopes
   // can share the same NameMap. For instance, a new scope created using
-  // WithControlDependencies() should would share the same NameMap with the
-  // parent.
+  // WithControlDependencies() would share the same NameMap with the parent.
   typedef std::unordered_map<string, int> NameMap;
 
   Impl(const std::shared_ptr<Graph>& graph,
diff --git a/tensorflow/cc/framework/scope_test.cc b/tensorflow/cc/framework/scope_test.cc
index 9eca9d3fac..b40b345eb8 100644
--- a/tensorflow/cc/framework/scope_test.cc
+++ b/tensorflow/cc/framework/scope_test.cc
@@ -26,6 +26,16 @@ TEST(ScopeTest, BasicNames) {
   EXPECT_EQ(root.GetUniqueNameForOp("mul"), "mul");
 }
 
+TEST(ScopeTest, OpAndScopeNameCollision) {
+  Scope root = Scope::NewRootScope();
+  EXPECT_EQ(root.GetUniqueNameForOp("foo"), "foo");
+  EXPECT_EQ(root.GetUniqueNameForOp("foo"), "foo_1");
+  EXPECT_EQ(root.GetUniqueNameForOp("foo_1"), "foo_1_1");
+  EXPECT_EQ(root.GetUniqueNameForOp("foo_2"), "foo_2");
+  EXPECT_EQ(root.GetUniqueNameForOp("foo"), "foo_3");
+  EXPECT_EQ(root.GetUniqueNameForOp("foo_2"), "foo_2_1");
+}
+
 TEST(ScopeTest, HierarchicalNames) {
   Scope root = Scope::NewRootScope();
   Scope child = root.NewSubScope("child");
-- 
cgit v1.2.3


From 37c79b0d807e12067bc0d732e074a829757adb05 Mon Sep 17 00:00:00 2001
From: KB Sriram <kbsriram@gmail.com>
Date: Tue, 26 Jun 2018 12:55:37 -0700
Subject: C++ gradient for Slice (#17592)

See https://github.com/tensorflow/tensorflow/issues/9645
---
 tensorflow/cc/gradients/array_grad.cc      | 52 ++++++++++++++++++++++++++++++
 tensorflow/cc/gradients/array_grad_test.cc |  7 ++++
 2 files changed, 59 insertions(+)

(limited to 'tensorflow/cc')

diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc
index ff348fadb2..b353accddc 100644
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@@ -421,6 +421,58 @@ Status StridedSliceGradHelper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("StridedSlice", StridedSliceGradHelper);
 
+Status SliceGrad(const Scope& scope, const Operation& op,
+                 const std::vector<Output>& grad_inputs,
+                 std::vector<Output>* grad_outputs) {
+  // Propagate the incoming gradient along all the selected values,
+  // and zero everywhere else. Use the Pad operator for this.
+  //
+  // First create an Nx2 padding where N is the number of input
+  // dimensions. The first column is the number of prepended zeros
+  // for each dimension, and the second column is the number of
+  // appended zeros.
+  //
+  // The first column is just the begin vector.
+  // The second column is the shape of the input element-wise
+  // subtracted by begin+size
+
+  // Running example:
+  // input.shape = [3, 5, 3]
+  // begin = [1, 2, 1], size = [1, 3, 2]
+  Input input = op.input(0);
+  Input begin = op.input(1);
+  // input_rank = 3
+  auto input_rank = Rank(scope, input);
+  // slice_size = [1, 3, 2]
+  auto slice_size = Shape(scope, op.output(0));
+  // padding_shape = [3, 1]
+  auto padding_shape = Stack(scope, {input_rank, 1});
+  // before_padding = [[1]
+  //                   [2]
+  //                   [1]]
+  Input before_padding = Reshape(scope, begin, padding_shape);
+  // after_padding_sizes = shape(input) - slice_size - begin
+  //                     = [3, 5, 3] - [1, 3, 2] - [1, 2, 1]
+  //                     = [1, 0, 0]
+  auto after_padding_sizes =
+      Sub(scope, Sub(scope, Shape(scope, input), slice_size), begin);
+  // after_padding = [[1]
+  //                  [0]
+  //                  [0]]
+  Input after_padding = Reshape(scope, after_padding_sizes, padding_shape);
+  // paddings = [[1 1]
+  //             [2 0]
+  //             [1 0]]
+  auto paddings =
+      Concat(scope, {before_padding, after_padding}, Const(scope, 1));
+  grad_outputs->push_back(Pad(scope, grad_inputs[0], paddings));
+  // Nothing propagated for "begin" and "size" inputs
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(NoGradient());
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Slice", SliceGrad);
+
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/array_grad_test.cc b/tensorflow/cc/gradients/array_grad_test.cc
index de3bd0fc9e..d09275b648 100644
--- a/tensorflow/cc/gradients/array_grad_test.cc
+++ b/tensorflow/cc/gradients/array_grad_test.cc
@@ -378,5 +378,12 @@ TEST_F(ArrayGradTest, StridedSliceGrad) {
   RunTest(x, x_shape, y, {1, 2, 2, 2});
 }
 
+TEST_F(ArrayGradTest, SliceGrad) {
+  TensorShape x_shape({3, 5, 3});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  auto y = Slice(scope_, x, {1, 2, 1}, {1, 3, 2});
+  RunTest(x, x_shape, y, {1, 3, 2});
+}
+
 }  // namespace
 }  // namespace tensorflow
-- 
cgit v1.2.3


From 9249690173fe4bf46f2fc170f3e68f5496d7609c Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 4 Jun 2018 15:59:36 +0000
Subject: Remove TODO and enable complex for tan in test case.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/cc/gradients/math_grad_test.cc | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'tensorflow/cc')

diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index fd7b6fe662..1c9bdff5e1 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -475,11 +475,7 @@ TEST_F(CWiseUnaryGradTest, Tan_Complex) {
   auto x_fn = [this](const int i) {
     return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
   };
-  // TODO(kbsriram)
-  // Enable when tan kernel supports complex inputs
-  if (false) {
-    TestCWiseGrad<complex64, complex64>(TAN, x_fn);
-  }
+  TestCWiseGrad<complex64, complex64>(TAN, x_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Atan) {
-- 
cgit v1.2.3


From 7fe0ae12ec42eca1ea07d93bbd63de394743a018 Mon Sep 17 00:00:00 2001
From: Pradeep Banavara <pradeepbs@gmail.com>
Date: Thu, 12 Jul 2018 21:52:13 -0400
Subject: Fix: #12686 SoftmaxCrossEntropyWithLogits

Committing in a new PR as the old PR has too many commit files
---
 tensorflow/cc/gradients/nn_grad.cc      | 94 +++++++++++++++++++++++++++++----
 tensorflow/cc/gradients/nn_grad_test.cc | 29 ++++++++--
 2 files changed, 109 insertions(+), 14 deletions(-)

(limited to 'tensorflow/cc')

diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index c73482d5f4..dc6477e59d 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -47,6 +47,81 @@ Status SoftmaxGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Softmax", SoftmaxGrad);
 
+bool IsZero(const Scope& scope, Output grad) {
+  std::array<std::string, 2> zero_op_type_names{{"ZerosLike", "Zeros"}};
+  string op_type_name = grad.op().node()->type_string();
+  for (auto& zero_op_type_name: zero_op_type_names) {
+    if (op_type_name == zero_op_type_name) {
+      return true;
+    }
+  }
+  // the Operation we were provided is not named something obvious
+  // we need to actually look at its contents.
+  // the original python code did this by calling a utility function called
+  // tensor_util.constant_value. When you dig into tensor_tuil.constant_value
+  // it is a large number of 'if' statements that measure certain edge cases
+  // where it is possible to get the value of the tensor without actually
+  // evaluating it. There are many kinds of tensors that can not have this
+  // done.
+  // There is no C++ equivalent to tensor_util.constant_value so we do nothing
+  // for the moment.
+  return false;
+}
+
+Output BroadcastMul(const Scope& scope, Output vec, Output mat) {
+  /* Multiply after broadcasting vec to match dimensions of mat.
+     Args:
+       vec: A 1-D tensor of dimension [D0]
+       mat: A 2-D tensor of dimesnion [D0, D1]
+
+    Returns:
+      A tensor of dimension [D0, D1], the result fo vec * mat
+      we use an element for element multiply here.
+  */
+  auto reshaped = ExpandDims(scope, vec, -1);
+  return Multiply(scope, reshaped, mat);
+}
+
+Status SoftmaxCrossEntropyWithLogitsGrad(const Scope& scope,
+                                         const Operation& op,
+                                         const std::vector<Output>& grad_inputs,
+                                         std::vector<Output>* grad_outputs) {
+  // Softmax gradient with cross entropy logits function
+  // We multiply the backprop for cost with the gradients - op.output[1]
+  // There is no gradient for labels
+  auto logits =
+      op.input(0);  // the outputs of the network are at
+                    // input index 0. The "truth" labels are at index 1.
+  auto softmax_grad = op.output(1);
+
+  // The documentation for ops::SoftmaxCrossEntropyWithLogits says
+  // loss is the output at index 0, and backprop is the output at index 1
+  auto grad_loss = grad_inputs[0];
+  auto grad_grad = grad_inputs[1];
+
+  auto grad = BroadcastMul(scope, grad_loss, softmax_grad);
+  if (!IsZero(scope, grad_grad)) {
+    std::vector<int> axis;
+    auto logitsSoftmax = Softmax(scope, logits);
+
+    auto grad_gradExpand = ExpandDims(scope, grad_grad, 1);
+    auto logitsSoftMaxExpand = ExpandDims(scope, logitsSoftmax, 2);
+    auto matMulResult =
+        BatchMatMul(scope, grad_gradExpand, logitsSoftMaxExpand);
+    axis.push_back(1);
+    auto squeezeResult = Squeeze(scope, matMulResult, Squeeze::Axis(axis));
+    auto subtractionResult = Subtract(scope, grad_grad, squeezeResult);
+    auto multiplyResult = Multiply(scope, subtractionResult, logitsSoftmax);
+    grad = Add(scope, grad, multiplyResult);
+  }
+  auto minusLogSoftmax = Multiply(scope, LogSoftmax(scope, logits), -1.0f);
+  grad_outputs->push_back(grad);
+  grad_outputs->push_back(BroadcastMul(scope, grad_loss, minusLogSoftmax));
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("SoftmaxCrossEntropyWithLogits",
+                     SoftmaxCrossEntropyWithLogitsGrad);
+
 Status LogSoftmaxGrad(const Scope& scope, const Operation& op,
                       const std::vector<Output>& grad_inputs,
                       std::vector<Output>* grad_outputs) {
@@ -195,9 +270,9 @@ Status MaxPool3DGradHelper(const Scope& scope, const Operation& op,
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
   MaxPool3DGrad::Attrs grad_attrs;
-  auto dx = MaxPool3DGrad(scope, op.input(0), op.output(0), grad_inputs[0],
-                          ksize, strides, padding,
-                          grad_attrs.DataFormat(data_format));
+  auto dx =
+      MaxPool3DGrad(scope, op.input(0), op.output(0), grad_inputs[0], ksize,
+                    strides, padding, grad_attrs.DataFormat(data_format));
   grad_outputs->push_back(dx);
   return scope.status();
 }
@@ -216,10 +291,9 @@ Status AvgPoolGradHelper(const Scope& scope, const Operation& op,
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
   internal::AvgPoolGrad::Attrs grad_attrs;
-  auto dx =
-      internal::AvgPoolGrad(scope, Shape(scope, op.input(0)), grad_inputs[0],
-                            ksize, strides, padding,
-                            grad_attrs.DataFormat(data_format));
+  auto dx = internal::AvgPoolGrad(scope, Shape(scope, op.input(0)),
+                                  grad_inputs[0], ksize, strides, padding,
+                                  grad_attrs.DataFormat(data_format));
   grad_outputs->push_back(dx);
   return scope.status();
 }
@@ -238,9 +312,9 @@ Status AvgPool3DGradHelper(const Scope& scope, const Operation& op,
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
   AvgPool3DGrad::Attrs grad_attrs;
-  auto dx = AvgPool3DGrad(scope, Shape(scope, op.input(0)), grad_inputs[0],
-                          ksize, strides, padding,
-                          grad_attrs.DataFormat(data_format));
+  auto dx =
+      AvgPool3DGrad(scope, Shape(scope, op.input(0)), grad_inputs[0], ksize,
+                    strides, padding, grad_attrs.DataFormat(data_format));
   grad_outputs->push_back(dx);
   return scope.status();
 }
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index b4d457a9d1..f26a7e99e6 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -25,6 +25,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+using ops::AvgPool;
+using ops::AvgPool3D;
 using ops::BiasAdd;
 using ops::Conv2D;
 using ops::Elu;
@@ -33,11 +35,9 @@ using ops::FractionalMaxPool;
 using ops::L2Loss;
 using ops::LogSoftmax;
 using ops::LRN;
-using ops::AvgPool;
-using ops::AvgPool3D;
 using ops::MaxPool;
-using ops::MaxPoolV2;
 using ops::MaxPool3D;
+using ops::MaxPoolV2;
 using ops::Placeholder;
 using ops::Relu;
 using ops::Relu6;
@@ -111,6 +111,27 @@ TEST_F(NNGradTest, SoftmaxGrad) {
   RunTest(x, shape, y, shape);
 }
 
+TEST_F(NNGradTest, SoftmaxCrossEntropyWithLogitsGrad) {
+  TensorShape logitsShape(
+      {5, 3});  // batch size of 5,3 possible labels (classes),
+                // logits is what is produced by a network
+                // they are compared to labels which are the truth
+  TensorShape lossShape(
+      {5});  // batch size of 5, 1 value for each entry in the batch
+             // loss is the difference between logits and labels
+
+  auto logits = Placeholder(scope_, DT_FLOAT,
+                            Placeholder::Shape(logitsShape));  // estimation
+  auto labels =
+      Placeholder(scope_, DT_FLOAT, Placeholder::Shape(logitsShape));  // truth
+  auto y =
+      tensorflow::ops::SoftmaxCrossEntropyWithLogits(scope_, logits, labels);
+  // Please note the reversal of the backprop and loss orders. A separate issue
+  // #18734 has been opened for this.
+  RunTest({logits, labels}, {logitsShape, logitsShape}, {y.backprop, y.loss},
+          {logitsShape, lossShape});
+}
+
 TEST_F(NNGradTest, LogSoftmaxGrad) {
   TensorShape shape({5, 3});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
@@ -253,7 +274,7 @@ TEST_F(NNGradTest, AvgPool3DGradHelper) {
   RunTest(x, x_shape, y, y_shape);
 }
 
-TEST_F(NNGradTest, LRN){
+TEST_F(NNGradTest, LRN) {
   TensorShape x_shape({1, 1, 2, 1});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
   auto y = LRN(scope_, x);
-- 
cgit v1.2.3


From 98010279f40e4963512ba2f2f39c3d732aef7b93 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Jul 2018 07:21:37 -0700
Subject: TF SavedModel: Split off a reader from the loader module

PiperOrigin-RevId: 204468340
---
 tensorflow/cc/saved_model/BUILD                    |  30 ++++++
 tensorflow/cc/saved_model/loader.cc                |  70 ++-----------
 tensorflow/cc/saved_model/reader.cc                |  88 +++++++++++++++++
 tensorflow/cc/saved_model/reader.h                 |  39 ++++++++
 tensorflow/cc/saved_model/reader_test.cc           | 108 +++++++++++++++++++++
 .../java/org/tensorflow/SavedModelBundleTest.java  |   2 +-
 6 files changed, 272 insertions(+), 65 deletions(-)
 create mode 100644 tensorflow/cc/saved_model/reader.cc
 create mode 100644 tensorflow/cc/saved_model/reader.h
 create mode 100644 tensorflow/cc/saved_model/reader_test.cc

(limited to 'tensorflow/cc')

diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 06a3be18e0..730b1b669b 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -33,6 +33,35 @@ cc_library(
     hdrs = ["tag_constants.h"],
 )
 
+cc_library(
+    name = "reader",
+    srcs = ["reader.cc"],
+    hdrs = ["reader.h"],
+    deps = [
+        ":constants",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "reader_test",
+    srcs = ["reader_test.cc"],
+    data = [
+        ":saved_model_half_plus_two",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":constants",
+        ":reader",
+        ":tag_constants",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 cc_library(
     name = "loader",
     hdrs = ["loader.h"],
@@ -54,6 +83,7 @@ cc_library(
     hdrs = ["loader.h"],
     deps = [
         ":constants",
+        ":reader",
     ] + if_not_mobile([
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index faa1e378d0..07807ed2f3 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -18,8 +18,10 @@ limitations under the License.
 #include <unordered_set>
 
 #include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/cc/saved_model/reader.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf_internal.h"
@@ -43,56 +45,6 @@ auto* load_latency = monitoring::Counter<1>::New(
 constexpr char kLoadAttemptFail[] = "fail";
 constexpr char kLoadAttemptSuccess[] = "success";
 
-Status ReadSavedModel(const string& export_dir, SavedModel* saved_model_proto) {
-  const string saved_model_pb_path =
-      io::JoinPath(export_dir, kSavedModelFilenamePb);
-  if (Env::Default()->FileExists(saved_model_pb_path).ok()) {
-    return ReadBinaryProto(Env::Default(), saved_model_pb_path,
-                           saved_model_proto);
-  }
-  const string saved_model_pbtxt_path =
-      io::JoinPath(export_dir, kSavedModelFilenamePbTxt);
-  if (Env::Default()->FileExists(saved_model_pbtxt_path).ok()) {
-    return ReadTextProto(Env::Default(), saved_model_pbtxt_path,
-                         saved_model_proto);
-  }
-  return Status(error::Code::NOT_FOUND,
-                "Could not find SavedModel .pb or .pbtxt at supplied export "
-                "directory path: " +
-                    export_dir);
-}
-
-string GetTagsAsString(const std::unordered_set<string>& tags) {
-  string tags_as_string = "{ ";
-  for (const string& tag : tags) {
-    tags_as_string = strings::StrCat(tags_as_string, tag, " ");
-  }
-  tags_as_string = strings::StrCat(tags_as_string, "}");
-  return tags_as_string;
-}
-
-Status FindMetaGraphDefToLoad(const SavedModel& saved_model_proto,
-                              const std::unordered_set<string>& tags,
-                              MetaGraphDef* meta_graph_def_to_load) {
-  for (const MetaGraphDef& meta_graph_def : saved_model_proto.meta_graphs()) {
-    // Get tags from the meta_graph_def.
-    std::unordered_set<string> graph_tags;
-    for (const string& tag : meta_graph_def.meta_info_def().tags()) {
-      graph_tags.insert(tag);
-    }
-    // Match with the set of tags provided.
-    if (graph_tags == tags) {
-      *meta_graph_def_to_load = meta_graph_def;
-      return Status::OK();
-    }
-  }
-  return Status(error::Code::NOT_FOUND,
-                "Could not find meta graph def matching supplied tags: " +
-                    GetTagsAsString(tags) +
-                    ". To inspect available tag-sets in the SavedModel, please "
-                    "use the SavedModel CLI: `saved_model_cli`");
-}
-
 Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def,
                                 const SessionOptions& session_options,
                                 std::unique_ptr<Session>* session) {
@@ -235,18 +187,8 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
                               const string& export_dir,
                               const std::unordered_set<string>& tags,
                               SavedModelBundle* const bundle) {
-  if (!MaybeSavedModelDirectory(export_dir)) {
-    return Status(error::Code::NOT_FOUND,
-                  "SavedModel not found in export directory: " + export_dir);
-  }
-  LOG(INFO) << "Loading SavedModel with tags: " << GetTagsAsString(tags)
-            << "; from: " << export_dir;
-
-  SavedModel saved_model_proto;
-  TF_RETURN_IF_ERROR(ReadSavedModel(export_dir, &saved_model_proto));
-
-  TF_RETURN_IF_ERROR(
-      FindMetaGraphDefToLoad(saved_model_proto, tags, &bundle->meta_graph_def));
+  TF_RETURN_IF_ERROR(ReadMetaGraphDefFromSavedModel(export_dir, tags,
+                                                    &bundle->meta_graph_def));
 
   TF_RETURN_IF_ERROR(LoadMetaGraphIntoSession(
       bundle->meta_graph_def, session_options, &bundle->session));
@@ -288,8 +230,8 @@ Status LoadSavedModel(const SessionOptions& session_options,
     return end_microseconds - start_microseconds;
   }();
   auto log_and_count = [&](const string& status_str) {
-    LOG(INFO) << "SavedModel load for tags " << GetTagsAsString(tags)
-              << "; Status: " << status_str << ". Took "
+    LOG(INFO) << "SavedModel load for tags { " << str_util::Join(tags, " ")
+              << " }; Status: " << status_str << ". Took "
               << load_latency_microsecs << " microseconds.";
     load_attempt_count->GetCell(export_dir, status_str)->IncrementBy(1);
   };
diff --git a/tensorflow/cc/saved_model/reader.cc b/tensorflow/cc/saved_model/reader.cc
new file mode 100644
index 0000000000..2146c8a197
--- /dev/null
+++ b/tensorflow/cc/saved_model/reader.cc
@@ -0,0 +1,88 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/saved_model/reader.h"
+
+#include <unordered_set>
+
+#include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/protobuf/saved_model.pb.h"
+
+namespace tensorflow {
+namespace {
+
+Status ReadSavedModel(const string& export_dir, SavedModel* saved_model_proto) {
+  LOG(INFO) << "Reading SavedModel from: " << export_dir;
+
+  const string saved_model_pb_path =
+      io::JoinPath(export_dir, kSavedModelFilenamePb);
+  if (Env::Default()->FileExists(saved_model_pb_path).ok()) {
+    return ReadBinaryProto(Env::Default(), saved_model_pb_path,
+                           saved_model_proto);
+  }
+  const string saved_model_pbtxt_path =
+      io::JoinPath(export_dir, kSavedModelFilenamePbTxt);
+  if (Env::Default()->FileExists(saved_model_pbtxt_path).ok()) {
+    return ReadTextProto(Env::Default(), saved_model_pbtxt_path,
+                         saved_model_proto);
+  }
+  return Status(error::Code::NOT_FOUND,
+                "Could not find SavedModel .pb or .pbtxt at supplied export "
+                "directory path: " +
+                    export_dir);
+}
+
+Status FindMetaGraphDef(const SavedModel& saved_model_proto,
+                        const std::unordered_set<string>& tags,
+                        MetaGraphDef* meta_graph_def) {
+  LOG(INFO) << "Reading meta graph with tags { " << str_util::Join(tags, " ")
+            << " }";
+  for (const MetaGraphDef& graph_def : saved_model_proto.meta_graphs()) {
+    // Get tags from the graph_def.
+    std::unordered_set<string> graph_tags;
+    for (const string& tag : graph_def.meta_info_def().tags()) {
+      graph_tags.insert(tag);
+    }
+    // Match with the set of tags provided.
+    if (graph_tags == tags) {
+      *meta_graph_def = graph_def;
+      return Status::OK();
+    }
+  }
+  return Status(
+      error::Code::NOT_FOUND,
+      strings::StrCat(
+          "Could not find meta graph def matching supplied tags: { ",
+          str_util::Join(tags, " "),
+          " }. To inspect available tag-sets in the SavedModel, please "
+          "use the SavedModel CLI: `saved_model_cli`"));
+}
+
+}  // namespace
+
+Status ReadMetaGraphDefFromSavedModel(const string& export_dir,
+                                      const std::unordered_set<string>& tags,
+                                      MetaGraphDef* const meta_graph_def) {
+  SavedModel saved_model_proto;
+  TF_RETURN_IF_ERROR(ReadSavedModel(export_dir, &saved_model_proto));
+  TF_RETURN_IF_ERROR(FindMetaGraphDef(saved_model_proto, tags, meta_graph_def));
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/reader.h b/tensorflow/cc/saved_model/reader.h
new file mode 100644
index 0000000000..5815108df2
--- /dev/null
+++ b/tensorflow/cc/saved_model/reader.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// Functions to read the SavedModel proto, or parts of it.
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_READER_H_
+#define TENSORFLOW_CC_SAVED_MODEL_READER_H_
+
+#include <string>
+#include <unordered_set>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+
+// Reads the SavedModel proto from saved_model.pb(txt) in the given directory,
+// finds the MetaGraphDef that matches the given set of tags and writes it to
+// the `meta_graph_def` parameter. Returns a failure status when the SavedModel
+// file does not exist or no MetaGraphDef matches the tags.
+Status ReadMetaGraphDefFromSavedModel(const string& export_dir,
+                                      const std::unordered_set<string>& tags,
+                                      MetaGraphDef* const meta_graph_def);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_READER_H_
diff --git a/tensorflow/cc/saved_model/reader_test.cc b/tensorflow/cc/saved_model/reader_test.cc
new file mode 100644
index 0000000000..620e9c2eec
--- /dev/null
+++ b/tensorflow/cc/saved_model/reader_test.cc
@@ -0,0 +1,108 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/saved_model/reader.h"
+
+#include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/cc/saved_model/tag_constants.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+constexpr char kTestDataPbTxt[] =
+    "cc/saved_model/testdata/half_plus_two_pbtxt/00000123";
+constexpr char kTestDataSharded[] =
+    "cc/saved_model/testdata/half_plus_two/00000123";
+
+class ReaderTest : public ::testing::Test {
+ protected:
+  ReaderTest() {}
+
+  void CheckMetaGraphDef(const MetaGraphDef& meta_graph_def) {
+    const auto& tags = meta_graph_def.meta_info_def().tags();
+    EXPECT_TRUE(std::find(tags.begin(), tags.end(), kSavedModelTagServe) !=
+                tags.end());
+    EXPECT_NE(meta_graph_def.meta_info_def().tensorflow_version(), "");
+    EXPECT_EQ(
+        meta_graph_def.signature_def().at("serving_default").method_name(),
+        "tensorflow/serving/predict");
+  }
+};
+
+TEST_F(ReaderTest, TagMatch) {
+  MetaGraphDef meta_graph_def;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(export_dir, {kSavedModelTagServe},
+                                              &meta_graph_def));
+  CheckMetaGraphDef(meta_graph_def);
+}
+
+TEST_F(ReaderTest, NoTagMatch) {
+  MetaGraphDef meta_graph_def;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  Status st = ReadMetaGraphDefFromSavedModel(export_dir, {"missing-tag"},
+                                             &meta_graph_def);
+  EXPECT_FALSE(st.ok());
+  EXPECT_TRUE(str_util::StrContains(
+      st.error_message(),
+      "Could not find meta graph def matching supplied tags: { missing-tag }"))
+      << st.error_message();
+}
+
+TEST_F(ReaderTest, NoTagMatchMultiple) {
+  MetaGraphDef meta_graph_def;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  Status st = ReadMetaGraphDefFromSavedModel(
+      export_dir, {kSavedModelTagServe, "missing-tag"}, &meta_graph_def);
+  EXPECT_FALSE(st.ok());
+  EXPECT_TRUE(str_util::StrContains(
+      st.error_message(),
+      "Could not find meta graph def matching supplied tags: "))
+      << st.error_message();
+}
+
+TEST_F(ReaderTest, PbtxtFormat) {
+  MetaGraphDef meta_graph_def;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPbTxt);
+  TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(export_dir, {kSavedModelTagServe},
+                                              &meta_graph_def));
+  CheckMetaGraphDef(meta_graph_def);
+}
+
+TEST_F(ReaderTest, InvalidExportPath) {
+  MetaGraphDef meta_graph_def;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), "missing-path");
+  Status st = ReadMetaGraphDefFromSavedModel(export_dir, {kSavedModelTagServe},
+                                             &meta_graph_def);
+  EXPECT_FALSE(st.ok());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/java/src/test/java/org/tensorflow/SavedModelBundleTest.java b/tensorflow/java/src/test/java/org/tensorflow/SavedModelBundleTest.java
index 7922f3329c..b063b6f1cd 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/SavedModelBundleTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/SavedModelBundleTest.java
@@ -47,7 +47,7 @@ public class SavedModelBundleTest {
       fail("not expected");
     } catch (org.tensorflow.TensorFlowException e) {
       // expected exception
-      assertTrue(e.getMessage().contains("SavedModel not found"));
+      assertTrue(e.getMessage().contains("Could not find SavedModel"));
     }
   }
 }
-- 
cgit v1.2.3


From e9e48b963b1ad1274ad8a0ad7d07d7fa990fe6b9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Jul 2018 08:52:49 -0700
Subject: Update `reader` dependencies such that the SavedModel loader still
 works on mobile.

PiperOrigin-RevId: 205248073
---
 tensorflow/cc/saved_model/BUILD | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'tensorflow/cc')

diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 730b1b669b..3d3895c8fa 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -39,9 +39,20 @@ cc_library(
     hdrs = ["reader.h"],
     deps = [
         ":constants",
+    ] + if_not_mobile([
+        # TODO(b/111634734): :lib and :protos_all contain dependencies that
+        # cannot be built on mobile platforms. Instead, include the appropriate
+        # tf_lib depending on the build platform.
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-    ],
+    ]) + if_mobile([
+        # Mobile-friendly SavedModel proto. See go/portable-proto for more info.
+        "//tensorflow/core:saved_model_portable_proto",
+    ]) + if_android([
+        "//tensorflow/core:android_tensorflow_lib",
+    ]) + if_ios([
+        "//tensorflow/core:ios_tensorflow_lib",
+    ]),
 )
 
 tf_cc_test(
-- 
cgit v1.2.3


From cf94a46c34f8568608d78b77e9a1c4369ebcafa2 Mon Sep 17 00:00:00 2001
From: Karmel Allison <karmel@google.com>
Date: Mon, 23 Jul 2018 14:47:30 -0700
Subject: The SavedModel legacy_init_op and main_op are functionally
 equivalent. Here, we remove duplicated code paths by mapping legacy_init_op
 into main_op in the SavedModelBuilder, and we deprecate the legacy_init_op
 arg. Note that the loader will still look for both, so old SavedModels will
 still load without trouble.

PiperOrigin-RevId: 205728344
---
 tensorflow/cc/saved_model/loader.cc               | 39 +++---------
 tensorflow/python/saved_model/builder_impl.py     | 76 ++++++++++++-----------
 tensorflow/python/saved_model/loader_impl.py      | 42 ++++---------
 tensorflow/python/saved_model/saved_model_test.py | 28 ++++++---
 4 files changed, 79 insertions(+), 106 deletions(-)

(limited to 'tensorflow/cc')

diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 07807ed2f3..d47b025743 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -86,10 +86,11 @@ bool HasMainOp(const MetaGraphDef& meta_graph_def) {
 Status RunMainOp(const RunOptions& run_options, const string& export_dir,
                  const MetaGraphDef& meta_graph_def,
                  const std::vector<AssetFileDef>& asset_file_defs,
-                 Session* session) {
-  LOG(INFO) << "Running MainOp on SavedModel bundle.";
+                 Session* session, const string& main_op_key) {
+  LOG(INFO) << "Running MainOp with key " << main_op_key
+            << " on SavedModel bundle.";
   const auto& collection_def_map = meta_graph_def.collection_def();
-  const auto main_op_it = collection_def_map.find(kSavedModelMainOpKey);
+  const auto main_op_it = collection_def_map.find(main_op_key);
   if (main_op_it != collection_def_map.end()) {
     if (main_op_it->second.node_list().value_size() != 1) {
       return errors::FailedPrecondition(
@@ -141,30 +142,6 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir,
                       nullptr /* outputs */, &run_metadata);
 }
 
-Status RunLegacyInitOp(const RunOptions& run_options, const string& export_dir,
-                       const MetaGraphDef& meta_graph_def,
-                       const std::vector<AssetFileDef>& asset_file_defs,
-                       Session* session) {
-  LOG(INFO) << "Running LegacyInitOp on SavedModel bundle.";
-  const auto& collection_def_map = meta_graph_def.collection_def();
-  const auto init_op_it = collection_def_map.find(kSavedModelLegacyInitOpKey);
-  if (init_op_it != collection_def_map.end()) {
-    if (init_op_it->second.node_list().value_size() != 1) {
-      return errors::FailedPrecondition(strings::StrCat(
-          "Expected exactly one serving init op in : ", export_dir));
-    }
-    std::vector<std::pair<string, Tensor>> inputs;
-    AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
-    RunMetadata run_metadata;
-    const StringPiece legacy_init_op_name =
-        init_op_it->second.node_list().value(0);
-    return session->Run(run_options, inputs, {},
-                        {legacy_init_op_name.ToString()}, nullptr /* outputs */,
-                        &run_metadata);
-  }
-  return Status::OK();
-}
-
 Status GetAssetFileDefs(const MetaGraphDef& meta_graph_def,
                         std::vector<AssetFileDef>* asset_file_defs) {
   const auto& collection_def_map = meta_graph_def.collection_def();
@@ -204,11 +181,11 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
   if (HasMainOp(bundle->meta_graph_def)) {
     TF_RETURN_IF_ERROR(RunMainOp(run_options, export_dir,
                                  bundle->meta_graph_def, asset_file_defs,
-                                 bundle->session.get()));
+                                 bundle->session.get(), kSavedModelMainOpKey));
   } else {
-    TF_RETURN_IF_ERROR(RunLegacyInitOp(run_options, export_dir,
-                                       bundle->meta_graph_def, asset_file_defs,
-                                       bundle->session.get()));
+    TF_RETURN_IF_ERROR(RunMainOp(
+        run_options, export_dir, bundle->meta_graph_def, asset_file_defs,
+        bundle->session.get(), kSavedModelLegacyInitOpKey));
   }
   return Status::OK();
 }
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index e58be804c2..8c985a7c2f 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -34,6 +34,7 @@ from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
+from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -133,39 +134,32 @@ class SavedModelBuilder(object):
     tf_logging.info("Assets written to: %s",
                     compat.as_text(assets_destination_dir))
 
-  def _maybe_add_legacy_init_op(self, legacy_init_op=None):
-    """Add legacy init op to the SavedModel.
+  def _maybe_add_main_op(self, main_op):
+    """Adds main op to the SavedModel.
 
     Args:
-      legacy_init_op: Optional legacy init op to support backward compatibility.
+      main_op: Main op to run as part of graph initialization. If None, no
+        main op will be added to the graph.
 
     Raises:
-      TypeError if legacy init op is not of type `Operation`.
-      AssertionError if the graph already contains one or more legacy init ops.
+      TypeError: if main op is provided but is not of type `Operation`.
+      ValueError: if the Graph already contains an init op.
     """
-    if legacy_init_op is not None:
-      if not isinstance(legacy_init_op, ops.Operation):
-        raise TypeError("legacy_init_op needs to be an Operation: %r" %
-                        legacy_init_op)
-      if ops.get_collection(constants.LEGACY_INIT_OP_KEY):
-        raise AssertionError(
-            "graph already contains one or more legacy init ops under the "
-            "collection {}.".format(constants.LEGACY_INIT_OP_KEY))
-      ops.add_to_collection(constants.LEGACY_INIT_OP_KEY, legacy_init_op)
-
-  def _add_main_op(self, main_op):
-    """Add main op to the SavedModel.
+    if main_op is None:
+      return
 
-    Args:
-      main_op: Main op to run as part of graph initialization.
+    if not isinstance(main_op, ops.Operation):
+      raise TypeError("main_op needs to be an Operation: %r" % main_op)
 
-    Raises:
-      TypeError if main op is not of type `Operation`.
-    """
-    if main_op is not None:
-      if not isinstance(main_op, ops.Operation):
-        raise TypeError("main_op needs to be an Operation: %r" % main_op)
-      ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
+    # Validate that no other init ops have been added to this graph already.
+    # We check main_op and legacy_init_op for thoroughness and explicitness.
+    for init_op_key in (constants.MAIN_OP_KEY, constants.LEGACY_INIT_OP_KEY):
+      if ops.get_collection(init_op_key):
+        raise ValueError(
+            "Graph already contains one or more main ops under the "
+            "collection {}.".format(init_op_key))
+
+    ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
 
   def _add_train_op(self, train_op):
     """Add train op to the SavedModel.
@@ -257,16 +251,12 @@ class SavedModelBuilder(object):
           self._validate_tensor_info(outputs[outputs_key])
 
   def _add_collections(
-      self, assets_collection, legacy_init_op, main_op, train_op):
+      self, assets_collection, main_op, train_op):
     """Add asset and op collections to be saved."""
     # Save asset files and write them to disk, if any.
     self._save_and_write_assets(assets_collection)
 
-    if main_op is None:
-      # Add legacy init op to the SavedModel.
-      self._maybe_add_legacy_init_op(legacy_init_op)
-    else:
-      self._add_main_op(main_op)
+    self._maybe_add_main_op(main_op)
 
     self._add_train_op(train_op)
 
@@ -282,6 +272,9 @@ class SavedModelBuilder(object):
           allow_empty=True)
     return saver
 
+  @deprecated_args(None,
+                   "Pass your op to the equivalent parameter main_op instead.",
+                   "legacy_init_op")
   def add_meta_graph(self,
                      tags,
                      signature_def_map=None,
@@ -306,7 +299,7 @@ class SavedModelBuilder(object):
           that this collection should be a subset of the assets saved as part of
           the first meta graph in the SavedModel.
       legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load.
+          restore op upon a load. Deprecated; please use main_op instead.
       clear_devices: Set to true if the device info on the default graph should
           be cleared.
       main_op: Op or group of ops to execute when the graph is loaded. Note
@@ -333,8 +326,12 @@ class SavedModelBuilder(object):
     # properly populated.
     self._validate_signature_def_map(signature_def_map)
 
+    # legacy_init_op is deprecated, and going away in TF 2.0.
+    # Re-mapping to main_op, as treatment is identical regardless.
+    main_op = main_op or legacy_init_op
+
     # Add assets and ops
-    self._add_collections(assets_collection, legacy_init_op, main_op, None)
+    self._add_collections(assets_collection, main_op, None)
 
     saver = self._maybe_create_saver(saver)
 
@@ -351,6 +348,9 @@ class SavedModelBuilder(object):
     # Tag the meta graph def and add it to the SavedModel.
     self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
 
+  @deprecated_args(None,
+                   "Pass your op to the equivalent parameter main_op instead.",
+                   "legacy_init_op")
   def add_meta_graph_and_variables(self,
                                    sess,
                                    tags,
@@ -378,7 +378,7 @@ class SavedModelBuilder(object):
         def.
       assets_collection: Assets collection to be saved with SavedModel.
       legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load.
+          restore op upon a load. Deprecated; please use main_op instead.
       clear_devices: Set to true if the device info on the default graph should
           be cleared.
       main_op: Op or group of ops to execute when the graph is loaded. Note
@@ -402,8 +402,12 @@ class SavedModelBuilder(object):
     # properly populated.
     self._validate_signature_def_map(signature_def_map)
 
+    # legacy_init_op is deprecated, and going away in TF 2.0.
+    # Re-mapping to main_op, as treatment is identical regardless.
+    main_op = main_op or legacy_init_op
+
     # Add assets and ops
-    self._add_collections(assets_collection, legacy_init_op, main_op, None)
+    self._add_collections(assets_collection, main_op, None)
 
     # Create the variables sub-directory, if it does not exist.
     variables_dir = os.path.join(
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index e5f649fdab..fb70c91c29 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -116,11 +116,14 @@ def _get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
   return asset_tensor_dict
 
 
-def _get_main_op_tensor(meta_graph_def_to_load):
+def _get_main_op_tensor(
+    meta_graph_def_to_load, init_op_key=constants.MAIN_OP_KEY):
   """Gets the main op tensor, if one exists.
 
   Args:
     meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
+    init_op_key: name of collection to check; should be one of MAIN_OP_KEY
+      or the deprecated LEGACY_INIT_OP_KEY
 
   Returns:
     The main op tensor, if it exists and `None` otherwise.
@@ -131,38 +134,15 @@ def _get_main_op_tensor(meta_graph_def_to_load):
   """
   collection_def = meta_graph_def_to_load.collection_def
   main_op_tensor = None
-  if constants.MAIN_OP_KEY in collection_def:
-    main_ops = collection_def[constants.MAIN_OP_KEY].node_list.value
+  if init_op_key in collection_def:
+    main_ops = collection_def[init_op_key].node_list.value
     if len(main_ops) != 1:
-      raise RuntimeError("Expected exactly one SavedModel main op.")
-    main_op_tensor = ops.get_collection(constants.MAIN_OP_KEY)[0]
+      raise RuntimeError("Expected exactly one SavedModel main op. "
+                         "Found: {}".format(main_ops))
+    main_op_tensor = ops.get_collection(init_op_key)[0]
   return main_op_tensor
 
 
-def _get_legacy_init_op_tensor(meta_graph_def_to_load):
-  """Gets the legacy init op tensor, if one exists.
-
-  Args:
-    meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
-
-  Returns:
-    The legacy init op tensor, if it exists and `None` otherwise.
-
-  Raises:
-    RuntimeError: If the collection def corresponding to the legacy init op key
-        has other than exactly one tensor.
-  """
-  collection_def = meta_graph_def_to_load.collection_def
-  legacy_init_op_tensor = None
-  if constants.LEGACY_INIT_OP_KEY in collection_def:
-    legacy_init_ops = collection_def[
-        constants.LEGACY_INIT_OP_KEY].node_list.value
-    if len(legacy_init_ops) != 1:
-      raise RuntimeError("Expected exactly one legacy serving init op.")
-    legacy_init_op_tensor = ops.get_collection(constants.LEGACY_INIT_OP_KEY)[0]
-  return legacy_init_op_tensor
-
-
 @tf_export("saved_model.loader.maybe_saved_model_directory")
 def maybe_saved_model_directory(export_dir):
   """Checks whether the provided export directory could contain a SavedModel.
@@ -340,8 +320,8 @@ class SavedModelLoader(object):
           self._export_dir, meta_graph_def, import_scope=import_scope)
 
       main_op_tensor = (
-          _get_main_op_tensor(meta_graph_def) or
-          (_get_legacy_init_op_tensor(meta_graph_def)))
+          _get_main_op_tensor(meta_graph_def, constants.MAIN_OP_KEY) or
+          _get_main_op_tensor(meta_graph_def, constants.LEGACY_INIT_OP_KEY))
       if main_op_tensor is not None:
         sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
 
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index fb4732aca2..00b669fc97 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -846,9 +846,19 @@ class SavedModelTest(test.TestCase):
   def testLegacyInitOpWithNonEmptyCollection(self):
     export_dir = self._get_export_dir(
         "test_legacy_init_op_with_non_empty_collection")
+    self._testInitOpsWithNonEmptyCollection(
+        export_dir, constants.LEGACY_INIT_OP_KEY)
+
+  def testMainOpWithNonEmptyCollection(self):
+    export_dir = self._get_export_dir(
+        "test_main_op_with_non_empty_collection")
+    self._testInitOpsWithNonEmptyCollection(export_dir, constants.MAIN_OP_KEY)
+
+  def _testInitOpsWithNonEmptyCollection(self, export_dir, key):
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    g = ops.Graph()
+    with self.test_session(graph=g) as sess:
       # Initialize variable `v1` to 1.
       v1 = variables.Variable(1, name="v1")
       ops.add_to_collection("v", v1)
@@ -857,19 +867,21 @@ class SavedModelTest(test.TestCase):
       v2 = variables.Variable(42, name="v2", trainable=False, collections=[])
       ops.add_to_collection("v", v2)
 
-      # Set up an assignment op to be run as part of the legacy_init_op.
+      # Set up an assignment op to be run as part of the init op.
       assign_v2 = state_ops.assign(v2, v1)
-      legacy_init_op = control_flow_ops.group(assign_v2, name="legacy_init_op")
+      init_op = control_flow_ops.group(assign_v2, name="init_op")
 
       sess.run(variables.global_variables_initializer())
 
-      ops.add_to_collection(constants.LEGACY_INIT_OP_KEY,
-                            control_flow_ops.no_op())
-      # AssertionError should be raised since the LEGACY_INIT_OP_KEY collection
+      ops.add_to_collection(key, control_flow_ops.no_op())
+      # ValueError should be raised since the LEGACY_INIT_OP_KEY collection
       # is not empty and we don't support multiple init ops.
-      with self.assertRaises(AssertionError):
+      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
         builder.add_meta_graph_and_variables(
-            sess, ["foo"], legacy_init_op=legacy_init_op)
+            sess, ["foo"], legacy_init_op=init_op)
+      # We shouldn't be able to add as MAIN_OP, either.
+      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
+        builder.add_meta_graph_and_variables(sess, ["foo"], main_op=init_op)
 
   def testTrainOp(self):
     export_dir = self._get_export_dir("test_train_op")
-- 
cgit v1.2.3


From 7087243b8594faa92b274b92d586cbb2d3b24bfe Mon Sep 17 00:00:00 2001
From: Karmel Allison <karmel@google.com>
Date: Mon, 23 Jul 2018 16:40:25 -0700
Subject: Automated rollback of commit cf94a46c34f8568608d78b77e9a1c4369ebcafa2

PiperOrigin-RevId: 205746329
---
 tensorflow/cc/saved_model/loader.cc               | 39 +++++++++---
 tensorflow/python/saved_model/builder_impl.py     | 76 +++++++++++------------
 tensorflow/python/saved_model/loader_impl.py      | 42 +++++++++----
 tensorflow/python/saved_model/saved_model_test.py | 28 +++------
 4 files changed, 106 insertions(+), 79 deletions(-)

(limited to 'tensorflow/cc')

diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index d47b025743..07807ed2f3 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -86,11 +86,10 @@ bool HasMainOp(const MetaGraphDef& meta_graph_def) {
 Status RunMainOp(const RunOptions& run_options, const string& export_dir,
                  const MetaGraphDef& meta_graph_def,
                  const std::vector<AssetFileDef>& asset_file_defs,
-                 Session* session, const string& main_op_key) {
-  LOG(INFO) << "Running MainOp with key " << main_op_key
-            << " on SavedModel bundle.";
+                 Session* session) {
+  LOG(INFO) << "Running MainOp on SavedModel bundle.";
   const auto& collection_def_map = meta_graph_def.collection_def();
-  const auto main_op_it = collection_def_map.find(main_op_key);
+  const auto main_op_it = collection_def_map.find(kSavedModelMainOpKey);
   if (main_op_it != collection_def_map.end()) {
     if (main_op_it->second.node_list().value_size() != 1) {
       return errors::FailedPrecondition(
@@ -142,6 +141,30 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir,
                       nullptr /* outputs */, &run_metadata);
 }
 
+Status RunLegacyInitOp(const RunOptions& run_options, const string& export_dir,
+                       const MetaGraphDef& meta_graph_def,
+                       const std::vector<AssetFileDef>& asset_file_defs,
+                       Session* session) {
+  LOG(INFO) << "Running LegacyInitOp on SavedModel bundle.";
+  const auto& collection_def_map = meta_graph_def.collection_def();
+  const auto init_op_it = collection_def_map.find(kSavedModelLegacyInitOpKey);
+  if (init_op_it != collection_def_map.end()) {
+    if (init_op_it->second.node_list().value_size() != 1) {
+      return errors::FailedPrecondition(strings::StrCat(
+          "Expected exactly one serving init op in : ", export_dir));
+    }
+    std::vector<std::pair<string, Tensor>> inputs;
+    AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
+    RunMetadata run_metadata;
+    const StringPiece legacy_init_op_name =
+        init_op_it->second.node_list().value(0);
+    return session->Run(run_options, inputs, {},
+                        {legacy_init_op_name.ToString()}, nullptr /* outputs */,
+                        &run_metadata);
+  }
+  return Status::OK();
+}
+
 Status GetAssetFileDefs(const MetaGraphDef& meta_graph_def,
                         std::vector<AssetFileDef>* asset_file_defs) {
   const auto& collection_def_map = meta_graph_def.collection_def();
@@ -181,11 +204,11 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
   if (HasMainOp(bundle->meta_graph_def)) {
     TF_RETURN_IF_ERROR(RunMainOp(run_options, export_dir,
                                  bundle->meta_graph_def, asset_file_defs,
-                                 bundle->session.get(), kSavedModelMainOpKey));
+                                 bundle->session.get()));
   } else {
-    TF_RETURN_IF_ERROR(RunMainOp(
-        run_options, export_dir, bundle->meta_graph_def, asset_file_defs,
-        bundle->session.get(), kSavedModelLegacyInitOpKey));
+    TF_RETURN_IF_ERROR(RunLegacyInitOp(run_options, export_dir,
+                                       bundle->meta_graph_def, asset_file_defs,
+                                       bundle->session.get()));
   }
   return Status::OK();
 }
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 8c985a7c2f..e58be804c2 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -34,7 +34,6 @@ from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
-from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -134,32 +133,39 @@ class SavedModelBuilder(object):
     tf_logging.info("Assets written to: %s",
                     compat.as_text(assets_destination_dir))
 
-  def _maybe_add_main_op(self, main_op):
-    """Adds main op to the SavedModel.
+  def _maybe_add_legacy_init_op(self, legacy_init_op=None):
+    """Add legacy init op to the SavedModel.
 
     Args:
-      main_op: Main op to run as part of graph initialization. If None, no
-        main op will be added to the graph.
+      legacy_init_op: Optional legacy init op to support backward compatibility.
 
     Raises:
-      TypeError: if main op is provided but is not of type `Operation`.
-      ValueError: if the Graph already contains an init op.
+      TypeError if legacy init op is not of type `Operation`.
+      AssertionError if the graph already contains one or more legacy init ops.
     """
-    if main_op is None:
-      return
-
-    if not isinstance(main_op, ops.Operation):
-      raise TypeError("main_op needs to be an Operation: %r" % main_op)
+    if legacy_init_op is not None:
+      if not isinstance(legacy_init_op, ops.Operation):
+        raise TypeError("legacy_init_op needs to be an Operation: %r" %
+                        legacy_init_op)
+      if ops.get_collection(constants.LEGACY_INIT_OP_KEY):
+        raise AssertionError(
+            "graph already contains one or more legacy init ops under the "
+            "collection {}.".format(constants.LEGACY_INIT_OP_KEY))
+      ops.add_to_collection(constants.LEGACY_INIT_OP_KEY, legacy_init_op)
+
+  def _add_main_op(self, main_op):
+    """Add main op to the SavedModel.
 
-    # Validate that no other init ops have been added to this graph already.
-    # We check main_op and legacy_init_op for thoroughness and explicitness.
-    for init_op_key in (constants.MAIN_OP_KEY, constants.LEGACY_INIT_OP_KEY):
-      if ops.get_collection(init_op_key):
-        raise ValueError(
-            "Graph already contains one or more main ops under the "
-            "collection {}.".format(init_op_key))
+    Args:
+      main_op: Main op to run as part of graph initialization.
 
-    ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
+    Raises:
+      TypeError if main op is not of type `Operation`.
+    """
+    if main_op is not None:
+      if not isinstance(main_op, ops.Operation):
+        raise TypeError("main_op needs to be an Operation: %r" % main_op)
+      ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
 
   def _add_train_op(self, train_op):
     """Add train op to the SavedModel.
@@ -251,12 +257,16 @@ class SavedModelBuilder(object):
           self._validate_tensor_info(outputs[outputs_key])
 
   def _add_collections(
-      self, assets_collection, main_op, train_op):
+      self, assets_collection, legacy_init_op, main_op, train_op):
     """Add asset and op collections to be saved."""
     # Save asset files and write them to disk, if any.
     self._save_and_write_assets(assets_collection)
 
-    self._maybe_add_main_op(main_op)
+    if main_op is None:
+      # Add legacy init op to the SavedModel.
+      self._maybe_add_legacy_init_op(legacy_init_op)
+    else:
+      self._add_main_op(main_op)
 
     self._add_train_op(train_op)
 
@@ -272,9 +282,6 @@ class SavedModelBuilder(object):
           allow_empty=True)
     return saver
 
-  @deprecated_args(None,
-                   "Pass your op to the equivalent parameter main_op instead.",
-                   "legacy_init_op")
   def add_meta_graph(self,
                      tags,
                      signature_def_map=None,
@@ -299,7 +306,7 @@ class SavedModelBuilder(object):
           that this collection should be a subset of the assets saved as part of
           the first meta graph in the SavedModel.
       legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load. Deprecated; please use main_op instead.
+          restore op upon a load.
       clear_devices: Set to true if the device info on the default graph should
           be cleared.
       main_op: Op or group of ops to execute when the graph is loaded. Note
@@ -326,12 +333,8 @@ class SavedModelBuilder(object):
     # properly populated.
     self._validate_signature_def_map(signature_def_map)
 
-    # legacy_init_op is deprecated, and going away in TF 2.0.
-    # Re-mapping to main_op, as treatment is identical regardless.
-    main_op = main_op or legacy_init_op
-
     # Add assets and ops
-    self._add_collections(assets_collection, main_op, None)
+    self._add_collections(assets_collection, legacy_init_op, main_op, None)
 
     saver = self._maybe_create_saver(saver)
 
@@ -348,9 +351,6 @@ class SavedModelBuilder(object):
     # Tag the meta graph def and add it to the SavedModel.
     self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
 
-  @deprecated_args(None,
-                   "Pass your op to the equivalent parameter main_op instead.",
-                   "legacy_init_op")
   def add_meta_graph_and_variables(self,
                                    sess,
                                    tags,
@@ -378,7 +378,7 @@ class SavedModelBuilder(object):
         def.
       assets_collection: Assets collection to be saved with SavedModel.
       legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load. Deprecated; please use main_op instead.
+          restore op upon a load.
       clear_devices: Set to true if the device info on the default graph should
           be cleared.
       main_op: Op or group of ops to execute when the graph is loaded. Note
@@ -402,12 +402,8 @@ class SavedModelBuilder(object):
     # properly populated.
     self._validate_signature_def_map(signature_def_map)
 
-    # legacy_init_op is deprecated, and going away in TF 2.0.
-    # Re-mapping to main_op, as treatment is identical regardless.
-    main_op = main_op or legacy_init_op
-
     # Add assets and ops
-    self._add_collections(assets_collection, main_op, None)
+    self._add_collections(assets_collection, legacy_init_op, main_op, None)
 
     # Create the variables sub-directory, if it does not exist.
     variables_dir = os.path.join(
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index fb70c91c29..e5f649fdab 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -116,14 +116,11 @@ def _get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
   return asset_tensor_dict
 
 
-def _get_main_op_tensor(
-    meta_graph_def_to_load, init_op_key=constants.MAIN_OP_KEY):
+def _get_main_op_tensor(meta_graph_def_to_load):
   """Gets the main op tensor, if one exists.
 
   Args:
     meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
-    init_op_key: name of collection to check; should be one of MAIN_OP_KEY
-      or the deprecated LEGACY_INIT_OP_KEY
 
   Returns:
     The main op tensor, if it exists and `None` otherwise.
@@ -134,15 +131,38 @@ def _get_main_op_tensor(
   """
   collection_def = meta_graph_def_to_load.collection_def
   main_op_tensor = None
-  if init_op_key in collection_def:
-    main_ops = collection_def[init_op_key].node_list.value
+  if constants.MAIN_OP_KEY in collection_def:
+    main_ops = collection_def[constants.MAIN_OP_KEY].node_list.value
     if len(main_ops) != 1:
-      raise RuntimeError("Expected exactly one SavedModel main op. "
-                         "Found: {}".format(main_ops))
-    main_op_tensor = ops.get_collection(init_op_key)[0]
+      raise RuntimeError("Expected exactly one SavedModel main op.")
+    main_op_tensor = ops.get_collection(constants.MAIN_OP_KEY)[0]
   return main_op_tensor
 
 
+def _get_legacy_init_op_tensor(meta_graph_def_to_load):
+  """Gets the legacy init op tensor, if one exists.
+
+  Args:
+    meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
+
+  Returns:
+    The legacy init op tensor, if it exists and `None` otherwise.
+
+  Raises:
+    RuntimeError: If the collection def corresponding to the legacy init op key
+        has other than exactly one tensor.
+  """
+  collection_def = meta_graph_def_to_load.collection_def
+  legacy_init_op_tensor = None
+  if constants.LEGACY_INIT_OP_KEY in collection_def:
+    legacy_init_ops = collection_def[
+        constants.LEGACY_INIT_OP_KEY].node_list.value
+    if len(legacy_init_ops) != 1:
+      raise RuntimeError("Expected exactly one legacy serving init op.")
+    legacy_init_op_tensor = ops.get_collection(constants.LEGACY_INIT_OP_KEY)[0]
+  return legacy_init_op_tensor
+
+
 @tf_export("saved_model.loader.maybe_saved_model_directory")
 def maybe_saved_model_directory(export_dir):
   """Checks whether the provided export directory could contain a SavedModel.
@@ -320,8 +340,8 @@ class SavedModelLoader(object):
           self._export_dir, meta_graph_def, import_scope=import_scope)
 
       main_op_tensor = (
-          _get_main_op_tensor(meta_graph_def, constants.MAIN_OP_KEY) or
-          _get_main_op_tensor(meta_graph_def, constants.LEGACY_INIT_OP_KEY))
+          _get_main_op_tensor(meta_graph_def) or
+          (_get_legacy_init_op_tensor(meta_graph_def)))
       if main_op_tensor is not None:
         sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
 
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 00b669fc97..fb4732aca2 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -846,19 +846,9 @@ class SavedModelTest(test.TestCase):
   def testLegacyInitOpWithNonEmptyCollection(self):
     export_dir = self._get_export_dir(
         "test_legacy_init_op_with_non_empty_collection")
-    self._testInitOpsWithNonEmptyCollection(
-        export_dir, constants.LEGACY_INIT_OP_KEY)
-
-  def testMainOpWithNonEmptyCollection(self):
-    export_dir = self._get_export_dir(
-        "test_main_op_with_non_empty_collection")
-    self._testInitOpsWithNonEmptyCollection(export_dir, constants.MAIN_OP_KEY)
-
-  def _testInitOpsWithNonEmptyCollection(self, export_dir, key):
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    g = ops.Graph()
-    with self.test_session(graph=g) as sess:
+    with self.test_session(graph=ops.Graph()) as sess:
       # Initialize variable `v1` to 1.
       v1 = variables.Variable(1, name="v1")
       ops.add_to_collection("v", v1)
@@ -867,21 +857,19 @@ class SavedModelTest(test.TestCase):
       v2 = variables.Variable(42, name="v2", trainable=False, collections=[])
       ops.add_to_collection("v", v2)
 
-      # Set up an assignment op to be run as part of the init op.
+      # Set up an assignment op to be run as part of the legacy_init_op.
       assign_v2 = state_ops.assign(v2, v1)
-      init_op = control_flow_ops.group(assign_v2, name="init_op")
+      legacy_init_op = control_flow_ops.group(assign_v2, name="legacy_init_op")
 
       sess.run(variables.global_variables_initializer())
 
-      ops.add_to_collection(key, control_flow_ops.no_op())
-      # ValueError should be raised since the LEGACY_INIT_OP_KEY collection
+      ops.add_to_collection(constants.LEGACY_INIT_OP_KEY,
+                            control_flow_ops.no_op())
+      # AssertionError should be raised since the LEGACY_INIT_OP_KEY collection
       # is not empty and we don't support multiple init ops.
-      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
+      with self.assertRaises(AssertionError):
         builder.add_meta_graph_and_variables(
-            sess, ["foo"], legacy_init_op=init_op)
-      # We shouldn't be able to add as MAIN_OP, either.
-      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
-        builder.add_meta_graph_and_variables(sess, ["foo"], main_op=init_op)
+            sess, ["foo"], legacy_init_op=legacy_init_op)
 
   def testTrainOp(self):
     export_dir = self._get_export_dir("test_train_op")
-- 
cgit v1.2.3


From 96c76b296768852dac94aaf006beab2e637cbbb6 Mon Sep 17 00:00:00 2001
From: Karmel Allison <karmel@google.com>
Date: Wed, 25 Jul 2018 11:33:12 -0700
Subject: The SavedModel legacy_init_op and main_op are functionally
 equivalent. Here, we remove duplicated code paths by mapping legacy_init_op
 into main_op in the SavedModelBuilder, and we deprecate the legacy_init_op
 arg. Note that the loader will still look for both, so old SavedModels will
 still load without trouble.

PiperOrigin-RevId: 206026743
---
 tensorflow/cc/saved_model/loader.cc               | 39 +++---------
 tensorflow/python/saved_model/builder_impl.py     | 76 ++++++++++++-----------
 tensorflow/python/saved_model/loader_impl.py      | 42 ++++---------
 tensorflow/python/saved_model/saved_model_test.py | 28 ++++++---
 4 files changed, 79 insertions(+), 106 deletions(-)

(limited to 'tensorflow/cc')

diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 07807ed2f3..d47b025743 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -86,10 +86,11 @@ bool HasMainOp(const MetaGraphDef& meta_graph_def) {
 Status RunMainOp(const RunOptions& run_options, const string& export_dir,
                  const MetaGraphDef& meta_graph_def,
                  const std::vector<AssetFileDef>& asset_file_defs,
-                 Session* session) {
-  LOG(INFO) << "Running MainOp on SavedModel bundle.";
+                 Session* session, const string& main_op_key) {
+  LOG(INFO) << "Running MainOp with key " << main_op_key
+            << " on SavedModel bundle.";
   const auto& collection_def_map = meta_graph_def.collection_def();
-  const auto main_op_it = collection_def_map.find(kSavedModelMainOpKey);
+  const auto main_op_it = collection_def_map.find(main_op_key);
   if (main_op_it != collection_def_map.end()) {
     if (main_op_it->second.node_list().value_size() != 1) {
       return errors::FailedPrecondition(
@@ -141,30 +142,6 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir,
                       nullptr /* outputs */, &run_metadata);
 }
 
-Status RunLegacyInitOp(const RunOptions& run_options, const string& export_dir,
-                       const MetaGraphDef& meta_graph_def,
-                       const std::vector<AssetFileDef>& asset_file_defs,
-                       Session* session) {
-  LOG(INFO) << "Running LegacyInitOp on SavedModel bundle.";
-  const auto& collection_def_map = meta_graph_def.collection_def();
-  const auto init_op_it = collection_def_map.find(kSavedModelLegacyInitOpKey);
-  if (init_op_it != collection_def_map.end()) {
-    if (init_op_it->second.node_list().value_size() != 1) {
-      return errors::FailedPrecondition(strings::StrCat(
-          "Expected exactly one serving init op in : ", export_dir));
-    }
-    std::vector<std::pair<string, Tensor>> inputs;
-    AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
-    RunMetadata run_metadata;
-    const StringPiece legacy_init_op_name =
-        init_op_it->second.node_list().value(0);
-    return session->Run(run_options, inputs, {},
-                        {legacy_init_op_name.ToString()}, nullptr /* outputs */,
-                        &run_metadata);
-  }
-  return Status::OK();
-}
-
 Status GetAssetFileDefs(const MetaGraphDef& meta_graph_def,
                         std::vector<AssetFileDef>* asset_file_defs) {
   const auto& collection_def_map = meta_graph_def.collection_def();
@@ -204,11 +181,11 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
   if (HasMainOp(bundle->meta_graph_def)) {
     TF_RETURN_IF_ERROR(RunMainOp(run_options, export_dir,
                                  bundle->meta_graph_def, asset_file_defs,
-                                 bundle->session.get()));
+                                 bundle->session.get(), kSavedModelMainOpKey));
   } else {
-    TF_RETURN_IF_ERROR(RunLegacyInitOp(run_options, export_dir,
-                                       bundle->meta_graph_def, asset_file_defs,
-                                       bundle->session.get()));
+    TF_RETURN_IF_ERROR(RunMainOp(
+        run_options, export_dir, bundle->meta_graph_def, asset_file_defs,
+        bundle->session.get(), kSavedModelLegacyInitOpKey));
   }
   return Status::OK();
 }
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index e58be804c2..8c985a7c2f 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -34,6 +34,7 @@ from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
+from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -133,39 +134,32 @@ class SavedModelBuilder(object):
     tf_logging.info("Assets written to: %s",
                     compat.as_text(assets_destination_dir))
 
-  def _maybe_add_legacy_init_op(self, legacy_init_op=None):
-    """Add legacy init op to the SavedModel.
+  def _maybe_add_main_op(self, main_op):
+    """Adds main op to the SavedModel.
 
     Args:
-      legacy_init_op: Optional legacy init op to support backward compatibility.
+      main_op: Main op to run as part of graph initialization. If None, no
+        main op will be added to the graph.
 
     Raises:
-      TypeError if legacy init op is not of type `Operation`.
-      AssertionError if the graph already contains one or more legacy init ops.
+      TypeError: if main op is provided but is not of type `Operation`.
+      ValueError: if the Graph already contains an init op.
     """
-    if legacy_init_op is not None:
-      if not isinstance(legacy_init_op, ops.Operation):
-        raise TypeError("legacy_init_op needs to be an Operation: %r" %
-                        legacy_init_op)
-      if ops.get_collection(constants.LEGACY_INIT_OP_KEY):
-        raise AssertionError(
-            "graph already contains one or more legacy init ops under the "
-            "collection {}.".format(constants.LEGACY_INIT_OP_KEY))
-      ops.add_to_collection(constants.LEGACY_INIT_OP_KEY, legacy_init_op)
-
-  def _add_main_op(self, main_op):
-    """Add main op to the SavedModel.
+    if main_op is None:
+      return
 
-    Args:
-      main_op: Main op to run as part of graph initialization.
+    if not isinstance(main_op, ops.Operation):
+      raise TypeError("main_op needs to be an Operation: %r" % main_op)
 
-    Raises:
-      TypeError if main op is not of type `Operation`.
-    """
-    if main_op is not None:
-      if not isinstance(main_op, ops.Operation):
-        raise TypeError("main_op needs to be an Operation: %r" % main_op)
-      ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
+    # Validate that no other init ops have been added to this graph already.
+    # We check main_op and legacy_init_op for thoroughness and explicitness.
+    for init_op_key in (constants.MAIN_OP_KEY, constants.LEGACY_INIT_OP_KEY):
+      if ops.get_collection(init_op_key):
+        raise ValueError(
+            "Graph already contains one or more main ops under the "
+            "collection {}.".format(init_op_key))
+
+    ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
 
   def _add_train_op(self, train_op):
     """Add train op to the SavedModel.
@@ -257,16 +251,12 @@ class SavedModelBuilder(object):
           self._validate_tensor_info(outputs[outputs_key])
 
   def _add_collections(
-      self, assets_collection, legacy_init_op, main_op, train_op):
+      self, assets_collection, main_op, train_op):
     """Add asset and op collections to be saved."""
     # Save asset files and write them to disk, if any.
     self._save_and_write_assets(assets_collection)
 
-    if main_op is None:
-      # Add legacy init op to the SavedModel.
-      self._maybe_add_legacy_init_op(legacy_init_op)
-    else:
-      self._add_main_op(main_op)
+    self._maybe_add_main_op(main_op)
 
     self._add_train_op(train_op)
 
@@ -282,6 +272,9 @@ class SavedModelBuilder(object):
           allow_empty=True)
     return saver
 
+  @deprecated_args(None,
+                   "Pass your op to the equivalent parameter main_op instead.",
+                   "legacy_init_op")
   def add_meta_graph(self,
                      tags,
                      signature_def_map=None,
@@ -306,7 +299,7 @@ class SavedModelBuilder(object):
           that this collection should be a subset of the assets saved as part of
           the first meta graph in the SavedModel.
       legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load.
+          restore op upon a load. Deprecated; please use main_op instead.
       clear_devices: Set to true if the device info on the default graph should
           be cleared.
       main_op: Op or group of ops to execute when the graph is loaded. Note
@@ -333,8 +326,12 @@ class SavedModelBuilder(object):
     # properly populated.
     self._validate_signature_def_map(signature_def_map)
 
+    # legacy_init_op is deprecated, and going away in TF 2.0.
+    # Re-mapping to main_op, as treatment is identical regardless.
+    main_op = main_op or legacy_init_op
+
     # Add assets and ops
-    self._add_collections(assets_collection, legacy_init_op, main_op, None)
+    self._add_collections(assets_collection, main_op, None)
 
     saver = self._maybe_create_saver(saver)
 
@@ -351,6 +348,9 @@ class SavedModelBuilder(object):
     # Tag the meta graph def and add it to the SavedModel.
     self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
 
+  @deprecated_args(None,
+                   "Pass your op to the equivalent parameter main_op instead.",
+                   "legacy_init_op")
   def add_meta_graph_and_variables(self,
                                    sess,
                                    tags,
@@ -378,7 +378,7 @@ class SavedModelBuilder(object):
         def.
       assets_collection: Assets collection to be saved with SavedModel.
       legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load.
+          restore op upon a load. Deprecated; please use main_op instead.
       clear_devices: Set to true if the device info on the default graph should
           be cleared.
       main_op: Op or group of ops to execute when the graph is loaded. Note
@@ -402,8 +402,12 @@ class SavedModelBuilder(object):
     # properly populated.
     self._validate_signature_def_map(signature_def_map)
 
+    # legacy_init_op is deprecated, and going away in TF 2.0.
+    # Re-mapping to main_op, as treatment is identical regardless.
+    main_op = main_op or legacy_init_op
+
     # Add assets and ops
-    self._add_collections(assets_collection, legacy_init_op, main_op, None)
+    self._add_collections(assets_collection, main_op, None)
 
     # Create the variables sub-directory, if it does not exist.
     variables_dir = os.path.join(
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index e5f649fdab..fb70c91c29 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -116,11 +116,14 @@ def _get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
   return asset_tensor_dict
 
 
-def _get_main_op_tensor(meta_graph_def_to_load):
+def _get_main_op_tensor(
+    meta_graph_def_to_load, init_op_key=constants.MAIN_OP_KEY):
   """Gets the main op tensor, if one exists.
 
   Args:
     meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
+    init_op_key: name of collection to check; should be one of MAIN_OP_KEY
+      or the deprecated LEGACY_INIT_OP_KEY
 
   Returns:
     The main op tensor, if it exists and `None` otherwise.
@@ -131,38 +134,15 @@ def _get_main_op_tensor(meta_graph_def_to_load):
   """
   collection_def = meta_graph_def_to_load.collection_def
   main_op_tensor = None
-  if constants.MAIN_OP_KEY in collection_def:
-    main_ops = collection_def[constants.MAIN_OP_KEY].node_list.value
+  if init_op_key in collection_def:
+    main_ops = collection_def[init_op_key].node_list.value
     if len(main_ops) != 1:
-      raise RuntimeError("Expected exactly one SavedModel main op.")
-    main_op_tensor = ops.get_collection(constants.MAIN_OP_KEY)[0]
+      raise RuntimeError("Expected exactly one SavedModel main op. "
+                         "Found: {}".format(main_ops))
+    main_op_tensor = ops.get_collection(init_op_key)[0]
   return main_op_tensor
 
 
-def _get_legacy_init_op_tensor(meta_graph_def_to_load):
-  """Gets the legacy init op tensor, if one exists.
-
-  Args:
-    meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
-
-  Returns:
-    The legacy init op tensor, if it exists and `None` otherwise.
-
-  Raises:
-    RuntimeError: If the collection def corresponding to the legacy init op key
-        has other than exactly one tensor.
-  """
-  collection_def = meta_graph_def_to_load.collection_def
-  legacy_init_op_tensor = None
-  if constants.LEGACY_INIT_OP_KEY in collection_def:
-    legacy_init_ops = collection_def[
-        constants.LEGACY_INIT_OP_KEY].node_list.value
-    if len(legacy_init_ops) != 1:
-      raise RuntimeError("Expected exactly one legacy serving init op.")
-    legacy_init_op_tensor = ops.get_collection(constants.LEGACY_INIT_OP_KEY)[0]
-  return legacy_init_op_tensor
-
-
 @tf_export("saved_model.loader.maybe_saved_model_directory")
 def maybe_saved_model_directory(export_dir):
   """Checks whether the provided export directory could contain a SavedModel.
@@ -340,8 +320,8 @@ class SavedModelLoader(object):
           self._export_dir, meta_graph_def, import_scope=import_scope)
 
       main_op_tensor = (
-          _get_main_op_tensor(meta_graph_def) or
-          (_get_legacy_init_op_tensor(meta_graph_def)))
+          _get_main_op_tensor(meta_graph_def, constants.MAIN_OP_KEY) or
+          _get_main_op_tensor(meta_graph_def, constants.LEGACY_INIT_OP_KEY))
       if main_op_tensor is not None:
         sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
 
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index fb4732aca2..00b669fc97 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -846,9 +846,19 @@ class SavedModelTest(test.TestCase):
   def testLegacyInitOpWithNonEmptyCollection(self):
     export_dir = self._get_export_dir(
         "test_legacy_init_op_with_non_empty_collection")
+    self._testInitOpsWithNonEmptyCollection(
+        export_dir, constants.LEGACY_INIT_OP_KEY)
+
+  def testMainOpWithNonEmptyCollection(self):
+    export_dir = self._get_export_dir(
+        "test_main_op_with_non_empty_collection")
+    self._testInitOpsWithNonEmptyCollection(export_dir, constants.MAIN_OP_KEY)
+
+  def _testInitOpsWithNonEmptyCollection(self, export_dir, key):
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    g = ops.Graph()
+    with self.test_session(graph=g) as sess:
       # Initialize variable `v1` to 1.
       v1 = variables.Variable(1, name="v1")
       ops.add_to_collection("v", v1)
@@ -857,19 +867,21 @@ class SavedModelTest(test.TestCase):
       v2 = variables.Variable(42, name="v2", trainable=False, collections=[])
       ops.add_to_collection("v", v2)
 
-      # Set up an assignment op to be run as part of the legacy_init_op.
+      # Set up an assignment op to be run as part of the init op.
       assign_v2 = state_ops.assign(v2, v1)
-      legacy_init_op = control_flow_ops.group(assign_v2, name="legacy_init_op")
+      init_op = control_flow_ops.group(assign_v2, name="init_op")
 
       sess.run(variables.global_variables_initializer())
 
-      ops.add_to_collection(constants.LEGACY_INIT_OP_KEY,
-                            control_flow_ops.no_op())
-      # AssertionError should be raised since the LEGACY_INIT_OP_KEY collection
+      ops.add_to_collection(key, control_flow_ops.no_op())
+      # ValueError should be raised since the LEGACY_INIT_OP_KEY collection
       # is not empty and we don't support multiple init ops.
-      with self.assertRaises(AssertionError):
+      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
         builder.add_meta_graph_and_variables(
-            sess, ["foo"], legacy_init_op=legacy_init_op)
+            sess, ["foo"], legacy_init_op=init_op)
+      # We shouldn't be able to add as MAIN_OP, either.
+      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
+        builder.add_meta_graph_and_variables(sess, ["foo"], main_op=init_op)
 
   def testTrainOp(self):
     export_dir = self._get_export_dir("test_train_op")
-- 
cgit v1.2.3


From 19defc68c99483049e14ec18718cb037f1ca050b Mon Sep 17 00:00:00 2001
From: Christopher Olston <olston@google.com>
Date: Thu, 26 Jul 2018 21:04:48 -0700
Subject: Have the SavedModel loader use Session's Make/Run/ReleaseCallable()
 API instead of Run(), to avoid leaving behind non-GC'ed state after model
 initialization.

PiperOrigin-RevId: 206266841
---
 tensorflow/cc/saved_model/loader.cc | 56 ++++++++++++++++++++++++++++++++++---
 1 file changed, 52 insertions(+), 4 deletions(-)

(limited to 'tensorflow/cc')

diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index d47b025743..98be66a6ad 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -74,6 +74,54 @@ void AddAssetsTensorsToInputs(const StringPiece export_dir,
   }
 }
 
+// Like Session::Run(), but uses the Make/Run/ReleaseCallable() API to avoid
+// leaving behind non-GC'ed state.
+//
+// Detailed motivation behind this approach, from ashankar@:
+//
+// Each call to Session::Run() that identifies a new subgraph (based on feeds
+// and fetches) creates some datastructures that live as long as the session
+// (the partitioned graph, associated executors etc.).
+//
+// A pathological case of this would be if say the initialization op
+// (main_op/legacy_init_op) involves the use of a large constant. Then we
+// allocate memory for that large constant that will just stick around till the
+// session dies. With this Callable mechanism, that memory will be released
+// right after ReleaseCallable returns.
+//
+// However, the resource manager state remains.
+Status RunOnce(const RunOptions& run_options,
+               const std::vector<std::pair<string, Tensor>>& inputs,
+               const std::vector<string>& output_tensor_names,
+               const std::vector<string>& target_node_names,
+               std::vector<Tensor>* outputs, RunMetadata* run_metadata,
+               Session* session) {
+  CallableOptions callable_options;
+  std::vector<Tensor> feed_tensors;
+  *callable_options.mutable_run_options() = run_options;
+  for (const auto& input : inputs) {
+    const string& name = input.first;
+    const Tensor& tensor = input.second;
+    callable_options.add_feed(name);
+    feed_tensors.push_back(tensor);
+  }
+  for (const string& output_tensor_name : output_tensor_names) {
+    callable_options.add_fetch(output_tensor_name);
+  }
+  for (const string& target_node_name : target_node_names) {
+    callable_options.add_target(target_node_name);
+  }
+
+  Session::CallableHandle callable_handle;
+  TF_RETURN_IF_ERROR(session->MakeCallable(callable_options, &callable_handle));
+  const Status run_status = session->RunCallable(callable_handle, feed_tensors,
+                                                 outputs, run_metadata);
+  // Be sure to call ReleaseCallable() regardless of the outcome of
+  // RunCallable().
+  session->ReleaseCallable(callable_handle).IgnoreError();
+  return run_status;
+}
+
 bool HasMainOp(const MetaGraphDef& meta_graph_def) {
   const auto& collection_def_map = meta_graph_def.collection_def();
   if (collection_def_map.find(kSavedModelMainOpKey) !=
@@ -100,8 +148,8 @@ Status RunMainOp(const RunOptions& run_options, const string& export_dir,
     AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
     RunMetadata run_metadata;
     const StringPiece main_op_name = main_op_it->second.node_list().value(0);
-    return session->Run(run_options, inputs, {}, {main_op_name.ToString()},
-                        nullptr /* outputs */, &run_metadata);
+    return RunOnce(run_options, inputs, {}, {main_op_name.ToString()},
+                   nullptr /* outputs */, &run_metadata, session);
   }
   return Status::OK();
 }
@@ -138,8 +186,8 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir,
   AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
 
   RunMetadata run_metadata;
-  return session->Run(run_options, inputs, {}, {restore_op_name.ToString()},
-                      nullptr /* outputs */, &run_metadata);
+  return RunOnce(run_options, inputs, {}, {restore_op_name.ToString()},
+                 nullptr /* outputs */, &run_metadata, session);
 }
 
 Status GetAssetFileDefs(const MetaGraphDef& meta_graph_def,
-- 
cgit v1.2.3


Version:	CPU/GPU:	Python Version:	Compiler:	Build Tools:	cuDNN:	CUDA:
tensorflow-1.9.0	CPU	2.7, 3.3-3.6	GCC 4.8	Bazel 0.11.0	N/A	N/A
tensorflow_gpu-1.9.0	GPU	2.7, 3.3-3.6	GCC 4.8	Bazel 0.11.0	7	9
tensorflow-1.8.0	CPU	2.7, 3.3-3.6	GCC 4.8	Bazel 0.10.0	N/A	N/A
tensorflow_gpu-1.8.0	GPU	2.7, 3.3-3.6	GCC 4.8	Bazel 0.9.0	7	9
tensorflow-1.7.0	CPU	2.7, 3.3-3.6	GCC 4.8	Bazel 0.10.0	N/A	N/A
Version:	CPU/GPU:	Python Version:	Compiler:	Build Tools:	cuDNN:	CUDA:
tensorflow-1.9.0	CPU	2.7, 3.3-3.6	Clang from xcode	Bazel 0.11.0	N/A	N/A
tensorflow-1.8.0	CPU	2.7, 3.3-3.6	Clang from xcode	Bazel 0.10.1	N/A	N/A
tensorflow-1.7.0	CPU	2.7, 3.3-3.6	Clang from xcode	Bazel 0.10.1	N/A	N/A
tensorflow-1.6.0	CPU	2.7, 3.3-3.6	Clang from xcode	Bazel 0.8.1	N/A	N/A
Version:	CPU/GPU:	Python Version:	Compiler:	Build Tools:	cuDNN:	CUDA:
tensorflow-1.9.0	CPU	3.5-3.6	MSVC 2015 update 3	Cmake v3.6.3	N/A	N/A
tensorflow_gpu-1.9.0	GPU	3.5-3.6	MSVC 2015 update 3	Cmake v3.6.3	7	9
tensorflow-1.8.0	CPU	3.5-3.6	MSVC 2015 update 3	Cmake v3.6.3	N/A	N/A
tensorflow_gpu-1.8.0	GPU	3.5-3.6	MSVC 2015 update 3	Cmake v3.6.3	7	9
tensorflow-1.7.0	CPU	3.5-3.6	MSVC 2015 update 3	Cmake v3.6.3	N/A	N/A