Automated g4 rollback of changelist 201011811

PiperOrigin-RevId: 201033171
author: Akshay Modi <nareshmodi@google.com> 2018-06-18 11:48:36 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-06-18 11:55:03 -0700
commit: 148b4381fd0259cae441e459ec8ebe2c5d557722 (patch)
tree: c66c96ea6c60c63385b528dce195af802b8acf3b /tensorflow
parent: fc03fbff3dd7a58fa4f16226df4ada1f21f8b53f (diff)
221 files changed, 888 insertions, 3235 deletions
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 6d134dbb80..a73c4ca3aa 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -475,7 +475,7 @@ tf_cc_shared_object(
 # excludes all but a subset of function names.
 # On MacOS, the linker does not support version_script, but has an
 # an "-exported_symbols_list" command.  -z defs disallows undefined
-# symbols in object files.
+# symbols in object files and -s strips the output.
 
 tf_cc_shared_object(
     name = "libtensorflow.so",
@@ -489,6 +489,7 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
+            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow/c:version_script.lds)",
         ],
@@ -514,6 +515,7 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
+            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow:tf_version_script.lds)",
         ],
diff --git a/tensorflow/c/generate-pc.sh b/tensorflow/c/generate-pc.sh
index 7184ad68fb..02a6a58b61 100755
--- a/tensorflow/c/generate-pc.sh
+++ b/tensorflow/c/generate-pc.sh
@@ -15,12 +15,10 @@
 # ==============================================================================
 
 TF_PREFIX='/usr/local'
-LIBDIR='lib'
 
 usage() {
     echo "Usage: $0 OPTIONS"
     echo -e "-p, --prefix\tset installation prefix (default: /usr/local)"
-    echo -e "-l, --libdir\tset lib directory (default: lib)"
     echo -e "-v, --version\tset TensorFlow version"
     echo -e "-h, --help\tdisplay this message"
 }
@@ -28,7 +26,7 @@ usage() {
 [ $# == 0 ] && usage && exit 0
 
 # read the options
-ARGS=$(getopt -o p:l:v:h --long prefix:,libdir:,version:,help -n $0 -- "$@")
+ARGS=$(getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@")
 eval set -- "$ARGS"
 
 # extract options and their arguments into variables.
@@ -40,11 +38,6 @@ while true ; do
                 "") shift 2 ;;
                 *) TF_PREFIX=$2 ; shift 2 ;;
             esac ;;
-        -l|--libdir)
-            case "$2" in
-                "") shift 2 ;;
-                *) LIBDIR=$2 ; shift 2 ;;
-            esac ;;
         -v|--version)
             case "$2" in
                 "") shift 2 ;;
@@ -62,7 +55,7 @@ echo "Generating pkgconfig file for TensorFlow $TF_VERSION in $TF_PREFIX"
 cat << EOF > tensorflow.pc
 prefix=${TF_PREFIX}
 exec_prefix=\${prefix}
-libdir=\${exec_prefix}/${LIBDIR}
+libdir=\${exec_prefix}/lib
 includedir=\${prefix}/include
 
 Name: TensorFlow
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index 35a01e0341..52c177212a 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -38,7 +38,6 @@ REGISTER_NO_GRADIENT_OP("NotEqual");
 REGISTER_NO_GRADIENT_OP("LogicalAnd");
 REGISTER_NO_GRADIENT_OP("LogicalOr");
 REGISTER_NO_GRADIENT_OP("LogicalNot");
-REGISTER_NO_GRADIENT_OP("Floor");
 
 // Conjugate helper function returns the conjugate of an Output if it
 // is complex valued.
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index c73482d5f4..0cb3132e94 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -255,53 +255,6 @@ Status LRNGradHelper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("LRN", LRNGradHelper);
 
-Status SoftplusGradHelper(const Scope& scope, const Operation& op,
-                          const std::vector<Output>& grad_inputs,
-                          std::vector<Output>* grad_outputs) {
-  auto dx = internal::SoftplusGrad(scope, grad_inputs[0], op.input(0));
-  grad_outputs->push_back(dx);
-  return scope.status();
-}
-REGISTER_GRADIENT_OP("Softplus", SoftplusGradHelper);
-
-Status SoftsignGradHelper(const Scope& scope, const Operation& op,
-                          const std::vector<Output>& grad_inputs,
-                          std::vector<Output>* grad_outputs) {
-  auto dx = internal::SoftsignGrad(scope, grad_inputs[0], op.input(0));
-  grad_outputs->push_back(dx);
-  return scope.status();
-}
-REGISTER_GRADIENT_OP("Softsign", SoftsignGradHelper);
-
-Status FractionalAvgPoolGradHelper(const Scope& scope, const Operation& op,
-                                   const std::vector<Output>& grad_inputs,
-                                   std::vector<Output>* grad_outputs) {
-  bool overlapping;
-  TF_RETURN_IF_ERROR(
-      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
-  auto dx = internal::FractionalAvgPoolGrad(
-      scope, Shape(scope, op.input(0), Shape::OutType(DT_INT64)),
-      grad_inputs[0], op.output(1), op.output(2),
-      internal::FractionalAvgPoolGrad::Overlapping(overlapping));
-  grad_outputs->push_back(dx);
-  return scope.status();
-}
-REGISTER_GRADIENT_OP("FractionalAvgPool", FractionalAvgPoolGradHelper);
-
-Status FractionalMaxPoolGradHelper(const Scope& scope, const Operation& op,
-                                   const std::vector<Output>& grad_inputs,
-                                   std::vector<Output>* grad_outputs) {
-  bool overlapping;
-  TF_RETURN_IF_ERROR(
-      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
-  auto dx = internal::FractionalMaxPoolGrad(
-      scope, op.input(0), op.output(0), grad_inputs[0], op.output(1),
-      op.output(2), internal::FractionalMaxPoolGrad::Overlapping(overlapping));
-  grad_outputs->push_back(dx);
-  return scope.status();
-}
-REGISTER_GRADIENT_OP("FractionalMaxPool", FractionalMaxPoolGradHelper);
-
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index b4d457a9d1..c4eba7ecb0 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -28,8 +28,6 @@ namespace {
 using ops::BiasAdd;
 using ops::Conv2D;
 using ops::Elu;
-using ops::FractionalAvgPool;
-using ops::FractionalMaxPool;
 using ops::L2Loss;
 using ops::LogSoftmax;
 using ops::LRN;
@@ -43,8 +41,6 @@ using ops::Relu;
 using ops::Relu6;
 using ops::Selu;
 using ops::Softmax;
-using ops::Softplus;
-using ops::Softsign;
 
 class NNGradTest : public ::testing::Test {
  protected:
@@ -75,30 +71,22 @@ class NNGradTest : public ::testing::Test {
     EXPECT_LT(max_error, 1e-3);
   }
 
-  // Sets tensor with random values, ensuring that every pair of elements are at
-  // least a reasonable amount apart.
-  // This is an issue for max pooling operations, in which perturbations by the
-  // numeric gradient computation in the gradient checker can change the max
-  // value if a pool has values that are too close together.
+  // Sets tensor with random values, ensuring that the max value is largest by
+  // a reasonable amount.
+  // This is an issue for MaxPool, MaxPoolV2 and MaxPool3D, in which
+  // perturbations by the numeric gradient computation in the gradient checker
+  // can change the max value if values are too close together.
   template <typename T>
-  void SetRandomValuesForMaxPooling(Tensor* tensor) {
+  void SetRandomValuesWithBumpedMax(Tensor* tensor) {
     auto tensor_flat = tensor->flat<T>();
-    // First set the array to an increasing sequence of values spaced
-    // a reasonable amount apart
-    T cur = 0;
-    for (size_t i = 0; i < tensor->NumElements(); i++) {
-      tensor_flat(i) = cur;
-      cur += 5e-2;
-    }
-    // Fischer-Yates shuffle the array
-    for (size_t i = tensor->NumElements() - 1; i >= 1; i--) {
-      // j <- random integer 0 <= j <= i
-      size_t j = random::New64() % (i + 1);
-      // swap values at i, j
-      T tmp = tensor_flat(i);
-      tensor_flat(i) = tensor_flat(j);
-      tensor_flat(j) = tmp;
+    tensor_flat.setRandom();
+    int32 max_index = 0;
+    for (size_t i = 1; i < tensor->NumElements(); i++) {
+      if (tensor_flat(i) > tensor_flat(max_index)) {
+        max_index = i;
+      }
     }
+    tensor_flat(max_index) += 1e-2;
   }
 
   Scope scope_;
@@ -201,7 +189,7 @@ TEST_F(NNGradTest, MaxPoolGradHelper) {
   const std::vector<int> strides{1, 2, 2, 1};
   auto y = MaxPool(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesForMaxPooling<float>(&x_init_value);
+  SetRandomValuesWithBumpedMax<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -214,7 +202,7 @@ TEST_F(NNGradTest, MaxPoolGradV2Helper) {
   Tensor strides = test::AsTensor<int>({1, 2, 2, 1}, {4});
   auto y = MaxPoolV2(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesForMaxPooling<float>(&x_init_value);
+  SetRandomValuesWithBumpedMax<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -227,7 +215,7 @@ TEST_F(NNGradTest, MaxPool3DGradHelper) {
   const std::vector<int> strides{1, 3, 3, 3, 1};
   auto y = MaxPool3D(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesForMaxPooling<float>(&x_init_value);
+  SetRandomValuesWithBumpedMax<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -260,45 +248,5 @@ TEST_F(NNGradTest, LRN){
   RunTest(x, x_shape, y, x_shape);
 }
 
-TEST_F(NNGradTest, SoftplusGrad) {
-  TensorShape shape({3, 7});
-  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
-  auto y = Softplus(scope_, x);
-  RunTest(x, shape, y, shape);
-}
-
-TEST_F(NNGradTest, SoftsignGrad) {
-  TensorShape shape({3, 7});
-  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
-  auto y = Softsign(scope_, x);
-  RunTest(x, shape, y, shape);
-}
-
-TEST_F(NNGradTest, FractionalAvgPoolGradHelper) {
-  TensorShape x_shape({1, 3, 7, 1});
-  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
-  // Force consistent pooling regions for unit testing.
-  auto y = FractionalAvgPool(
-      scope_, x, {1, 1.2, 1.9, 1},
-      FractionalAvgPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
-          2));
-  TensorShape y_shape({1, 2, 3, 1});
-  RunTest(x, x_shape, y.output, y_shape);
-}
-
-TEST_F(NNGradTest, FractionalMaxPoolGradHelper) {
-  TensorShape x_shape({1, 3, 7, 1});
-  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
-  // Force consistent pooling regions for unit testing.
-  auto y = FractionalMaxPool(
-      scope_, x, {1, 1.2, 1.9, 1},
-      FractionalMaxPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
-          2));
-  Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesForMaxPooling<float>(&x_init_value);
-  TensorShape y_shape({1, 2, 3, 1});
-  RunTest(x, x_init_value, y.output, y_shape);
-}
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 6641d45e83..6e050cf564 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -56,9 +56,9 @@ namespace bar {
 //
 // Memory stats:
 //   arg bytes total:    104
-//   arg bytes aligned:  192
+//   arg bytes aligned:  128
 //   temp bytes total:   126
-//   temp bytes aligned: 320
+//   temp bytes aligned: 224
 class MyClass : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.h b/tensorflow/compiler/aot/embedded_protocol_buffers.h
index 4e194a6aba..ebfe4806c2 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.h
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.h
@@ -71,7 +71,7 @@ struct ProtobufToEmbed {
   const ::tensorflow::protobuf::MessageLite* message;
 };
 
-// Embeds a sequence of protocol buffers into an object file.
+// Embeds a a sequence of protocol buffers into an object file.
 //
 // `target_triple` is the target triple for the target architecture for the
 // generated object file.
diff --git a/tensorflow/compiler/aot/runtime.h b/tensorflow/compiler/aot/runtime.h
index d1a669ceb1..d085864f00 100644
--- a/tensorflow/compiler/aot/runtime.h
+++ b/tensorflow/compiler/aot/runtime.h
@@ -25,8 +25,8 @@ namespace tensorflow {
 namespace tfcompile {
 namespace runtime {
 
-// Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
-static constexpr size_t kAlign = 64;
+// Align to 32-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
+static constexpr size_t kAlign = 32;
 
 // aligned_buffer_bytes returns the sum of each size in `sizes`, skipping -1
 // values.  There are `n` entries in `sizes`.  Each buffer is aligned to kAlign
diff --git a/tensorflow/compiler/aot/runtime_test.cc b/tensorflow/compiler/aot/runtime_test.cc
index 06ec623eb2..6d603a02eb 100644
--- a/tensorflow/compiler/aot/runtime_test.cc
+++ b/tensorflow/compiler/aot/runtime_test.cc
@@ -24,7 +24,7 @@ namespace runtime {
 namespace {
 
 TEST(Runtime, AlignmentValue) {
-  // We've chosen 64 byte alignment for the tfcompile runtime to mimic the
+  // We've chosen 32 byte alignment for the tfcompile runtime to mimic the
   // regular tensorflow allocator, which was chosen to play nicely with Eigen.
   // The tfcompile runtime also has a requirement that comes from the xla
   // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8
@@ -39,13 +39,13 @@ TEST(Runtime, AlignedBufferBytes) {
   EXPECT_EQ(aligned_buffer_bytes(sizesA, 1), 0);
 
   static constexpr intptr_t sizesB[1] = {3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 64);
+  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 32);
 
   static constexpr intptr_t sizesC[1] = {32};
-  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 64);
+  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 32);
 
   static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 320);
+  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 192);
 }
 
 void* add_ptr(void* base, uintptr_t delta) {
@@ -101,11 +101,11 @@ TEST(Runtime, MallocFreeContiguousBuffers) {
   EXPECT_NE(base, nullptr);
   EXPECT_EQ(bufD[0], add_ptr(base, 0));
   EXPECT_EQ(bufD[1], nullptr);
-  EXPECT_EQ(bufD[2], add_ptr(base, 64));
+  EXPECT_EQ(bufD[2], add_ptr(base, 32));
   EXPECT_EQ(bufD[3], nullptr);
-  EXPECT_EQ(bufD[4], add_ptr(base, 128));
-  EXPECT_EQ(bufD[5], add_ptr(base, 192));
-  EXPECT_EQ(bufD[6], add_ptr(base, 256));
+  EXPECT_EQ(bufD[4], add_ptr(base, 64));
+  EXPECT_EQ(bufD[5], add_ptr(base, 128));
+  EXPECT_EQ(bufD[6], add_ptr(base, 160));
   for (int i = 0; i < 7; ++i) {
     const intptr_t size = sizesD[i];
     if (size != -1) {
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 1067b38f93..d82922a359 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -178,7 +178,6 @@ cc_library(
         ":runtime_matmul",
         ":runtime_matmul_mkl",
         ":runtime_single_threaded_conv2d",
-        ":runtime_single_threaded_fft",
         ":runtime_single_threaded_matmul",
         "@llvm//:execution_engine",
         "@llvm//:core",
@@ -517,6 +516,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
@@ -579,22 +579,6 @@ cc_library(
 )
 
 cc_library(
-    name = "runtime_single_threaded_fft",
-    srcs = [
-        "runtime_fft_impl.h",
-        "runtime_single_threaded_fft.cc",
-    ],
-    hdrs = ["runtime_single_threaded_fft.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:framework_lite",
-        "//third_party/eigen3",
-    ],
-)
-
-cc_library(
     name = "runtime_single_threaded_matmul",
     srcs = ["runtime_single_threaded_matmul.cc"],
     hdrs = ["runtime_single_threaded_matmul.h"],
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 54c52bc08f..215405f680 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -51,8 +51,6 @@ extern const char* const kEigenConvF16SymbolName =
 extern const char* const kEigenConvF32SymbolName =
     "__xla_cpu_runtime_EigenConvF32";
 extern const char* const kEigenFftSymbolName = "__xla_cpu_runtime_EigenFft";
-extern const char* const kEigenSingleThreadedFftSymbolName =
-    "__xla_cpu_runtime_EigenSingleThreadedFft";
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName =
     "__xla_cpu_runtime_EigenSingleThreadedMatMulF16";
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index aa0e967123..1dce6efa5c 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -52,7 +52,6 @@ extern const char* const kMKLSingleThreadedMatMulF64SymbolName;
 extern const char* const kEigenConvF16SymbolName;
 extern const char* const kEigenConvF32SymbolName;
 extern const char* const kEigenFftSymbolName;
-extern const char* const kEigenSingleThreadedFftSymbolName;
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF64SymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 758b8c62b4..2c20be155f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1172,13 +1172,7 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
       {int8_ptr_type, int8_ptr_type, int8_ptr_type, int32_type, int32_type,
        int64_type, int64_type, int64_type, int64_type},
       /*isVarArg=*/false);
-
-  bool multi_threaded_eigen =
-      hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
-  const char* fn_name = multi_threaded_eigen
-                            ? runtime::kEigenFftSymbolName
-                            : runtime::kEigenSingleThreadedFftSymbolName;
-
+  const char* fn_name = runtime::kEigenFftSymbolName;
   llvm::Function* fft_func = llvm::cast<llvm::Function>(
       module_->getOrInsertFunction(fn_name, fft_type));
   fft_func->setCallingConv(llvm::CallingConv::C);
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
index 0bf693edd0..984cb0616e 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/types.h"
 
 // 'tensorflow' namespace is used so that int64 and other types don't require
@@ -69,9 +71,11 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
+  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = fft_shape[i];
     out_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
+    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<float, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -84,8 +88,8 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   const auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
 
   // Compute the full FFT using a temporary tensor.
-  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(in_dims);
-
+  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
+  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
   const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> zero_start_indices;
   full_fft.device(device) =
       input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(axes);
@@ -108,9 +112,11 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
+  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
     out_dims[i + 1] = fft_shape[i];
+    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -123,7 +129,8 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   // region we will slice from input given fft_shape. We slice input to
   // fft_shape on its inner-most dimensions, except the last (which we
   // slice to fft_shape[-1] / 2 + 1).
-  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(out_dims);
+  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
+  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
 
   // Calculate the starting point and range of the source of
   // negative frequency part.
@@ -172,6 +179,7 @@ template <int FFTRank, typename EigenDevice>
 void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
                       int32 fft_type, int64 input_batch, int64 fft_length0,
                       int64 fft_length1, int64 fft_length2) {
+  CHECK(::xla::FftType_IsValid(fft_type)) << fft_type;
   switch (fft_type) {
     case ::xla::FftType::FFT:
       EigenFftC2C<true, FFTRank, EigenDevice>(
@@ -196,8 +204,7 @@ void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
           input_batch, fft_length0, fft_length1, fft_length2);
       break;
     default:
-      // Unsupported FFT type
-      abort();
+      LOG(FATAL) << "Unsupported FFT type: " << fft_type;
   }
 }
 
@@ -223,8 +230,7 @@ void EigenFftImpl(const EigenDevice& device, void* out, void* operand,
                                                  fft_length1, fft_length2);
       break;
     default:
-      // Unsupported FFT rank
-      abort();
+      LOG(FATAL) << "Unsupported FFT rank " << fft_rank;
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
deleted file mode 100644
index 2613ddb127..0000000000
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
-
-#include "tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h"
-#include "tensorflow/core/platform/dynamic_annotations.h"
-#include "tensorflow/core/platform/types.h"
-
-using tensorflow::int32;
-using tensorflow::int64;
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenSingleThreadedFft(
-    const void* run_options_ptr, void* out, void* operand, int32 fft_type,
-    int32 fft_rank, int64 input_batch, int64 fft_length0, int64 fft_length1,
-    int64 fft_length2) {
-  tensorflow::xla::EigenFftImpl(Eigen::DefaultDevice(), out, operand, fft_type,
-                                fft_rank, input_batch, fft_length0, fft_length1,
-                                fft_length2);
-}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
deleted file mode 100644
index dcd133d012..0000000000
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
-
-#include "tensorflow/core/platform/types.h"
-
-extern "C" {
-
-extern void __xla_cpu_runtime_EigenSingleThreadedFft(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, void* out,
-    void* operand, tensorflow::int32 fft_type, tensorflow::int32 fft_rank,
-    tensorflow::int64 input_batch, tensorflow::int64 fft_length0,
-    tensorflow::int64 fft_length1, tensorflow::int64 fft_length2);
-
-}  // extern "C"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index c4c90515ac..8d8c5e4c44 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
-#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -203,7 +202,6 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF64);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedFft);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index 2515222cf2..d3bc47e61e 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -204,7 +204,7 @@ class LayoutPattern {
   // Modifies the pattern to match only if the layout equals the given proto.
   // The layout must outlive the returned pattern.
   constexpr LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>> EqualTo(
-      const ::xla::Layout* layout) const {
+      const Layout* layout) const {
     return LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>>(
         LayoutPatternEqualImpl<Impl>(impl_, layout), matched_layout_);
   }
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.cc b/tensorflow/compiler/xla/service/tuple_simplifier.cc
index 77bdcc9de0..e536c8afbf 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.cc
@@ -30,17 +30,10 @@ limitations under the License.
 
 namespace xla {
 
-TupleSimplifier::TupleSimplifier(bool exclude_entry_computation) :
-    exclude_entry_computation_(exclude_entry_computation) {}
-
 StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
   // Initially add all GTE and Tuple instructions to the worklist.
   std::queue<HloInstruction*> worklist;
   for (auto* computation : module->computations()) {
-    if (exclude_entry_computation_ &&
-        computation == module->entry_computation()) {
-      continue;
-    }
     for (auto* instruction : computation->instructions()) {
       if (instruction->opcode() == HloOpcode::kTuple ||
           instruction->opcode() == HloOpcode::kGetTupleElement) {
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.h b/tensorflow/compiler/xla/service/tuple_simplifier.h
index 7509501883..e5e9b10b5b 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.h
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.h
@@ -27,20 +27,13 @@ namespace xla {
 // the module.
 class TupleSimplifier : public HloPassInterface {
  public:
-  TupleSimplifier() : TupleSimplifier(/*exclude_entry_computation=*/false) {}
-  explicit TupleSimplifier(bool exclude_entry_computation);
+  TupleSimplifier() {}
   ~TupleSimplifier() override {}
   tensorflow::StringPiece name() const override { return "tuple-simplifier"; }
 
   // Run tuple simplification on the given computation. Returns whether the
   // computation was changed.
   StatusOr<bool> Run(HloModule* module) override;
-
- private:
-  // When set, this pipeline stage will perform optimization of all computations
-  // apart from the module's entry computation. This is used by Graphcore's
-  // backend.
-  bool exclude_entry_computation_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
index d3635eae81..ca9ae91281 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
@@ -42,12 +42,6 @@ class TupleSimplifierTest : public HloTestBase {
     TF_ASSERT_OK(changed_status.status());
     EXPECT_EQ(change_expected, changed_status.ValueOrDie());
   }
-  void Run(HloModule* module, bool change_expected, bool exclude_entry) {
-    TupleSimplifier simplifier(exclude_entry);
-    auto changed_status = simplifier.Run(module);
-    TF_ASSERT_OK(changed_status.status());
-    EXPECT_EQ(change_expected, changed_status.ValueOrDie());
-  }
 
   const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
   const Shape tuple_shape_ = ShapeUtil::MakeTupleShape(
@@ -217,76 +211,5 @@ TEST_F(TupleSimplifierTest, IncompatibleTuples) {
   EXPECT_THAT(computation->root_instruction(), tuple);
 }
 
-TEST_F(TupleSimplifierTest, CanExcludeEntryComputation) {
-  //  Verify that the root computation can be excluded
-  auto module = CreateNewModule();
-
-  HloInstruction* p0;
-  HloInstruction* p1;
-  HloComputation* c0;
-  HloComputation* c1;
-  HloComputation* entry;
-
-  {
-    HloComputation::Builder builder(TestName() + "_1");
-    p0 = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
-    HloInstruction* gte0 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 0));
-    HloInstruction* gte1 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 1));
-    HloInstruction* gte2 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 2));
-
-    builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
-
-    c0 = module->AddEmbeddedComputation(builder.Build());
-  }
-  {
-    HloComputation::Builder builder(TestName() + "_2");
-    p1 = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
-    HloInstruction* gte0 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 0));
-    HloInstruction* gte1 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 1));
-    HloInstruction* gte2 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 2));
-
-    builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
-
-    c1 = module->AddEmbeddedComputation(builder.Build());
-  }
-  {
-    HloComputation::Builder builder(TestName() + "_Entry");
-    HloInstruction* tuple_param = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
-    HloInstruction* call0 = builder.AddInstruction(
-        HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c0));
-    HloInstruction* call1 = builder.AddInstruction(
-        HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c1));
-    HloInstruction* gte0 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, call0, 0));
-    HloInstruction* gte1 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, call1, 1));
-    HloInstruction* tuple0 =
-        builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
-    HloInstruction* gte2 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 0));
-    HloInstruction* gte3 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 1));
-
-    builder.AddInstruction(HloInstruction::CreateTuple({gte2, gte3}));
-
-    entry = module->AddEntryComputation(builder.Build());
-  }
-
-  Run(module.get(), /*change_expected=*/true, /*exclude_entry=*/ true);
-
-  EXPECT_THAT(c0->root_instruction(), p0);
-  EXPECT_THAT(c1->root_instruction(), p1);
-  EXPECT_THAT(entry->instruction_count(), 9);
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/contrib/autograph/__init__.py b/tensorflow/contrib/autograph/__init__.py
index dbdbad8f4c..637e49c082 100644
--- a/tensorflow/contrib/autograph/__init__.py
+++ b/tensorflow/contrib/autograph/__init__.py
@@ -23,7 +23,6 @@ from __future__ import print_function
 
 # TODO(mdan): Bring only the relevant symbols to the top level.
 from tensorflow.contrib.autograph import utils
-from tensorflow.contrib.autograph import operators
 from tensorflow.contrib.autograph.impl.api import convert
 from tensorflow.contrib.autograph.impl.api import converted_call
 from tensorflow.contrib.autograph.impl.api import do_not_convert
@@ -44,8 +43,6 @@ _allowed_symbols = [
     'do_not_convert',
     'to_code',
     'to_graph',
-    # Overloaded operators
-    'operators',
     # Special functions and directives
     'set_element_type',
     'set_loop_options',
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
index 2e0a2fcef4..bda5e26f43 100644
--- a/tensorflow/contrib/cmake/tf_c.cmake
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -37,15 +37,13 @@ add_dependencies(
   tf_core_lib
   tf_protos_cc)
 
-if(tensorflow_BUILD_PYTHON_BINDINGS)
-  add_library(tf_c_python_api OBJECT
-    "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
-    "${tensorflow_source_dir}/tensorflow/c/python_api.h"
-  )
-  add_dependencies(
-    tf_c_python_api
-    tf_c
-    tf_core_lib
-    tf_core_framework
-    tf_protos_cc)
-endif()
+add_library(tf_c_python_api OBJECT
+  "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
+  "${tensorflow_source_dir}/tensorflow/c/python_api.h"
+)
+add_dependencies(
+  tf_c_python_api
+  tf_c
+  tf_core_lib
+  tf_core_framework
+  tf_protos_cc)
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index 6c90cf398c..f73da0b8ab 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -155,7 +155,7 @@ if (WIN32)
     set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib")
   endif()
 else (WIN32)
-  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}")
+  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so")
 endif (WIN32)
 add_custom_target(tf_extension_ops)
 
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 9244604489..a0c3ddd28b 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -715,7 +715,7 @@ if(WIN32)
   endif()
 else()
   add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
                                      ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
 endif()
 
@@ -832,6 +832,7 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
 add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/testing/python/framework/util_test.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
+
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/README
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index 4f957f1e0b..cffe069aa3 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -44,8 +44,7 @@ UNDNAME = "undname.exe"
 DUMPBIN = "dumpbin.exe"
 
 # Exclude if matched
-EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::|Internal|"
-                        r"python_op_gen_internal|grappler")
+EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::")
 
 # Include if matched before exclude
 INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
@@ -57,10 +56,6 @@ INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
                            r"tensorflow::ops::internal::Enter|"
                            r"tensorflow::strings::internal::AppendPieces|"
                            r"tensorflow::strings::internal::CatPieces|"
-                           r"tensorflow::errors::Internal|"
-                           r"tensorflow::Tensor::CopyFromInternal|"
-                           r"tensorflow::kernel_factory::"
-                           r"OpKernelRegistrar::InitInternal|"
                            r"tensorflow::io::internal::JoinPathImpl")
 
 # Include if matched after exclude
@@ -69,7 +64,7 @@ INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
                         r"tensorflow::|"
                         r"functor::|"
                         r"\?nsync_|"
-                        r"stream_executor::")
+                        r"perftools::gputools")
 
 # We want to identify data members explicitly in the DEF file, so that no one
 # can implicitly link against the DLL if they use one of the variables exported
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
index 795f1993ba..45760a29ee 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
@@ -151,24 +151,16 @@ class SinhArcsinhBijectorTest(test.TestCase):
         self.assertAllClose(y, bijector.forward(x).eval(), rtol=1e-4, atol=0.)
         self.assertAllClose(x, bijector.inverse(y).eval(), rtol=1e-4, atol=0.)
 
-        # On IBM PPC systems, longdouble (np.float128) is same as double except that it can have more precision.
-        # Type double being of 8 bytes, can't hold square of max of float64 (which is also 8 bytes) and
-        # below test fails due to overflow error giving inf. So this check avoids that error by skipping square
-        # calculation and corresponding assert.
-
-        if np.amax(y) <= np.sqrt(np.finfo(np.float128).max) and \
-           np.fabs(np.amin(y)) <= np.sqrt(np.fabs(np.finfo(np.float128).min)):
-
-          # Do the numpy calculation in float128 to avoid inf/nan.
-          y_float128 = np.float128(y)
-          self.assertAllClose(
-              np.log(np.cosh(
-                  np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
-                      y_float128**2 + 1)) -
-              np.log(tailweight),
-              bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
-              rtol=1e-4,
-              atol=0.)
+        # Do the numpy calculation in float128 to avoid inf/nan.
+        y_float128 = np.float128(y)
+        self.assertAllClose(
+            np.log(np.cosh(
+                np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
+                    y_float128**2 + 1)) -
+            np.log(tailweight),
+            bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
+            rtol=1e-4,
+            atol=0.)
         self.assertAllClose(
             -bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
             bijector.forward_log_det_jacobian(x, event_ndims=0).eval(),
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index adf92c27ea..d7909dd5a2 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -106,8 +106,7 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
             target_device=target,
             buffer_size=10,
             container="",
-            shared_name=_generate_shared_name(
-                "contrib_eager_iterator_function_buffer_resource"))
+            shared_name=_generate_shared_name("function_buffer_resource"))
         self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
             handle=self._buffer_resource_handle,
             handle_device=self._device)
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
index 5749f22ac5..4fe3a0e3f3 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
@@ -68,7 +68,7 @@
         "# simply construct the object. Most layers take as a first argument the number\n",
         "# of output dimensions / channels.\n",
         "layer = tf.keras.layers.Dense(100)\n",
-        "# The number of input dimensions is often unnecessary, as it can be inferred\n",
+        "# The number of input dimensionss is often unnecessary, as it can be inferred\n",
         "# the first time the layer is used, but it can be provided if you want to \n",
         "# specify it manually, which is useful in some complex models.\n",
         "layer = tf.keras.layers.Dense(10, input_shape=(None, 5))"
@@ -267,7 +267,7 @@
         "  * `build`, where you know the shapes of the input tensors and can do the rest of the initialization\n",
         "  * `call`, where you do the forward computation\n",
         "\n",
-        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes required to create the variables will need to be explicitly specified."
+        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes requires to create the variables will need to be explicitly specified."
       ]
     },
     {
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index 05bcdac2ca..84a413c791 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -346,8 +346,7 @@ def sequence_numeric_column(
     key,
     shape=(1,),
     default_value=0.,
-    dtype=dtypes.float32,
-    normalizer_fn=None):
+    dtype=dtypes.float32):
   """Returns a feature column that represents sequences of numeric data.
 
   Example:
@@ -371,12 +370,6 @@ def sequence_numeric_column(
     default_value: A single value compatible with `dtype` that is used for
       padding the sparse data into a dense `Tensor`.
     dtype: The type of values.
-    normalizer_fn: If not `None`, a function that can be used to normalize the
-      value of the tensor after `default_value` is applied for parsing.
-      Normalizer function takes the input `Tensor` as its argument, and returns
-      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
-      even though the most common use case of this function is normalization, it
-      can be used for any kind of Tensorflow transformations.
 
   Returns:
     A `_SequenceNumericColumn`.
@@ -390,16 +383,12 @@ def sequence_numeric_column(
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
-  if normalizer_fn is not None and not callable(normalizer_fn):
-    raise TypeError(
-        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
 
   return _SequenceNumericColumn(
       key,
       shape=shape,
       default_value=default_value,
-      dtype=dtype,
-      normalizer_fn=normalizer_fn)
+      dtype=dtype)
 
 
 def _assert_all_equal_and_return(tensors, name=None):
@@ -418,7 +407,7 @@ class _SequenceNumericColumn(
     fc._SequenceDenseColumn,
     collections.namedtuple(
         '_SequenceNumericColumn',
-        ['key', 'shape', 'default_value', 'dtype', 'normalizer_fn'])):
+        ['key', 'shape', 'default_value', 'dtype'])):
   """Represents sequences of numeric data."""
 
   @property
@@ -430,10 +419,7 @@ class _SequenceNumericColumn(
     return {self.key: parsing_ops.VarLenFeature(self.dtype)}
 
   def _transform_feature(self, inputs):
-    input_tensor = inputs.get(self.key)
-    if self.normalizer_fn is not None:
-      input_tensor = self.normalizer_fn(input_tensor)
-    return input_tensor
+    return inputs.get(self.key)
 
   @property
   def _variable_shape(self):
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
index 45d7b74046..ee74cf56dc 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
@@ -28,7 +28,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
 
@@ -948,7 +947,6 @@ class SequenceNumericColumnTest(test.TestCase):
     self.assertEqual((1,), a.shape)
     self.assertEqual(0., a.default_value)
     self.assertEqual(dtypes.float32, a.dtype)
-    self.assertIsNone(a.normalizer_fn)
 
   def test_shape_saved_as_tuple(self):
     a = sfc.sequence_numeric_column('aaa', shape=[1, 2])
@@ -967,10 +965,6 @@ class SequenceNumericColumnTest(test.TestCase):
         ValueError, 'dtype must be convertible to float'):
       sfc.sequence_numeric_column('aaa', dtype=dtypes.string)
 
-  def test_normalizer_fn_must_be_callable(self):
-    with self.assertRaisesRegexp(TypeError, 'must be a callable'):
-      sfc.sequence_numeric_column('aaa', normalizer_fn='NotACallable')
-
   def test_get_sequence_dense_tensor(self):
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, values [[0.], [1]]
@@ -991,41 +985,6 @@ class SequenceNumericColumnTest(test.TestCase):
       self.assertAllEqual(
           expected_dense_tensor, dense_tensor.eval(session=sess))
 
-  def test_get_sequence_dense_tensor_with_normalizer_fn(self):
-
-    def _increment_two(input_sparse_tensor):
-      return sparse_ops.sparse_add(
-          input_sparse_tensor,
-          sparse_tensor.SparseTensor(((0, 0), (1, 1)), (2.0, 2.0), (2, 2))
-      )
-
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0.], [1]]
-        # example 1, [[10.]]
-        indices=((0, 0), (0, 1), (1, 0)),
-        values=(0., 1., 10.),
-        dense_shape=(2, 2))
-
-    # Before _increment_two:
-    #   [[0.], [1.]],
-    #   [[10.], [0.]],
-    # After _increment_two:
-    #   [[2.], [1.]],
-    #   [[10.], [2.]],
-    expected_dense_tensor = [
-        [[2.], [1.]],
-        [[10.], [2.]],
-    ]
-    numeric_column = sfc.sequence_numeric_column(
-        'aaa', normalizer_fn=_increment_two)
-
-    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_dense_tensor, dense_tensor.eval(session=sess))
-
   def test_get_sequence_dense_tensor_with_shape(self):
     """Tests get_sequence_dense_tensor with shape !=(1,)."""
     sparse_input = sparse_tensor.SparseTensorValue(
diff --git a/tensorflow/contrib/ffmpeg/__init__.py b/tensorflow/contrib/ffmpeg/__init__.py
index 484ffee3e7..daba965a98 100644
--- a/tensorflow/contrib/ffmpeg/__init__.py
+++ b/tensorflow/contrib/ffmpeg/__init__.py
@@ -28,6 +28,7 @@ from __future__ import print_function
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_audio
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import encode_audio
+from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 
 from tensorflow.python.util.all_util import remove_undocumented
 
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
index b1b5126d9e..020b5c99c6 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.contrib.ffmpeg.ops import gen_decode_audio_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_encode_audio_op_py
+from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index dc49383c5c..10d1ecc738 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -119,13 +119,14 @@ from tensorflow.python.framework.smart_cond import smart_cond
 from tensorflow.python.framework.smart_cond import smart_constant_value
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
+from tensorflow.python.ops.array_ops import broadcast_to
 from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_1d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_3d
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['nest']
+_allowed_symbols = ['nest', 'broadcast_to']
 _nest_allowed_symbols = [
     'assert_same_structure',
     'is_sequence',
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
index a955e21b72..65cb94b5a4 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -301,8 +301,8 @@ class FusedConv2DBiasActivationTest(test.TestCase):
           conv = tensors[i]
           value = values[i]
           ref_value = ref_values[i]
-          tf_logging.info("expected = ", ref_value)
-          tf_logging.info("actual = ", value)
+          print("expected = ", ref_value)
+          print("actual = ", value)
           tol = 1e-5
           if value.dtype == np.float16:
             tol = 1e-3
@@ -843,8 +843,7 @@ class FusedConvInt8Tests(test.TestCase):
                                                 vertical_stride, padding_type)
     output_width = CalculateConvolvedOutputDim(input_width, filter_width,
                                                horizontal_stride, padding_type)
-    tf_logging.info("output_height=", output_height, ", output_width=", 
-			                 output_width)
+    print("output_height=", output_height, ", output_width=", output_width)
 
     side_input, _, _ = gen_array_ops.quantize_v2(
         random_ops.random_uniform(
@@ -881,8 +880,8 @@ class FusedConvInt8Tests(test.TestCase):
     with self.test_session(
         use_gpu=True, config=NoMemoryOptimizationConfig()) as sess:
       actual_y, expected_y = sess.run([actual, expected])
-      tf_logging.info("actual_y = ", actual_y)
-      tf_logging.info("expected_y = ", expected_y)
+      print("actual_y = ", actual_y)
+      print("expected_y = ", expected_y)
       self.assertTrue(np.array_equal(actual_y, expected_y))
 
   def testFusedConvInt8(self):
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
index 2e5c84704f..6a5d982dc8 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "hexagon_controller.h"
 
-#include <stdlib.h>
+#include <malloc.h>
 #include <stdio.h>
 
 #include "adspmsgd.h"
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index 840015a7fa..436c3e1d4c 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -30,7 +30,9 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
+# the archive has been propagated in mirror.bazel.build.
+GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
diff --git a/tensorflow/contrib/lite/examples/minimal/minimal.cc b/tensorflow/contrib/lite/examples/minimal/minimal.cc
index 8b0ace96cc..106e3b0270 100644
--- a/tensorflow/contrib/lite/examples/minimal/minimal.cc
+++ b/tensorflow/contrib/lite/examples/minimal/minimal.cc
@@ -38,7 +38,7 @@ using namespace tflite;
 
 int main(int argc, char *argv[]) {
   if(argc != 2) {
-    fprintf(stderr, "minimal <tflite model>\n");
+    fprintf(stderr, "Usage: %s <model>\n");
     return 1;
   }
   const char* filename = argv[1];
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index 965273f0f0..bb2e615eac 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -128,6 +128,7 @@ TensorFlow operation not listed above are likely unsupported. Notably, the
 following common ops are not supported at the moment:
 
 *   [tf.depth_to_space](https://www.tensorflow.org/api_docs/python/tf/depth_to_space)
+*   [tf.gather](https://www.tensorflow.org/api_docs/python/tf/gather)
 *   [tf.image.resize_bilinear](https://www.tensorflow.org/api_docs/python/tf/image/resize_bilinear)
 *   [tf.tanh](https://www.tensorflow.org/api_docs/python/tf/tanh)
 
@@ -305,19 +306,6 @@ Options {
 }
 ```
 
-**GATHER**
-
-```
-Inputs {
-  0: params tensor
-  1: indices tensor
-  2: axis tensor (optional)
-}
-Outputs {
-  0: a tensor with same type as the params tensor.
-}
-```
-
 **GREATER**
 
 ```
diff --git a/tensorflow/contrib/lite/java/ovic/README.md b/tensorflow/contrib/lite/java/ovic/README.md
index 26349347fa..5efa70987e 100644
--- a/tensorflow/contrib/lite/java/ovic/README.md
+++ b/tensorflow/contrib/lite/java/ovic/README.md
@@ -2,7 +2,7 @@
 
 This folder contains building code for track one of the [Low Power ImageNet Recognition Challenge workshop at CVPR 2018.](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018)
 
-## Pre-requisite
+## Pre-requesits
 
 Follow the steps [here](https://www.tensorflow.org/mobile/tflite/demo_android) to install Tensorflow, Bazel, and the Android NDK and SDK.
 
@@ -49,7 +49,7 @@ Once you have a submission that follows the instructions from the [competition s
 You can call the validator binary below to verify that your model fits the format requirements. This often helps you to catch size mismatches (e.g. output should be [1, 1001] instead of [1,1,1,1001]). Let say the submission file is located at `/path/to/my_model.lite`, then call:
 
 ```sh
-bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
+bazel build --cxxopt--std=c++11 //tensorflow/contrib/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
 bazel-bin/tensorflow/contrib/lite/java/ovic/ovic_validator /path/to/my_model.lite
 ```
 
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 1908f7fa6c..a2f192bbc2 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1934,7 +1934,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // The quantization of the input, output arrays is as follows:
 //  - The input activations are quantized as uint8 on the interval
 //    [-1, 127/128].
-//    The rationale for that is that is the natural interval for output
+//    The rationale for that is that that is the natural interval for output
 //    activations (see next point) and these need to be concatenated together.
 //    We could accommodate different ranges by re-scaling, but we empirically
 //    found that setting the input activations range to be [-1, 127/128] in the
@@ -1999,7 +1999,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // However, for a fixed-point implementation in 16-bit integers, using 5
 // integer bits to represent the [-16, 16] range would leave only 11
 // fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
-// representable values. Notice that is higher than the
+// representable values. Notice that that is higher than the
 // worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
 // Using [-8, 8] thus seems like the better compromise overall, enjoying
 // an increment of 2.4e-4 between representable values and a worst-case
diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
index fd90823425..9400e757b9 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -55,7 +55,7 @@ class Interpreter(object):
     elif model_content and not model_path:
       self._interpreter = (
           _interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromBuffer(
-              model_content))
+              model_content, len(model_content)))
       if not self._interpreter:
         raise ValueError(
             'Failed to create model from {} bytes'.format(len(model_content)))
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index b283551c45..f705551fcb 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -397,14 +397,9 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
-    PyObject* data) {
-  char * buf = nullptr;
-  Py_ssize_t length;
-  if (PY_TO_CPPSTRING(data, &buf, &length) == -1) {
-    return nullptr;
-  }
+    const char* data, size_t len) {
   std::unique_ptr<tflite::FlatBufferModel> model =
-      tflite::FlatBufferModel::BuildFromBuffer(buf, length);
+      tflite::FlatBufferModel::BuildFromBuffer(data, len);
   return model ? new InterpreterWrapper(std::move(model)) : nullptr;
 }
 
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
index cbeb53bee7..b0ed7c4559 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -40,7 +40,8 @@ class InterpreterWrapper {
   static InterpreterWrapper* CreateWrapperCPPFromFile(const char* model_path);
 
   // SWIG caller takes ownership of pointer.
-  static InterpreterWrapper* CreateWrapperCPPFromBuffer(PyObject* data);
+  static InterpreterWrapper* CreateWrapperCPPFromBuffer(const char* data,
+                                                        size_t len);
 
   ~InterpreterWrapper();
   bool AllocateTensors();
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 88dda7290b..0913cd2c5c 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -34,8 +34,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from six import PY3
-
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
 from tensorflow.contrib.lite.python import lite_constants as constants
@@ -56,7 +54,6 @@ from tensorflow.python.framework.importer import import_graph_def
 from tensorflow.python.ops.variables import global_variables_initializer
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
-# from tensorflow.python.util.all_util import remove_undocumented
 
 
 class TocoConverter(object):
@@ -206,12 +203,6 @@ class TocoConverter(object):
       except (_text_format.ParseError, DecodeError):
         try:
           print("Ignore 'tcmalloc: large alloc' warnings.")
-
-          if not isinstance(file_content, str):
-            if PY3:
-              file_content = file_content.decode('utf-8')
-            else:
-              file_content = file_content.encode('utf-8')
           _text_format.Merge(file_content, graph_def)
         except (_text_format.ParseError, DecodeError):
           raise ValueError(
@@ -391,5 +382,3 @@ def _freeze_graph(sess, output_tensors):
                                                         output_arrays)
   else:
     return sess.graph_def
-
-# remove_undocumented(__name__)
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 5c7fa09891..e33b430937 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -178,7 +178,7 @@ ArrayDataType ConvertDataType(tensorflow::DataType dtype) {
   else if (dtype == DT_STRING)
     return ArrayDataType::kString;
   else
-    LOG(INFO) << "Unsupported data type in placeholder op: " << dtype;
+    LOG(INFO) << "Unsupported data type in placehoder op: " << dtype;
   return ArrayDataType::kNone;
 }
 
diff --git a/tensorflow/contrib/lite/toco/toco_port.cc b/tensorflow/contrib/lite/toco/toco_port.cc
index de76fd4032..1b21c8bc60 100644
--- a/tensorflow/contrib/lite/toco/toco_port.cc
+++ b/tensorflow/contrib/lite/toco/toco_port.cc
@@ -20,12 +20,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
-namespace std {
-double round(double x) { return ::round(x); }
-}  // namespace std
-#endif
-
 namespace toco {
 namespace port {
 void CopyToBuffer(const string& src, char* dest) {
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h
index 17f82b9dd7..5c019cb2bf 100644
--- a/tensorflow/contrib/lite/toco/toco_port.h
+++ b/tensorflow/contrib/lite/toco/toco_port.h
@@ -34,24 +34,6 @@ limitations under the License.
 #define TFLITE_PROTO_NS google::protobuf
 #endif
 
-#ifdef __ANDROID__
-#include <sstream>
-namespace std {
-
-template <typename T>
-std::string to_string(T value)
-{
-    std::ostringstream os ;
-    os << value ;
-    return os.str() ;
-}
-
-#ifdef __ARM_ARCH_7A__
-double round(double x);
-#endif
-}
-#endif
-
 namespace toco {
 namespace port {
 
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index a28fc3a87f..e8c6edd7ba 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -270,7 +270,7 @@ for arch in $archs; do
                         PLATFORM_LDFLAGS=-pthread
                         MKDEP=${CC} -M -std=c++11
                         PLATFORM_C=../../platform/c++11/src/nsync_semaphore_mutex.cc \
-                                   ../../platform/posix/src/per_thread_waiter.c \
+                                   ../../platform/c++11/src/per_thread_waiter.cc \
                                    ../../platform/c++11/src/yield.cc \
                                    ../../platform/c++11/src/time_rep_timespec.cc \
                                    ../../platform/c++11/src/nsync_panic.cc
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 48953e2e38..eff9081e35 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -27,7 +27,9 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
+# the archive has been propagated in mirror.bazel.build.
+GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index a6be2084aa..2ed99d50a4 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -2503,7 +2503,7 @@ def _compute_recall_at_precision(tp, fp, fn, precision, name):
     name: An optional variable_scope name.
 
   Returns:
-    The recall at a given `precision`.
+    The recall at a the given `precision`.
   """
   precisions = math_ops.div(tp, tp + fp + _EPSILON)
   tf_index = math_ops.argmin(
diff --git a/tensorflow/contrib/mpi_collectives/kernels/ring.h b/tensorflow/contrib/mpi_collectives/kernels/ring.h
index c001615d3f..1d56d588bc 100644
--- a/tensorflow/contrib/mpi_collectives/kernels/ring.h
+++ b/tensorflow/contrib/mpi_collectives/kernels/ring.h
@@ -129,7 +129,7 @@ cudaStream_t CudaStreamForMPI();
  *  has the fully accumulated Segment 1; and so on. The scatter-reduce is
  * complete.
  *
- *  Next, the allgather distributes these fully accumulated chunks across all
+ *  Next, the allgather distributes these fully accumululated chunks across all
  * nodes. Communication proceeds in the same ring, once again in N-1 steps. At
  * the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
  * For example, at the first iteration, the following transfers will occur:
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index 915e6504e1..21bf3f5313 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -224,10 +224,8 @@ class AdaMaxOptimizerTest(test.TestCase):
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0),
-                                             rtol=1e-2)
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1),
-                                             rtol=1e-2)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
           if use_resource:
             self.assertEqual("var0_%d/AdaMax:0" % (i,),
                              opt.get_slot(var=var0, name="m").name)
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer.py b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
index b6b10e500b..a7c97a1da2 100644
--- a/tensorflow/contrib/opt/python/training/model_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
@@ -62,7 +62,7 @@ class ModelAverageCustomGetter(object):
   """
 
   def __init__(self, worker_device):
-    """Create a new `ModelAverageCustomGetter`.
+    """Create a new `ElasticAverageCustomGetter`.
 
     Args:
       worker_device: String.  Name of the `worker` job.
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
index aad1ca04c5..6ca7fe8b6e 100644
--- a/tensorflow/contrib/periodic_resample/BUILD
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -6,13 +6,12 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
+    "py_test",
     "tf_gen_op_libs",
     "tf_custom_op_library",
     "tf_custom_op_py_library",
     "tf_gen_op_wrapper_py",
 )
-load("//tensorflow:tensorflow.bzl", "py_test")
 
 cc_library(
     name = "all_ops",
@@ -85,23 +84,6 @@ py_test(
         ":init_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradient_checker",
-    ],
-)
-
-tf_cc_test(
-    name = "periodic_resample_op_cc_test",
-    size = "small",
-    srcs = [
-        "ops/array_ops_test.cc",
-    ],
-    deps = [
-        ":all_ops",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_proto",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
     ],
 )
 
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
index 514689cf45..e18923c8aa 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
@@ -22,9 +22,4 @@ namespace tensorflow {
 REGISTER_KERNEL_BUILDER(Name("PeriodicResample").Device(DEVICE_CPU),
                         PeriodicResampleOp);
 
-
-REGISTER_KERNEL_BUILDER(Name("PeriodicResampleOpGrad")
-                            .Device(DEVICE_CPU),
-                        PeriodicResampleOpGrad);
-
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
index 42fba81a5c..3ab588c458 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
@@ -25,202 +25,92 @@
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/util/work_sharder.h"
 
 namespace {
 
-// Computes input tensor index for given output index during forward
-// propagation through periodic_resample operation.
-class InputIndexer {
- public:
-  InputIndexer(const std::vector<tensorflow::int64>& output_dimensions,
-               const tensorflow::TensorShape& input_shape,
-               int adjustable_dimension)
-      : output_dimensions_(output_dimensions),
-        adjustable_dimension_(adjustable_dimension),
-        rank_(input_shape.dims()),
-        linear_output_index_(0),
-        linear_input_index_(0),
-        adjustable_dimension_carriage_sum_(0) {
-    auto input_dimensions = TensorShapeToVector(input_shape);
-    // factors by which input_dimensions increases/decreases w.r.t.
-    // output_dimensions
-    dimension_ceiling_ =
-        ComputeDimensionCeiling(output_dimensions, input_dimensions);
-    cumulative_dimensions_ = ComputeCumulativeDimensions();
-
-    output_indices_.resize(output_dimensions_.size());
-    input_indices_.resize(output_dimensions_.size());
-
-    // Compute index_factors
-    index_factors_.resize(rank_);
-    tensorflow::int64 last_index_factor = 1;
-    for (auto r = rank_ - 1; r >= 0; --r) {
-      index_factors_[r] = last_index_factor;
-      last_index_factor *= input_dimensions[r];
-    }
-  }
-
-  tensorflow::int64 linear_input_index() const { return linear_input_index_; }
-
-  void MoveToOutputIndex(tensorflow::int64 output_index);
-  void IncrementOutputIndex();
-
- private:
-  void RecomputeInputAdjustableDimensionIndex() {
-    tensorflow::int64 index = adjustable_dimension_carriage_sum_;
-    index *= output_dimensions_[adjustable_dimension_];
-    index += output_indices_[adjustable_dimension_];
-    input_indices_[adjustable_dimension_] = index;
-  }
-
-  std::vector<tensorflow::int64> TensorShapeToVector(
-      const tensorflow::TensorShape& tensor_shape);
-
-  std::vector<tensorflow::int64> ComputeDimensionCeiling(
-      const std::vector<tensorflow::int64>& output_dimensions,
-      const std::vector<tensorflow::int64>& input_dimensions);
-
-  std::vector<tensorflow::int64> ComputeCumulativeDimensions();
-
-  const std::vector<tensorflow::int64> output_dimensions_;
-  std::vector<tensorflow::int64> dimension_ceiling_;
-  std::vector<tensorflow::int64> index_factors_;
-  std::vector<tensorflow::int64> cumulative_dimensions_;
-  std::vector<tensorflow::int64> output_indices_;
-  std::vector<tensorflow::int64> input_indices_;
-
-  const int adjustable_dimension_;
-  const int rank_;
-  tensorflow::int64 linear_output_index_;
-  tensorflow::int64 linear_input_index_;
-  tensorflow::int64 adjustable_dimension_carriage_sum_;
-};
-
-void InputIndexer::MoveToOutputIndex(tensorflow::int64 output_index) {
-  linear_output_index_ = output_index;
-  linear_input_index_ = 0;
+template <class IndexVecT, class IndexT>
+IndexT compute_input_index(
+    IndexVecT* target_dimensions, const IndexT& output_index,
+    const IndexVecT& original_dimensions, const int& adjustable_dimension,
+    const std::vector<tensorflow::int64>& dimension_ceiling,
+    const std::vector<tensorflow::int64>& cumulative_dimensions, IndexT* result,
+    std::vector<IndexT>* output_indices, const int& rank) {
+  *result = 0;
+  output_indices->clear();
 
   // un-rasterize the output index
   auto last_reduced_i = output_index;
-  for (auto r = rank_ - 1; r >= 0; --r) {
-    output_indices_[r] = last_reduced_i % output_dimensions_[r];
+  for (auto r = rank - 1; r >= 0; --r) {
+    (*output_indices)[r] = last_reduced_i % (*target_dimensions)[r];
     last_reduced_i =
-        (last_reduced_i - output_indices_[r]) / output_dimensions_[r];
+        (last_reduced_i - (*output_indices)[r]) / (*target_dimensions)[r];
   }
 
-  tensorflow::int64 carriage_sum = 0;
-  for (int qi = 0; qi < rank_; ++qi) {
-    if (qi == adjustable_dimension_) continue;
-    carriage_sum += cumulative_dimensions_[qi] *
-                    (output_indices_[qi] % dimension_ceiling_[qi]);
-  }
-  adjustable_dimension_carriage_sum_ = carriage_sum;
-
   // rasterize the input index
-  for (auto r = rank_ - 1; r >= 0; --r) {
-    if (r != adjustable_dimension_) {
-      input_indices_[r] = output_indices_[r] / dimension_ceiling_[r];
-    } else {
-      RecomputeInputAdjustableDimensionIndex();
-    }
-  }
-  for (auto r = rank_ - 1; r >= 0; --r) {
-    linear_input_index_ += index_factors_[r] * input_indices_[r];
-  }
-}
-
-void InputIndexer::IncrementOutputIndex() {
-  linear_output_index_++;
-  for (auto r = rank_ - 1; r >= 0; --r) {
-    auto old_carriage_sum_increment =
-        cumulative_dimensions_[r] *
-        (output_indices_[r] % dimension_ceiling_[r]);
-    output_indices_[r] = (output_indices_[r] + 1) % output_dimensions_[r];
-    if (r != adjustable_dimension_) {
-      auto new_input_index = output_indices_[r] / dimension_ceiling_[r];
-      linear_input_index_ +=
-          (new_input_index - input_indices_[r]) * index_factors_[r];
-
-      input_indices_[r] = new_input_index;
-
-      auto new_carriage_sum_increment =
-          cumulative_dimensions_[r] *
-          (output_indices_[r] % dimension_ceiling_[r]);
-
-      adjustable_dimension_carriage_sum_ = adjustable_dimension_carriage_sum_ -
-                                           old_carriage_sum_increment +
-                                           new_carriage_sum_increment;
-    }
-
-    if (output_indices_[r] != 0) {
-      // No more carries to higher indices.
-      break;
+  IndexT last_index_factor = 1;
+  for (auto r = rank - 1; r >= 0; --r) {
+    IndexT index = 0;
+    if (r != adjustable_dimension)
+      index = (*output_indices)[r] / dimension_ceiling[r];
+    else {
+      for (int qi = 0; qi < rank; ++qi) {
+        if (qi == adjustable_dimension) continue;
+        index += cumulative_dimensions[qi] *
+                 ((*output_indices)[qi] % dimension_ceiling[qi]);
+      }
+      index *= (*target_dimensions)[adjustable_dimension];
+      index += (*output_indices)[r];
     }
+    *result += last_index_factor * index;
+    last_index_factor *= original_dimensions[r];
   }
-  auto old_adjustable_dimension_input_index =
-      input_indices_[adjustable_dimension_];
-  RecomputeInputAdjustableDimensionIndex();
-  linear_input_index_ += (input_indices_[adjustable_dimension_] -
-                           old_adjustable_dimension_input_index) *
-                          index_factors_[adjustable_dimension_];
-}
 
-std::vector<tensorflow::int64> InputIndexer::TensorShapeToVector(
-    const tensorflow::TensorShape& tensor_shape) {
-  std::vector<tensorflow::int64> result(tensor_shape.dims());
-  int count = 0;
-  for (const auto dim_info : tensor_shape) {
-    result[count] = dim_info.size;
-    ++count;
-  }
-  return result;
+  return *result;
 }
 
-std::vector<tensorflow::int64> InputIndexer::ComputeDimensionCeiling(
-    const std::vector<tensorflow::int64>& output_dimensions,
-    const std::vector<tensorflow::int64>& input_dimensions) {
-  std::vector<tensorflow::int64> dimension_ceiling(input_dimensions.size());
-  for (size_t i = 0; i < input_dimensions.size(); ++i) {
-    dimension_ceiling[i] = (output_dimensions[i] + input_dimensions[i] - 1) /
-        input_dimensions[i];
-  }
-  return dimension_ceiling;
-}
+template <class InputDataT,
+          class IndexVecT>  // both types are needed here b/c IndexVecT and
+                            // InputDataT are not related
+                            void
+                            fill_periodic_tensor(
+                                tensorflow::OpKernelContext* context,
+                                const IndexVecT& desired_shape,
+                                const tensorflow::Tensor& input_tensor) {
+  // input is a strided array (last index is fastest, C-ordered)
+  auto input = input_tensor.flat<InputDataT>();
+  const int rank = input_tensor.dims();
+  // original and target dimensions
+  std::vector<tensorflow::int64> original_dimensions(rank),
+      target_dimensions(rank);
+  tensorflow::int64 total_size(input_tensor.NumElements()), new_sliced_size(1);
+  // factors by which original_dimensions increases/decreases w.r.t.
+  // target_dimensions
+  std::vector<tensorflow::int64> dimension_ceiling(rank),
+      cumulative_dimensions(rank);
+  // index of adjustable dimension
+  int adjustable_dimension;
+  tensorflow::TensorShape output_shape;
 
-std::vector<tensorflow::int64> InputIndexer::ComputeCumulativeDimensions() {
-  std::vector<tensorflow::int64> cumulative_dimensions(rank_);
-  int count = 0;
-  for (int i = 0; i < rank_; ++i) {
-    if (count == 0) {
-      cumulative_dimensions[count] = 1;
-    } else {
-      cumulative_dimensions[count] =
-          cumulative_dimensions[count - 1] * dimension_ceiling_[count - 1];
-    }
-    ++count;
-  }
-  return cumulative_dimensions;
-}
+  // requires that the rank of the input tensor and length of the desired shape
+  // are equal
+  OP_REQUIRES(context, rank == desired_shape.size(),
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample expects the rank of the input tensor, ",
+                  rank, ", to be the same as the length of the desired shape, ",
+                  desired_shape.size(), "."));
 
-template <typename IndexVecT>
-void process_desired_shape(tensorflow::OpKernelContext* context,
-                           const tensorflow::TensorShape& input_tensor_shape,
-                           const IndexVecT& desired_shape,
-                           int* adjustable_dimension,
-                           std::vector<tensorflow::int64>* target_dimensions,
-                           tensorflow::int64* output_size) {
-  tensorflow::int64 new_sliced_size = 1;
   bool found = false;
-  const int rank = input_tensor_shape.dims();
+  const auto& input_tensor_shape = input_tensor.shape();
+
   for (int i = 0; i < rank; ++i) {
+    // if (desired_shape(i) < 1) {
     if (desired_shape[i] < 1) {
       // only one index can be adjustable
       OP_REQUIRES(context, !found,
                   tensorflow::errors::InvalidArgument(
                       "periodic_resample expects only "
                       "one index to be marked as adjustable."));
-      *adjustable_dimension = i;
+      adjustable_dimension = i;
       found = true;
     } else {
       OP_REQUIRES(
@@ -232,8 +122,9 @@ void process_desired_shape(tensorflow::OpKernelContext* context,
               i, " input tensor has size ", input_tensor_shape.dim_size(i),
               ", desired shape has size ", desired_shape[i], "."));
 
-      (*target_dimensions)[i] = desired_shape[i];
-      new_sliced_size *= (*target_dimensions)[i];
+      // target_dimensions[i] = desired_shape(i);
+      target_dimensions[i] = desired_shape[i];
+      new_sliced_size *= target_dimensions[i];
     }
   }
   // at least one index needs to be adjustable
@@ -241,50 +132,26 @@ void process_desired_shape(tensorflow::OpKernelContext* context,
               tensorflow::errors::InvalidArgument(
                   "periodic_resample expects at least "
                   "one index to be marked as adjustable."));
-  (*target_dimensions)[*adjustable_dimension] =
-      input_tensor_shape.num_elements() / new_sliced_size;
-
-  *output_size = new_sliced_size * (*target_dimensions)[*adjustable_dimension];
-}
-
-// Heuristic number based on measurements on
-// Intel(R) Core(TM) i7-4930K CPU @ 3.40GHz
-const tensorflow::int64 costPerFillIndex = 35;
 
-enum class Mode {
-  kForward,
-  kGradient
-};
-
-// Computes either periodic_resample operation output or gradients for it,
-// depending on |mode|.
-// |original_shape| is always shape of input to periodic_resample operation.
-// |source_tensor| is either source for periodic_resample (for forward mode)
-//     or gradients tensor.
-// |desired_shape| is always shape, provided by user, to which forward
-//     propagation attempts resample input tensor.
-template <class InputDataT, Mode mode>
-void
-do_periodic_resample_op(tensorflow::OpKernelContext* context,
-                        const tensorflow::TensorShape& original_shape,
-                        const tensorflow::PartialTensorShape& desired_shape,
-                        const tensorflow::Tensor& source_tensor) {
-  const int rank = source_tensor.dims();
+  int count = 0;
+  for (const auto dim_info : input_tensor.shape()) {
+    original_dimensions[count] = dim_info.size;
+    ++count;
+  }
 
-  // requires that the rank of the input tensor and length of the desired shape
-  // are equal
-  OP_REQUIRES(context, rank == desired_shape.dims(),
-              tensorflow::errors::InvalidArgument(
-                  "periodic_resample expects the rank of the input tensor, ",
-                  rank, ", to be the same as the length of the desired shape, ",
-                  desired_shape.dims(), "."));
+  target_dimensions[adjustable_dimension] = total_size / new_sliced_size;
 
-  std::vector<tensorflow::int64> target_dimensions(rank);
-  tensorflow::int64 new_size = 0;
-  // index of adjustable dimension
-  int adjustable_dimension = 0;
-  process_desired_shape(context, original_shape, desired_shape.dim_sizes(),
-                        &adjustable_dimension, &target_dimensions, &new_size);
+  count = 0;
+  for (int i = 0; i < input_tensor.shape().dims(); ++i) {
+    dimension_ceiling[count] = tensorflow::int64(std::ceil(
+        float(target_dimensions[count]) / float(original_dimensions[count])));
+    if (count == 0)
+      cumulative_dimensions[count] = 1;
+    else
+      cumulative_dimensions[count] =
+          cumulative_dimensions[count - 1] * dimension_ceiling[count - 1];
+    ++count;
+  }
 
   // ensure that the new dimension is greater than zero
   OP_REQUIRES(context, target_dimensions[adjustable_dimension] > 0,
@@ -293,14 +160,11 @@ do_periodic_resample_op(tensorflow::OpKernelContext* context,
                   "adjustable dimension, ",
                   adjustable_dimension, ", isn't greater than zero, ",
                   target_dimensions[adjustable_dimension], "."));
-  tensorflow::TensorShape output_shape;
-  if (mode == Mode::kForward) {
-    for (int i = 0; i < rank; ++i) {
-      output_shape.AddDim(target_dimensions[i]);
-    }
-  } else {
-    output_shape = original_shape;
+  for (int i = 0; i < rank; ++i) {
+    output_shape.AddDim(target_dimensions[i]);
   }
+  const auto new_size =
+      new_sliced_size * target_dimensions[adjustable_dimension];
 
   // Create an output tensor and attach it to the current context
   tensorflow::Tensor* output_tensor = nullptr;
@@ -308,73 +172,47 @@ do_periodic_resample_op(tensorflow::OpKernelContext* context,
                  context->allocate_output(0, output_shape, &output_tensor));
   auto output = output_tensor->flat<InputDataT>();
 
-  // input is a strided array (last index is fastest, C-ordered)
-  auto input = source_tensor.flat<InputDataT>();
+  // memory is allocated for these variables outside the inner loop for
+  // efficiency (although, I could create a separate class scope for
+  // this purpose instead)
+  tensorflow::int64 result = 0;
+  std::vector<tensorflow::int64> output_indices(target_dimensions.size());
 
   // Fill output tensor with periodically resampled input tensor values
-  InputIndexer input_indexer(target_dimensions, original_shape,
-                             adjustable_dimension);
-
-  auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-  auto fill_output_tensor = [&input_indexer, &output, &input](
-      tensorflow::int64 start, tensorflow::int64 limit) {
-    InputIndexer local_indexer(input_indexer);
-    local_indexer.MoveToOutputIndex(start);
-    for (tensorflow::int64 output_index = start; output_index < limit;
-         ++output_index) {
-      if (mode == Mode::kForward) {
-        output(output_index) = input(local_indexer.linear_input_index());
-      } else {
-        output(local_indexer.linear_input_index()) = input(output_index);
-      }
-      local_indexer.IncrementOutputIndex();
-    }
-  };
-  ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
-                      new_size, costPerFillIndex, fill_output_tensor);
-}
-
-#define DATA_TYPE_SWITCH(data_type, context, CASE)                            \
-  switch (data_type) {                                                        \
-    CASE(float)                                                               \
-    CASE(double)                                                              \
-    CASE(tensorflow::int32)                                                   \
-    CASE(tensorflow::int64)                                                   \
-    default:                                                                  \
-      context->CtxFailure(__FILE__, __LINE__,                                 \
-          tensorflow::errors::InvalidArgument(                                \
-              "Unsuppored tensor elements type"));                            \
-      break;                                                                  \
+  for (tensorflow::int64 output_index = 0; output_index < new_size;
+       ++output_index) {
+    output(output_index) = input(compute_input_index(
+        &target_dimensions, output_index, original_dimensions,
+        adjustable_dimension, dimension_ceiling, cumulative_dimensions, &result,
+        &output_indices, rank));
   }
+}
 
 void create_output_tensor(
     tensorflow::OpKernelContext* context,
     const tensorflow::Tensor& input_tensor,
     const tensorflow::DataType& input_tensor_type,
-    const tensorflow::PartialTensorShape& desired_shape) {
-#define CASE(type)                                                            \
-    case tensorflow::DataTypeToEnum<type>::value:                             \
-      do_periodic_resample_op<type, Mode::kForward>(                          \
-          context, input_tensor.shape(), desired_shape, input_tensor);        \
-      break;
+    const tensorflow::PartialTensorShape& desired_shape_tensor) {
+  auto desired_shape = desired_shape_tensor.dim_sizes();
 
-  DATA_TYPE_SWITCH(input_tensor_type, context, CASE);
-#undef CASE
-}
-
-void create_grad_tensor(tensorflow::OpKernelContext* context,
-                        const tensorflow::Tensor& grad_tensor,
-                        const tensorflow::DataType& grad_tensor_type,
-                        const tensorflow::TensorShape& original_shape,
-                        const tensorflow::PartialTensorShape& desired_shape) {
-#define CASE(type)                                                            \
-    case tensorflow::DataTypeToEnum<type>::value:                             \
-      do_periodic_resample_op<type, Mode::kGradient>(                         \
-          context, original_shape, desired_shape, grad_tensor);               \
+  // obligatory type switch
+  switch (input_tensor_type) {
+    case tensorflow::DataTypeToEnum<float>::value:
+      fill_periodic_tensor<float>(context, desired_shape, input_tensor);
       break;
-
-  DATA_TYPE_SWITCH(grad_tensor_type, context, CASE);
-#undef CASE
+    case tensorflow::DataTypeToEnum<double>::value:
+      fill_periodic_tensor<double>(context, desired_shape, input_tensor);
+      break;
+    case tensorflow::DataTypeToEnum<tensorflow::int32>::value:
+      fill_periodic_tensor<tensorflow::int32>(context, desired_shape,
+                                              input_tensor);
+      break;
+    case tensorflow::DataTypeToEnum<tensorflow::int64>::value:
+      fill_periodic_tensor<tensorflow::int64>(context, desired_shape,
+                                              input_tensor);
+      break;
+    default:;
+  }
 }
 
 }  // namespace
@@ -400,25 +238,4 @@ class PeriodicResampleOp : public tensorflow::OpKernel {
   tensorflow::PartialTensorShape desired_shape;
 };
 
-class PeriodicResampleOpGrad : public tensorflow::OpKernel {
- public:
-  explicit PeriodicResampleOpGrad(tensorflow::OpKernelConstruction* context)
-      : tensorflow::OpKernel(context) {
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("original_shape", &original_shape));
-    OP_REQUIRES_OK(context, context->GetAttr("desired_shape", &desired_shape));
-  }
-
-  void Compute(tensorflow::OpKernelContext* context) override {
-    const tensorflow::Tensor& grad_tensor = context->input(0);
-    const tensorflow::DataType grad_tensor_type = context->input_dtype(0);
-    create_grad_tensor(context, grad_tensor, grad_tensor_type, original_shape,
-                       desired_shape);
-  }
-
- private:
-  tensorflow::TensorShape original_shape;
-  tensorflow::PartialTensorShape desired_shape;
-};
-
 #endif  // TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops.cc b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
index fd38cd09b4..82bd796956 100644
--- a/tensorflow/contrib/periodic_resample/ops/array_ops.cc
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
@@ -26,42 +26,7 @@ REGISTER_OP("PeriodicResample")
     .Input("values: T")
     .Attr("shape: shape")
     .Output("output: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      tensorflow::PartialTensorShape desired_shape;
-      TF_RETURN_IF_ERROR(c->GetAttr("shape", &desired_shape));
-      shape_inference::ShapeHandle input_tensor_shape = c->input(0);
-      shape_inference::DimensionHandle num_input_elements =
-          c->NumElements(input_tensor_shape);
-      shape_inference::ShapeHandle result_shape_handle;
-      if (!shape_inference::InferenceContext::ValueKnown(num_input_elements)) {
-        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
-            desired_shape, &result_shape_handle));
-      } else {
-        const int rank = c->Rank(input_tensor_shape);
-        std::vector<tensorflow::int64> target_dimensions(rank);
-        tensorflow::int64 new_sliced_size = 1;
-        int adjustable_dimension = 0;
-        for (int i = 0; i < rank; ++i) {
-          if (desired_shape.dim_size(i) < 1) {
-            adjustable_dimension = i;
-          } else {
-            target_dimensions[i] = desired_shape.dim_size(i);
-            new_sliced_size *= target_dimensions[i];
-          }
-        }
-        target_dimensions[adjustable_dimension] =
-            shape_inference::InferenceContext::Value(
-                num_input_elements) / new_sliced_size;
-        tensorflow::TensorShape result_shape;
-        for (int i = 0; i < rank; ++i) {
-          result_shape.AddDim(target_dimensions[i]);
-        }
-        TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(
-            result_shape, &result_shape_handle));
-      }
-      c->set_output(0, result_shape_handle);
-      return Status::OK();
-    })
+    .SetShapeFn(shape_inference::ExplicitShape)
     .Doc(R"doc(
 Periodically resample elements of a tensor to conform to `shape`.
 
@@ -136,20 +101,4 @@ output: Periodically resampled tensor that has dimensions specified as in
 
 )doc");
 
-
-REGISTER_OP("PeriodicResampleOpGrad")
-    .Attr("T: numbertype")
-    .Input("grad: T")
-    .Attr("original_shape: shape")
-    .Attr("desired_shape: shape")
-    .Output("grad_values: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      tensorflow::TensorShape original_shape;
-      TF_RETURN_IF_ERROR(c->GetAttr("original_shape", &original_shape));
-      shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(original_shape, &s));
-      c->set_output(0, s);
-      return Status::OK();
-});
-
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc b/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
deleted file mode 100644
index 43b7c1799f..0000000000
--- a/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/node_def_builder.h"
-#include "tensorflow/core/framework/shape_inference_testutil.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-
-TEST(ArrayOpsTest, PeriodicResample_ShapeFn) {
-  ShapeInferenceTestOp op("PeriodicResample");
-  // Case 1: output shape can be fully inferreed.
-  PartialTensorShape shape({4, 4, -1});
-  TensorShapeProto shape_proto;
-  shape.AsProto(&shape_proto);
-
-  TF_ASSERT_OK(NodeDefBuilder("test", "PeriodicResample")
-                   .Input({"values", 0, DT_INT32})
-                   .Attr("shape", shape_proto)
-                   .Finalize(&op.node_def));
-  INFER_OK(op, "[2,2,4]", "[4,4,1]");
-  // Case 2: output shape can not be inferred - report desired shape.
-  INFER_OK(op, "[2,2,?]", "[4,4,?]");
-}
-
-}  // end namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
index 31a6fe1d94..a25de55e18 100644
--- a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
+++ b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
@@ -21,11 +21,8 @@ from __future__ import print_function
 import numpy
 
 from tensorflow.contrib.periodic_resample import periodic_resample
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -96,6 +93,7 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
   def testPeriodicResampleErrors(self):
     input_tensor = numpy.zeros(shape=[1, 2, 2, 4])
     with self.test_session():
+      variables.global_variables_initializer().run()
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
           'Dimension 3 input tensor has size 4, desired shape has size 1'):
@@ -105,29 +103,6 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
           '4, to be the same as the length of the desired shape, 3'):
         periodic_resample(input_tensor, [None, 4, 4]).eval()
 
-  def testPeriodicResampleGradient(self):
-    desired_shape = numpy.array([4, 4, None])
-    result_shape = (4, 4, 1)
-    input_shape = (2, 2, 4)
-    with self.test_session() as sess:
-      x = array_ops.placeholder(dtypes.float32, shape=input_shape)
-      output = periodic_resample(x, desired_shape)
-      error = gradient_checker.compute_gradient_error(
-          x, input_shape, output, result_shape)
-      self.assertLess(error, 1e-4)
-
-  def testPeriodicResampleShapeInference(self):
-    with self.test_session() as sess:
-      # Case 1: output shape can be fully inferreed.
-      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, 4))
-      output = periodic_resample(x, [4, 4, None])
-      self.assertEqual(output.shape, [4, 4, 1])
-      # Case 2: output shape can not be inferred - report desired shape.
-      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, None))
-      output = periodic_resample(x, [4, 4, None])
-      self.assertTrue(output.shape.is_compatible_with([4, 4, None]))
-      self.assertEqual(output.shape[2].value, None)
-
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
index 470e300ccb..348623d8f8 100644
--- a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
+++ b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
@@ -21,17 +21,11 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.contrib.periodic_resample.python.ops import gen_periodic_resample_op
 
-from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample, periodic_resample_op_grad
+from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample
 
 from tensorflow.contrib.util import loader
-from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
 # pylint: enable=unused-import
 
 _periodic_resample_op = loader.load_op_library(
     resource_loader.get_path_to_datafile('_periodic_resample_op.so'))
-
-@ops.RegisterGradient("PeriodicResample")
-def _periodic_resample_grad_cc(op, grad):
-  return periodic_resample_op_grad(
-      grad, op.inputs[0].shape, op.get_attr('shape'))
diff --git a/tensorflow/contrib/predictor/contrib_estimator_predictor.py b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
index af3b2ad1b5..b7a98c68e2 100644
--- a/tensorflow/contrib/predictor/contrib_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
@@ -34,8 +34,7 @@ class ContribEstimatorPredictor(predictor.Predictor):
                prediction_input_fn,
                input_alternative_key=None,
                output_alternative_key=None,
-               graph=None,
-               config=None):
+               graph=None):
     """Initialize a `ContribEstimatorPredictor`.
 
     Args:
@@ -49,7 +48,6 @@ class ContribEstimatorPredictor(predictor.Predictor):
         multi-headed models.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
-      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -60,7 +58,6 @@ class ContribEstimatorPredictor(predictor.Predictor):
       checkpoint_path = saver.latest_checkpoint(estimator.model_dir)
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
-              config=config,
               checkpoint_filename_with_path=checkpoint_path))
 
     input_alternative_key = (
diff --git a/tensorflow/contrib/predictor/core_estimator_predictor.py b/tensorflow/contrib/predictor/core_estimator_predictor.py
index a725072e72..d78d94c269 100644
--- a/tensorflow/contrib/predictor/core_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/core_estimator_predictor.py
@@ -51,8 +51,7 @@ class CoreEstimatorPredictor(predictor.Predictor):
                estimator,
                serving_input_receiver_fn,
                output_key=None,
-               graph=None,
-               config=None):
+               graph=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -63,7 +62,6 @@ class CoreEstimatorPredictor(predictor.Predictor):
         `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
-      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -73,7 +71,6 @@ class CoreEstimatorPredictor(predictor.Predictor):
       checkpoint_dir = estimator.model_dir
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
-              config=config,
               checkpoint_dir=checkpoint_dir))
 
     feed_tensor_info = signature_def.inputs
diff --git a/tensorflow/contrib/predictor/predictor_factories.py b/tensorflow/contrib/predictor/predictor_factories.py
index f275bc15ad..6e77e934fe 100644
--- a/tensorflow/contrib/predictor/predictor_factories.py
+++ b/tensorflow/contrib/predictor/predictor_factories.py
@@ -30,8 +30,7 @@ def from_contrib_estimator(estimator,
                            prediction_input_fn,
                            input_alternative_key=None,
                            output_alternative_key=None,
-                           graph=None,
-                           config=None):
+                           graph=None):
   """Constructs a `Predictor` from a `tf.contrib.learn.Estimator`.
 
   Args:
@@ -45,7 +44,6 @@ def from_contrib_estimator(estimator,
       multi-headed models.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
-    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -64,15 +62,13 @@ def from_contrib_estimator(estimator,
       prediction_input_fn,
       input_alternative_key=input_alternative_key,
       output_alternative_key=output_alternative_key,
-      graph=graph,
-      config=config)
+      graph=graph)
 
 
 def from_estimator(estimator,
                    serving_input_receiver_fn,
                    output_key=None,
-                   graph=None,
-                   config=None):
+                   graph=None):
   """Constructs a `Predictor` from a `tf.python.estimator.Estimator`.
 
   Args:
@@ -83,7 +79,6 @@ def from_estimator(estimator,
       `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
-    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -98,19 +93,14 @@ def from_estimator(estimator,
                     'tf.contrib.learn.Estimator. You likely want to call '
                     'from_contrib_estimator.')
   return core_estimator_predictor.CoreEstimatorPredictor(
-      estimator,
-      serving_input_receiver_fn,
-      output_key=output_key,
-      graph=graph,
-      config=config)
+      estimator, serving_input_receiver_fn, output_key=output_key, graph=graph)
 
 
 def from_saved_model(export_dir,
                      signature_def_key=None,
                      signature_def=None,
                      tags=None,
-                     graph=None,
-                     config=None):
+                     graph=None):
   """Constructs a `Predictor` from a `SavedModel` on disk.
 
   Args:
@@ -125,7 +115,6 @@ def from_saved_model(export_dir,
       `SignatureDef`. Defaults to `DEFAULT_TAGS`.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
-    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -139,5 +128,4 @@ def from_saved_model(export_dir,
       signature_def_key=signature_def_key,
       signature_def=signature_def,
       tags=tags,
-      graph=graph,
-      config=config)
+      graph=graph)
diff --git a/tensorflow/contrib/predictor/predictor_factories_test.py b/tensorflow/contrib/predictor/predictor_factories_test.py
index a2ef1dc3af..578d9424b2 100644
--- a/tensorflow/contrib/predictor/predictor_factories_test.py
+++ b/tensorflow/contrib/predictor/predictor_factories_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.contrib.predictor import predictor_factories
 from tensorflow.contrib.predictor import testing_common
-from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import test
 
 MODEL_DIR_NAME = 'contrib/predictor/test_export_dir'
@@ -42,11 +41,6 @@ class PredictorFactoriesTest(test.TestCase):
     """Test loading from_saved_model with tags."""
     predictor_factories.from_saved_model(self._export_dir, tags='serve')
 
-  def testFromSavedModelWithSessionConfig(self):
-    """Test loading from_saved_model with session config."""
-    predictor_factories.from_saved_model(
-        self._export_dir, config=config_pb2.ConfigProto())
-
   def testFromSavedModelWithBadTags(self):
     """Test that loading fails for bad tags."""
     bad_tags_regex = ('.*? could not be found in SavedModel')
@@ -59,13 +53,6 @@ class PredictorFactoriesTest(test.TestCase):
     predictor_factories.from_contrib_estimator(
         estimator, input_fn, output_alternative_key='sum')
 
-  def testFromContribEstimatorWithSessionConfig(self):
-    estimator = testing_common.get_arithmetic_estimator(core=False)
-    input_fn = testing_common.get_arithmetic_input_fn(core=False)
-    predictor_factories.from_contrib_estimator(
-        estimator, input_fn, output_alternative_key='sum',
-        config=config_pb2.ConfigProto())
-
   def testFromContribEstimatorWithCoreEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=True)
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
@@ -77,12 +64,6 @@ class PredictorFactoriesTest(test.TestCase):
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
     predictor_factories.from_estimator(estimator, input_fn)
 
-  def testFromCoreEstimatorWithSessionConfig(self):
-    estimator = testing_common.get_arithmetic_estimator(core=True)
-    input_fn = testing_common.get_arithmetic_input_fn(core=True)
-    predictor_factories.from_estimator(
-        estimator, input_fn, config=config_pb2.ConfigProto())
-
   def testFromCoreEstimatorWithContribEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=False)
     input_fn = testing_common.get_arithmetic_input_fn(core=False)
diff --git a/tensorflow/contrib/predictor/saved_model_predictor.py b/tensorflow/contrib/predictor/saved_model_predictor.py
index 95da6d04ed..0dbca0f813 100644
--- a/tensorflow/contrib/predictor/saved_model_predictor.py
+++ b/tensorflow/contrib/predictor/saved_model_predictor.py
@@ -121,8 +121,7 @@ class SavedModelPredictor(predictor.Predictor):
                input_names=None,
                output_names=None,
                tags=None,
-               graph=None,
-               config=None):
+               graph=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -143,7 +142,6 @@ class SavedModelPredictor(predictor.Predictor):
         the correct `SignatureDef`. Defaults to `DEFAULT_TAGS`.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
-      config: `ConfigProto` proto used to configure the session.
     Raises:
       ValueError: If more than one of signature_def_key OR signature_def OR
         (input_names AND output_names) is specified.
@@ -154,7 +152,7 @@ class SavedModelPredictor(predictor.Predictor):
     self._graph = graph or ops.Graph()
 
     with self._graph.as_default():
-      self._session = session.Session(config=config)
+      self._session = session.Session()
       loader.load(self._session, tags.split(','), export_dir)
 
     if input_names is None:
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index 27a933c0f9..c83623ec94 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -6,7 +6,7 @@ inference. The details of the transformation implemented in this package is
 described here [1].
 
 This is done using the
-[fake quantization op](https://www.tensorflow.org/api_guides/python/array_ops#Fake_quantization).
+[fake quantization op](https://www.tensorflow.org/versions/r0.12/api_docs/python/array_ops/fake_quantization).
 
 Literature has shown that fixed point networks provide comparable performance to
 floating point networks [2]. This is achieved by modeling the quantization
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 3d0308aaf3..94fc12ca81 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -26,6 +26,7 @@ import time
 import numpy as np
 
 from tensorflow.contrib.framework.python.ops import variables as variables_lib
+from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.slim.python.slim import evaluation
 from tensorflow.contrib.training.python.training import evaluation as evaluation_lib
 from tensorflow.core.protobuf import saver_pb2
@@ -36,7 +37,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import gfile
@@ -89,8 +89,8 @@ class EvaluationTest(test.TestCase):
     self._predictions, self._scale = TestModel(self._inputs)
 
   def testFinalOpsOnEvaluationLoop(self):
-    value_op, update_op = metrics.accuracy(
-        labels=self._labels, predictions=self._predictions)
+    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
+                                                        self._labels)
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
     # Create checkpoint and log directories:
@@ -136,10 +136,9 @@ class EvaluationTest(test.TestCase):
     self.assertTrue(obj.hook_was_run)
 
   def _create_names_to_metrics(self, predictions, labels):
-    accuracy0, update_op0 = metrics.accuracy(
-        labels=labels, predictions=predictions)
-    accuracy1, update_op1 = metrics.accuracy(
-        labels=labels, predictions=predictions + 1)
+    accuracy0, update_op0 = metric_ops.streaming_accuracy(predictions, labels)
+    accuracy1, update_op1 = metric_ops.streaming_accuracy(predictions + 1,
+                                                          labels)
 
     names_to_values = {'Accuracy': accuracy0, 'Another_accuracy': accuracy1}
     names_to_updates = {'Accuracy': update_op0, 'Another_accuracy': update_op1}
@@ -199,8 +198,8 @@ class EvaluationTest(test.TestCase):
     predictions_limited = input.limit_epochs(self._predictions, num_epochs=1)
     labels_limited = input.limit_epochs(self._labels, num_epochs=1)
 
-    value_op, update_op = metrics.accuracy(
-        labels=labels_limited, predictions=predictions_limited)
+    value_op, update_op = metric_ops.streaming_accuracy(
+        predictions_limited, labels_limited)
 
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
@@ -261,8 +260,8 @@ class SingleEvaluationTest(test.TestCase):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metrics.accuracy(
-        labels=self._labels, predictions=self._predictions)
+    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
+                                                        self._labels)
 
     # Run the evaluation and verify the results:
     accuracy_value = evaluation.evaluate_once(
@@ -277,8 +276,8 @@ class SingleEvaluationTest(test.TestCase):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metrics.accuracy(
-        labels=self._labels, predictions=self._predictions)
+    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
+                                                        self._labels)
 
     dumping_root = os.path.join(self.get_temp_dir(), 'tfdbg_dump_dir')
     dumping_hook = hooks.DumpingDebugHook(dumping_root, log_usage=False)
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index d22b80ac88..99ced53e11 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -21,7 +21,6 @@ from @{tf.summary.merge_all} to @{tf.summary.FileWriter}.
 
 To use with eager execution enabled, write your code as follows:
 
-```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -31,11 +30,9 @@ with summary_writer.as_default(), tf.contrib.summary.always_record_summaries():
   tf.contrib.summary.scalar("loss", my_loss)
   # In this case every call to tf.contrib.summary.scalar will generate a record
   # ...
-```
 
 To use it with graph execution, write your code as follows:
 
-```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -56,7 +53,7 @@ with tf.Session(...) as sess:
   while not_done_training:
     sess.run([train_op, tf.contrib.summary.all_summary_ops()])
     # ...
-```
+
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index d8236a0a6f..e893e1d1c8 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -21,10 +21,10 @@ import numpy as np
 
 from tensorflow.contrib import losses
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
+from tensorflow.contrib.metrics.python.ops import metric_ops
 
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 
 INFERENCE_PROB_NAME = prediction_key.PredictionKey.PROBABILITIES
@@ -38,13 +38,12 @@ def _top_k_generator(k):
     targets = math_ops.to_int32(targets)
     if targets.get_shape().ndims > 1:
       targets = array_ops.squeeze(targets, axis=[1])
-    return metrics.mean(nn.in_top_k(probabilities, targets, k))
+    return metric_ops.streaming_mean(nn.in_top_k(probabilities, targets, k))
   return _top_k
 
 
 def _accuracy(predictions, targets, weights=None):
-  return metrics.accuracy(
-      labels=targets, predictions=predictions, weights=weights)
+  return metric_ops.streaming_accuracy(predictions, targets, weights=weights)
 
 
 def _r2(probabilities, targets, weights=None):
@@ -54,7 +53,7 @@ def _r2(probabilities, targets, weights=None):
   squares_residuals = math_ops.reduce_sum(
       math_ops.square(targets - probabilities), 0)
   score = 1 - math_ops.reduce_sum(squares_residuals / squares_total)
-  return metrics.mean(score, weights=weights)
+  return metric_ops.streaming_mean(score, weights=weights)
 
 
 def _squeeze_and_onehot(targets, depth):
@@ -63,7 +62,7 @@ def _squeeze_and_onehot(targets, depth):
 
 
 def _sigmoid_entropy(probabilities, targets, weights=None):
-  return metrics.mean(
+  return metric_ops.streaming_mean(
       losses.sigmoid_cross_entropy(probabilities,
                                    _squeeze_and_onehot(
                                        targets,
@@ -72,7 +71,7 @@ def _sigmoid_entropy(probabilities, targets, weights=None):
 
 
 def _softmax_entropy(probabilities, targets, weights=None):
-  return metrics.mean(
+  return metric_ops.streaming_mean(
       losses.sparse_softmax_cross_entropy(probabilities,
                                           math_ops.to_int32(targets)),
       weights=weights)
@@ -83,7 +82,7 @@ def _predictions(predictions, unused_targets, **unused_kwargs):
 
 
 def _class_log_loss(probabilities, targets, weights=None):
-  return metrics.mean(
+  return metric_ops.streaming_mean(
       losses.log_loss(probabilities,
                       _squeeze_and_onehot(targets,
                                           array_ops.shape(probabilities)[1])),
@@ -91,36 +90,34 @@ def _class_log_loss(probabilities, targets, weights=None):
 
 
 def _precision(predictions, targets, weights=None):
-  return metrics.precision(
-      labels=targets, predictions=predictions, weights=weights)
+  return metric_ops.streaming_precision(predictions, targets, weights=weights)
 
 
 def _precision_at_thresholds(predictions, targets, weights=None):
-  return metrics.precision_at_thresholds(
-      labels=targets,
-      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
-      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
+  return metric_ops.streaming_precision_at_thresholds(
+      array_ops.slice(predictions, [0, 1], [-1, 1]),
+      targets,
+      np.arange(
+          0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _recall(predictions, targets, weights=None):
-  return metrics.recall(
-      labels=targets, predictions=predictions, weights=weights)
+  return metric_ops.streaming_recall(predictions, targets, weights=weights)
 
 
 def _recall_at_thresholds(predictions, targets, weights=None):
-  return metrics.recall_at_thresholds(
-      labels=targets,
-      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
-      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
+  return metric_ops.streaming_recall_at_thresholds(
+      array_ops.slice(predictions, [0, 1], [-1, 1]),
+      targets,
+      np.arange(
+          0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _auc(probs, targets, weights=None):
-  return metrics.auc(
-      labels=targets,
-      predictions=array_ops.slice(probs, [0, 1], [-1, 1]),
-      weights=weights)
+  return metric_ops.streaming_auc(array_ops.slice(probs, [0, 1], [-1, 1]),
+                                  targets, weights=weights)
 
 
 _EVAL_METRICS = {
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index 6f62cd11a9..7a35a70bbe 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -295,7 +295,7 @@ def get_epoch_variable():
 
 
 # A simple container to hold the training variables for a single tree.
-class TreeVariables(object):
+class TreeTrainingVariables(object):
   """Stores tf.Variables for training a single random tree.
 
   Uses tf.get_variable to get tree-specific names so that this can be used
@@ -303,7 +303,7 @@ class TreeVariables(object):
   then relies on restoring that model to evaluate).
   """
 
-  def __init__(self, params, tree_num, training, tree_config='', tree_stat=''):
+  def __init__(self, params, tree_num, training):
     if (not hasattr(params, 'params_proto') or
         not isinstance(params.params_proto,
                        _params_proto.TensorForestParams)):
@@ -315,28 +315,27 @@ class TreeVariables(object):
       # TODO(gilberth): Manually shard this to be able to fit it on
       # multiple machines.
       self.stats = stats_ops.fertile_stats_variable(
-          params, tree_stat, self.get_tree_name('stats', tree_num))
+          params, '', self.get_tree_name('stats', tree_num))
     self.tree = model_ops.tree_variable(
-        params, tree_config, self.stats, self.get_tree_name('tree', tree_num))
+        params, '', self.stats, self.get_tree_name('tree', tree_num))
 
   def get_tree_name(self, name, num):
     return '{0}-{1}'.format(name, num)
 
 
-class ForestVariables(object):
+class ForestTrainingVariables(object):
   """A container for a forests training data, consisting of multiple trees.
 
-  Instantiates a TreeVariables object for each tree. We override the
+  Instantiates a TreeTrainingVariables object for each tree. We override the
   __getitem__ and __setitem__ function so that usage looks like this:
 
-    forest_variables = ForestVariables(params)
+    forest_variables = ForestTrainingVariables(params)
 
     ... forest_variables.tree ...
   """
 
   def __init__(self, params, device_assigner, training=True,
-               tree_variables_class=TreeVariables,
-               tree_configs=None, tree_stats=None):
+               tree_variables_class=TreeTrainingVariables):
     self.variables = []
     # Set up some scalar variables to run through the device assigner, then
     # we can use those to colocate everything related to a tree.
@@ -348,13 +347,7 @@ class ForestVariables(object):
 
     for i in range(params.num_trees):
       with ops.device(self.device_dummies[i].device):
-        kwargs = {}
-        if tree_configs is not None:
-          kwargs.update(dict(tree_config=tree_configs[i]))
-        if tree_stats is not None:
-          kwargs.update(dict(tree_stat=tree_stats[i]))
-        self.variables.append(tree_variables_class(
-            params, i, training, **kwargs))
+        self.variables.append(tree_variables_class(params, i, training))
 
   def __setitem__(self, t, val):
     self.variables[t] = val
@@ -368,11 +361,9 @@ class RandomForestGraphs(object):
 
   def __init__(self,
                params,
-               tree_configs=None,
-               tree_stats=None,
                device_assigner=None,
                variables=None,
-               tree_variables_class=TreeVariables,
+               tree_variables_class=TreeTrainingVariables,
                tree_graphs=None,
                training=True):
     self.params = params
@@ -380,10 +371,9 @@ class RandomForestGraphs(object):
         device_assigner or framework_variables.VariableDeviceChooser())
     logging.info('Constructing forest with params = ')
     logging.info(self.params.__dict__)
-    self.variables = variables or ForestVariables(
+    self.variables = variables or ForestTrainingVariables(
         self.params, device_assigner=self.device_assigner, training=training,
-        tree_variables_class=tree_variables_class,
-        tree_configs=tree_configs, tree_stats=tree_stats)
+        tree_variables_class=tree_variables_class)
     tree_graph_class = tree_graphs or RandomTreeGraphs
     self.trees = [
         tree_graph_class(self.variables[i], self.params, i)
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
index 1c9c81827e..bbe627b157 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
@@ -18,14 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from google.protobuf.json_format import ParseDict
-from tensorflow.contrib.decision_trees.proto import generic_tree_model_pb2 as _tree_proto
 from tensorflow.contrib.tensor_forest.python import tensor_forest
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import resources
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -114,47 +110,6 @@ class TensorForestTest(test_util.TensorFlowTestCase):
     self.assertTrue(isinstance(paths, ops.Tensor))
     self.assertTrue(isinstance(var, ops.Tensor))
 
-  def testInfrenceFromRestoredModel(self):
-    input_data = [[-1., 0.], [-1., 2.],  # node 1
-                  [1., 0.], [1., -2.]]  # node 2
-    expected_prediction = [[0.0, 1.0], [0.0, 1.0],
-                           [0.0, 1.0], [0.0, 1.0]]
-    hparams = tensor_forest.ForestHParams(
-        num_classes=2,
-        num_features=2,
-        num_trees=1,
-        max_nodes=1000,
-        split_after_samples=25).fill()
-    tree_weight = {'decisionTree':
-                       {'nodes':
-                        [{'binaryNode':
-                          {'rightChildId': 2,
-                           'leftChildId': 1,
-                           'inequalityLeftChildTest':
-                           {'featureId': {'id': '0'},
-                            'threshold': {'floatValue': 0}}}},
-                         {'leaf': {'vector':
-                                   {'value': [{'floatValue': 0.0},
-                                              {'floatValue': 1.0}]}},
-                          'nodeId': 1},
-                         {'leaf': {'vector':
-                                   {'value': [{'floatValue': 0.0},
-                                              {'floatValue': 1.0}]}},
-                          'nodeId': 2}]}}
-    restored_tree_param = ParseDict(tree_weight,
-                                    _tree_proto.Model()).SerializeToString()
-    graph_builder = tensor_forest.RandomForestGraphs(hparams,
-                                                     [restored_tree_param])
-    probs, paths, var = graph_builder.inference_graph(input_data)
-    self.assertTrue(isinstance(probs, ops.Tensor))
-    self.assertTrue(isinstance(paths, ops.Tensor))
-    self.assertTrue(isinstance(var, ops.Tensor))
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      resources.initialize_resources(resources.shared_resources()).run()
-      self.assertEquals(probs.eval().shape, (4, 2))
-      self.assertEquals(probs.eval().tolist(), expected_prediction)
-
   def testTrainingConstructionClassificationSparse(self):
     input_data = sparse_tensor.SparseTensor(
         indices=[[0, 0], [0, 3], [1, 0], [1, 7], [2, 1], [3, 9]],
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index da4dd5a14c..b7b26cfb1c 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -91,11 +91,8 @@ void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
       if (!subgraph_node_ids.count(edge->src()->id()) &&
           !edge->src()->IsSource() && !edge->IsControlEdge()) {
         incoming_edges->insert(edge);
-        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
-                << " Y, ";
       } else {
-        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
-                << " N, ";
+        VLOG(2) << node->name() << " -> " << edge->src()->name() << " N, ";
       }
     }
   }
@@ -109,12 +106,10 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
     for (const tensorflow::Edge* edge : node->out_edges()) {
       if (!subgraph_node_ids.count(edge->dst()->id()) &&
           !edge->dst()->IsSink() && !edge->IsControlEdge()) {
-        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
-                << " Y, ";
+        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " Y, ";
         outgoing_edges->insert(edge);
       } else {
-        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
-                << " N, ";
+        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " N, ";
       }
     }
   }
@@ -186,27 +181,29 @@ struct ConvertGraphParams {
 static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
   GetSubGraphIncomingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_incoming_edges);
-
-  std::set<std::pair<int, int>> unique_tensors;
-  // Add only unique input source nodes. If output of an outside node is shared
-  // between multiple nodes inside the engine, only one edge should be created
   for (const tensorflow::Edge* edge : p->subgraph_incoming_edges) {
-    unique_tensors.insert({edge->src()->id(), edge->src_output()});
+    p->subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
+  }
+  auto output_name_to_index_map = BuildTensorNameMap(p->output_names);
+  std::set<std::pair<int, int>> subgraph_outputs_set;
+  // Collect outputs referenced from output_names
+  for (int node_id : p->subgraph_node_ids) {
+    tensorflow::Node* node = p->graph.FindNodeId(node_id);
+    if (output_name_to_index_map.count(node->name())) {
+      for (int index : output_name_to_index_map.at(node->name())) {
+        subgraph_outputs_set.insert({node_id, index});
+      }
+    }
   }
-  p->subgraph_inputs.insert(p->subgraph_inputs.begin(), unique_tensors.begin(),
-                            unique_tensors.end());
   GetSubGraphOutgoingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_outgoing_edges);
-  unique_tensors.clear();
-  // Similar to above, if multiple ouside nodes are sharing the output of an
-  // internal node only one output port should be created and shared between
-  // outputs
   for (const tensorflow::Edge* edge : p->subgraph_outgoing_edges) {
-    unique_tensors.insert({edge->src()->id(), edge->src_output()});
+    subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
   }
-  p->subgraph_outputs.reserve(unique_tensors.size());
+  p->subgraph_outputs.reserve(subgraph_outputs_set.size());
   p->subgraph_outputs.insert(p->subgraph_outputs.begin(),
-                             unique_tensors.begin(), unique_tensors.end());
+                             subgraph_outputs_set.begin(),
+                             subgraph_outputs_set.end());
   return tensorflow::Status::OK();
 }
 
@@ -228,6 +225,7 @@ tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
   for (auto in_edge :
        params->subgraph_incoming_edges) {  // loop over incoming edges and
                                            // attach them to calib node
+    // tensorflow::Node* src_node = in_edge->src();
     auto src_output = in_edge->src_output();
     auto dst_node = in_edge->dst();
     auto dst_input = in_edge->dst_input();
@@ -259,24 +257,19 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
   for (size_t i = 0; i < params->subgraph_inputs.size(); ++i) {
     subgraph_edge_to_input_map.insert({params->subgraph_inputs.at(i), i});
   }
-  std::set<std::pair<int, int>> unique_tensors;
   for (const tensorflow::Edge* edge : params->subgraph_incoming_edges) {
     std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
-    if (unique_tensors.count(old_src)) continue;
-    unique_tensors.insert(old_src);
     int new_src_output = subgraph_edge_to_input_map.at(old_src);
     params->graph.AddEdge(edge->src(), edge->src_output(), trt_node,
                           new_src_output);
-    VLOG(1) << "Wire " << edge->src()->name() << ":" << edge->src_output()
-            << " -> " << trt_node->name() << ":" << new_src_output;
     params->graph.RemoveEdge(edge);
   }
-  if (VLOG_IS_ON(2)) {
-    VLOG(2) << "new edge count: " << trt_node->in_edges().size();
-    for (const tensorflow::Edge* edge : trt_node->in_edges()) {
-      VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
-    }
+
+  VLOG(2) << "new wiring edges: " << trt_node->in_edges().size();
+  for (const tensorflow::Edge* edge : trt_node->in_edges()) {
+    VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
   }
+
   TF_RETURN_IF_ERROR(status);
 
   // Re-map outgoing edges to use the new TRT node instead of the orig subgraph
@@ -290,8 +283,6 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
     int new_src_output = subgraph_edge_to_output_map.at(old_src);
     TF_RETURN_IF_ERROR(params->graph.UpdateEdge(
         trt_node, new_src_output, edge->dst(), edge->dst_input()));
-    VLOG(1) << "Wire " << trt_node->name() << ":" << new_src_output << " -> "
-            << edge->dst()->name() << ":" << edge->dst_input();
   }
   // Remove the original subgraph
   for (int node_id : params->subgraph_node_ids) {
@@ -326,12 +317,9 @@ tensorflow::Status ConvertCalibGraphToInferGraph(
       tensorflow::GraphConstructorOptions(), graph_def, &graph));
   //  get calib nodes
   std::vector<tensorflow::Node*> calib_nodes;
-  std::vector<tensorflow::Node*> topo_order;
-  tensorflow::GetPostOrder(graph, &topo_order);
-  for (auto rit = topo_order.rbegin(); rit != topo_order.rend(); ++rit) {
-    auto node = *rit;
+  for (auto node : graph.op_nodes()) {
     if (node->type_string() == "TRTCalibOp") {
-      VLOG(1) << "Found Calib Node " << node->name();
+      VLOG(1) << "Found Calib Node";
       calib_nodes.push_back(node);
     }
   }
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 4e4d295538..96e0700862 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -362,11 +362,10 @@ void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
       break;
     }
     case tensorflow::DataType::DT_HALF: {
-      Reorder2(
-          {k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
-          istrides,
-          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
-          ostrides);
+      Reorder2({k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
+               istrides, static_cast<Eigen::half*>(
+                             const_cast<void*>(oweights->GetValues())),
+               ostrides);
       break;
     }
     default:
@@ -1180,9 +1179,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented(
-        "binary op: " + node_def.op() +
-        " not supported at: " + node_def.name());
+    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
+                                             " not supported at: " +
+                                             node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -2139,7 +2138,9 @@ void Converter::register_op_converters() {
 }
 
 }  // namespace
-
+tensorflow::Status GetTensorRTGraph(tensorrt::convert::SubGraphParams& s) {
+  return tensorflow::errors::Unimplemented("Not implemented yet");
+}
 tensorflow::Status ConvertCalibrationNodeToEngineNode(
     tensorflow::Graph& graph, tensorflow::Node* c_node) {
   const auto ndef = c_node->def();
@@ -2163,23 +2164,9 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   for (auto n : graph.op_nodes()) {
     node_maps.insert({n->name(), n});
   }
-  std::set<int> subgraph_ids;
-  for (const auto internal_node : segment_nodes) {
-    subgraph_ids.insert(node_maps.at(internal_node)->id());
-  }
-  if (VLOG_IS_ON(2)) {
-    string node_names = StrCat(c_node->name(), " segment nodes= ");
-
-    for (const auto& node_name : segment_nodes) {
-      StrAppend(&node_names, node_name, ", ");
-    }
-    VLOG(2) << node_names;
-  }
-
   VLOG(1) << "Output Nodes:";
   std::vector<tensorflow::DataType> out_types;
   std::vector<const tensorflow::Edge*> out_edges;
-
   for (auto& i : output_nodes) {
     auto node_port = tensorflow::str_util::Split(i, ":");
     VLOG(1) << " " << i << " in graph " << node_maps.count(i);
@@ -2199,24 +2186,18 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
         out_types.push_back(out_node->output_type(0));
       }
       for (auto out_edge : out_node->out_edges()) {
-        if (subgraph_ids.count(out_edge->dst()->id()))
-          continue;  // skip internal edges;
         if (out_edge->src_output() == port) {
           out_edges.push_back(out_edge);
-          VLOG(1) << "OUTPUT EDGE " << out_edge->src()->name() << ":"
-                  << out_edge->src_output() << " -> " << out_edge->dst()->name()
-                  << ":" << out_edge->dst_input();
+          break;
         }
       }
     } else {
       LOG(WARNING) << " couldn't find output node " << out_node_name;
     }
   }
-  if (VLOG_IS_ON(1)) {
-    VLOG(1) << c_node->name() << " Input Nodes:";
-    for (auto& i : input_names) {
-      VLOG(1) << " Input " << i << " in graph " << node_maps.count(i);
-    }
+  VLOG(1) << "Input Nodes:";
+  for (auto& i : input_names) {
+    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
   }
   auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
   auto resmgr = trt_rm->getManager("TRTCalibOps");
@@ -2250,24 +2231,14 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   calib_res->builder_ = nullptr;
   tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
   std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  income_edges.resize(c_node->num_inputs());
   for (const auto in_edge : c_node->in_edges()) {
     auto src = in_edge->src();
     int dest_port = in_edge->dst_input();
-    VLOG(1) << "Incoming connection " << src->name() << ":"
-            << in_edge->src_output() << " -> " << c_node->name() << ":"
-            << dest_port;
-    income_edges.at(dest_port) = {src->name(), in_edge->src_output(),
-                                  c_node->input_type(dest_port)};
+    income_edges.emplace_back(src->name(), in_edge->src_output(),
+                              c_node->input_type(dest_port));
   }
   tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
       income_edges);
-  if (VLOG_IS_ON(2)) {
-    for (const auto& inp : input_list) {
-      VLOG(2) << " Input from inputlist " << inp.node << ":" << inp.index << " "
-              << tensorflow::DataTypeString(inp.data_type);
-    }
-  }
   op_builder.Input(input_list);
   tensorflow::NodeDef engine_node;
   const char* engine_plan_data = static_cast<const char*>(engine_plan->data());
@@ -2284,26 +2255,13 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   }
   auto trt_engine_node = graph.AddNode(engine_node, &status);
   TF_RETURN_IF_ERROR(status);
-  std::map<string, int> port_map;
-  for (size_t t = 0; t < output_nodes.size(); t++) {
-    port_map.insert({output_nodes.at(t), t});
-  }
-  for (auto& i : out_edges) {
-    string s(i->src()->name());
-    if (i->src_output()) StrAppend(&s, ":", i->src_output());
-    int out_port = port_map.at(s);
-    VLOG(1) << "Connecting " << trt_engine_node->name() << ":" << out_port
-            << " -> " << i->dst()->name() << ":" << i->dst_input();
-    TF_RETURN_IF_ERROR(
-        graph.UpdateEdge(trt_engine_node, out_port, i->dst(), i->dst_input()));
-  }
-  for (const auto ed : trt_engine_node->in_edges()) {
-    VLOG(1) << "In Edge  " << ed->src()->name() << ":" << ed->src_output()
-            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
-  }
-  for (const auto ed : trt_engine_node->out_edges()) {
-    VLOG(1) << "Out Edge " << ed->src()->name() << ":" << ed->src_output()
-            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
+  for (size_t i = 0; i < out_edges.size(); i++) {
+    VLOG(1) << "Connecting trt_engine_node output " << i << " with "
+            << out_edges.at(i)->dst()->name() << " port "
+            << out_edges.at(i)->dst_input();
+    TF_RETURN_IF_ERROR(graph.UpdateEdge(trt_engine_node, i,
+                                        out_edges.at(i)->dst(),
+                                        out_edges.at(i)->dst_input()));
   }
   VLOG(1) << "Segment nodes:";
   for (auto& i : segment_nodes) {
@@ -2374,7 +2332,6 @@ tensorflow::Status ConvertSubgraph(
     std::vector<string>* output_names,
     std::vector<tensorflow::DataType>* output_dtypes,
     const string& engine_name) {
-  std::set<string> added_tensors;
   for (const std::pair<int, int>& input : s.input_inds) {
     VLOG(2) << "parsing input. Node id= " << input.first;
     int node_id = input.first;
@@ -2417,6 +2374,7 @@ tensorflow::Status ConvertSubgraph(
 
     auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
+    input_dtypes->push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
     auto type_status = ConvertDType(tf_dtype, &dtype);
@@ -2452,10 +2410,8 @@ tensorflow::Status ConvertSubgraph(
     if (output_idx != 0) {
       input_tensor_name = StrCat(node_name, ":", output_idx);
     }
-    if (added_tensors.count(input_tensor_name)) continue;
-    added_tensors.insert(input_tensor_name);
+
     input_names->push_back(input_tensor_name);
-    input_dtypes->push_back(tf_dtype);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
         input_tensor_name.c_str(), dtype, input_dim_pseudo_chw);
 
@@ -2479,7 +2435,6 @@ tensorflow::Status ConvertSubgraph(
 
   // Gather output metadata
   int trt_engine_op_output_idx = 0;
-  added_tensors.clear();
   for (const std::pair<int, int>& output : s.output_inds) {
     int node_id = output.first;
     int output_idx = output.second;
@@ -2496,8 +2451,6 @@ tensorflow::Status ConvertSubgraph(
     if (output_idx != 0)
       tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
     VLOG(2) << "Output tensor name: " << tensor_name;
-    if (added_tensors.count(tensor_name)) continue;
-    added_tensors.insert(tensor_name);
     output_names->push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index d879170b68..2e472a2805 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -166,21 +166,11 @@ def StreamingFilesDataset(files,
     return remote_iterator.get_next()
 
   def MapFn(unused_input):
-    if isinstance(source_dataset.output_types, dtypes.DType):
-      output_types = [source_dataset.output_types]
-    elif isinstance(source_dataset.output_types, (list, tuple)):
-      output_types = source_dataset.output_types
-    else:
-      raise ValueError('source dataset has invalid output types')
-    remote_calls = functional_ops.remote_call(
+    return functional_ops.remote_call(
         args=[source_handle],
-        Tout=output_types,
+        Tout=[dtypes.string],
         f=LoadingFunc,
-        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
-    if len(remote_calls) == 1:
-      return remote_calls[0]
-    else:
-      return remote_calls
+        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)[0]
 
   with ops.device('/job:%s' % worker_job):
     output_dataset = dataset_ops.Dataset.range(2).repeat().map(
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
index b58d05eac5..918cf0ed8e 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
@@ -26,8 +26,6 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -164,30 +162,6 @@ class DatasetsTest(test.TestCase):
 
     self.assertEqual(set(all_contents), set(retrieved_values))
 
-  def testArbitraryReaderFuncFromDatasetGenerator(self):
-
-    def my_generator():
-      yield (1, [1] * 10)
-
-    def gen_dataset(dummy):
-      return dataset_ops.Dataset.from_generator(
-          my_generator, (dtypes.int64, dtypes.int64),
-          (tensor_shape.TensorShape([]), tensor_shape.TensorShape([10])))
-
-    dataset = datasets.StreamingFilesDataset(
-        dataset_ops.Dataset.range(10), filetype=gen_dataset)
-
-    iterator = dataset.make_initializable_iterator()
-    self._sess.run(iterator.initializer)
-    get_next = iterator.get_next()
-
-    retrieved_values = self._sess.run(get_next)
-
-    self.assertIsInstance(retrieved_values, (list, tuple))
-    self.assertEqual(len(retrieved_values), 2)
-    self.assertEqual(retrieved_values[0], 1)
-    self.assertItemsEqual(retrieved_values[1], [1] * 10)
-
   def testUnexpectedFiletypeString(self):
     with self.assertRaises(ValueError):
       datasets.StreamingFilesDataset(
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index b1c224a345..d89633199d 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -699,9 +699,7 @@ cc_library(
     srcs = ["platform/stacktrace_handler.cc"],
     hdrs = ["platform/stacktrace_handler.h"],
     deps = [
-        ":abi",
         ":lib_platform",
-        ":stacktrace",
     ],
 )
 
@@ -3091,8 +3089,6 @@ cc_library(
         # we now need at least "str_util".
         ":lib",
         ":lib_platform",
-        ":stacktrace_handler",
-        ":test_lite",
         "//tensorflow/core/platform/default/build_config:test_lite_main",
     ],
     alwayslink = 1,
@@ -3573,10 +3569,7 @@ tf_cc_tests_gpu(
 tf_cc_test_mkl(
     name = "mkl_runtime_tests",
     size = "small",
-    srcs = [
-        "common_runtime/mkl_cpu_allocator_test.cc",
-        "common_runtime/mkl_threadpool_device_test.cc",
-    ],
+    srcs = ["common_runtime/mkl_cpu_allocator_test.cc"],
     linkstatic = 1,
     deps = [
         ":core",
diff --git a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
index 985f09312f..cbe76de415 100644
--- a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
@@ -4,10 +4,6 @@ op {
   description: <<END
 if < 0, `scale * features` otherwise.
 
-To be used together with
-`initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
-For correct dropout, use `tf.contrib.nn.alpha_dropout`.
-
 See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
deleted file mode 100644
index 6e13d0d049..0000000000
--- a/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
+++ /dev/null
@@ -1,48 +0,0 @@
-op {
-  graph_op_name: "StringSplitV2"
-  in_arg {
-    name: "input"
-    description: <<END
-`1-D` string `Tensor`, the strings to split.
-END
-  }
-  in_arg {
-    name: "sep"
-    description: <<END
-`0-D` string `Tensor`, the delimiter character.
-END
-  }
-  attr {
-    name: "maxsplit"
-    description: <<END
-An `int`. If `maxsplit > 0`, limit of the split of the result.
-END
-  }
-  summary: "Split elements of `source` based on `sep` into a `SparseTensor`."
-  description: <<END
-Let N be the size of source (typically N will be the batch size). Split each
-element of `source` based on `sep` and return a `SparseTensor`
-containing the split tokens. Empty tokens are ignored.
-
-For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
-then the output will be
-```
-st.indices = [0, 0;
-              0, 1;
-              1, 0;
-              1, 1;
-              1, 2]
-st.shape = [2, 3]
-st.values = ['hello', 'world', 'a', 'b', 'c']
-```
-
-If `sep` is given, consecutive delimiters are not grouped together and are
-deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
-sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
-string, consecutive whitespace are regarded as a single separator, and the
-result will contain no empty strings at the startor end if the string has
-leading or trailing whitespace.
-
-Note that the above mentioned behavior matches python's str.split.
-END
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
deleted file mode 100644
index 0e8576fb01..0000000000
--- a/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "StringSplitV2"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 9cda17867b..8f2a419756 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -86,7 +86,7 @@ BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
   return &(chunks_[h]);
 }
 
-bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
+bool BFCAllocator::Extend(size_t rounded_bytes) {
   size_t available_bytes = memory_limit_ - total_region_allocated_bytes_;
   // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
   available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
@@ -108,7 +108,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
 
   // Try allocating.
   size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
-  void* mem_addr = suballocator_->Alloc(alignment, bytes);
+  void* mem_addr = suballocator_->Alloc(32, bytes);
   if (mem_addr == nullptr && !started_backpedal_) {
     // Only backpedal once.
     started_backpedal_ = true;
@@ -119,7 +119,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
     while (mem_addr == nullptr) {
       bytes = RoundedBytes(bytes * kBackpedalFactor);
       if (bytes < rounded_bytes) break;
-      mem_addr = suballocator_->Alloc(alignment, bytes);
+      mem_addr = suballocator_->Alloc(32, bytes);
     }
   }
 
@@ -261,7 +261,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
   }
 
   // Try to extend
-  if (Extend(unused_alignment, rounded_bytes)) {
+  if (Extend(rounded_bytes)) {
     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
     if (ptr != nullptr) {
       return ptr;
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 52aedb1e9c..ba5a3eea3a 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -305,8 +305,7 @@ class BFCAllocator : public VisitableAllocator {
   // Try to add a new memory region that can satisfy an allocation of
   // 'rounded_bytes' bytes.  Returns true on success and false on
   // failure.
-  bool Extend(size_t alignment, size_t rounded_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Returns a pointer to an underlying allocated chunk of size
   // 'rounded_bytes'.
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 9028e6298c..c21a1ea9f2 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -102,25 +102,9 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
         EXPECT_EQ(2, shape.dim(0).size());
         EXPECT_EQ(1, shape.dim(1).size());
         if (node->name() == y->name()) {
-#ifdef INTEL_MKL
-          // if MKL is used, it goes through various additional 
-          // graph rewrite pass. In TF, everytime a graph pass 
-          // happens, "constant" nodes are allocated
-          // and deallocated. Each allocation calls the
-          // (FindChunkPtr of BFCAllocator),
-          // which increments the value of AllocationId. 
-          // Thus AllocationId becomes more than 3 and 4 if 
-          // MKL is used. Now they are 9 and 10 for MKL. 
-          EXPECT_EQ(19, cm->AllocationId(node, 0));
-#else
           EXPECT_EQ(21, cm->AllocationId(node, 0));
-#endif 
         } else {
-#ifdef INTEL_MKL
-          EXPECT_EQ(20, cm->AllocationId(node, 0));
-#else
           EXPECT_EQ(22, cm->AllocationId(node, 0));
-#endif 
         }
       }
       EXPECT_LE(0, cm->MaxExecutionTime(node));
diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
deleted file mode 100644
index 5d583a8360..0000000000
--- a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef INTEL_MKL
-
-#include "tensorflow/core/common_runtime/threadpool_device.h"
-
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/cpu_info.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/public/session_options.h"
-
-namespace tensorflow {
-
-#ifdef _OPENMP
-TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) {
-  SessionOptions options;
-  unsetenv("OMP_NUM_THREADS");
-
-  ThreadPoolDevice* tp = new ThreadPoolDevice(
-      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
-
-  const int ht = port::NumHyperthreadsPerCore();
-  EXPECT_EQ(omp_get_max_threads(), (port::NumSchedulableCPUs() + ht - 1) / ht);
-}
-
-TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) {
-  SessionOptions options;
-  setenv("OMP_NUM_THREADS", "314", 1);
-
-  ThreadPoolDevice* tp = new ThreadPoolDevice(
-      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
-
-  EXPECT_EQ(omp_get_max_threads(), 314);
-}
-#endif  // _OPENMP
-
-}  // namespace tensorflow
-
-#endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index a5d31b75c7..21912236d0 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -16,10 +16,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 
 #ifdef INTEL_MKL
-#ifdef _OPENMP
 #include <omp.h>
-#endif  // _OPENMP
-#endif  // INTEL_MKL
+#endif
 #include <string.h>
 
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -59,10 +57,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   // MKL library executes ops in parallel using OMP threads
   // Set inter_op conservatively to avoid thread oversubscription that could
   // lead to severe perf degradations and OMP resource exhaustion
-  int mkl_intra_op = 1;
-#ifdef _OPENMP
-  mkl_intra_op = omp_get_max_threads();
-#endif  // _OPENMP
+  const int mkl_intra_op = omp_get_max_threads();
   CHECK_GE(mkl_intra_op, 1);
   const int32 mkl_inter_op = std::max(
       (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
@@ -73,7 +68,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
 #else
   // Default to using the number of cores available in the process.
   return port::NumSchedulableCPUs();
-#endif  // INTEL_MKL
+#endif
 }
 
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 74a87215e1..f7a07fe503 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -31,11 +31,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 
 #ifdef INTEL_MKL
-#ifdef _OPENMP
-#include <omp.h>
-#endif
 #include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
-#include "tensorflow/core/platform/cpu_info.h"
 #endif
 
 namespace tensorflow {
@@ -47,26 +43,7 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
     : LocalDevice(options, Device::BuildDeviceAttributes(
                                name, DEVICE_CPU, memory_limit, locality)),
       allocator_(allocator),
-      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {
-#ifdef INTEL_MKL
-#ifdef _OPENMP
-  const char* user_omp_threads = getenv("OMP_NUM_THREADS");
-  if (user_omp_threads == nullptr) {
-    // OMP_NUM_THREADS controls MKL's intra-op parallelization
-    // Default to available physical cores
-    const int mkl_intra_op = port::NumSchedulableCPUs();
-    const int ht = port::NumHyperthreadsPerCore();
-    omp_set_num_threads((mkl_intra_op + ht - 1) / ht);
-  } else {
-    uint64 user_val = 0;
-    if (strings::safe_strtou64(user_omp_threads, &user_val)) {
-      // Superflous but triggers OpenMP loading
-      omp_set_num_threads(user_val);
-    }
-  }
-#endif  // _OPENMP
-#endif  // INTEL_MKL
-}
+      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {}
 
 ThreadPoolDevice::~ThreadPoolDevice() {}
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index 770a0fcf14..1cea1b1462 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -147,9 +147,7 @@ MasterService::Stub::Stub(
 }
 
 MasterService::AsyncService::AsyncService() {
-  int method_len = sizeof(grpcMasterService_method_names) / 
-                    sizeof(grpcMasterService_method_names[0]);
-  for (int i = 0; i < method_len; ++i) {
+  for (int i = 0; i < 10; ++i) {
     AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcMasterService_method_names[i],
         ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
index a8508d2d4f..89f83f9f24 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_session.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
@@ -51,14 +50,9 @@ Status TestCluster::MakeTestCluster(const SessionOptions& options, int n,
   }
 
   for (int i = 0; i < n; ++i) {
-    string server_file =
-        strings::StrCat(testing::TensorFlowSrcRoot(),
-                        "/core/distributed_runtime/rpc/grpc_testlib_server");
-    if (!options.env->FileExists(server_file).ok()) {
-      return errors::Internal("Could not find grpc_testlib_server");
-    }
     const std::vector<string> argv(
-        {server_file,
+        {strings::StrCat(testing::TensorFlowSrcRoot(),
+                         "/core/distributed_runtime/rpc/grpc_testlib_server"),
          /* see grpc_testlib_server.cc for flags */
          tf_jobs, "--tf_job=localhost", strings::StrCat("--tf_task=", i),
          strings::StrCat("--num_cpus=", num_cpus),
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 2bb4d32d57..2c87156dca 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -67,8 +67,13 @@ struct AllocatorStats {
 // device memory.
 class Allocator {
  public:
+#ifdef EIGEN_VECTORIZE_AVX512
   // Align to 64 byte boundary.
   static constexpr size_t kAllocatorAlignment = 64;
+#else
+  // Align to 32 byte boundary.
+  static constexpr size_t kAllocatorAlignment = 32;
+#endif
 
   virtual ~Allocator();
 
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 4b56d807df..3d7920a6e2 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_gen_lib.h"
 
-#include <algorithm>
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/framework/remote_fused_graph_execute_info.proto b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
index 10072724d2..eb689ec1e6 100644
--- a/tensorflow/core/framework/remote_fused_graph_execute_info.proto
+++ b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RemoteFusedGraphExecuteInfoProto";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+//add go_package externally
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index 80e168df97..b613effd18 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -1147,29 +1147,29 @@ TEST(Tensor, FailureToAllocate) {
 
 // On the alignment.
 //
-// As of 2018/5, tensorflow::Tensor allocates its buffer with 64-byte
+// As of 2015/8, tensorflow::Tensor allocates its buffer with 32-byte
 // alignment. Tensor::tensor/flat/vec/matrix methods requires the
 // buffer satisfies Eigen::Aligned (e.g., 16-bytes aligned usually,
-// 32-bytes for AVX, and 64-bytes for AVX512). Tensor::Slice requires
-// the caller to ensure its result is aligned if the caller intends
-// to use those methods. In this test case, we simply make sure each
-// slice is 64-byte aligned: sizeof(float) * 4 * 36 = 576.  576 % 64 = 0.
+// and 32-bytes for AVX). Tensor::Slice requires the caller to ensure
+// its result is aligned if the caller intends to use those methods.
+// In this test case, we simply make sure each slice is 32-byte
+// aligned: sizeof(float) * 4 * 2 = 32.
 TEST(Tensor, Slice_Basic) {
   Tensor saved;
   {  // General
-    Tensor x(DT_FLOAT, TensorShape({10, 4, 36}));
+    Tensor x(DT_FLOAT, TensorShape({10, 4, 34}));
     // Fills in known values.
     for (int i = 0; i < 10; ++i) {
       x.Slice(i, i + 1).flat<float>().setConstant(i * 1.f);
     }
     // A simple slice along dim0.
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 36})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 34})));
     auto tx = x.tensor<float, 3>();
     auto ty = y.tensor<float, 3>();
     for (int i = 0; i < 4; ++i) {
       for (int j = 0; j < 4; ++j) {
-        for (int k = 0; k < 36; ++k) {
+        for (int k = 0; k < 34; ++k) {
           EXPECT_EQ(ty(i, j, k), 4.0 + i);
           EXPECT_EQ(&tx(4 + i, j, k), &ty(i, j, k));
         }
@@ -1186,7 +1186,7 @@ TEST(Tensor, Slice_Basic) {
     auto tz = z.tensor<float, 3>();
     EXPECT_EQ(1, z.dim_size(0));
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 36; ++k) {
+      for (int k = 0; k < 34; ++k) {
         EXPECT_EQ(tz(0, j, k), 6.0);
       }
     }
@@ -1198,16 +1198,16 @@ TEST(Tensor, Slice_Basic) {
     EXPECT_EQ(1, saved.dim_size(0));
     auto tsaved = saved.tensor<float, 3>();
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 36; ++k) {
+      for (int k = 0; k < 34; ++k) {
         EXPECT_EQ(tsaved(0, j, k), 6.0);
       }
     }
   }
   {  // Empty
-    Tensor x(DT_FLOAT, TensorShape({10, 0, 36}));
+    Tensor x(DT_FLOAT, TensorShape({10, 0, 34}));
     x.flat<float>().setRandom();
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 36})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 34})));
   }
 
   {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index b9667998d6..72a13d4da7 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -2691,14 +2691,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // If Op has been specifically assigned to a non-CPU device, then No.
     if (!n->assigned_device_name().empty() &&
-        !str_util::StrContains(n->assigned_device_name(), kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) {
       result = false;
       reason = "Op has been assigned a runtime device that is not CPU.";
     }
 
     // If user has specifically assigned this op to a non-CPU device, then No.
     if (!n->def().device().empty() &&
-        !str_util::StrContains(n->def().device(), kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) {
       result = false;
       reason = "User has assigned a device that is not CPU.";
     }
@@ -2865,9 +2865,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
-  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized
-  // path. The unoptimized path is slow. Thus we dont rewrite the node
-  // and use default Eigen. But for depth_radius=2, MKL DNN optimized
+  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized 
+  // path. The unoptimized path is slow. Thus we dont rewrite the node 
+  // and use default Eigen. But for depth_radius=2, MKL DNN optimized 
   // path is taken, i.e., eigen node is rewritten by MKl DNN node.
   static bool LrnRewrite(const Node* n) {
     CHECK_NOTNULL(n);
@@ -2876,13 +2876,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     CHECK_EQ(GetNodeAttr(n->def(), "depth_radius", &depth_radius).ok(), true);
 
     // if the depth_radius of LRN is not 2, don't rewrite the node by MKL DNN
-    // and use eigen node instead
+    // and use eigen node instead 
     if (depth_radius == 2) {
       return true;
     }
     VLOG(1) << "LrnRewrite: The model sets depth_radius as not 2 which"
             << "case is not optimized by Intel MKL, thus using Eigen op"
-            << "for LRN ";
+            << "for LRN " ; 
 
     return false;
   }
@@ -3015,35 +3015,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                 std::vector<NodeBuilder::NodeOut>* ws_tensors,
                                 bool* are_ws_tensors_added);
 
-  // Helper function used by FixMklMetaDataEdges. Fixes the metadata edge
-  // pointed by 'e_metadata' corresponding to the data edge 'e_data' in graph
-  // 'g'. Returns true is fixup was done; otherwise, it returns false.
-  bool FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
-    const Edge* e_data, const Edge* e_metadata);
-
-  // Are the input Mkl metadata edges for node 'n' in graph 'g' correctly
-  // connected? If not, then fix them. This is needed because a graph may have
-  // some input Mkl metadata edges incorrectly setup after node merge and
-  // rewrite passes. This could happen because GetReversePostOrder function may
-  // not provide topologically sorted order if a graph contains cycles. The
-  // function returns true if at least one Mkl metadata edge for node 'n' was
-  // fixed. Otherwise, it returns false.
-  //
-  // Example:
-  //
-  // X = MklConv2D(_, _, _)
-  // Y = MklConv2DWithBias(_, _, _, _, _, _)
-  // Z = MklAdd(X, Y, DummyMklTensor, Y:1)
-  //
-  // For a graph such as shown above, note that 3rd argument of MklAdd contains
-  // DummyMklTensor. Actually, it should be getting the Mkl metadata from
-  // MklConv2D op (specifically, X:2). This incorrect plumbing could be possible
-  // (although rare) if the Mkl NodeMerge + NodeRewrite passes visit Z before X
-  // (possible if X, Y, Z are part of a loop.) This function fixes the Mkl
-  // metadata edges only - it does not rewrite nodes nor does it modify the Mkl
-  // data edges (1st and 2nd arguments of MklAdd).
-  bool FixMklMetaDataEdges(std::unique_ptr<Graph>* g, Node* n);
-
   // Functions specific to operators to copy attributes
   // We need operator-specific function to copy attributes because the framework
   // does not provide any generic function for it.
@@ -4271,92 +4242,6 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-//              Post-rewrite Mkl metadata fixup pass
-///////////////////////////////////////////////////////////////////////////////
-bool MklLayoutRewritePass::FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
-    const Edge* e_data, const Edge* e_metadata) {
-  if (g == nullptr || e_data == nullptr || e_metadata == nullptr) {
-    return false;
-  }
-
-  Node* n_data = e_data->src();
-  int n_data_op_slot = e_data->src_output();
-  int n_metadata_op_slot = GetTensorMetaDataIndex(n_data_op_slot,
-                                                  n_data->num_outputs());
-
-  // If the source of meta edge is a constant node (producing dummy Mkl metadata
-  // tensor), then we will need to fix.
-  if (IsConstant(e_metadata->src())) {
-    Node* e_metadata_dst = e_metadata->dst();
-    int e_metadata_in_slot = e_metadata->dst_input();
-    CHECK_NOTNULL((*g)->AddEdge(n_data, n_metadata_op_slot,
-                  e_metadata_dst, e_metadata_in_slot));
-
-    (*g)->RemoveEdge(e_metadata);
-    return true;
-  }
-
-  return false;
-}
-
-bool MklLayoutRewritePass::FixMklMetaDataEdges(std::unique_ptr<Graph>* g,
-    Node* n) {
-  bool result = false;
-
-  // If graph node is not Mkl node, then return.
-  DataType T = DT_INVALID;
-  if (!GetNodeAttr(n->def(), "T", &T).ok() ||
-      !mkl_op_registry::IsMklOp(n->type_string(), T)) {
-    return result;
-  }
-
-  // If it is Mkl node, then check if the input edges to this node that carry
-  // Mkl metadata are linked up correctly with the source node.
-
-  // For Mkl nodes, we generate twice the number of input tensors (n for Mkl
-  // data tensors + n for Mkl metadata tensors). We need to check for correct
-  // connection of n metadata tensors only.
-  int num_data_inputs = n->num_inputs() / 2;
-  for (int idx = 0; idx < num_data_inputs; idx++) {
-    // Get the edge connecting input slot with index (idx).
-    const Edge* e = nullptr;
-    TF_CHECK_OK(n->input_edge(idx, &e));
-
-    // If e is control edge, then skip.
-    if (e->IsControlEdge()) {
-      continue;
-    }
-
-    // Check that the source node for edge 'e' is Mkl node. If it is not an Mkl
-    // node, then we don't need to do anything.
-    Node* e_src = e->src();
-    if (GetNodeAttr(e_src->def(), "T", &T).ok() &&
-        mkl_op_registry::IsMklOp(e_src->type_string(), T)) {
-      // Source node for edge 'e' is Mkl node.
-      // Destination node and destination input slot of e is node 'n' and 'idx'
-      // resp.
-      CHECK_EQ(e->dst(), n);
-      CHECK_EQ(e->dst_input(), idx);
-
-      // Let's get edge that carries Mkl metadata corresponding to Mkl data edge
-      // 'e'. For that, let's first get the input slot of 'n' where the meta
-      // edge will feed the value.
-      int e_meta_in_slot = GetTensorMetaDataIndex(e->dst_input(),
-                                                  n->num_inputs());
-      const Edge* e_meta = nullptr;
-      TF_CHECK_OK(n->input_edge(e_meta_in_slot, &e_meta));
-
-      // Let's check if we need to fix this meta edge.
-      if (FixMklMetaDataEdgeIfNeeded(g, e, e_meta)) {
-        result = true;
-      }
-    }
-  }
-
-  return result;
-}
-
-///////////////////////////////////////////////////////////////////////////////
 //              Run function for the pass
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -4422,25 +4307,6 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
 
   DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite)", &**g);
 
-  order.clear();
-  GetReversePostOrder(**g, &order);  // This will give us topological sort.
-  for (Node* n : order) {
-    // If node is not an op or it cannot run on CPU device, then skip.
-    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
-      continue;
-    }
-    if (FixMklMetaDataEdges(g, n)) {
-      string node_name = n->name();
-      string op_name = n->type_string();
-
-      VLOG(1) << "MklLayoutRewritePass: fixed metadata edges for node "
-              << node_name << " with op " << op_name;
-      result = true;
-    }
-  }
-  DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite+Fixup)",
-            &**g);
-
   return result;
 }
 
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 7645b4a7f0..029cdcf94a 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -3519,37 +3519,6 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
 }
 
 /////////////////////////////////////////////////////////////////////
-//         Post-rewrite fixup pass test
-
-TEST_F(MklLayoutPassTest, PostRewriteFixUpPass) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_UINT8 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_UINT8 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'E' op: '_MklAdd'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A', 'D', 'D']}");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Const);E(_MklAdd);"
-            "M(_MklInput);N(_MklInput)|A->C;A->E:1;B->C:1;C->E;C:2->E:2;"
-            "D->E:3;M->C:2;N->C:3");
-}
-
-/////////////////////////////////////////////////////////////////////
 
 static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
   testing::StopTiming();
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 0c02876ac5..6749a7c571 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -610,6 +610,7 @@ class SymbolicShapeRefiner {
     }
   };
 
+  // Compute the shape of the tensors outputed by node 'node' at output port
   // 'port_index' as the union of shape1 and shape2.
   ShapeHandle OutputAsUnion(const NodeDef* node, int port_index,
                             ShapeHandle shape1, ShapeHandle shape2) {
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 8ca726df0b..1b18087cdf 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -679,7 +679,6 @@ cc_library(
     deps = [
         ":constant_folding",
         ":graph_optimizer",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
@@ -781,6 +780,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:scoped_allocator_ops_op_lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 03e36a7b9c..4dde7ed1b4 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -201,7 +200,8 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
         }
       }
       if (optimizable) {
-        VLOG(1) << "Optimizing fused batch norm node " << node.DebugString();
+        VLOG(2) << "Optimizing fused batch norm node " << node.DebugString()
+                << std::endl;
         AddBatchNormNodes(optimized_graph, node);
         continue;
       }
diff --git a/tensorflow/core/kernels/as_string_op.cc b/tensorflow/core/kernels/as_string_op.cc
index a7757d1361..66c4aff3e3 100644
--- a/tensorflow/core/kernels/as_string_op.cc
+++ b/tensorflow/core/kernels/as_string_op.cc
@@ -73,7 +73,6 @@ class AsStringOp : public OpKernel {
     }
     switch (dtype) {
       case DT_INT8:
-      case DT_INT16:
       case DT_INT32:
         strings::Appendf(&format_, "d");
         break;
@@ -130,7 +129,6 @@ class AsStringOp : public OpKernel {
       ENCODE_TYPE(DT_FLOAT, float, format_);
       ENCODE_TYPE(DT_DOUBLE, double, format_);
       ENCODE_TYPE(DT_INT8, int8, format_);
-      ENCODE_TYPE(DT_INT16, int16, format_);
       case (DT_BOOL): {
         const auto& input_flat = input_tensor->flat<bool>();
         for (int i = 0; i < input_flat.size(); ++i) {
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index 49b90e855b..14d889e8e3 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -33,41 +33,52 @@ class ClipOp : public OpKernel {
     const Tensor& in0 = ctx->input(0);
     const Tensor& in1 = ctx->input(1);
     const Tensor& in2 = ctx->input(2);
-    OP_REQUIRES(ctx, (in0.shape() == in1.shape() ||
-                      TensorShapeUtils::IsScalar(in1.shape())) &&
-                     (in0.shape() == in2.shape() ||
-                      TensorShapeUtils::IsScalar(in2.shape())),
-                errors::InvalidArgument(
-                    "clip_value_min and clip_value_max must be either of "
-                    "the same shape as input, or a scalar. ",
-                    "input shape: ", in0.shape().DebugString(),
-                    "clip_value_min shape: ", in1.shape().DebugString(),
-                    "clip_value_max shape: ", in2.shape().DebugString()));
-
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
-    if (out->NumElements() == 0) return;  // Nothing to do for empty output
 
     auto in0_flat = in0.flat<T>();
     auto in1_flat = in1.flat<T>();
     auto in2_flat = in2.flat<T>();
-    auto out_flat = out->flat<T>();
     const Device& d = ctx->eigen_device<Device>();
 
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
+    auto out_flat = out->flat<T>();
     if (in1.shape() == in2.shape()) {
       if (in0.shape() == in1.shape()) {
         functor::TernaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                             out_flat);
       } else {
+        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in1.shape()),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::UnaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                           out_flat);
       }
     } else {
       if (in0.shape() == in1.shape()) {
+        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in2.shape()),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryLeftClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                out_flat);
       } else {
+        OP_REQUIRES(ctx,
+                    (in0.shape() == in2.shape() &&
+                     TensorShapeUtils::IsScalar(in1.shape())),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryRightClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                 out_flat);
       }
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 17a85d9773..9a3b2303a3 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -57,7 +57,6 @@ struct DenseUpdate<GPUDevice, T, SUB> {
   template struct functor::DenseUpdate<GPUDevice, T, ADD>; \
   template struct functor::DenseUpdate<GPUDevice, T, SUB>;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
-TF_CALL_int32(DEFINE_GPU_KERNELS);
 TF_CALL_int64(DEFINE_GPU_KERNELS);
 #undef DEFINE_GPU_KERNELS
 
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index 5cd8e04927..e6fefe643b 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -37,7 +37,6 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
-TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.cc b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
index 4563fc6353..39b6924d74 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
@@ -31,7 +31,6 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
-TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 4e53291b7f..7e5a9e1ec5 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -228,8 +228,6 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
-TF_CALL_int32(DECLARE_GPU_SPECS);
-TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
@@ -241,8 +239,6 @@ TF_CALL_complex128(DECLARE_GPU_SPECS);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_ND_GPU(type) REGISTER_GATHER_ND_ALL_INDICES(GPU, type)
 
-TF_CALL_int32(REGISTER_GATHER_ND_GPU);
-TF_CALL_int64(REGISTER_GATHER_ND_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex64(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex128(REGISTER_GATHER_ND_GPU);
diff --git a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
index da8d2e9e3c..b03efc684f 100644
--- a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
@@ -119,8 +119,6 @@ struct GatherNdSlice<GPUDevice, T, Index, IXDIM> {
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
-TF_CALL_int32(DEFINE_GPU_SPECS);
-TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 094504d6b9..ef332ebee3 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -153,7 +153,6 @@ TF_CALL_uint64(REGISTER_GATHER_CPU);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
-TF_CALL_int64(REGISTER_GATHER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
 TF_CALL_complex64(REGISTER_GATHER_GPU);
 TF_CALL_complex128(REGISTER_GATHER_GPU);
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 31d1b949ef..5eeb23d810 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -14,7 +14,6 @@ limitations under the License.
 
 #include <limits>
 #include <vector>
-#include <unordered_map>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -591,8 +590,8 @@ class MklConcatOp : public OpKernel {
       const int N = input_tensors.size();
 
       // Get Tensor shapes.
-      std::vector<MklDnnShape> mkl_input_shapes(N);
-      GetMklShapeList(context, "values", &mkl_input_shapes);
+      std::vector<MklDnnShape> input_shapes(N);
+      GetMklShapeList(context, "values", &input_shapes);
 
       const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM)
                                             ? MklGetInput(context, 0)
@@ -611,14 +610,19 @@ class MklConcatOp : public OpKernel {
       int i = 0;
       bool invoke_eigen = false;
       bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
-      const TensorShape expected_shape = mkl_input_shapes[0].IsMklTensor()
-                                       ? mkl_input_shapes[0].GetTfShape()
-                                       : input_tensors[0].shape();
+      const TensorShape expected_shape = input_shapes[0].IsMklTensor()
+                                             ? input_shapes[0].GetTfShape()
+                                             : input_tensors[0].shape();
       size_t expected_dims = expected_shape.dims();
 
       if (concat_dim < 0) concat_dim = expected_dims + concat_dim;
 
-      for (auto& s : mkl_input_shapes) {
+      for (auto& s : input_shapes) {
+        if (s == expected_shape) {
+          ++i;
+          continue;
+        }
+
         TensorShape s_shape =
             s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
         size_t s_dims = s_shape.dims();
@@ -661,14 +665,21 @@ class MklConcatOp : public OpKernel {
 
       // Call Eigen library
       if (invoke_eigen) {
-        CallEigenVersion(context, input_tensors, mkl_input_shapes);
+        TensorShapeList tf_input_shapes;
+        i = 0;
+        for (auto& s : input_shapes) {
+          TensorShape s_shape =
+              s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
+          tf_input_shapes.push_back(s_shape);
+          ++i;
+        }
+        CallEigenVersion(context, input_tensors, tf_input_shapes);
         return;
       }
 
       memory::dims dst_dims;
-
       if (are_all_mkl_inputs)
-        dst_dims = TFShapeToMklDnnDims(mkl_input_shapes[0].GetTfShape());
+        dst_dims = TFShapeToMklDnnDims(input_shapes[0].GetTfShape());
       else
         // When all the inputs are in Tensorflow format, we don't know
         // what is the input data format. In that case, we just use
@@ -678,61 +689,26 @@ class MklConcatOp : public OpKernel {
       std::vector<memory::primitive_desc> srcs_pd;
       std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
       int64 dst_concat_dim_size = 0;
-
-      bool isMklReorderNeeded = false;
-      memory::format mkl_common_format = memory::format::any;
-      if (are_all_mkl_inputs) {
-        mkl_common_format =
-            FindMklCommonFormat(mkl_input_shapes, concat_dim,
-               &isMklReorderNeeded, &dst_concat_dim_size);
-
-        if (!isMklReorderNeeded) {
-          // All MKL tensors have a same format. Reorder is not needed.
-          for (int k = 0; k < N; k++) {
-            if (input_tensors[k].NumElements() == 0)
-              continue;
-
-            auto src_md = mkl_input_shapes[k].GetMklLayout();
-            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-            auto src_mpd = srcs[k].GetUsrMemPrimDesc();
-            srcs_pd.push_back(src_mpd);
-          }
-        } else {
-          // MKL tensors have different formats.
-          // Reorder them to most common format.
-          for (int k = 0; k < N; k++) {
-            if (input_tensors[k].NumElements() == 0)
-              continue;
-
-            auto src_dims = TFShapeToMklDnnDims(
-                mkl_input_shapes[k].GetTfShape());
-            auto src_md = mkl_input_shapes[k].GetMklLayout();
-            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-
-            if (src_md.data.format != mkl_common_format)
-              src_md = memory::desc(src_dims, MklDnnType<T>(),
-                           mkl_common_format);
-
-            srcs_pd.push_back(memory::primitive_desc(src_md, cpu_engine));
-          }
-        }
-      } else {  // All TF inputs
-        for (int k = 0; k < N; k++) {
-          if (input_tensors[k].NumElements() == 0)
-            continue;
-
-          memory::dims src_dims = TFShapeToMklDnnDims(input_tensors[k].shape());
-          dst_concat_dim_size += src_dims[concat_dim];
-
-          // It does not matter what data format to be used (NHWC versus NCHW).
-          // We just need to ensure that output uses same data format as inputs.
-          auto src_md =
-              memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
-
-          srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-          auto src_mpd = srcs[k].GetUsrMemPrimDesc();
-          srcs_pd.push_back(src_mpd);
-        }
+      for (int k = 0; k < N; k++) {
+        bool is_mkl_tensor = input_shapes[k].IsMklTensor();
+        memory::dims src_dims;
+
+        // Same comment as dst_dims for src_dims.
+        src_dims = (is_mkl_tensor)
+                       ? TFShapeToMklDnnDims(input_shapes[k].GetTfShape())
+                       : TFShapeToMklDnnDims(input_tensors[k].shape());
+
+        dst_concat_dim_size += src_dims[concat_dim];
+        auto src_md =
+            is_mkl_tensor ? input_shapes[k].GetMklLayout() :
+                          // It does not matter what data format we use here
+                          // (NHWC or NCHW). We just need to ensure that output
+                          // of Concat uses same data format as input.
+                memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
+
+        srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+        auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+        srcs_pd.push_back(src_mpd);
       }
       dst_dims[concat_dim] = dst_concat_dim_size;
 
@@ -742,33 +718,25 @@ class MklConcatOp : public OpKernel {
       if (are_all_mkl_inputs) {
         // Since we are passing a specific format for destination,
         // we need to have dst_dims in MklDnn order (NCHW).
-        auto orig_tf_format = mkl_input_shapes[0].GetTfDataFormat();
+        auto orig_tf_format = input_shapes[0].GetTfDataFormat();
         dst_dims_in_nchw = MklDnnDimsInNCHW(
             dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
-        // Set the output format same as the most common format of inputs
-        // to avoid layout conversions.
+        // We will set the output in the same format as input to avoid layout
+        // conversions.
+        // Currently we are setting dst format same as input format.
+        // See if we can make this choice in a better way.
         dst_md = memory::desc(
-            dst_dims_in_nchw, MklDnnType<T>(), mkl_common_format);
+            dst_dims_in_nchw, MklDnnType<T>(),
+            (memory::format)input_shapes[0].GetMklLayout().data.format);
       } else {
-        // All inputs are TF tensors.
-        // Set the output format same as input format (nchw).
+        // Again, format does not matter here. We just need to make it same as
+        // input format.
         dst_md = memory::desc(dst_dims, MklDnnType<T>(), memory::format::nchw);
       }
 
       std::vector<primitive::at> inputs;
-      std::vector<primitive> net;
-      if (isMklReorderNeeded) {
-        for (int k = 0; k < input_tensors.size(); k++) {
-          if (input_tensors[k].NumElements() > 0) {
-            srcs[k].CheckReorderToOpMem(srcs_pd[k], &net);
-          }
-        }
-      }
-      for (int k = 0; k < input_tensors.size(); k++) {
-        if (input_tensors[k].NumElements() > 0) {
-          inputs.push_back(srcs[k].GetOpMem());
-        }
-      }
+      for (int k = 0; k < input_tensors.size(); k++)
+        inputs.push_back(srcs[k].GetOpMem());
 
       // If all inputs are in MKL format, then meaning of concat_dim needs to
       // change. Value of concat_dim is tied to input Tensorflow data format
@@ -777,8 +745,7 @@ class MklConcatOp : public OpKernel {
       // But ifinput tensors are in NHWC order, then semantics need to change.
       // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
       // then since MklDnn order is NCHW, concat_dim needs to be 1.
-      if (are_all_mkl_inputs)
-         concat_dim = mkl_input_shapes[0].TfDimIdx(concat_dim);
+      if (are_all_mkl_inputs) concat_dim = input_shapes[0].TfDimIdx(concat_dim);
 
       auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
 
@@ -791,7 +758,7 @@ class MklConcatOp : public OpKernel {
         dnn_shape_dst.SetMklLayout(&dst_pd);
         dnn_shape_dst.SetElemType(MklDnnType<T>());
         dnn_shape_dst.SetTfLayout(dst_dims.size(), dst_dims_in_nchw,
-                                  mkl_input_shapes[0].GetTfDataFormat());
+                                  input_shapes[0].GetTfDataFormat());
         tf_shape_dst.AddDim((dst_pd.get_size() / sizeof(T)));
       } else {
         dnn_shape_dst.SetMklTensor(false);
@@ -806,6 +773,7 @@ class MklConcatOp : public OpKernel {
       dst.SetUsrMem(dst_md, dst_tensor);
 
       auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
+      std::vector<primitive> net;
       net.push_back(concat_op);
       stream(stream::kind::eager).submit(net).wait();
     } catch (mkldnn::error& e) {
@@ -819,27 +787,15 @@ class MklConcatOp : public OpKernel {
   }
 
   void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
-                        const MklDnnShapeList& mkl_input_shapes) {
-    CHECK_EQ(values.size(), mkl_input_shapes.size());
+                        const TensorShapeList& input_shapes) {
+    CHECK_EQ(values.size(), input_shapes.size());
 
     std::vector<Tensor> converted_values;
-    TensorShapeList tf_input_shapes;
-    for (int i = 0; i < mkl_input_shapes.size(); i++) {
-      if (mkl_input_shapes[i].IsMklTensor()) {
-        // do conversion from MKL to TF
-        Tensor tmp_tensor =
-            ConvertMklToTF<T>(context, values[i], mkl_input_shapes[i]);
-        converted_values.push_back(tmp_tensor);
-        tf_input_shapes.push_back(mkl_input_shapes[i].GetTfShape());
-      } else {
-        // no conversion since it is TF tensor already
-        converted_values.push_back(values[i]);
-        tf_input_shapes.push_back(values[i].shape());
-      }
-    }
+    for (int i = 0; i < input_shapes.size(); i++)
+      converted_values.push_back(values[i]);
 
     // Call Eigen concat.
-    eigen_concat_op_.Compute(context, converted_values, tf_input_shapes);
+    eigen_concat_op_.Compute(context, converted_values, input_shapes);
 
     // Set output Mkl tensor for this op.
     MklDnnShape dnn_shape_output;
@@ -856,55 +812,6 @@ class MklConcatOp : public OpKernel {
         output_tensor->flat<uint8>().data(),
         output_tensor->flat<uint8>().size() * sizeof(uint8));
   }
-
-  // This method finds the most commom format accross all MKL inputs
-  // Inputs:
-  //   1. input_shapes: shapes of input (MKL) tensors.
-  //   2. concat_dim: concat dimension.
-  // Outputs:
-  //   1. is_reorder_needed is set to true if inputs have difference formats
-  //      It is set to false otherwise.
-  //   2. concat_dim_size is the size of concat_dim.
-  // Return:
-  //   return the common MKL format.
-  memory::format FindMklCommonFormat(const MklDnnShapeList& input_shapes,
-      int concat_dim, bool* is_reorder_needed, int64* concat_dim_size) {
-    *is_reorder_needed = false;
-    *concat_dim_size = 0;
-    std::unordered_map<int, int> occurrence_map;
-    if (input_shapes.size() == 0)
-      return memory::format::any;
-
-    // Compute ocurrences of each format of all inputs.
-    for (int k=0; k <input_shapes.size(); k++) {
-      auto src_dims = TFShapeToMklDnnDims(input_shapes[k].GetTfShape());
-      *concat_dim_size += src_dims[concat_dim];
-      int fmt = static_cast<int>(
-          input_shapes[k].GetMklLayout().data.format);
-      occurrence_map[fmt] += 1;
-    }
-
-    if (occurrence_map.size() == 1) {
-       // this means that all inputs have a same format
-       // return it with is_reorder_needed set false.
-       return static_cast<memory::format>(
-           input_shapes[0].GetMklLayout().data.format);
-    }
-
-    // Input tensors have different formats. Thus, reorder is needed.
-    // We pick up the most common format to minimize the total
-    // number of input reorder.
-    memory::format commonest_format = memory::format::any;
-    int max_occurrence = 0;
-    *is_reorder_needed = true;
-    for (auto item : occurrence_map) {
-      if (item.second > max_occurrence) {
-        commonest_format = static_cast<memory::format>(item.first);
-        max_occurrence = item.second;
-      }
-    }
-    return commonest_format;
-  }
 };
 
 #endif
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index f857be6c32..c1da0ded1d 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -18,7 +18,6 @@ limitations under the License.
 // bias.
 
 #ifdef INTEL_MKL
-#ifdef INTEL_MKL_ML
 
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
@@ -265,5 +264,4 @@ class MklConv2DCustomBackpropBiasOp : public OpKernel {
 TF_CALL_float(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 } /* namespace tensorflow */
-#endif /* INTEL_MKL_ML */
 #endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index c0dfed7d7d..279167aba2 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -199,15 +199,13 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
     CHECK_NOTNULL(pool_params);
     CHECK_NOTNULL(dnn_data_input);
     TensorShape input_tensor_shape = input_tensor.shape();
-    if (input_tensor.NumElements() != 0) {
-      memory::desc input_md =
+    memory::desc input_md =
         input_mkl_shape.IsMklTensor()
             ? input_mkl_shape.GetMklLayout()
             : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
                                                      this->data_format_tf_),
                            MklDnnType<T>(), this->data_format_mkldnn_);
-      dnn_data_input->SetUsrMem(input_md, &input_tensor);
-    }
+    dnn_data_input->SetUsrMem(input_md, &input_tensor);
     this->InitMklPoolParameters(context, pool_params, input_mkl_shape,
                                 input_tensor_shape);
   }
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index e1fc2ea128..43c5b29509 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -292,7 +292,6 @@ TF_CALL_string(REGISTER_SCATTER_ND_CPU);
   REGISTER_SCATTER_ND_UPDATE_GPU(type);   \
   REGISTER_SCATTER_ND_GPU(type);
 
-TF_CALL_int32(REGISTER_SCATTER_ND_ALL_GPU);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_complex64(REGISTER_SCATTER_ND_ALL_GPU);
@@ -307,8 +306,6 @@ TF_CALL_complex128(REGISTER_SCATTER_ND_ALL_GPU);
 #define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
   REGISTER_SCATTER_ND_UPDATE(type, SYCL);
 
-TF_CALL_int32(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
-TF_CALL_int32(REGISTER_SCATTER_ND_UPDATE_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
 #undef REGISTER_SCATTER_ND_ADD_SUB_SYCL
@@ -579,7 +576,6 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
-TF_CALL_int32(DECLARE_GPU_SPECS);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index 08b657f4c3..a3c21edc15 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -170,7 +170,6 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
-TF_CALL_int32(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
index 634f9ba887..bb0129fa6f 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@@ -216,13 +216,8 @@ TEST_F(ScopedAllocatorConcatOpTest, Success3) {
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, Reshape) {
-  MakeOp({2, 2, 4}, DT_DOUBLE, true, "test", 120, 2);
-
-  // The elements of the third parameter to ExecOp must be multiples of
-  // Allocator::kAllocatorAlignment in size.  If they are not, the backing
-  // tensor allocated by PrepOp will have too many elements and reshaping
-  // will fail.
-  ExecOp(DT_DOUBLE, 120, {{2, 4}, {2, 4}});
+  MakeOp({2, 2, 2}, DT_DOUBLE, true, "test", 120, 2);
+  ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}});
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, NoReshapeAttr) {
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index d65692a552..7796bf3587 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -16,14 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
-
-// This file requires the following include because it uses CudaAtomicMax:
-// #include "tensorflow/core/util/cuda_kernel_helper.h"
-
-// Unfortunately we can't add the #include, since it breaks compilation for
-// non-GPU targets. This only breaks in clang, because it's more strict for
-// template code and CudaAtomicMax is used in template context.
-
 // This file requires the following include because it uses CudaAtomicMax:
 // #include "tensorflow/core/util/cuda_kernel_helper.h"
 
@@ -138,4 +130,4 @@ struct Highest {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 866c5dcd52..a1f9667b78 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -1490,7 +1490,7 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
 
 #endif  // TENSORFLOW_USE_LIBXSMM
 
-// Here is an overview of the SparseMatMul code. Note that we assume that the
+// Here is a an overview of the SparseMatMul code. Note that we assume that the
 // left matrix is sparse.
 //
 // The matrix "left" is divided into a grid with blocksize of (M, KL). Each
diff --git a/tensorflow/core/kernels/string_split_op.cc b/tensorflow/core/kernels/string_split_op.cc
index 26ab72f12e..4c2b312c34 100644
--- a/tensorflow/core/kernels/string_split_op.cc
+++ b/tensorflow/core/kernels/string_split_op.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
@@ -44,63 +43,6 @@ std::vector<string> Split(const string& str, const string& delimiter,
   return char_vector;
 }
 
-std::vector<string> SplitV2(const string& str, StringPiece sep, int maxsplit) {
-  // This SplitV2 method matches the behavior of python's str.split:
-  //   If sep is given, consecutive delimiters are not grouped together
-  //   and are deemed to delimit empty strings (for example, '1,,2'.split(',')
-  //   returns ['1', '', '2']). The sep argument may consist of multiple
-  //   characters (for example, '1<>2<>3'.split('<>') returns ['1', '2', '3']).
-  //   Splitting an empty string with a specified separator returns [''].
-  //
-  //   If sep is not specified or is None, a different splitting algorithm is
-  //   applied: runs of consecutive whitespace are regarded as a single
-  //   separator, and the result will contain no empty strings at the start or
-  //   end if the string has leading or trailing whitespace. Consequently,
-  //   splitting an empty string or a string consisting of just whitespace
-  //   with a None separator returns [].
-
-  std::vector<string> result;
-
-  StringPiece text(str);
-  if (maxsplit == 0) {
-    result.emplace_back(std::string(text));
-    return result;
-  }
-
-  if (sep.empty()) {
-    StringPiece token;
-    // Remove leading whitespaces.
-    str_util::RemoveLeadingWhitespace(&text);
-    int split = 0;
-    while (str_util::ConsumeNonWhitespace(&text, &token)) {
-      result.emplace_back(std::string(token));
-      str_util::RemoveLeadingWhitespace(&text);
-      ++split;
-      if (maxsplit > 0 && split == maxsplit) {
-        result.emplace_back(std::string(text));
-        return result;
-      }
-    }
-    return result;
-  }
-  auto p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
-  int split = 0;
-  while (p != text.end()) {
-    StringPiece token = text.substr(0, p - text.begin());
-    result.emplace_back(std::string(token));
-    text.remove_prefix(token.size());
-    text.remove_prefix(sep.size());
-    ++split;
-    if (maxsplit > 0 && split == maxsplit) {
-      result.emplace_back(std::string(text));
-      return result;
-    }
-    p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
-  }
-  result.emplace_back(std::string(text));
-  return result;
-}
-
 }  // namespace
 
 class StringSplitOp : public OpKernel {
@@ -180,78 +122,6 @@ class StringSplitOp : public OpKernel {
   bool skip_empty_;
 };
 
-class StringSplitV2Op : public OpKernel {
- public:
-  explicit StringSplitV2Op(OpKernelConstruction* context)
-      : OpKernel(context), maxsplit_(-1) {
-    OP_REQUIRES_OK(context, context->GetAttr("maxsplit", &maxsplit_));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor* input_tensor;
-    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(input_tensor->shape()),
-                errors::InvalidArgument("input must be a vector, got shape: ",
-                                        input_tensor->shape().DebugString()));
-
-    const auto input_vec = input_tensor->vec<string>();
-    const int64 batch_size = input_vec.dimension(0);
-
-    const Tensor* sep_tensor;
-    OP_REQUIRES_OK(ctx, ctx->input("sep", &sep_tensor));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(sep_tensor->shape()),
-                errors::InvalidArgument("sep must be a scalar, got shape: ",
-                                        sep_tensor->shape().DebugString()));
-    const auto sep_vec = sep_tensor->flat<string>();
-    StringPiece sep(sep_vec(0));
-    std::vector<string> tokens;
-    // Guess that we'll be unpacking a handful of tokens per example.
-    static constexpr int kReserveSize = 4;
-    tokens.reserve(batch_size * kReserveSize);
-
-    int64 output_size = 0;
-    int64 max_num_entries = 0;
-    std::vector<int64> num_indices(batch_size);
-    for (int64 i = 0; i < batch_size; ++i) {
-      std::vector<string> parts = SplitV2(input_vec(i), sep, maxsplit_);
-      int64 n_entries = parts.size();
-      num_indices[i] = n_entries;
-      output_size += n_entries;
-      max_num_entries = std::max(max_num_entries, n_entries);
-      tokens.insert(tokens.end(), parts.begin(), parts.end());
-    }
-
-    Tensor* sp_indices_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({output_size, 2}),
-                                             &sp_indices_t));
-    Tensor* sp_tokens_t;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_output(1, TensorShape({output_size}), &sp_tokens_t));
-    Tensor* sp_shape_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(2, TensorShape({2}), &sp_shape_t));
-
-    auto sp_indices = sp_indices_t->matrix<int64>();
-    auto sp_tokens = sp_tokens_t->vec<string>();
-    auto sp_shape = sp_shape_t->vec<int64>();
-    sp_shape(0) = batch_size;
-    sp_shape(1) = max_num_entries;
-    size_t c = 0;
-    for (size_t i = 0; i < batch_size; ++i) {
-      for (size_t j = 0; j < num_indices[i]; ++j) {
-        sp_indices(c, 0) = i;
-        sp_indices(c, 1) = j;
-        sp_tokens(c) = tokens[c];
-        ++c;
-      }
-    }
-  }
-
- private:
-  int maxsplit_;
-};
-
 REGISTER_KERNEL_BUILDER(Name("StringSplit").Device(DEVICE_CPU), StringSplitOp);
-REGISTER_KERNEL_BUILDER(Name("StringSplitV2").Device(DEVICE_CPU),
-                        StringSplitV2Op);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/candidate_sampling_ops.cc b/tensorflow/core/ops/candidate_sampling_ops.cc
index 6e589c8d1c..6e4d100b04 100644
--- a/tensorflow/core/ops/candidate_sampling_ops.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops.cc
@@ -145,15 +145,12 @@ REGISTER_OP("ComputeAccidentalHits")
       int64 num_true;
       TF_RETURN_IF_ERROR(c->GetAttr("num_true", &num_true));
 
-      // Validate true_classes, must be a matrix.
+      // Validate true_classes.
       ShapeHandle true_classes;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &true_classes));
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(
           c->WithValue(c->Dim(true_classes, 1), num_true, &unused));
-      // Validate sampled_candidates, must be a vector.
-      ShapeHandle sampled_candidates;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &sampled_candidates));
 
       // All three outputs are the same shape.
       ShapeHandle v = c->Vector(InferenceContext::kUnknownDim);
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 9dca5f53ce..15e0ca8af9 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -218,17 +218,7 @@ REGISTER_OP("MapAndBatchDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      // Use index from the end to retrieve the Input shapes,
-      // so that to avoid guessing the length of "other_arguments".
-      // batch_size, num_parallel_batches, and drop_remainder are 0-D scalars.
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
-
-      return shape_inference::ScalarShape(c);
-    });
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("MapAndBatchDatasetV2")
     .Input("input_dataset: variant")
@@ -241,17 +231,7 @@ REGISTER_OP("MapAndBatchDatasetV2")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      // Use index from the end to retrieve the Input shapes,
-      // so that to avoid guessing the length of "other_arguments".
-      // batch_size, num_parallel_calls, and drop_remainder are 0-D scalars.
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
-
-      return shape_inference::ScalarShape(c);
-    });
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("PrefetchDataset")
     .Input("input_dataset: variant")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 87f4991134..d949e70c66 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -454,9 +454,7 @@ REGISTER_OP("DrawBoundingBoxes")
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 2), 4, &unused));
 
-      // The rank of the input image (rank = 4) has already been restricted
-      // above, and the output is of the same shape as the input.
-      return shape_inference::UnchangedShape(c);
+      return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
     });
 
 // --------------------------------------------------------------------------
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index b3487122e2..1740fa152c 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1084,7 +1084,7 @@ REGISTER_OP("UnsortedSegmentProd")
     .Input("segment_ids: Tindices")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: realnumbertype")
     .Attr("Tindices: {int32,int64}")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .SetShapeFn(UnsortedSegmentReductionShapeFn);
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 41efa49ce3..fc60e807b9 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1453,7 +1453,6 @@ REGISTER_OP("QuantizedReluX")
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 4423062362..1d5c743a56 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -78,7 +78,7 @@ REGISTER_OP("ReduceJoin")
 REGISTER_OP("AsString")
     .Input("input: T")
     .Output("output: string")
-    .Attr("T: {int8, int16, int32, int64, complex64, float, double, bool}")
+    .Attr("T: {int32, int64, complex64, float, double, bool, int8}")
     .Attr("precision: int = -1")
     .Attr("scientific: bool = false")
     .Attr("shortest: bool = false")
@@ -134,24 +134,6 @@ REGISTER_OP("StringSplit")
       return Status::OK();
     });
 
-REGISTER_OP("StringSplitV2")
-    .Input("input: string")
-    .Input("sep: string")
-    .Output("indices: int64")
-    .Output("values: string")
-    .Output("shape: int64")
-    .Attr("maxsplit: int = -1")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-
-      c->set_output(0, c->Matrix(InferenceContext::kUnknownDim, 2));
-      c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
-      c->set_output(2, c->Vector(2));
-      return Status::OK();
-    });
-
 REGISTER_OP("StringStrip")
     .Input("input: string")
     .Output("output: string")
diff --git a/tensorflow/core/platform/cpu_info.cc b/tensorflow/core/platform/cpu_info.cc
index e9da3d8e32..99de364042 100644
--- a/tensorflow/core/platform/cpu_info.cc
+++ b/tensorflow/core/platform/cpu_info.cc
@@ -344,28 +344,5 @@ int CPUModelNum() {
 #endif
 }
 
-int CPUIDNumSMT() {
-#ifdef PLATFORM_IS_X86
-  // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration
-  // https://software.intel.com/en-us/articles/intel-sdm (Vol 3A)
-  // Section: Detecting Hardware Multi-threads Support and Topology
-  // Uses CPUID Leaf 11 to enumerate system topology on Intel x86 architectures
-  // Other cases not supported
-  uint32 eax, ebx, ecx, edx;
-  // Check if system supports Leaf 11
-  GETCPUID(eax, ebx, ecx, edx, 0, 0);
-  if (eax >= 11) {
-    // 1) Leaf 11 available? CPUID.(EAX=11, ECX=0):EBX != 0
-    // 2) SMT_Mask_Width = CPUID.(EAX=11, ECX=0):EAX[4:0] if CPUID.(EAX=11,
-    // ECX=0):ECX[15:8] is 1
-    GETCPUID(eax, ebx, ecx, edx, 11, 0);
-    if (ebx != 0 && ((ecx & 0xff00) >> 8) == 1) {
-      return 1 << (eax & 0x1f);  // 2 ^ SMT_Mask_Width
-    }
-  }
-#endif  // PLATFORM_IS_X86
-  return 0;
-}
-
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index 175c9ae8b1..b5be7e8b54 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -35,10 +35,6 @@ namespace port {
 // software can change it dynamically.
 int NumSchedulableCPUs();
 
-// Returns an estimate of the number of hyperthreads per physical core
-// on the CPU
-int NumHyperthreadsPerCore();
-
 // Mostly ISA related features that we care about
 enum CPUFeature {
   // Do not change numeric assignments.
@@ -111,9 +107,6 @@ int CPUModelNum();
 // Returns nominal core processor cycles per second of each processor.
 double NominalCPUFrequency();
 
-// Returns num of hyperthreads per physical core
-int CPUIDNumSMT();
-
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index a319ccbdbe..ae81f9b5b3 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -71,8 +71,6 @@ def pyx_library(
         name = filename + "_cython_translation",
         srcs = [filename],
         outs = [filename.split(".")[0] + ".cpp"],
-        # Optionally use PYTHON_BIN_PATH on Linux platforms so that python 3
-        # works. Windows has issues with cython_binary so skip PYTHON_BIN_PATH.
         cmd = "PYTHONHASHSEED=0 $(location @cython//:cython_binary) --cplus $(SRCS) --output-file $(OUTS)",
         tools = ["@cython//:cython_binary"] + pxd_srcs,
     )
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index ff4b4436bb..72c12318ca 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -115,17 +115,18 @@ class LibHDFS {
     const char* kLibHdfsDso = "libhdfs.so";
 #endif
     char* hdfs_home = getenv("HADOOP_HDFS_HOME");
-    if (hdfs_home != nullptr) {
-      string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
-      status_ = TryLoadAndBind(path.c_str(), &handle_);
-      if (status_.ok()) {
-        return;
-      }
+    if (hdfs_home == nullptr) {
+      status_ = errors::FailedPrecondition(
+          "Environment variable HADOOP_HDFS_HOME not set");
+      return;
+    }
+    string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
+    status_ = TryLoadAndBind(path.c_str(), &handle_);
+    if (!status_.ok()) {
+      // try load libhdfs.so using dynamic loader's search path in case
+      // libhdfs.so is installed in non-standard location
+      status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
     }
-
-    // Try to load the library dynamically in case it has been installed
-    // to a in non-standard location.
-    status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
   }
 
   Status status_;
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 708f32ba80..8e316472fe 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -74,11 +74,6 @@ int NumSchedulableCPUs() {
   return kDefaultCores;
 }
 
-int NumHyperthreadsPerCore() {
-  static const int ht_per_core = tensorflow::port::CPUIDNumSMT();
-  return (ht_per_core > 0) ? ht_per_core : 1;
-}
-
 void* AlignedMalloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__)
   return memalign(minimum_alignment, size);
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index cb1fd09dbb..522a9d84fd 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 9
+#define TF_MINOR_VERSION 8
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc0"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 90b6533690..dffc965b14 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -42,7 +42,6 @@ limitations under the License.
 
 #ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
-#include "tensorflow/core/lib/core/stringpiece.h"
 
 using mkldnn::engine;
 using mkldnn::memory;
@@ -713,48 +712,15 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
   return output_tensor;
 }
 #else
-using mkldnn::stream;
-template <typename T> class MklDnnData;
-
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklDnnShape& mkl_shape) {
   Tensor output_tensor;
-  try {
-    if (!mkl_shape.IsMklTensor())
-      return mkl_tensor;  // return input since it is already TF tensor
-
-    TensorShape output_shape = mkl_shape.GetTfShape();;
-
-    // Allocate output tensor.
-    context->allocate_temp(DataTypeToEnum<T>::v(),
-        output_shape, &output_tensor);
-
-    auto cpu_engine = engine(engine::cpu, 0);
-    MklDnnData<T> input(&cpu_engine);
-
-    // Get Mkl layout of input tensor.
-    auto input_mkl_md = mkl_shape.GetMklLayout();
-    auto output_tf_md = mkl_shape.GetTfLayout();
-    auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
-    input.SetUsrMem(input_mkl_md, &mkl_tensor);
-
-    // reorder
-    if (input.IsReorderNeeded(output_tf_pd)) {
-      std::vector<primitive> net;
-      CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, &output_tensor, &net),
-             true);
-      stream(stream::kind::eager).submit(net).wait();
-    } else {
-      // If not, just forward input tensor to output tensor.
-      CHECK(output_tensor.CopyFrom(mkl_tensor, output_shape));
-    }
-  } catch (mkldnn::error& e) {
-    string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) + ", in file " +
-                       string(__FILE__) + ":" + std::to_string(__LINE__);
-    LOG(FATAL) << "Operation received an exception: " << error_msg;
-  }
+  TensorShape output_shape;
+
+  TF_CHECK_OK(
+      Status(error::Code::UNIMPLEMENTED, "Unimplemented conversion function"));
+
   return output_tensor;
 }
 #endif
@@ -1877,7 +1843,7 @@ class FactoryKeyCreator {
   template <typename T>
   void AddAsKey(const T data) {
     auto buffer = reinterpret_cast<const char *>(&data);
-    Append(StringPiece(buffer, sizeof(T)));
+    Append(absl::string_view(buffer, sizeof(T)));
   }
 
   std::string GetKey() {
@@ -1888,8 +1854,8 @@ class FactoryKeyCreator {
   string key_;
   const char delimiter = 'x';
   const int kMaxKeyLength = 256;
-  void Append(StringPiece s) {
-    key_.append(s.ToString());
+  void Append(absl::string_view s) {
+    key_.append(string(s));
     key_.append(1, delimiter);
   }
 };
diff --git a/tensorflow/docs_src/community/groups.md b/tensorflow/docs_src/community/groups.md
index 0b07d413da..d92f5775fa 100644
--- a/tensorflow/docs_src/community/groups.md
+++ b/tensorflow/docs_src/community/groups.md
@@ -1,38 +1,17 @@
 # User Groups
 
-TensorFlow has communities around the world. [Submit your community!](https://docs.google.com/forms/d/e/1FAIpQLSc_RQIUYtVgLLihzATaO_WUXkEyBDE_OoRoOXYDPmBEvHuEBA/viewform)
+TensorFlow has communities around the world.
 
 ## Asia
 
-* [TensorFlow China community](https://www.tensorflowers.cn)
-* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/)
-* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/)
-* [Soleil Data Dojo](https://soleildatadojo.connpass.com/)
+* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
+* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
+* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
 * [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/)
-* [TensorFlow Philippines Community](https://www.facebook.com/groups/TensorFlowPH/)
-* [TensorFlow and Deep Learning Singapore](https://www.meetup.com/TensorFlow-and-Deep-Learning-Singapore/)
-* [TensorFlow India](https://www.facebook.com/tensorflowindia)
 
 
 ## Europe
 
 * [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
 * [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
-* [Tensorflow Belgium](https://www.meetup.com/TensorFlow-Belgium)
-* [TensorFlow x Rome Meetup](https://www.meetup.com/it-IT/TensorFlow-x-Rome-Meetup)
-* [TensorFlow London](https://www.meetup.com/TensorFlow-London/)
-* [TensorFlow Edinburgh](https://www.meetup.com/tensorflow-edinburgh/)
 
-
-## America
-
-* [TensorFlow Buenos Aires](https://www.meetup.com/TensorFlow-Buenos-Aires/)
-
-
-## Oceania
-* [Melbourne TensorFlow Meetup](https://www.meetup.com/Melbourne-TensorFlow-Meetup)
-
-
-## Africa
-
-* [TensorFlow Tunis Meetup](https://www.meetup.com/fr-FR/TensorFlow-Tunis-Meetup/)
diff --git a/tensorflow/docs_src/get_started/eager.md b/tensorflow/docs_src/get_started/eager.md
index bbb25e20c6..f08ac74425 100644
--- a/tensorflow/docs_src/get_started/eager.md
+++ b/tensorflow/docs_src/get_started/eager.md
@@ -1,3 +1,3 @@
 # Get Started with Eager Execution
 
-[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/r1.9.0/samples/core/get_started/eager.ipynb)
+[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/r1.8.0/samples/core/get_started/eager.ipynb)
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index 232d2f1547..55579d52fb 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -10,9 +10,9 @@ course prior to diving into TensorFlow documentation:
 TensorFlow is a tool for machine learning. While it contains a wide range of
 functionality, TensorFlow is mainly designed for deep neural network models.
 
-The easiest way to get started with TensorFlow is by using Eager Execution.
+The easiest way to get started with TensorFlow is using Eager Execution.
 
-  * @{$get_started/eager}, is for anyone new to machine learning or TensorFlow.
+  * @{$get_started/eager}, is for anyone new to  machine learning or TensorFlow.
 
 TensorFlow provides many APIs. The remainder of this section focuses on the
 Estimator API which provide scalable, high-performance models. See the
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 2901848745..1abd840ab3 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 55bc0f64e7..52a2a3f8a6 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.9.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 637231da12..1256fb99c4 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.9.0-rc0</version>
+  <version>1.8.0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.9.0-rc0</version>
+                 <version>1.8.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -124,12 +124,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.9.0-rc0</version>
+  <version>1.8.0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.9.0-rc0</version>
+  <version>1.8.0</version>
 </dependency>
 ```
 
@@ -148,7 +148,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -167,7 +167,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -175,13 +175,13 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.9.0-rc0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0.zip).
   3. Extract this .zip file.
 
-__Note__: The native library (`tensorflow_jni.dll`) requires `msvcp140.dll` at runtime, which is included in the [Visual C++ 2015 Redistributable](https://www.microsoft.com/en-us/download/details.aspx?id=48145) package. 
+
 
 ### Validate the installation
 
@@ -227,7 +227,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.9.0-rc0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.8.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -241,11 +241,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.9.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.9.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index c8d706cf3c..0ed8160027 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -339,7 +339,9 @@ Docker will download the TensorFlow binary image the first time you launch it.
 
 #### GPU support
 
-To launch a Docker container with NVidia GPU support, enter a command of the following format (this [does not require any local CUDA installation](https://github.com/nvidia/nvidia-docker/wiki/CUDA#requirements)):
+Prior to installing TensorFlow with GPU support, ensure that your system meets all
+[NVIDIA software requirements](#NVIDIARequirements).  To launch a Docker container
+with NVidia GPU support, enter a command of the following format:
 
 <pre>
 $ <b>nvidia-docker run -it</b> <i>-p hostPort:containerPort TensorFlowGPUImage</i>
@@ -436,7 +438,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -515,7 +517,7 @@ on your system:
   from source. To use the TensorFlow binaries, version 3.5 or higher is required.
   See the [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a
   list of supported GPU cards.
-* [GPU drivers](http://nvidia.com/drivers) that support your version of the CUDA
+* [GPU drivers](http://nvidia.com/driver) that support your version of the CUDA
   Toolkit.
 * The `libcupti-dev` library is the NVIDIA CUDA Profile Tools Interface. This
   library provides advanced profiling support. To install this library,
@@ -682,14 +684,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -701,14 +703,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -720,14 +722,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -739,14 +741,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 9d01271c5a..29a867a9e3 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -522,7 +522,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl
 </pre>
 
 
@@ -530,5 +530,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index dc6c1e36fc..5ba522b436 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -81,7 +81,7 @@ or
 [macOS](#PrepareMac)
 
 
-<a name="PrepareLinux"></a>
+<a name="#PrepareLinux"></a>
 ## Prepare environment for Linux
 
 Before building TensorFlow on Linux, install the following build
@@ -328,10 +328,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.9.0rc0 on Linux:
+for TensorFlow 1.8.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.9.0rc0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -373,9 +373,9 @@ The build and installation problems you encounter typically depend on the
 operating system.  See the "Common installation problems" section
 of one of the following guides:
 
-  * @{$install_linux#common_installation_problems$Installing TensorFlow on Linux}
-  * @{$install_mac#common_installation_problems$Installing TensorFlow on Mac OS}
-  * @{$install_windows#common_installation_problems$Installing TensorFlow on Windows}
+  * @{$install_linux#CommonInstallationProblems$Installing TensorFlow on Linux}
+  * @{$install_mac#CommonInstallationProblems$Installing TensorFlow on Mac OS}
+  * @{$install_windows#CommonInstallationProblems$Installing TensorFlow on Windows}
 
 Beyond the errors documented in those two guides, the following table
 notes additional errors specific to building TensorFlow.  Note that we
@@ -433,8 +433,6 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.11.0</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.9.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.11.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
@@ -458,7 +456,6 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.11.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
@@ -475,8 +472,6 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.9.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/mobile/linking_libs.md b/tensorflow/docs_src/mobile/linking_libs.md
index efef5dd0da..cf0db59021 100644
--- a/tensorflow/docs_src/mobile/linking_libs.md
+++ b/tensorflow/docs_src/mobile/linking_libs.md
@@ -27,7 +27,7 @@ called `libandroid_tensorflow_inference_java.jar`. There are three ways to
 include this functionality in your program:
 
 1. Include the jcenter AAR which contains it, as in this
- [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/tfmobile/build.gradle#L59-L65)
+ [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/build.gradle#L59-L65)
 
 2. Download the nightly precompiled version from
 [ci.tensorflow.org](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/).
diff --git a/tensorflow/docs_src/mobile/prepare_models.md b/tensorflow/docs_src/mobile/prepare_models.md
index 2b84dbb973..8b22c04d87 100644
--- a/tensorflow/docs_src/mobile/prepare_models.md
+++ b/tensorflow/docs_src/mobile/prepare_models.md
@@ -105,8 +105,8 @@ inline constants so everything’s in one file.  To handle the conversion, you�
 need the `freeze_graph.py` script, that’s held in
 [`tensorflow/python/tools/freeze_graph.py`](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py). You’ll run it like this:
 
-    bazel build tensorflow/python/tools:freeze_graph
-    bazel-bin/tensorflow/python/tools/freeze_graph \
+    bazel build tensorflow/tools:freeze_graph
+    bazel-bin/tensorflow/tools/freeze_graph \
     --input_graph=/tmp/model/my_graph.pb \
     --input_checkpoint=/tmp/model/model.ckpt-1000 \
     --output_graph=/tmp/frozen_graph.pb \
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index c97f74139c..2fea02d861 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -227,8 +227,8 @@ of 30.0f, and an 8-bit array, the quantized values represent the following:
   <table>
     <tr><th>Quantized</th><th>Float</th></tr>
     <tr><td>0</td><td>-10.0</td></tr>
-    <tr><td>128</td><td>10.0</td></tr>
     <tr><td>255</td><td>30.0</td></tr>
+    <tr><td>128</td><td>10.0</td></tr>
   </table>
   <figcaption>
     <b>Table 2</b>: Example quantized value range
diff --git a/tensorflow/docs_src/programmers_guide/estimators.md b/tensorflow/docs_src/programmers_guide/estimators.md
index b13b47184d..c4aae1d9d6 100644
--- a/tensorflow/docs_src/programmers_guide/estimators.md
+++ b/tensorflow/docs_src/programmers_guide/estimators.md
@@ -21,17 +21,18 @@ Note: TensorFlow also includes a deprecated `Estimator` class at
 
 Estimators provide the following benefits:
 
-*   You can run Estimator-based models on a local host or on a
+*   You can run Estimators-based models on a local host or on a
     distributed multi-server environment without changing your model.
-    Furthermore, you can run Estimator-based models on CPUs, GPUs,
+    Furthermore, you can run Estimators-based models on CPUs, GPUs,
     or TPUs without recoding your model.
 *   Estimators simplify sharing implementations between model developers.
-*   You can develop a state of the art model with high-level intuitive code.
+*   You can develop a state of the art model with high-level intuitive code,
     In short, it is generally much easier to create models with Estimators
     than with the low-level TensorFlow APIs.
-*   Estimators are themselves built on @{tf.layers}, which
+*   Estimators are themselves built on tf.layers, which
     simplifies customization.
-*   Estimators build the graph for you.
+*   Estimators build the graph for you.  In other words, you don't have to
+    build the graph.
 *   Estimators provide a safe distributed training loop that controls how and
     when to:
     *   build the graph
@@ -56,7 +57,7 @@ the "plumbing" for you.  That is, pre-made Estimators create and manage
 pre-made Estimators let you experiment with different model architectures by
 making only minimal code changes.  @{tf.estimator.DNNClassifier$`DNNClassifier`},
 for example, is a pre-made Estimator class that trains classification models
-based on dense, feed-forward neural networks.
+through dense, feed-forward neural networks.
 
 
 ### Structure of a pre-made Estimators program
@@ -78,7 +79,7 @@ of the following four steps:
     an input function:
 
         def input_fn(dataset):
-           ...  # manipulate dataset, extracting the feature dict and the label
+           ...  # manipulate dataset, extracting feature names and the label
            return feature_dict, label
 
     (See @{$programmers_guide/datasets} for full details.)
@@ -95,13 +96,13 @@ of the following four steps:
         population = tf.feature_column.numeric_column('population')
         crime_rate = tf.feature_column.numeric_column('crime_rate')
         median_education = tf.feature_column.numeric_column('median_education',
-                            normalizer_fn=lambda x: x - global_education_mean)
+                            normalizer_fn='lambda x: x - global_education_mean')
 
 3.  **Instantiate the relevant pre-made Estimator.**  For example, here's
     a sample instantiation of a pre-made Estimator named `LinearClassifier`:
 
         # Instantiate an estimator, passing the feature columns.
-        estimator = tf.estimator.LinearClassifier(
+        estimator = tf.estimator.Estimator.LinearClassifier(
             feature_columns=[population, crime_rate, median_education],
             )
 
diff --git a/tensorflow/docs_src/programmers_guide/feature_columns.md b/tensorflow/docs_src/programmers_guide/feature_columns.md
index 90f5c53a17..845194fe0e 100644
--- a/tensorflow/docs_src/programmers_guide/feature_columns.md
+++ b/tensorflow/docs_src/programmers_guide/feature_columns.md
@@ -528,10 +528,10 @@ suggested by the following snippet:
 categorical_column = ... # Create any categorical column
 
 # Represent the categorical column as an embedding column.
-# This means creating an embedding vector lookup table with one element for each category.
+# This means creating a one-hot vector with one element for each category.
 embedding_column = tf.feature_column.embedding_column(
     categorical_column=categorical_column,
-    dimension=embedding_dimensions)
+    dimension=dimension_of_embedding_vector)
 ```
 
 @{$programmers_guide/embedding$Embeddings} is a significant topic within machine
diff --git a/tensorflow/examples/learn/iris.py b/tensorflow/examples/learn/iris.py
index 86f5204ec3..03e60972aa 100644
--- a/tensorflow/examples/learn/iris.py
+++ b/tensorflow/examples/learn/iris.py
@@ -21,8 +21,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-
-from six.moves.urllib.request import urlretrieve
+import urllib
 
 import tensorflow as tf
 
@@ -39,7 +38,9 @@ FEATURE_KEYS = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
 def maybe_download_iris_data(file_name, download_url):
   """Downloads the file and returns the number of data."""
   if not os.path.exists(file_name):
-    urlretrieve(download_url, file_name)
+    raw = urllib.urlopen(download_url).read()
+    with open(file_name, 'w') as f:
+      f.write(raw)
 
   # The first line is a comma-separated string. The first one is the number of
   # total data in the file.
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index 9b171f66ec..debd95fc62 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -376,6 +376,9 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
     }
   }
   // op annotations
+  op_class.add_annotation(
+      Annotation::Create("Generated", "javax.annotation")
+          .attributes("value = \"TensorFlow Java Op Generator\""));
   if (endpoint.deprecated()) {
     op_class.add_annotation(Annotation::Create("Deprecated"));
     string explanation;
@@ -412,12 +415,8 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
   SourceFileWriter writer(op_file.get());
   std::list<Type> dependencies;
   CollectOpDependencies(op, mode, &dependencies);
-  writer.Write(kLicense)
-      .EndLine()
-      .Write("// This class has been generated, DO NOT EDIT!")
-      .EndLine()
-      .EndLine()
-      .BeginType(op_class, PUBLIC | FINAL, &dependencies, &op_javadoc);
+  writer.Write(kLicense).EndLine().BeginType(op_class, PUBLIC | FINAL,
+                                             &dependencies, &op_javadoc);
   if (!op.optional_attributes().empty()) {
     RenderOptionsClass(op, op_class, &writer);
   }
diff --git a/tensorflow/java/src/gen/cc/op_specs.cc b/tensorflow/java/src/gen/cc/op_specs.cc
index 941ab2699c..181fd4c5e3 100644
--- a/tensorflow/java/src/gen/cc/op_specs.cc
+++ b/tensorflow/java/src/gen/cc/op_specs.cc
@@ -96,7 +96,6 @@ Type TypeResolver::TypeOf(const OpDef_ArgDef& arg_def, bool* iterable_out) {
     *iterable_out = true;
     visited_attrs_.insert(std::make_pair(arg_def.number_attr(), Type::Int()));
   }
-
   Type type = Type::Wildcard();
   if (arg_def.type() != DataType::DT_INVALID) {
     // resolve type from DataType
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index bd97b181ff..b2e6c60021 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -196,11 +196,11 @@ def implicit_val_and_grad(f):
   # TODO(cais): Remove calls to tf.constant() once the gradients functions
   # accept lists and np.ndarrays.
 
-  def grad_fn(*args, **kwds):
+  def grad_fn(*args):
     """Computes the gradient of the wrapped function."""
     this_tape = tape.push_new_tape()
     try:
-      end_node = f(*args, **kwds)
+      end_node = f(*args)
       if end_node is None:
         raise ValueError("Cannot differentiate a function that returns None; "
                          "did you forget to return a value from {}?".format(
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 20522098b0..9cd17e0407 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -978,10 +978,7 @@ py_test(
     size = "large",
     srcs = ["keras_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_windows",
-        "notsan",
-    ],
+    tags = ["notsan"],
     deps = [
         ":keras",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/python/estimator/exporter.py b/tensorflow/python/estimator/exporter.py
index b18212cfcd..7cdf840c97 100644
--- a/tensorflow/python/estimator/exporter.py
+++ b/tensorflow/python/estimator/exporter.py
@@ -156,7 +156,7 @@ def _loss_smaller(best_eval_result, current_eval_result):
   return best_eval_result[default_key] > current_eval_result[default_key]
 
 
-def _verify_compare_fn_args(compare_fn):
+def _verify_compre_fn_args(compare_fn):
   """Verifies compare_fn arguments."""
   args = set(util.fn_args(compare_fn))
   if 'best_eval_result' not in args:
@@ -265,7 +265,7 @@ class BestExporter(Exporter):
     self._compare_fn = compare_fn
     if self._compare_fn is None:
       raise ValueError('`compare_fn` must not be None.')
-    _verify_compare_fn_args(self._compare_fn)
+    _verify_compre_fn_args(self._compare_fn)
 
     self._saved_model_exporter = _SavedModelExporter(
         name, serving_input_receiver_fn, assets_extra, as_text)
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index a6cefdece2..035c7c148c 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -136,13 +136,11 @@ def numpy_input_fn(x,
       values in `x` have same shape).
     ValueError: if duplicate keys are in both `x` and `y` when `y` is a dict.
     ValueError: if x or y is an empty dict.
-    TypeError: `x` is not a dict or array.
-    ValueError: if 'shuffle' is not provided or a bool.
+    TypeError: `x` is not a dict or array, or if `shuffle` is not bool.
   """
   if not isinstance(shuffle, bool):
-    raise ValueError('shuffle must be provided and explicitly set as boolean '
-                     '(it is recommended to set it as True for training); '
-                     'got {}'.format(shuffle))
+    raise TypeError('shuffle must be explicitly set as boolean; '
+                    'got {}'.format(shuffle))
 
   def input_fn():
     """Numpy input function."""
diff --git a/tensorflow/python/estimator/inputs/numpy_io_test.py b/tensorflow/python/estimator/inputs/numpy_io_test.py
index 81b201cc5c..92d057e25d 100644
--- a/tensorflow/python/estimator/inputs/numpy_io_test.py
+++ b/tensorflow/python/estimator/inputs/numpy_io_test.py
@@ -286,9 +286,8 @@ class NumpyIoTest(test.TestCase):
     x = np.arange(32, 36)
     y = np.arange(4)
     with self.test_session():
-      with self.assertRaisesRegexp(ValueError,
-                                   'shuffle must be provided and explicitly '
-                                   'set as boolean'):
+      with self.assertRaisesRegexp(TypeError,
+                                   'shuffle must be explicitly set as boolean'):
         # Default shuffle is None.
         numpy_io.numpy_input_fn(x, y)
 
diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py
index 57f8e5fd6a..938e244fb3 100644
--- a/tensorflow/python/estimator/inputs/pandas_io.py
+++ b/tensorflow/python/estimator/inputs/pandas_io.py
@@ -68,16 +68,15 @@ def pandas_input_fn(x,
   Raises:
     ValueError: if `x` already contains a column with the same name as `y`, or
       if the indexes of `x` and `y` don't match.
-    ValueError: if 'shuffle' is not provided or a bool.
+    TypeError: `shuffle` is not bool.
   """
   if not HAS_PANDAS:
     raise TypeError(
         'pandas_input_fn should not be called without pandas installed')
 
   if not isinstance(shuffle, bool):
-    raise ValueError('shuffle must be provided and explicitly set as boolean '
-                     '(it is recommended to set it as True for training); '
-                     'got {}'.format(shuffle))
+    raise TypeError('shuffle must be explicitly set as boolean; '
+                    'got {}'.format(shuffle))
 
   x = x.copy()
   if y is not None:
diff --git a/tensorflow/python/estimator/inputs/pandas_io_test.py b/tensorflow/python/estimator/inputs/pandas_io_test.py
index dcecf6dd61..e5912a3b28 100644
--- a/tensorflow/python/estimator/inputs/pandas_io_test.py
+++ b/tensorflow/python/estimator/inputs/pandas_io_test.py
@@ -70,9 +70,8 @@ class PandasIoTest(test.TestCase):
       return
     x, _ = self.makeTestDataFrame()
     y_noindex = pd.Series(np.arange(-32, -28))
-    with self.assertRaisesRegexp(ValueError,
-                                 'shuffle must be provided and explicitly '
-                                 'set as boolean'):
+    with self.assertRaisesRegexp(TypeError,
+                                 'shuffle must be explicitly set as boolean'):
       # Default shuffle is None
       pandas_io.pandas_input_fn(x, y_noindex)
 
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index 51a61adb21..8e2ec83020 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -250,7 +250,7 @@ class _PandasFeedFn(object):
                num_epochs=None):
     if len(placeholders) != len(dataframe.columns) + 1:
       raise ValueError("Expected {} placeholders; got {}.".format(
-          len(dataframe.columns) + 1, len(placeholders)))
+          len(dataframe.columns), len(placeholders)))
     self._index_placeholder = placeholders[0]
     self._col_placeholders = placeholders[1:]
     self._dataframe = dataframe
diff --git a/tensorflow/python/estimator/keras.py b/tensorflow/python/estimator/keras.py
index 2f439f765e..c80af08fba 100644
--- a/tensorflow/python/estimator/keras.py
+++ b/tensorflow/python/estimator/keras.py
@@ -70,7 +70,7 @@ def _convert_tensor(x):
   return x
 
 
-def _any_variable_initialized():
+def _any_variable_initalized():
   """Check if any variable has been initialized in the Keras model.
 
   Returns:
@@ -511,7 +511,7 @@ def model_to_estimator(keras_model=None,
       keras_model_fn, model_dir=model_dir, config=config)
 
   # Check if we need to call get_weights:
-  if _any_variable_initialized():
+  if _any_variable_initalized():
     keras_weights = keras_model.get_weights()
     # Warn if config passed to estimator tries to update GPUOptions. If a
     # session has already been created, the GPUOptions passed to the first
diff --git a/tensorflow/python/estimator/keras_test.py b/tensorflow/python/estimator/keras_test.py
index 5e094ae92b..6688a84130 100644
--- a/tensorflow/python/estimator/keras_test.py
+++ b/tensorflow/python/estimator/keras_test.py
@@ -31,10 +31,10 @@ from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.applications import mobilenet
 from tensorflow.python.keras.optimizers import SGD
-from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -146,13 +146,13 @@ def randomize_io_type(array, name):
 def multi_inputs_multi_outputs_model():
   a = keras.layers.Input(shape=(16,), name='input_a')
   b = keras.layers.Input(shape=(16,), name='input_b')
-  m = keras.layers.Input(shape=(8,), dtype='string', name='input_m')
+  m = keras.layers.Input(shape=(8,), dtype='bool', name='input_m')
   dense = keras.layers.Dense(8, name='dense_1')
 
   a_2 = dense(a)
-  # Read m
-  m_2 = keras.layers.Lambda(gen_parsing_ops.string_to_number)(m)
-  s_2 = keras.layers.Lambda(lambda k: k[0] * k[1])([m_2, a_2])
+  # Apply a mask
+  s_2 = keras.layers.Lambda(lambda k:
+                            K.switch(k[0], k[1], K.zeros_like(k[1])))([m, a_2])
   b_2 = dense(b)
   merged = keras.layers.concatenate([s_2, b_2], name='merge')
   c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
@@ -372,13 +372,13 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
 
     def train_input_fn():
       input_dict = {'input_a': a_train, 'input_b': b_train,
-                    'input_m': input_m_train.astype(np.str)}
+                    'input_m': input_m_train > 0}
       output_dict = {'dense_2': c_train, 'dense_3': d_train}
       return input_dict, output_dict
 
     def eval_input_fn():
       input_dict = {'input_a': a_test, 'input_b': b_test,
-                    'input_m': input_m_test.astype(np.str)}
+                    'input_m': input_m_test > 0}
       output_dict = {'dense_2': c_test, 'dense_3': d_test}
       return input_dict, output_dict
 
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index af5d709f7e..2d6925d1a8 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -1389,7 +1389,7 @@ class LayoutOptimizerTest(test.TestCase):
       expected_num_transposes = 3
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
-      self._assert_trans_nchw_to_nhwc('map/while/Add_1-0-2', nodes)
+      self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testLoopWithVecAnd4D(self):
@@ -1413,7 +1413,7 @@ class LayoutOptimizerTest(test.TestCase):
       expected_num_transposes = 2
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
-      self._assert_trans_nchw_to_nhwc('map/while/Add_1-0-2', nodes)
+      self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testBinaryOpSecondPort(self):
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index f608dea430..e487f583be 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -93,8 +93,6 @@ def selu(x):
       - To be used together with the initialization "lecun_normal".
       - To be used together with the dropout variant "AlphaDropout".
 
-  References:
-      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
   """
   alpha = 1.6732632423543772848170429916717
   scale = 1.0507009873554804934193349852946
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 9f91368e5b..70b6a8431a 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -724,6 +724,15 @@ class TensorBoard(Callback):
         for weight in layer.weights:
           mapped_weight_name = weight.name.replace(':', '_')
           tf_summary.histogram(mapped_weight_name, weight)
+          if self.write_grads:
+            grads = model.optimizer.get_gradients(model.total_loss, weight)
+
+            def is_indexed_slices(grad):
+              return type(grad).__name__ == 'IndexedSlices'
+
+            grads = [grad.values if is_indexed_slices(grad) else grad
+                     for grad in grads]
+            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
           if self.write_images:
             w_img = array_ops.squeeze(weight)
             shape = K.int_shape(w_img)
@@ -750,18 +759,6 @@ class TensorBoard(Callback):
             assert len(shape) == 4 and shape[-1] in [1, 3, 4]
             tf_summary.image(mapped_weight_name, w_img)
 
-        if self.write_grads:
-          for weight in layer.trainable_weights:
-            mapped_weight_name = weight.name.replace(':', '_')
-            grads = model.optimizer.get_gradients(model.total_loss, weight)
-
-            def is_indexed_slices(grad):
-              return type(grad).__name__ == 'IndexedSlices'
-
-            grads = [grad.values if is_indexed_slices(grad) else grad
-                     for grad in grads]
-            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
-
         if hasattr(layer, 'output'):
           tf_summary.histogram('{}_out'.format(layer.name), layer.output)
     self.merged = tf_summary.merge_all()
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 5062a26580..b355f4a269 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -653,8 +653,6 @@ class KerasCallbacksTest(test.TestCase):
       model.add(
           keras.layers.Dense(
               NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      # non_trainable_weights: moving_variance, moving_mean
-      model.add(keras.layers.BatchNormalization())
       model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
       model.compile(
           loss='categorical_crossentropy',
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 1c9135982e..a4cd017d60 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -123,7 +123,7 @@ class Network(base_layer.Layer):
     # Entries are unique. Includes input and output layers.
     self._layers = []
 
-    # Used in symbolic mode only, only in conjunction with graph-networks
+    # Used in symbolic mode only, only in conjonction with graph-networks
     self._outbound_nodes = []
     self._inbound_nodes = []
 
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index 7e82db028b..6a94986b9c 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -482,7 +482,7 @@ class TestWholeModelSaving(test.TestCase):
       with h5py.File(fname, 'r') as h5file:
         num_names_arrays = len([attr for attr in h5file['model_weights'].attrs
                                 if attr.startswith('layer_names')])
-      # The chunking of layer names array should have happened.
+      # The chunking of layer names array should have happend.
       self.assertGreater(num_names_arrays, 0)
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
@@ -527,7 +527,7 @@ class TestWholeModelSaving(test.TestCase):
         num_weight_arrays = len(
             [attr for attr in h5file['model_weights']['nested_model'].attrs
              if attr.startswith('weight_names')])
-      # The chunking of layer names array should have happened.
+      # The chunking of layer names array should have happend.
       self.assertGreater(num_weight_arrays, 0)
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index fce6cbdb7a..89c1f1a40f 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -24,7 +24,6 @@ import numpy as np
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -410,13 +409,11 @@ class Model(Network):
         else:
           if sample_weight_mode == 'temporal':
             sample_weights.append(array_ops.placeholder_with_default(
-                constant_op.constant([[1.]], dtype=K.floatx()),
-                shape=[None, None], name=name + '_sample_weights'))
+                [[1.]], shape=[None, None], name=name + '_sample_weights'))
             sample_weight_modes.append('temporal')
           else:
             sample_weights.append(array_ops.placeholder_with_default(
-                constant_op.constant([1.], dtype=K.floatx()),
-                shape=[None], name=name + '_sample_weights'))
+                [1.], shape=[None], name=name + '_sample_weights'))
             sample_weight_modes.append(None)
     self.sample_weight_modes = sample_weight_modes
     self._feed_sample_weight_modes = []
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index e8838cd3bc..2ecbff3a1c 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -732,7 +732,7 @@ def slice_arrays(arrays, indices, contiguous=True):
   """Slices batches out of provided arrays (workaround for eager tensors).
 
   Unfortunately eager tensors don't have the same slicing behavior as
-  Numpy arrays (they follow the same slicing behavior as symbolic TF tensors),
+  Numpy arrays (they folow  the same slicing behavior as symbolic TF tensors),
   hence we cannot use `generic_utils.slice_arrays` directly
   and we have to implement this workaround based on `concat`. This has a
   performance cost.
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index c519e194bd..a54d6da839 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -71,7 +71,7 @@ class KerasInitializersTest(test.TestCase):
                                                       stddev=1,
                                                       seed=126),
                    tensor_shape,
-                   target_mean=0., target_max=2, target_min=-2)
+                   target_mean=0., target_std=None, target_max=2)
 
   def test_constant(self):
     tensor_shape = (5, 6, 4)
@@ -83,49 +83,49 @@ class KerasInitializersTest(test.TestCase):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(1. / fan_in)
+      scale = np.sqrt(3. / fan_in)
       self._runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_max=scale, target_min=-scale)
 
   def test_glorot_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / (fan_in + fan_out))
+      scale = np.sqrt(6. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_max=scale, target_min=-scale)
 
   def test_he_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / fan_in)
+      scale = np.sqrt(6. / fan_in)
       self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_max=scale, target_min=-scale)
 
   def test_lecun_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(1. / fan_in)
+      scale = np.sqrt(1. / fan_in)
       self._runner(keras.initializers.lecun_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_std=None, target_max=2 * scale)
 
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / (fan_in + fan_out))
+      scale = np.sqrt(2. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_std=None, target_max=2 * scale)
 
   def test_he_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / fan_in)
+      scale = np.sqrt(2. / fan_in)
       self._runner(keras.initializers.he_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+                   target_mean=0., target_std=None, target_max=2 * scale)
 
   def test_orthogonal(self):
     tensor_shape = (20, 20)
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index f60064ed63..5061825d38 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -19,9 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
-import sys
 import types as python_types
-import warnings
 
 import numpy as np
 
@@ -716,7 +714,6 @@ class Lambda(Layer):
     return self.mask
 
   def get_config(self):
-    module = self.function.__module__
     if isinstance(self.function, python_types.LambdaType):
       function = generic_utils.func_dump(self.function)
       function_type = 'lambda'
@@ -724,26 +721,21 @@ class Lambda(Layer):
       function = self.function.__name__
       function_type = 'function'
 
-    output_shape_module = None
     if isinstance(self._output_shape, python_types.LambdaType):
       output_shape = generic_utils.func_dump(self._output_shape)
       output_shape_type = 'lambda'
-      output_shape_module = self._output_shape.__module__
     elif callable(self._output_shape):
       output_shape = self._output_shape.__name__
       output_shape_type = 'function'
-      output_shape_module = self._output_shape.__module__
     else:
       output_shape = self._output_shape
       output_shape_type = 'raw'
 
     config = {
         'function': function,
-        'module': module,
         'function_type': function_type,
         'output_shape': output_shape,
         'output_shape_type': output_shape_type,
-        'output_shape_module': output_shape_module,
         'arguments': self.arguments
     }
     base_config = super(Lambda, self).get_config()
@@ -753,16 +745,8 @@ class Lambda(Layer):
   def from_config(cls, config, custom_objects=None):
     config = config.copy()
     globs = globals()
-    module = config.pop('module', None)
-    if module in sys.modules:
-      globs.update(sys.modules[module].__dict__)
-    elif module is not None:
-      # Note: we don't know the name of the function if it's a lambda.
-      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
-                    'It may cause errors.'.format(module)
-                    , UserWarning)
     if custom_objects:
-      globs.update(custom_objects)
+      globs = dict(list(globs.items()) + list(custom_objects.items()))
     function_type = config.pop('function_type')
     if function_type == 'function':
       # Simple lookup in custom objects
@@ -776,14 +760,6 @@ class Lambda(Layer):
     else:
       raise TypeError('Unknown function type:', function_type)
 
-    output_shape_module = config.pop('output_shape_module', None)
-    if output_shape_module in sys.modules:
-      globs.update(sys.modules[output_shape_module].__dict__)
-    elif output_shape_module is not None:
-      # Note: we don't know the name of the function if it's a lambda.
-      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
-                    'It may cause errors.'.format(output_shape_module)
-                    , UserWarning)
     output_shape_type = config.pop('output_shape_type')
     if output_shape_type == 'function':
       # Simple lookup in custom objects
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index e6e45902a8..c616d8f24f 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -144,19 +144,5 @@ class CheckpointingTests(test.TestCase):
     model.load_weights(save_prefix)
     self.assertEqual(12., self.evaluate(beta1_power))
 
-class TestModelBackend(test.TestCase):
-
-  def test_model_backend_float64_use_cases(self):
-    # Test case for GitHub issue 19318
-    floatx = keras.backend.floatx()
-    keras.backend.set_floatx('float64')
-
-    x = keras.Input((5,))
-    y = keras.layers.Dense(1)(x)
-    model = keras.models.Model(x, y)
-    model.compile('rmsprop', 'mse')
-
-    keras.backend.set_floatx(floatx)
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/as_string_op_test.py b/tensorflow/python/kernel_tests/as_string_op_test.py
index 94ed8ebd31..9d54add264 100644
--- a/tensorflow/python/kernel_tests/as_string_op_test.py
+++ b/tensorflow/python/kernel_tests/as_string_op_test.py
@@ -130,16 +130,6 @@ class AsStringOpTest(test.TestCase):
       result = output.eval(feed_dict={input_: int_inputs_})
       self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
 
-  def testHalfInt(self):
-    s = lambda strs: [x.decode("ascii") for x in strs]
-
-    with self.test_session():
-      input_ = array_ops.placeholder(dtypes.int16)
-      int_inputs_ = [np.iinfo(np.int16).min, np.iinfo(np.int16).max]
-      output = string_ops.as_string(input_)
-      result = output.eval(feed_dict={input_: int_inputs_})
-      self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
-
   def testBool(self):
     bool_inputs_ = [False, True]
     s = lambda strs: [x.decode("ascii") for x in strs]
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index 16fdedac41..08b03f8518 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -172,7 +172,7 @@ class BetaincTest(test.TestCase):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [gx_s.shape], tf_gout_t, gx_s.shape)
-      tf_logging.info("betainc gradient err = %g " % err)
+      print("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
       # Test broadcast gradient
@@ -181,7 +181,7 @@ class BetaincTest(test.TestCase):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [()], tf_gout_t, ga_s.shape)
-      tf_logging.info("betainc gradient err = %g " % err)
+      print("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
 
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index fb52d10475..e08123b041 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -417,16 +414,6 @@ class ClipTest(test.TestCase):
 
     self.assertAllClose(np_ans, tf_ans)
 
-  def testClipByValueEmptyTensor(self):
-    # Test case for GitHub issue 19337
-    zero = array_ops.placeholder(dtype=dtypes.float32, shape=None)
-    x = clip_ops.clip_by_value(zero, zero, zero)
-    y = clip_ops.clip_by_value(zero, 1.0, 1.0)
-    z = clip_ops.clip_by_value(zero, zero, 1.0)
-    w = clip_ops.clip_by_value(zero, 1.0, zero)
-    with self.test_session(use_gpu=True) as sess:
-      sess.run([x, y, z, w], feed_dict={zero: np.zeros((7, 0))})
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 80ba7dafc9..8699fd5b25 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -312,8 +312,8 @@ class Conv2DTest(test.TestCase):
       expected_values = self.evaluate(expected_results)
       computed_values = self.evaluate(computed_results)
       for e_value, c_value in zip(expected_values, computed_values):
-        tf_logging.info("expected = ", e_value)
-        tf_logging.info("actual = ", c_value)
+        print("expected = ", e_value)
+        print("actual = ", c_value)
         self.assertAllClose(
             e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-4)
 
@@ -337,8 +337,8 @@ class Conv2DTest(test.TestCase):
       for i in range(len(tensors)):
         conv = tensors[i]
         value = values[i]
-        tf_logging.info("expected = ", expected)
-        tf_logging.info("actual = ", value)
+        print("expected = ", expected)
+        print("actual = ", value)
         tol = 1e-5
         if value.dtype == np.float16:
           tol = 1e-3
@@ -547,8 +547,8 @@ class Conv2DTest(test.TestCase):
       # "values" consists of two tensors for two backprops
       value = self.evaluate(conv)
       self.assertShapeEqual(value, conv)
-    tf_logging.info("expected = ", expected)
-    tf_logging.info("actual = ", value)
+    print("expected = ", expected)
+    print("actual = ", value)
     self.assertArrayNear(expected, value.flatten(), err)
 
   def _CompareBackpropInput(self, input_sizes, filter_sizes, output_sizes,
@@ -723,8 +723,8 @@ class Conv2DTest(test.TestCase):
             data_format=data_format)
         value = self.evaluate(conv)
         self.assertShapeEqual(value, conv)
-      tf_logging.info("expected = ", expected)
-      tf_logging.info("actual = ", value)
+      print("expected = ", expected)
+      print("actual = ", value)
       self.assertArrayNear(expected, value.flatten(), 1e-5)
 
   def _CompareBackFilter(self, input_sizes, filter_sizes, output_sizes,
@@ -912,8 +912,8 @@ class Conv2DTest(test.TestCase):
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      tf_logging.info("expected = ", value_2)
-      tf_logging.info("actual = ", value)
+      print("expected = ", value_2)
+      print("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   # Testing for backprops
@@ -965,8 +965,8 @@ class Conv2DTest(test.TestCase):
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      tf_logging.info("expected = ", value_2)
-      tf_logging.info("actual = ", value)
+      print("expected = ", value_2)
+      print("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
@@ -1178,7 +1178,7 @@ class Conv2DTest(test.TestCase):
           # since fp16 numerical gradients are too imprecise.
           err = np.fabs(jacob_t - reference_jacob_t).max()
 
-        tf_logging.info("conv_2d gradient error = ", err)
+        print("conv_2d gradient error = ", err)
         self.assertLess(err, 0.002)
 
   def testInputGradientValidPaddingStrideOne(self):
@@ -1546,7 +1546,7 @@ class DepthwiseConv2DTest(test.TestCase):
       conv = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
       value = sess.run(conv)
-    tf_logging.info("value = ", value)
+    print("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1668,7 +1668,7 @@ class SeparableConv2DTest(test.TestCase):
         conv = array_ops.transpose(conv, [0, 2, 3, 1])
 
       value = sess.run(conv)
-    tf_logging.info("value = ", value)
+    print("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1826,7 +1826,7 @@ class Conv2DBenchmark(test.Benchmark):
         wall_time = time.time() - start
         self.report_benchmark(
             name="conv_stack_iter_%d" % iter_index, wall_time=wall_time)
-        tf_logging.info("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
+        print("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
 
 
 def GetInceptionFwdTest(input_size, filter_size, stride, padding,
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 58e2a8ac2a..91ebe8de99 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -197,21 +197,7 @@ class GatherNdTest(test.TestCase):
     self.assertEqual(None, shape.ndims)
     self.assertEqual(None, shape[0].value)
 
-  def testBadIndicesCPU(self):
-    with self.test_session(use_gpu=False):
-      params = [0, 1, 2]
-      indices = [[[0], [7]]]  # Make this one higher rank
-      gather_nd = array_ops.gather_nd(params, indices)
-      with self.assertRaisesOpError(
-          r"flat indices\[1, :\] = \[7\] does not index into param "
-          r"\(shape: \[3\]\)"):
-        gather_nd.eval()
-
-  def _disabledTestBadIndicesGPU(self):
-    # TODO disabled due to different behavior on GPU and CPU
-    # On GPU the bad indices do not raise error but fetch 0 values
-    if not test.is_gpu_available():
-      return
+  def testBadIndices(self):
     with self.test_session(use_gpu=True):
       params = [0, 1, 2]
       indices = [[[0], [7]]]  # Make this one higher rank
@@ -221,21 +207,7 @@ class GatherNdTest(test.TestCase):
           r"\(shape: \[3\]\)"):
         gather_nd.eval()
 
-  def testBadIndicesWithSlicesCPU(self):
-    with self.test_session(use_gpu=False):
-      params = [[0, 1, 2]]
-      indices = [[[0], [0], [1]]]  # Make this one higher rank
-      gather_nd = array_ops.gather_nd(params, indices)
-      with self.assertRaisesOpError(
-          r"flat indices\[2, :\] = \[1\] does not index into param "
-          r"\(shape: \[1,3\]\)"):
-        gather_nd.eval()
-
-  def _disabledTestBadIndicesWithSlicesGPU(self):
-    # TODO disabled due to different behavior on GPU and CPU
-    # On GPU the bad indices do not raise error but fetch 0 values
-    if not test.is_gpu_available():
-      return
+  def testBadIndicesWithSlices(self):
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2]]
       indices = [[[0], [0], [1]]]  # Make this one higher rank
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index 033fa95935..a2fcd751df 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -27,8 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.platform import test
 
-_TEST_TYPES = (dtypes.int64, dtypes.float32,
-               dtypes.complex64, dtypes.complex128)
+_TEST_TYPES = (dtypes.float32, dtypes.complex64, dtypes.complex128)
 
 
 class GatherTest(test.TestCase):
@@ -123,9 +122,6 @@ class GatherTest(test.TestCase):
                 gather, [tf_params, tf_indices, tf_axis], gather_grad)
             self.assertEqual(indices_grad, None)
             self.assertEqual(axis_grad, None)
-            if dtype.is_integer:
-              self.assertEqual(params_grad, None)
-              continue
             # For axis 0, we are able to create an efficient IndexedSlices for
             # the gradient.
             if axis == 0:
@@ -181,19 +177,7 @@ class GatherTest(test.TestCase):
     gather_t = array_ops.gather(params, indices, axis=axis)
     self.assertEqual(None, gather_t.shape)
 
-  def testBadIndicesCPU(self):
-    with self.test_session(use_gpu=False):
-      params = [[0, 1, 2], [3, 4, 5]]
-      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
-        array_ops.gather(params, [[7]], axis=0).eval()
-      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 3\)"):
-        array_ops.gather(params, [[7]], axis=1).eval()
-
-  def _disabledTestBadIndicesGPU(self):
-    # TODO disabled due to different behavior on GPU and CPU
-    # On GPU the bad indices do not raise error but fetch 0 values
-    if not test.is_gpu_available():
-      return
+  def testBadIndices(self):
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2], [3, 4, 5]]
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 795aa67248..a9b55854f1 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -362,33 +362,6 @@ class UniformUnitScalingInitializationTest(test.TestCase):
         dtype=dtypes.string)
 
 
-class VarianceScalingInitializationTest(test.TestCase):
-
-  def testNormalDistribution(self):
-    shape = [100, 100]
-    expect_mean = 0.
-    expect_var = 1. / shape[0]
-    init = init_ops.variance_scaling_initializer(distribution='normal')
-
-    with self.test_session(use_gpu=True):
-      x = init(shape).eval()
-
-    self.assertNear(np.mean(x), expect_mean, err=1e-2)
-    self.assertNear(np.var(x), expect_var, err=1e-2)
-
-  def testUniformDistribution(self):
-    shape = [100, 100]
-    expect_mean = 0.
-    expect_var = 1. / shape[0]
-    init = init_ops.variance_scaling_initializer(distribution='uniform')
-
-    with self.test_session(use_gpu=True):
-      x = init(shape).eval()
-
-    self.assertNear(np.mean(x), expect_mean, err=1e-2)
-    self.assertNear(np.var(x), expect_var, err=1e-2)
-
-
 # TODO(vrv): move to sequence_ops_test?
 class RangeTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index e95c729715..a0c372db7d 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -947,7 +947,7 @@ class PoolingTest(test.TestCase):
           output_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    tf_logging.info("%s gradient error = " % func_name, err)
+    print("%s gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _ConstructAndTestSecondGradient(self,
@@ -1024,7 +1024,7 @@ class PoolingTest(test.TestCase):
           input_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    tf_logging.info("%s second-order gradient error = " % func_name, err)
+    print("%s second-order gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _testMaxPoolGradValidPadding1_1(self, data_format, use_gpu):
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 253e43920b..677253946e 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -19,7 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import gc
 import re
 
 import numpy as np
@@ -435,29 +434,13 @@ class PyFuncTest(test.TestCase):
 
   # ----- Tests shared by py_func and eager_py_func -----
   def testCleanup(self):
-    # Delete everything created by previous tests to avoid side effects.
-    ops.reset_default_graph()
-    gc.collect()
-    initial_size = script_ops._py_funcs.size()
-    # Encapsulate the graph generation, so locals can be deleted.
-    def make_graphs():
-      for _ in xrange(1000):
-        g = ops.Graph()
-        with g.as_default():
-          c = constant_op.constant([1.], dtypes.float32)
-          _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
-          _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
-          # These ops have a reference to 'c' which has a reference to the graph.
-          # Checks if the functions are being deleted though the graph is referenced from them.
-          # (see #18292)
-          _ = script_ops.py_func(lambda x: x + c.shape[0], [c], [dtypes.float32])
-          _ = script_ops.eager_py_func(lambda x: x + c.shape[0], [c], [dtypes.float32])
- 
-    # Call garbage collector to enforce deletion.
-    make_graphs()
-    ops.reset_default_graph()
-    gc.collect()
-    self.assertEqual(initial_size, script_ops._py_funcs.size())
+    for _ in xrange(1000):
+      g = ops.Graph()
+      with g.as_default():
+        c = constant_op.constant([1.], dtypes.float32)
+        _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
+        _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
+    self.assertLess(script_ops._py_funcs.size(), 100)
 
   # ----- Tests for eager_py_func -----
   @test_util.run_in_graph_and_eager_modes()
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index faa4b49a8d..79fe927b8a 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -144,9 +144,7 @@ class StatefulScatterNdTest(test.TestCase):
         self.assertAllClose(new, ref_var.eval())
 
   def _VariableRankTests(self, np_scatter, tf_scatter):
-    for vtype in (np.int32,
-                  np.float32, np.float64,
-                  np.complex64, np.complex128):
+    for vtype in (np.float32, np.float64, np.complex64, np.complex128):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(np_scatter, tf_scatter, vtype, itype)
 
@@ -223,7 +221,7 @@ class StatefulScatterNdTest(test.TestCase):
   #   self._VariableRankTests(_NumpyDiv, state_ops.scatter_nd_div)
 
   def _ScatterRepeatIndicesTest(self, np_scatter, tf_scatter):
-    for vtype in (np.int32, np.float32, np.float64):
+    for vtype in (np.float32, np.float64):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(
             np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index 1a0fa744ae..c70a4ffce7 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -159,13 +159,7 @@ class ScatterTest(test.TestCase):
 
           # Clips small values to avoid division by zero.
           def clip_small_values(x):
-            threshold = 1e-4
-            sign = np.sign(x)
-
-            if isinstance(x, np.int32):
-              threshold = 1
-              sign = np.random.choice([-1, 1])
-            return threshold * sign if np.abs(x) < threshold else x
+            return 1e-4 * np.sign(x) if np.abs(x) < 1e-4 else x
 
           updates = np.vectorize(clip_small_values)(updates)
           old = _AsType(np.random.randn(*((first_dim,) + extra_shape)), vtype)
@@ -187,11 +181,7 @@ class ScatterTest(test.TestCase):
                          tf_scatter,
                          repeat_indices=False,
                          updates_are_scalar=False):
-    vtypes = [np.float32, np.float64]
-    if tf_scatter != state_ops.scatter_div:
-      vtypes.append(np.int32)
-
-    for vtype in vtypes:
+    for vtype in (np.float32, np.float64):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices,
                                updates_are_scalar)
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index a82855dfeb..794be096b7 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -264,9 +264,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
 
     # A subset of ops has been enabled for complex numbers
     self.complex_ops_list = [(np.add, None,
-                              math_ops.unsorted_segment_sum, lambda t: 0),
-                             (np.ndarray.__mul__, None,
-                              math_ops.unsorted_segment_prod, lambda t: 1)]
+                              math_ops.unsorted_segment_sum, lambda t: 0)]
     self.differentiable_dtypes = [dtypes_lib.float16, dtypes_lib.float32,
                                   dtypes_lib.float64]
     self.all_dtypes = (self.differentiable_dtypes +
diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index e20daccb28..a5bd1b6ee0 100644
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -146,101 +146,5 @@ class StringSplitOpTest(test.TestCase):
       self.assertAllEqual(shape, [3, 1])
 
 
-class StringSplitV2OpTest(test.TestCase):
-
-  def testSplitV2(self):
-    strings = ["pigs on the wing", "animals"]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings)
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
-      self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
-      self.assertAllEqual(shape, [2, 4])
-
-  def testSplitV2MultiCharSeparator(self):
-    # Match Python behavior:
-    # >>> '1<>2<>3'.split('<>')
-    # ['1', '2', '3']
-    # >>> "<><>4<>5<><>6<>".split("<>")
-    # ['', '', '4', '5', '', '6', '']
-    strings = ["1<>2<>3", "<><>4<>5<><>6<>"]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings, sep="<>")
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(
-          indices, [[0, 0], [0, 1], [0, 2],
-                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
-      self.assertAllEqual(values, [b"1", b"2", b"3",
-                                   b"", b"", b"4", b"5", b"", b"6", b""])
-      self.assertAllEqual(shape, [2, 7])
-
-  def testSplitV2SimpleSeparator(self):
-    # Match Python behavior:
-    # >>> '1,2,3'.split(',')
-    # ['1', '2', '3']
-    # >>> '1,2,,3,'.split(',')
-    # ['1', '2', '', '3', '']
-    strings = ["1,2,3", "4,5,,6,"]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings, sep=',')
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
-                                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
-      self.assertAllEqual(values, [b"1", b"2", b"3",
-                                   b"4", b"5", b"", b"6", b""])
-      self.assertAllEqual(shape, [2, 5])
-
-  def testSplitV2EmptySeparator(self):
-    # Match Python behavior:
-    # >>> '1 2 3'.split()
-    # ['1', '2', '3']
-    #>>> '   1   2   3   '.split()
-    #['1', '2', '3']
-    strings = ["1 2 3", "  4  5    6  "]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings)
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
-                                    [1, 0], [1, 1], [1, 2]])
-      self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
-      self.assertAllEqual(shape, [2, 3])
-
-  def testSplitV2SimpleSeparatorMaxSplit(self):
-    # Match Python behavior:
-    # >>> '1,2,3'.split(',', maxsplit=1)
-    # ['1', '2,3']
-    # >>> '4,5,,6,'.split(',', maxsplit=1)
-    # ['4', '5,,6,']
-    strings = ["1,2,3", "4,5,,6,"]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings, sep=',', maxsplit=1)
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(indices, [[0, 0], [0, 1],
-                                    [1, 0], [1, 1]])
-      self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
-      self.assertAllEqual(shape, [2, 2])
-
-  def testSplitV2EmptySeparatorMaxSplit(self):
-    # Match Python behavior:
-    # '1 2 3'.split(maxsplit=1)
-    # ['1', '2 3']
-    # >>> "  4  5    6  ".split(maxsplit=1)
-    # ['4', '5    6  ']
-    strings = ["1 2 3", "  4  5    6  "]
-
-    with self.test_session() as sess:
-      tokens = string_ops.string_split_v2(strings, maxsplit=1)
-      indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(indices, [[0, 0], [0, 1],
-                                    [1, 0], [1, 1]])
-      self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
-      self.assertAllEqual(shape, [2, 2])
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index fae63b1132..8129334703 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -2619,10 +2619,6 @@ reverse.__doc__ = gen_array_ops.reverse_v2.__doc__
 
 # pylint: disable=redefined-builtin
 @tf_export("reverse_sequence")
-@deprecation.deprecated_args(
-    None, "seq_dim is deprecated, use seq_axis instead", "seq_dim")
-@deprecation.deprecated_args(
-    None, "batch_dim is deprecated, use batch_axis instead", "batch_dim")
 def reverse_sequence(input,
                      seq_lengths,
                      seq_axis=None,
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 94c8d79335..12afcd0b51 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -283,10 +283,10 @@ def compute_gradient(x,
   numbers.  For example, if `x` is complex with shape `[m]` and `y` is complex
   with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with
 
-      J[::2, ::2] = d(Re y)/d(Re x)
-      J[::2, 1::2] = d(Im y)/d(Re x)
-      J[1::2, ::2] = d(Re y)/d(Im x)
-      J[1::2, 1::2] = d(Im y)/d(Im x)
+      J[:m, :n] = d(Re y)/d(Re x)
+      J[:m, n:] = d(Im y)/d(Re x)
+      J[m:, :n] = d(Re y)/d(Im x)
+      J[m:, n:] = d(Im y)/d(Im x)
 
   Args:
     x: a tensor or list of tensors
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index f27d9224c1..bdcf420980 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -28,7 +28,6 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
@@ -259,14 +258,14 @@ def random_flip_up_down(image, seed=None):
   dimension, which is `height`.  Otherwise output the image as-is.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
+    image: A 3-D tensor of shape `[height, width, channels].`
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A tensor of the same type and shape as `image`.
+    A 3-D tensor of the same type and shape as `image`.
+
   Raises:
     ValueError: if the shape of `image` not supported.
   """
@@ -281,14 +280,13 @@ def random_flip_left_right(image, seed=None):
   second dimension, which is `width`.  Otherwise output the image as-is.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
+    image: A 3-D tensor of shape `[height, width, channels].`
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A tensor of the same type and shape as `image`.
+    A 3-D tensor of the same type and shape as `image`.
 
   Raises:
     ValueError: if the shape of `image` not supported.
@@ -299,8 +297,7 @@ def random_flip_left_right(image, seed=None):
 def _random_flip(image, flip_index, seed, scope_name):
   """Randomly (50% chance) flip an image along axis `flip_index`.
     Args:
-      image: 4-D Tensor of shape `[batch, height, width, channels]` or
-             3-D Tensor of shape `[height, width, channels]`.
+      image: A 3-D tensor of shape `[height, width, channels].`
       flip_index: The dimension along which to flip the image.
                   Vertical: 0, Horizontal: 1
       seed: A Python integer. Used to create a random seed. See
@@ -309,37 +306,22 @@ def _random_flip(image, flip_index, seed, scope_name):
       scope_name: Name of the scope in which the ops are added.
 
     Returns:
-      A tensor of the same type and shape as `image`.
+      A 3-D tensor of the same type and shape as `image`.
 
     Raises:
       ValueError: if the shape of `image` not supported.
   """
   with ops.name_scope(None, scope_name, [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = _AssertAtLeast3DImage(image)
-    shape = image.get_shape()
-    if shape.ndims == 3 or shape.ndims is None:
-      uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-      mirror_cond = math_ops.less(uniform_random, .5)
-      result = control_flow_ops.cond(
-          mirror_cond,
-          lambda: array_ops.reverse(image, [flip_index]),
-          lambda: image,
-          name=scope
-      )
-      return fix_image_flip_shape(image, result)
-    elif shape.ndims == 4:
-      uniform_random = random_ops.random_uniform(
-          [array_ops.shape(image)[0]], 0, 1.0, seed=seed
-      )
-      mirror_cond = math_ops.less(uniform_random, .5)
-      return array_ops.where(
-          mirror_cond,
-          image,
-          functional_ops.map_fn(lambda x: array_ops.reverse(x, [flip_index]), image, dtype=image.dtype)
-      )
-    else:
-      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+    image = _Assert3DImage(image)
+    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
+    mirror_cond = math_ops.less(uniform_random, .5)
+    result = control_flow_ops.cond(
+        mirror_cond,
+        lambda: array_ops.reverse(image, [flip_index]),
+        lambda: image,
+        name=scope)
+    return fix_image_flip_shape(image, result)
 
 
 @tf_export('image.flip_left_right')
@@ -1652,13 +1634,13 @@ def is_jpeg(contents, name=None):
 
 
 @tf_export('image.decode_image')
-def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
+def decode_image(contents, channels=None, name=None):
   """Convenience function for `decode_bmp`, `decode_gif`, `decode_jpeg`,
   and `decode_png`.
 
   Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
-  appropriate operation to convert the input bytes `string` into a `Tensor`
-  of type `dtype`.
+  appropriate operation to convert the input bytes `string` into a `Tensor` of
+  type `uint8`.
 
   Note: `decode_gif` returns a 4-D array `[num_frames, height, width, 3]`, as
   opposed to `decode_bmp`, `decode_jpeg` and `decode_png`, which return 3-D
@@ -1670,11 +1652,10 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
     contents: 0-D `string`. The encoded image bytes.
     channels: An optional `int`. Defaults to `0`. Number of color channels for
       the decoded image.
-    dtype: The desired DType of the returned `Tensor`.
     name: A name for the operation (optional)
 
   Returns:
-    `Tensor` with type `dtype` and shape `[height, width, num_channels]` for
+    `Tensor` with type `uint8` with shape `[height, width, num_channels]` for
       BMP, JPEG, and PNG images and shape `[num_frames, height, width, 3]` for
       GIF images.
 
@@ -1698,7 +1679,7 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
       channels_msg = 'Channels must be in (None, 0, 3) when decoding BMP images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_decode, assert_channels]):
-        return convert_image_dtype(gen_image_ops.decode_bmp(contents), dtype)
+        return gen_image_ops.decode_bmp(contents)
 
     def _gif():
       # Create assert to make sure that channels is not set to 1
@@ -1711,7 +1692,7 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
       channels_msg = 'Channels must be in (None, 0, 3) when decoding GIF images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return convert_image_dtype(gen_image_ops.decode_gif(contents), dtype)
+        return gen_image_ops.decode_gif(contents)
 
     def check_gif():
       # Create assert op to check that bytes are GIF decodable
@@ -1720,11 +1701,7 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
 
     def _png():
       """Decodes a PNG image."""
-      return convert_image_dtype(
-          gen_image_ops.decode_png(contents, channels,
-                                   dtype=dtypes.uint8
-                                   if dtype == dtypes.uint8
-                                   else dtypes.uint16), dtype)
+      return gen_image_ops.decode_png(contents, channels)
 
     def check_png():
       """Checks if an image is PNG."""
@@ -1740,8 +1717,7 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
                       'images')
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return convert_image_dtype(
-            gen_image_ops.decode_jpeg(contents, channels), dtype)
+        return gen_image_ops.decode_jpeg(contents, channels)
 
     # Decode normal JPEG images (start with \xff\xd8\xff\xe0)
     # as well as JPEG images with EXIF data (start with \xff\xd8\xff\xe1).
@@ -1902,7 +1878,7 @@ def sample_distorted_bounding_box(image_size,
       width / height within this range.
     area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
       The cropped area of the image must contain a fraction of the
-      supplied image within this range.
+      supplied image within in this range.
     max_attempts: An optional `int`. Defaults to `100`.
       Number of attempts at generating a cropped region of the image
       of the specified constraints. After `max_attempts` failures, return the
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 2a6ab26e96..45499dcce0 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -533,37 +533,6 @@ class FlipImageBenchmark(test.Benchmark):
         iters=benchmark_rounds,
         wall_time=step_time)
 
-  def _benchmarkBatchedRandomFlipLeftRight(self, device, cpu_count):
-    image_shape = [16, 299, 299, 3]
-    warmup_rounds = 100
-    benchmark_rounds = 1000
-    config = config_pb2.ConfigProto()
-    if cpu_count is not None:
-      config.inter_op_parallelism_threads = 1
-      config.intra_op_parallelism_threads = cpu_count
-    with session.Session("", graph=ops.Graph(), config=config) as sess:
-      with ops.device(device):
-        inputs = variables.Variable(
-            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
-            trainable=False,
-            dtype=dtypes.float32)
-        run_op = image_ops.random_flip_left_right(inputs)
-        sess.run(variables.global_variables_initializer())
-        for i in xrange(warmup_rounds + benchmark_rounds):
-          if i == warmup_rounds:
-            start = time.time()
-          sess.run(run_op)
-    end = time.time()
-    step_time = (end - start) / benchmark_rounds
-    tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
-    print("benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s step_time: "
-          "%.2f us" %
-          (tag, step_time * 1e6))
-    self.report_benchmark(
-        name="benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s" % (tag),
-        iters=benchmark_rounds,
-        wall_time=step_time)
-
   def benchmarkFlipLeftRightCpu1(self):
     self._benchmarkFlipLeftRight("/cpu:0", 1)
 
@@ -582,15 +551,6 @@ class FlipImageBenchmark(test.Benchmark):
   def benchmarkRandomFlipLeftRightGpu(self):
     self._benchmarkRandomFlipLeftRight(test.gpu_device_name(), None)
 
-  def benchmarkBatchedRandomFlipLeftRightCpu1(self):
-    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", 1)
-
-  def benchmarkBatchedRandomFlipLeftRightCpuAll(self):
-    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", None)
-
-  def benchmarkBatchedRandomFlipLeftRightGpu(self):
-    self._benchmarkBatchedRandomFlipLeftRight(test.gpu_device_name(), None)
-
 
 class AdjustHueBenchmark(test.Benchmark):
 
@@ -1027,7 +987,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf, seed=seed)
+      y = image_ops.random_flip_left_right(x_tf)
       self.assertTrue(y.op.name.startswith("random_flip_left_right"))
 
       count_flipped = 0
@@ -1048,50 +1008,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
-  def testRandomFlipLeftRightWithBatch(self):
-    batch_size = 16
-    seed = 42
-
-    # create single item of test data
-    x_np_raw = np.array(
-        [[1, 2, 3], [1, 2, 3]], dtype=np.uint8
-    ).reshape([1, 2, 3, 1])
-    y_np_raw = np.array(
-        [[3, 2, 1], [3, 2, 1]], dtype=np.uint8
-    ).reshape([1, 2, 3, 1])
-
-    # create batched test data
-    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
-    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
-
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf, seed=seed)
-      self.assertTrue(y.op.name.startswith("random_flip_left_right"))
-
-      count_flipped = 0
-      count_unflipped = 0
-      for _ in range(100):
-        y_tf = y.eval()
-
-        # check every element of the batch
-        for i in range(batch_size):
-          if y_tf[i][0][0] == 1:
-            self.assertAllEqual(y_tf[i], x_np[i])
-            count_unflipped += 1
-          else:
-            self.assertAllEqual(y_tf[i], y_np[i])
-            count_flipped += 1
-
-      # 100 trials, each containing batch_size elements
-      # Mean: 50 * batch_size
-      # Std Dev: ~5 * sqrt(batch_size)
-      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
-      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
-      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
-      self.assertGreaterEqual(count_flipped, six_sigma)
-      self.assertGreaterEqual(count_unflipped, six_sigma)
-
   def testInvolutionUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1141,11 +1057,9 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
 
-    seed = 42
-
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf, seed=seed)
+      y = image_ops.random_flip_up_down(x_tf, seed=42)
       self.assertTrue(y.op.name.startswith("random_flip_up_down"))
       count_flipped = 0
       count_unflipped = 0
@@ -1165,50 +1079,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
-  def testRandomFlipUpDownWithBatch(self):
-    batch_size = 16
-    seed = 42
-
-    # create single item of test data
-    x_np_raw = np.array(
-        [[1, 2, 3], [4, 5, 6]], dtype=np.uint8
-    ).reshape([1, 2, 3, 1])
-    y_np_raw = np.array(
-        [[4, 5, 6], [1, 2, 3]], dtype=np.uint8
-    ).reshape([1, 2, 3, 1])
-
-    # create batched test data
-    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
-    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
-
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf, seed=seed)
-      self.assertTrue(y.op.name.startswith("random_flip_up_down"))
-
-      count_flipped = 0
-      count_unflipped = 0
-      for _ in range(100):
-        y_tf = y.eval()
-
-        # check every element of the batch
-        for i in range(batch_size):
-          if y_tf[i][0][0] == 1:
-            self.assertAllEqual(y_tf[i], x_np[i])
-            count_unflipped += 1
-          else:
-            self.assertAllEqual(y_tf[i], y_np[i])
-            count_flipped += 1
-
-      # 100 trials, each containing batch_size elements
-      # Mean: 50 * batch_size
-      # Std Dev: ~5 * sqrt(batch_size)
-      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
-      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
-      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
-      self.assertGreaterEqual(count_flipped, six_sigma)
-      self.assertGreaterEqual(count_unflipped, six_sigma)
-
   def testInvolutionTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1286,7 +1156,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     #Ops that support 4D input
     for op in [
         image_ops.flip_left_right, image_ops.flip_up_down,
-        image_ops.random_flip_left_right, image_ops.random_flip_up_down,
         image_ops.transpose_image, image_ops.rot90
     ]:
       transformed_unknown_dims_4 = op(p_unknown_dims_4)
@@ -1297,6 +1166,14 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
                                    "must be at least three-dimensional"):
         op(p_wrong_rank)
 
+    for op in [
+        image_ops.random_flip_left_right,
+        image_ops.random_flip_up_down,
+    ]:
+      with self.assertRaisesRegexp(ValueError, "must be three-dimensional"):
+        op(p_wrong_rank)
+
+
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -1331,6 +1208,41 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         y_np = np.rot90(image, k=k, axes=(1, 2))
         self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
+class RandomFlipTest(test_util.TensorFlowTestCase):
+
+  def testRandomLeftRight(self):
+    x_np = np.array([0, 1], dtype=np.uint8).reshape([1, 2, 1])
+    num_iterations = 500
+
+    hist = [0, 0]
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_left_right(x_tf)
+      for _ in xrange(num_iterations):
+        y_np = y.eval().flatten()[0]
+        hist[y_np] += 1
+
+    # Ensure that each entry is observed within 4 standard deviations.
+    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
+    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
+
+  def testRandomUpDown(self):
+    x_np = np.array([0, 1], dtype=np.uint8).reshape([2, 1, 1])
+    num_iterations = 500
+
+    hist = [0, 0]
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_up_down(x_tf)
+      for _ in xrange(num_iterations):
+        y_np = y.eval().flatten()[0]
+        hist[y_np] += 1
+
+    # Ensure that each entry is observed within 4 standard deviations.
+    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
+    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
+
+
 class AdjustContrastTest(test_util.TensorFlowTestCase):
 
   def _testContrast(self, x_np, y_np, contrast_factor):
@@ -3968,88 +3880,5 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
       self.assertAllClose(expected_batch, actual_sobel)
 
 
-class DecodeImageTest(test_util.TensorFlowTestCase):
-
-  def testJpegUint16(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/jpeg/testdata"
-      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
-      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
-                                             dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testPngUint16(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/png/testdata"
-      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
-      image0 = image_ops.decode_image(png0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(
-          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testGifUint16(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/gif/testdata"
-      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
-      image0 = image_ops.decode_image(gif0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
-                                             dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testBmpUint16(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/bmp/testdata"
-      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
-      image0 = image_ops.decode_image(bmp0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
-                                             dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testJpegFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/jpeg/testdata"
-      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
-      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
-                                             dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testPngFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/png/testdata"
-      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
-      image0 = image_ops.decode_image(png0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(
-          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testGifFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/gif/testdata"
-      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
-      image0 = image_ops.decode_image(gif0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
-                                             dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-  def testBmpFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/bmp/testdata"
-      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
-      image0 = image_ops.decode_image(bmp0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
-                                             dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
-      self.assertAllEqual(image0, image1)
-
-
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 724fcc39cd..2df230d470 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -467,8 +467,7 @@ class VarianceScaling(Initializer):
     else:
       scale /= max(1., (fan_in + fan_out) / 2.)
     if self.distribution == "normal":
-      # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
-      stddev = math.sqrt(scale) / .87962566103423978
+      stddev = math.sqrt(scale)
       return random_ops.truncated_normal(
           shape, 0.0, stddev, dtype, seed=self.seed)
     else:
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 8276047cb6..222b8ebc9d 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -35,9 +35,8 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 # Assert and Print are special symbols in python, so we must
-# have an upper-case version of them.  For users with Python 3 or Python 2.7
-# with `from __future__ import print_function`, we also allow lowercase.
-@tf_export("Print", "print")
+# use an upper-case version of them.
+@tf_export("Print")
 def Print(input_, data, message=None, first_n=None, summarize=None,
           name=None):
   """Prints a list of tensors.
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 466d0dadc8..e40481f3a7 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -125,8 +125,8 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` or `SparseTensor` of type `float16`, `float32`, `float64`,
-      `int32`, `int64`, `complex64` or `complex128`.
+    x: A `Tensor` or `SparseTensor` of type `float32`, `float64`, `int32`,
+      `int64`, `complex64` or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -430,10 +430,10 @@ def pow(x, y, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
-     `complex64`, or `complex128`.
-    y: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
-     `complex64`, or `complex128`.
+    x: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
+     or `complex128`.
+    y: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
+     or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -600,7 +600,7 @@ def round(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, or `int64`.
+    x: A `Tensor` of type `float32` or `float64`.
     name: A name for the operation (optional).
 
   Returns:
@@ -1257,7 +1257,7 @@ def reduce_sum(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1397,7 +1397,7 @@ def reduce_mean(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1469,7 +1469,7 @@ def reduce_prod(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1519,7 +1519,7 @@ def reduce_min(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1568,7 +1568,7 @@ def reduce_max(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1617,7 +1617,7 @@ def reduce_all(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1675,7 +1675,7 @@ def reduce_any(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index f47f38e29e..783d485892 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -621,7 +621,7 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
 
   Args:
-    counts: A `Tensor` containing the total count of the data (one value).
+    counts: A `Tensor` containing a the total count of the data (one value).
     mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly
       shifted) sum of the elements to average over.
     variance_ss: A `Tensor` containing the variance sufficient statistics: the
@@ -689,9 +689,6 @@ def moments(
     # Compute true mean while keeping the dims for proper broadcasting.
     mean = math_ops.reduce_mean(y, axes, keepdims=True, name="mean")
     # sample variance, not unbiased variance
-    # Note: stop_gradient does not change the gradient that gets 
-    #       backpropagated to the mean from the variance calculation,
-    #       because that gradient is zero
     variance = math_ops.reduce_mean(
         math_ops.squared_difference(y, array_ops.stop_gradient(mean)),
         axes,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 0c2f5b06c4..a0b55eb077 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1596,12 +1596,12 @@ def leaky_relu(features, alpha=0.2, name=None):
   Returns:
     The activation value.
   """
-  with ops.name_scope(name, "LeakyRelu", [features, alpha]) as name:
+  with ops.name_scope(name, "LeakyRelu", [features, alpha]):
     features = ops.convert_to_tensor(features, name="features")
     if features.dtype.is_integer:
       features = math_ops.to_float(features)
     alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
-    return math_ops.maximum(alpha * features, features, name=name)
+    return math_ops.maximum(alpha * features, features)
 
 
 def _flatten_outer_dims(logits):
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 035b4735af..46a5f4fae6 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -962,16 +962,6 @@ class LeakyReluTest(test_lib.TestCase):
       self.assertAllClose(
           outputs, [-0.4, -0.2, 0.0, 1.0, 2.0], rtol=tol, atol=tol)
 
-  def testName(self):
-    np_values = np.array([-2, -1, 0, 1, 2], dtype=np.float64)
-    outputs_with_name_set = nn_ops.leaky_relu(
-        constant_op.constant(np_values),
-        name='test_relu_op')
-    self.assertEqual(outputs_with_name_set.name, 'test_relu_op:0')
-    outputs_without_name_set = nn_ops.leaky_relu(
-        constant_op.constant(np_values))
-    self.assertEqual(outputs_without_name_set.name, 'LeakyRelu:0')
-
 
 class SwishTest(test_lib.TestCase):
 
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 219562de5d..f8676ccb5f 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -23,7 +23,6 @@ import threading
 
 # Used by py_util.cc to get tracebacks.
 import traceback  # pylint: disable=unused-import
-import weakref
 
 import numpy as np
 import six
@@ -130,14 +129,11 @@ class FuncRegistry(object):
   def __init__(self):
     self._lock = threading.Lock()
     self._unique_id = 0  # GUARDED_BY(self._lock)
-    # Only store weakrefs to the funtions. The strong reference is stored in
-    # the graph.
-    self._funcs = weakref.WeakValueDictionary()
+    self._funcs = {}
 
   def insert(self, func):
     """Registers `func` and returns a unique token for this entry."""
     token = self._next_unique_token()
-    # Store a weakref to the function
     self._funcs[token] = func
     return token
 
@@ -190,7 +186,7 @@ class FuncRegistry(object):
     Raises:
       ValueError: if no function is registered for `token`.
     """
-    func = self._funcs.get(token, None)
+    func = self._funcs[token]
     if func is None:
       raise ValueError("callback %s is not found" % token)
     if isinstance(func, EagerFunc):
@@ -232,6 +228,19 @@ _py_funcs = FuncRegistry()
 pywrap_tensorflow.InitializePyTrampoline(_py_funcs)
 
 
+class CleanupFunc(object):
+  """A helper class to remove a registered function from _py_funcs."""
+
+  def __init__(self, token):
+    self._token = token
+
+  def __del__(self):
+    if _py_funcs is not None:
+      # If _py_funcs is None, the program is most likely in shutdown, and the
+      # _py_funcs object has been destroyed already.
+      _py_funcs.remove(self._token)
+
+
 def _internal_py_func(func,
                       inp,
                       Tout,
@@ -261,15 +270,17 @@ def _internal_py_func(func,
     # bound to that of the outer graph instead.
     graph = graph._outer_graph
 
+  cleanup = CleanupFunc(token)
+
   # TODO(zhifengc): Consider adding a Graph method to collect
   # `cleanup` objects in one of its member.
-  if not hasattr(graph, "_py_funcs_used_in_graph"):
-    graph._py_funcs_used_in_graph = []
+  if not hasattr(graph, "_cleanup_py_funcs_used_in_graph"):
+    graph._cleanup_py_funcs_used_in_graph = []
 
-  # Store a reference to the function in the graph to ensure it stays alive
-  # as long as the graph lives. When the graph is destroyed, the function
-  # is left to the garbage collector for destruction as well.
-  graph._py_funcs_used_in_graph.append(func)
+  # When `graph` is destroyed, elements in _cleanup_py_funcs_used_in_graph
+  # will be destroyed and their __del__ will remove the 'token' from
+  # the funcs registry.
+  graph._cleanup_py_funcs_used_in_graph.append(cleanup)
   # pylint: enable=protected-access
 
   if eager:
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index c3b16a7bd5..0130233746 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -84,8 +84,6 @@ def _convert_to_sparse_tensors(sp_inputs):
 
 # pylint: disable=protected-access
 @tf_export("sparse_concat")
-@deprecation.deprecated_args(
-    None, "concat_dim is deprecated, use axis instead", "concat_dim")
 def sparse_concat(axis,
                   sp_inputs,
                   name=None,
@@ -599,8 +597,6 @@ class KeywordRequired(object):
 
 
 @tf_export("sparse_split")
-@deprecation.deprecated_args(
-    None, "split_dim is deprecated, use axis instead", "split_dim")
 def sparse_split(keyword_required=KeywordRequired(),
                  sp_input=None,
                  num_split=None,
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 0280c89c10..ae79c01949 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -91,59 +91,6 @@ def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=inv
   shape.set_shape([2])
   return sparse_tensor.SparseTensor(indices, values, shape)
 
-@tf_export("strings.split")
-def string_split_v2(source, sep=None, maxsplit=-1):
-  """Split elements of `source` based on `sep` into a `SparseTensor`.
-
-  Let N be the size of source (typically N will be the batch size). Split each
-  element of `source` based on `sep` and return a `SparseTensor`
-  containing the split tokens. Empty tokens are ignored.
-
-  For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
-  then the output will be
-
-  st.indices = [0, 0;
-                0, 1;
-                1, 0;
-                1, 1;
-                1, 2]
-  st.shape = [2, 3]
-  st.values = ['hello', 'world', 'a', 'b', 'c']
-
-  If `sep` is given, consecutive delimiters are not grouped together and are
-  deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
-  sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
-  string, consecutive whitespace are regarded as a single separator, and the
-  result will contain no empty strings at the startor end if the string has
-  leading or trailing whitespace.
-
-  Note that the above mentioned behavior matches python's str.split.
-
-  Args:
-    source: `1-D` string `Tensor`, the strings to split.
-    sep: `0-D` string `Tensor`, the delimiter character.
-    maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
-
-  Raises:
-    ValueError: If sep is not a string.
-
-  Returns:
-    A `SparseTensor` of rank `2`, the strings split according to the delimiter.
-    The first column of the indices corresponds to the row in `source` and the
-    second column corresponds to the index of the split component in this row.
-  """
-  if sep is None:
-    sep = ''
-  sep = ops.convert_to_tensor(sep, dtype=dtypes.string)
-  source = ops.convert_to_tensor(source, dtype=dtypes.string)
-
-  indices, values, shape = gen_string_ops.string_split_v2(
-      source, sep=sep, maxsplit=maxsplit)
-  indices.set_shape([None, 2])
-  values.set_shape([None])
-  shape.set_shape([2])
-  return sparse_tensor.SparseTensor(indices, values, shape)
-
 
 def _reduce_join_reduction_dims(x, axis, reduction_indices):
   """Returns range(rank(x) - 1, 0, -1) if reduction_indices is None."""
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 47414c28af..f49e2d314d 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -1786,23 +1786,6 @@ class variable_scope(object):
           assert v.name == "foo/bar/v:0"
   ```
 
-  Simple example of how to reenter a premade variable scope safely:
-
-  ```python
-  with tf.variable_scope("foo") as vs:
-    pass
-
-  # Re-enter the variable scope.
-  with tf.variable_scope(vs,
-                         auxiliary_name_scope=False) as vs1:
-    # Restore the original name_scope.
-    with tf.name_scope(vs1.original_name_scope):
-        v = tf.get_variable("v", [1])
-        assert v.name == "foo/v:0"
-        c = tf.constant([1], name="c")
-        assert c.name == "foo/c:0"
-  ```
-
   Basic example of sharing a variable AUTO_REUSE:
 
   ```python
@@ -1941,9 +1924,7 @@ class variable_scope(object):
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
       auxiliary_name_scope: If `True`, we create an auxiliary name scope with
-        the scope. If `False`, we don't create it. Note that the argument is
-        not inherited, and it only takes effect for once when creating. You
-        should only use it for re-entering a premade variable scope.
+        the scope. If `False`, we don't touch name scope.
 
     Returns:
       A scope that can be captured and reused.
diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
index 00de044505..00de044505 100644..100755
--- a/tensorflow/python/tools/import_pb_to_tensorboard.py
+++ b/tensorflow/python/tools/import_pb_to_tensorboard.py
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index b59f8e1f98..522965990b 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1719,7 +1719,7 @@ def tf_py_build_info_genrule():
       name="py_build_info_gen",
       outs=["platform/build_info.py"],
       cmd=
-     "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
+      "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
       local=1,
       tools=[clean_dep("//tensorflow/tools/build_info:gen_build_info.py")],)
 
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 671b7e387e..bca9fa49eb 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -41,11 +41,7 @@ _GENERATED_FILE_HEADER = """# This file is MACHINE GENERATED! Do not edit.
 # Generated by: tensorflow/tools/api/generator/create_python_api.py script.
 \"\"\"%s
 \"\"\"
-
-from __future__ import print_function
-
 """
-_GENERATED_FILE_FOOTER = "\n\ndel print_function\n"
 
 
 class SymbolExposedTwiceError(Exception):
@@ -153,7 +149,6 @@ class _ModuleInitCodeBuilder(object):
 _names_with_underscore = [%s]
 __all__ = [_s for _s in dir() if not _s.startswith('_')]
 __all__.extend([_s for _s in _names_with_underscore])
-__all__.remove('print_function')
 ''' % underscore_names_str
 
     return module_text_map
@@ -338,8 +333,7 @@ def create_api_files(
     if module or not root_init_template:
       contents = (
           _GENERATED_FILE_HEADER %
-          get_module_docstring(module, package, api_name) +
-          text + _GENERATED_FILE_FOOTER)
+          get_module_docstring(module, package, api_name) + text)
     else:
       # Read base init file
       with open(root_init_template, 'r') as root_init_template_file:
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index 10171b3d60..5bb3b3c444 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -58,7 +58,7 @@ tf_module {
   }
   member_method {
     name: "decode_image"
-    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "decode_jpeg"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 3051c4437e..dc2bd40096 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -1533,10 +1533,6 @@ tf_module {
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "print"
-    argspec: "args=[\'input_\', \'data\', \'message\', \'first_n\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
     name: "py_func"
     argspec: "args=[\'func\', \'inp\', \'Tout\', \'stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
index b641c39feb..a3fbe95bba 100644
--- a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
@@ -4,8 +4,4 @@ tf_module {
     name: "regex_full_match"
     argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "split"
-    argspec: "args=[\'source\', \'sep\', \'maxsplit\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
-  }
 }
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 883bb93647..5fa75e1d61 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -322,10 +322,6 @@ create_activate_virtualenv_and_install_tensorflow() {
   pip install -v ${PIP_FLAGS} ${WHL_PATH} || \
     die "pip install (forcing to reinstall tensorflow) FAILED"
   echo "Successfully installed pip package ${TF_WHEEL_PATH}"
-
-  # Force downgrade setuptools.
-  pip install --upgrade setuptools==39.1.0
-
 }
 
 ################################################################################
diff --git a/tensorflow/tools/ci_build/builds/with_the_same_user b/tensorflow/tools/ci_build/builds/with_the_same_user
index b216e3549f..d4bf546d40 100755
--- a/tensorflow/tools/ci_build/builds/with_the_same_user
+++ b/tensorflow/tools/ci_build/builds/with_the_same_user
@@ -40,7 +40,7 @@ if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
   ADDUSER_OPTS="--force-badname"
 fi
 
-getent group "${CI_BUILD_GID}" || addgroup ${ADDUSER_OPTS} --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
+getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
 getent passwd "${CI_BUILD_UID}" || adduser ${ADDUSER_OPTS} \
     --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
     --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 1f0fd0387a..072dd6ab99 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -134,12 +134,6 @@ if [[ $? != "0" ]]; then
   die "ERROR: docker build failed. Dockerfile is at ${DOCKERFILE_PATH}"
 fi
 
-# If caller wants the with_the_same_user script to allow bad usernames, 
-# pass the var to the docker environment
-if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
-        CI_BUILD_USER_FORCE_BADNAME_ENV="-e CI_BUILD_USER_FORCE_BADNAME=yes"
-fi
-
 # Run the command inside the container.
 echo "Running '${COMMAND[*]}' inside ${DOCKER_IMG_NAME}..."
 mkdir -p ${WORKSPACE}/bazel-ci_build-cache
@@ -154,7 +148,6 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_BUILD_GROUP=$(id -g -n)" \
     -e "CI_BUILD_GID=$(id -g)" \
     -e "CI_TENSORFLOW_SUBMODULE_PATH=${CI_TENSORFLOW_SUBMODULE_PATH}" \
-    ${CI_BUILD_USER_FORCE_BADNAME_ENV} \
     -v ${WORKSPACE}:/workspace \
     -w /workspace \
     ${GPU_EXTRA_PARAMS} \
diff --git a/tensorflow/tools/ci_build/copy_binary.py b/tensorflow/tools/ci_build/copy_binary.py
index 148526492d..420d390d2b 100755
--- a/tensorflow/tools/ci_build/copy_binary.py
+++ b/tensorflow/tools/ci_build/copy_binary.py
@@ -32,8 +32,7 @@ import shutil
 import tempfile
 import zipfile
 
-TF_NIGHTLY_REGEX = (r"(.+)tf_nightly(|_gpu)-(\d\.[\d]{1,2}"
-                    "\.\d.dev[\d]{0,8})-(.+)\.whl")
+TF_NIGHTLY_REGEX = r"(.+)tf_nightly(|_gpu)-(\d\.\d\.\d.dev[\d]{0,8})-(.+)\.whl"
 BINARY_STRING_TEMPLATE = "%s-%s-%s.whl"
 
 
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 88f1d04193..60290df833 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -115,7 +115,3 @@ pip2 install keras_applications==1.0.2
 pip3 install keras_applications==1.0.2
 pip2 install keras_preprocessing==1.0.1
 pip3 install keras_preprocessing==1.0.1
-
-# Install last working version of setuptools.
-pip2 install --upgrade setuptools==39.1.0
-pip3 install --upgrade setuptools==39.1.0
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index acd69ef346..edb9d4b929 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -39,6 +39,7 @@ if [[ -z $pip35_version ]]; then
 fi
 
 set -e
+pip3.5 install --upgrade setuptools
 pip3.5 install --upgrade pip
 
 pip3.5 install --upgrade virtualenv
@@ -85,7 +86,4 @@ pip3.5 install --upgrade termcolor
 pip3.5 install keras_applications==1.0.2
 pip3.5 install keras_preprocessing==1.0.1
 
-# Install last working version of setuptools.
-pip3.5 install --upgrade setuptools==39.1.0
-
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 323b30f48e..5635977731 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -49,6 +49,7 @@ cd Python-3.6.1
 make altinstall
 ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
 
+pip3 install --upgrade setuptools
 pip3 install --upgrade pip
 
 pip3 install --upgrade virtualenv
@@ -100,8 +101,4 @@ pip3 install --upgrade termcolor
 # Keras
 pip3.5 install keras_applications==1.0.2
 pip3.5 install keras_preprocessing==1.0.1
-
-# Install last working version of setuptools.
-pip3 install --upgrade setuptools==39.1.0
-
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh
deleted file mode 100755
index 10a09a415a..0000000000
--- a/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Usage: basic_mkl_test.sh
-
-# Helper function to traverse directories up until given file is found.
-function upsearch () {
-  test / == "$PWD" && return || \
-      test -e "$1" && echo "$PWD" && return || \
-      cd .. && upsearch "$1"
-}
-
-# Set up WORKSPACE.
-WORKSPACE="${WORKSPACE:-$(upsearch WORKSPACE)}"
-
-BUILD_TAG=mkl-ci-test CI_BUILD_USER_FORCE_BADNAME=yes ${WORKSPACE}/tensorflow/tools/ci_build/ci_build.sh cpu tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index b8bce57c87..1bd1852ffc 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -79,7 +79,6 @@ if [[ $1 == "PI_ONE" ]]; then
   --linkopt=-L${OPENBLAS_INSTALL_PATH}/lib/
   --linkopt=-l:libopenblas.a"
   echo "Building for the Pi One/Zero, with no NEON support"
-  WHEEL_ARCH=linux_armv6l
 else
   PI_COPTS='--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
   --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
@@ -87,7 +86,6 @@ else
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8'
-  WHEEL_ARCH=linux_armv7l
   echo "Building for the Pi Two/Three, with NEON acceleration"
 fi
 
@@ -102,8 +100,6 @@ bazel build -c opt ${PI_COPTS} \
   --copt=-fomit-frame-pointer --cpu=armeabi \
   --crosstool_top=@local_config_arm_compiler//:toolchain \
   --verbose_failures \
-  //tensorflow:libtensorflow.so \
-  //tensorflow:libtensorflow_framework.so \
   //tensorflow/tools/benchmark:benchmark_model \
   //tensorflow/tools/pip_package:build_pip_package
 
@@ -116,12 +112,10 @@ BDIST_OPTS="--universal" \
   bazel-bin/tensorflow/tools/pip_package/build_pip_package "${OUTDIR}"
 
 OLD_FN=$(ls "${OUTDIR}" | grep -m 1 \.whl)
-SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-'${WHEEL_ARCH}'.whl/; print'
+SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-any.whl/; print'
 NEW_FN=$(echo "${OLD_FN}" | perl -ne "${SUB}")
 mv "${OUTDIR}/${OLD_FN}" "${OUTDIR}/${NEW_FN}"
 cp bazel-bin/tensorflow/tools/benchmark/benchmark_model "${OUTDIR}"
-cp bazel-bin/tensorflow/libtensorflow.so "${OUTDIR}"
-cp bazel-bin/tensorflow/libtensorflow_framework.so "${OUTDIR}"
 
 echo "Output can be found here:"
 find "${OUTDIR}"
diff --git a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
index f8f63e276c..47539b2423 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+++ b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
@@ -31,11 +31,7 @@ def _def_file_filter_configure_impl(repository_ctx):
   vc_path = find_vc_path(repository_ctx)
   if vc_path == "visual-studio-not-found":
     auto_configure_fail("Visual C++ build tools not found on your machine")
-
-  undname = find_msvc_tool(repository_ctx, vc_path, "undname.exe")
-  if undname == None:
-    auto_configure_fail("Couldn't find undname.exe under %s, please check your VC installation and set BAZEL_VC environment variable correctly." % vc_path)
-  undname_bin_path = undname.replace("\\", "\\\\")
+  undname_bin_path = find_msvc_tool(repository_ctx, vc_path, "undname.exe").replace("\\", "\\\\")
 
   repository_ctx.template(
     "def_file_filter.py",
diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh
index b0114721bd..06c2b997cb 100755
--- a/tensorflow/tools/dist_test/local_test.sh
+++ b/tensorflow/tools/dist_test/local_test.sh
@@ -64,6 +64,9 @@ die() {
 # Configurations
 DOCKER_IMG_NAME="tensorflow/tf-dist-test-local-cluster"
 
+# Use TensorFlow v1.5.0 for Python 2.7 and CPU only as we set num_gpus to 0 in the below
+DEFAULT_WHL_FILE_LOCATION="https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl"
+
 # Parse input arguments
 LEAVE_CONTAINER_RUNNING=0
 MODEL_NAME=""
@@ -74,7 +77,8 @@ SYNC_REPLICAS_FLAG=""
 
 WHL_FILE_LOCATION=${1}
 if [[ -z "${WHL_FILE_LOCATION}" ]]; then
-  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
+  WHL_FILE_LOCATION=${DEFAULT_WHL_FILE_LOCATION}
+  echo "use default whl file location"
 fi
 
 while true; do
@@ -127,11 +131,7 @@ echo "Building in temporary directory: ${BUILD_DIR}"
 cp -r ${DIR}/* "${BUILD_DIR}"/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
-# Download whl file into the build context directory.
-if [[ -z "${WHL_FILE_LOCATION}" ]]; then
-  pip2 download --no-deps tf-nightly
-  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
-elif [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
+if [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
     # Download whl file into the build context directory.
     wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \
         die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}"
diff --git a/tensorflow/tools/dist_test/remote_test.sh b/tensorflow/tools/dist_test/remote_test.sh
index e188c88c8f..935535312d 100755
--- a/tensorflow/tools/dist_test/remote_test.sh
+++ b/tensorflow/tools/dist_test/remote_test.sh
@@ -108,7 +108,7 @@ fi
 # Parse command-line arguments.
 WHL_URL=${1}
 if [[ -z "${WHL_URL}" ]]; then
-  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
+  die "whl URL is not specified"
 fi
 
 # Create docker build context directory.
@@ -121,13 +121,8 @@ cp -r ${DIR}/* ${BUILD_DIR}/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
 # Download whl file into the build context directory.
-if [[ -z "${WHL_URL}" ]]; then
-  pip2 download --no-deps tf-nightly
-  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
-else
-  wget -P "${BUILD_DIR}" ${WHL_URL} || \
-    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
-fi
+wget -P "${BUILD_DIR}" ${WHL_URL} || \
+  die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
 
 # Build docker image for test.
 docker build ${NO_CACHE_FLAG} \
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 57a491255e..406d134699 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -76,7 +76,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.9 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index 6796ad70e5..a6cd44ced1 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.9
+ARG TF_BRANCH=r1.8
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 204b5b4dba..2fe47f3356 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -13,8 +13,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusparse-dev-9-0 \
         curl \
         git \
-        libcudnn7=7.1.4.18-1+cuda9.0 \
-        libcudnn7-dev=7.1.4.18-1+cuda9.0 \
+        libcudnn7=7.0.5.15-1+cuda9.0 \
+        libcudnn7-dev=7.0.5.15-1+cuda9.0 \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
@@ -85,7 +85,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.9 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 9197651ff4..bff4a20392 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusolver-9-0 \
         cuda-cusparse-9-0 \
         curl \
-        libcudnn7=7.1.4.18-1+cuda9.0 \
+        libcudnn7=7.0.5.15-1+cuda9.0 \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libpng12-dev \
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 620fef9363..5910f0625e 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -61,7 +61,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/autograph/core:core",
     "//tensorflow/contrib/autograph/impl:impl",
     "//tensorflow/contrib/autograph/lang:lang",
-    "//tensorflow/contrib/autograph/operators:operators",
     "//tensorflow/contrib/autograph/pyct:pyct",
     "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
     "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index f7e42ce536..0c4065bc77 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -41,15 +41,51 @@ function is_windows() {
   fi
 }
 
-function prepare_src() {
+function main() {
   if [ $# -lt 1 ] ; then
     echo "No destination dir provided"
     exit 1
   fi
 
-  TMPDIR="$1"
-  mkdir -p "$TMPDIR"
-  echo $(date) : "=== Preparing sources in dir: ${TMPDIR}"
+  DEST=$(real_path $1)
+  TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX)
+
+  PKG_NAME_FLAG=""
+  GPU_BUILD=0
+  NIGHTLY_BUILD=0
+  PROJECT_NAME=""
+  while true; do
+    if [[ "$1" == "--nightly_flag" ]]; then
+      NIGHTLY_BUILD=1
+    elif [[ "$1" == "--gpu" ]]; then
+      GPU_BUILD=1
+    elif [[ "$1" == "--gpudirect" ]]; then
+      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
+    elif [[ "$1" == "--project_name" ]]; then
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+    fi
+    shift
+
+    if [[ -z "$1" ]]; then
+      break
+    fi
+  done
+
+  if [[ -n ${PROJECT_NAME} ]]; then
+    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
+  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
+  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly"
+  elif [[ ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tensorflow_gpu"
+  fi
+
+  echo $(date) : "=== Using tmpdir: ${TMPDIR}"
 
   if [ ! -d bazel-bin/tensorflow ]; then
     echo "Could not find bazel-bin.  Did you run from the root of the build tree?"
@@ -119,28 +155,17 @@ function prepare_src() {
   # over so user defined ops can be compiled.
   mkdir -p ${TMPDIR}/google
   mkdir -p ${TMPDIR}/third_party
-  pushd ${RUNFILES%org_tensorflow} > /dev/null
+  pushd ${RUNFILES%org_tensorflow}
   for header in $(find protobuf_archive -name \*.h); do
     mkdir -p "${TMPDIR}/google/$(dirname ${header})"
     cp "$header" "${TMPDIR}/google/$(dirname ${header})/"
   done
-  popd > /dev/null
+  popd
   cp -R $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party
 
   cp tensorflow/tools/pip_package/MANIFEST.in ${TMPDIR}
   cp tensorflow/tools/pip_package/README ${TMPDIR}
   cp tensorflow/tools/pip_package/setup.py ${TMPDIR}
-}
-
-function build_wheel() {
-  if [ $# -lt 2 ] ; then
-    echo "No src and dest dir provided"
-    exit 1
-  fi
-
-  TMPDIR="$1"
-  DEST="$2"
-  PKG_NAME_FLAG="$3"
 
   # Before we leave the top-level directory, make sure we know how to
   # call python.
@@ -148,110 +173,15 @@ function build_wheel() {
     source tools/python_bin_path.sh
   fi
 
-  pushd ${TMPDIR} > /dev/null
+  pushd ${TMPDIR}
   rm -f MANIFEST
   echo $(date) : "=== Building wheel"
   "${PYTHON_BIN_PATH:-python}" setup.py bdist_wheel ${PKG_NAME_FLAG} >/dev/null
   mkdir -p ${DEST}
   cp dist/* ${DEST}
-  popd > /dev/null
+  popd
+  rm -rf ${TMPDIR}
   echo $(date) : "=== Output wheel file is in: ${DEST}"
 }
 
-function usage() {
-  echo "Usage:"
-  echo "$0 [--src srcdir] [--dst dstdir] [options]"
-  echo "$0 dstdir [options]"
-  echo ""
-  echo "    --src                 prepare sources in srcdir"
-  echo "                              will use temporary dir if not specified"
-  echo ""
-  echo "    --dst                 build wheel in dstdir"
-  echo "                              if dstdir is not set do not build, only prepare sources"
-  echo ""
-  echo "  Options:"
-  echo "    --project_name <name> set project name to name"
-  echo "    --gpu                 build tensorflow_gpu"
-  echo "    --gpudirect           build tensorflow_gpudirect"
-  echo "    --nightly_flag        build tensorflow nightly"
-  echo ""
-  exit 1
-}
-
-function main() {
-  PKG_NAME_FLAG=""
-  PROJECT_NAME=""
-  GPU_BUILD=0
-  NIGHTLY_BUILD=0
-  SRCDIR=""
-  DSTDIR=""
-  CLEANSRC=1
-  while true; do
-    if [[ "$1" == "--help" ]]; then
-      usage
-      exit 1
-    elif [[ "$1" == "--nightly_flag" ]]; then
-      NIGHTLY_BUILD=1
-    elif [[ "$1" == "--gpu" ]]; then
-      GPU_BUILD=1
-    elif [[ "$1" == "--gpudirect" ]]; then
-      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
-    elif [[ "$1" == "--project_name" ]]; then
-      shift
-      if [[ -z "$1" ]]; then
-        break
-      fi
-      PROJECT_NAME="$1"
-    elif [[ "$1" == "--src" ]]; then
-      shift
-      SRCDIR="$(real_path $1)"
-      CLEANSRC=0
-    elif [[ "$1" == "--dst" ]]; then
-      shift
-      DSTDIR="$(real_path $1)"
-    else
-      DSTDIR="$(real_path $1)"
-    fi
-    shift
-
-    if [[ -z "$1" ]]; then
-      break
-    fi
-  done
-
-  if [[ -z "$DSTDIR" ]] && [[ -z "$SRCDIR" ]]; then
-    echo "No destination dir provided"
-    usage
-    exit 1
-  fi
-
-  if [[ -z "$SRCDIR" ]]; then
-    # make temp srcdir if none set
-    SRCDIR="$(mktemp -d -t tmp.XXXXXXXXXX)"
-  fi
-
-  prepare_src "$SRCDIR"
-
-  if [[ -z "$DSTDIR" ]]; then
-      # only want to prepare sources
-      exit
-  fi
-
-  if [[ -n ${PROJECT_NAME} ]]; then
-    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
-  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
-  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly"
-  elif [[ ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_gpu"
-  fi
-
-  build_wheel "$SRCDIR" "$DSTDIR" "$PKG_NAME_FLAG"
-
-  if [[ $CLEANSRC -ne 0 ]]; then
-    rm -rf "${TMPDIR}"
-  fi
-}
-
 main "$@"
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 97f625e7e9..d25a9e77b1 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -45,7 +45,7 @@ DOCLINES = __doc__.split('\n')
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.9.0-rc0'
+_VERSION = '1.8.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
@@ -54,7 +54,6 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-    'setuptools <= 39.1.0',
     'tensorboard >= 1.8.0, < 1.9.0',
     'termcolor >= 1.1.0',
 ]
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
index 15d7c70281..29add6d5ea 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
@@ -814,9 +814,6 @@ void Generator::Generate(const FileDescriptor& fd) {
   // Add header to cc file.
   SetOutput(&cc_);
   Print("// GENERATED FILE - DO NOT MODIFY");
-  Print();
-  Print("#include <algorithm>");  // for `std::stable_sort()`
-  Print();
   headers = {GetProtoTextHeaderName(fd, true /* impl */)};
   AddHeadersToCurrentSection(headers);
   Print();
diff --git a/tensorflow/tools/quantization/quantize_graph_test.py b/tensorflow/tools/quantization/quantize_graph_test.py
index 92bb5127da..df71840b64 100644
--- a/tensorflow/tools/quantization/quantize_graph_test.py
+++ b/tensorflow/tools/quantization/quantize_graph_test.py
@@ -119,8 +119,8 @@ def are_tensors_near(a, b, tolerance):
   flat_a = a.flatten()
   flat_b = b.flatten()
   if len(flat_a) != len(flat_b):
-    tf_logging.info("Tensors are different sizes: " + str(len(flat_a)) + " vs "
-                    + str(len(flat_b)))
+    print("Tensors are different sizes: " + str(len(flat_a)) + " vs " + str(
+        len(flat_b)))
     return False
   value_count = len(flat_a)
   how_many_different = 0
@@ -140,10 +140,10 @@ def are_tensors_near(a, b, tolerance):
   if how_many_different == 0:
     return True
   else:
-    tf_logging.info("Tensors have {0} different values ({1}%), with mean"
-                    " difference {2} and mean absolute difference {3}".format(
-                        how_many_different, proportion_different * 100,
-                        mean_difference, mean_abs_difference))
+    print("Tensors have {0} different values ({1}%), with mean difference"
+          " {2} and mean absolute difference {3}".format(
+              how_many_different, proportion_different * 100, mean_difference,
+              mean_abs_difference))
     return False
 
 
diff --git a/tensorflow/tools/test/upload_test_benchmarks.py b/tensorflow/tools/test/upload_test_benchmarks.py
index c030575109..9c45359ee1 100644
--- a/tensorflow/tools/test/upload_test_benchmarks.py
+++ b/tensorflow/tools/test/upload_test_benchmarks.py
@@ -89,6 +89,7 @@ import shutil
 
 from six import text_type
 from google.cloud import datastore
+from six import text_type
 
 
 def is_real_file(dirpath, fname):
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 4f3df570a5..dbec66216a 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -50,31 +50,31 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl_linux",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
       ],
-      sha256 = "d2305244fdc9b87db7426ed4496e87a4b3977ad3374d73b8000e8b7a5b7aa725",
-      strip_prefix = "mklml_lnx_2018.0.3.20180406",
+      sha256 = "74844bd77294742bf2396ff040369d1aa4cdd9e826fcd38cf8398ae83564d146",
+      strip_prefix = "mklml_lnx_2018.0.2.20180127",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_windows",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip"
       ],
-      sha256 = "a584a5bf1c8d2ad70b90d12b52652030e9a338217719064fdb84b7ad0d693694",
-      strip_prefix = "mklml_win_2018.0.3.20180406",
+      sha256 = "d8fbf0faa0684bffa3548005d05fe5cfe56ff9dbc0e15e7612d7ac01055a6ded",
+      strip_prefix = "mklml_win_2018.0.2.20180127",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_darwin",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz"
       ],
-      sha256 = "094e3dfd61c816136dc8d12a45cc611ce26c5f4828176a3644cd0b0efa15a25b",
-      strip_prefix = "mklml_mac_2018.0.3.20180406",
+      sha256 = "aa740d71e14562bfea56e6829e6dc186e7487cbcf6748a88dec73826b7ec1943",
+      strip_prefix = "mklml_mac_2018.0.2.20180127",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
 
@@ -85,11 +85,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
-          "https://github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
+          "https://github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
       ],
-      sha256 = "efebc53882856afec86457a2da644693f5d59c68772d41d640d6b60a8efc4eb0",
-      strip_prefix = "mkl-dnn-0.14",
+      sha256 = "d2cfd93a70cfe86ebe054477c530c9b5c1218b70f75856eb6d1956c68ee89e8f",
+      strip_prefix = "mkl-dnn-0.13",
       build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
   )
 
@@ -187,11 +187,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "highwayhash",
       urls = [
-          "http://mirror.bazel.build/github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
-          "https://github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
+          "https://mirror.bazel.build/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
+          "https://github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
       ],
-      sha256 = "9c3e0e87d581feeb0c18d814d98f170ff23e62967a2bd6855847f0b2fe598a37",
-      strip_prefix = "highwayhash-fd3d9af80465e4383162e4a7c5e2f406e82dd968",
+      sha256 = "0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
+      strip_prefix = "highwayhash-dfcb97ca4fe9277bf9dc1802dd979b071896453b",
       build_file = clean_dep("//third_party:highwayhash.BUILD"),
   )
author	Akshay Modi <nareshmodi@google.com>	2018-06-18 11:48:36 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-06-18 11:55:03 -0700
commit	148b4381fd0259cae441e459ec8ebe2c5d557722 (patch)
tree	c66c96ea6c60c63385b528dce195af802b8acf3b /tensorflow
parent	fc03fbff3dd7a58fa4f16226df4ada1f21f8b53f (diff)