270 files changed, 10234 insertions, 3353 deletions
diff --git a/WORKSPACE b/WORKSPACE
index e0931512f4..a0c936af06 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -34,8 +34,8 @@ new_http_archive(
 new_http_archive(
   name = "mobile_multibox",
   build_file = "models.BUILD",
-  url = "https://storage.googleapis.com/download.tensorflow.org/models/mobile_multibox_v1.zip",
-  sha256 = "b4c178fd6236dcf0a20d25d07c45eebe85281263978c6a6f1dfc49d75befc45f"
+  url = "https://storage.googleapis.com/download.tensorflow.org/models/mobile_multibox_v1a.zip",
+  sha256 = "859edcddf84dddb974c36c36cfc1f74555148e9c9213dedacf1d6b613ad52b96"
 )
 
 new_http_archive(
diff --git a/configure b/configure
index ff4ec262e3..a8e7bb7738 100755
--- a/configure
+++ b/configure
@@ -9,6 +9,23 @@ SOURCE_BASE_DIR=`pwd -P`
 popd > /dev/null
 
 PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
+
+function is_linux() {
+  if [[ "${PLATFORM}" == "linux" ]]; then
+    true
+  else
+    false
+  fi
+}
+
+function is_macos() {
+  if [[ "${PLATFORM}" == "darwin" ]]; then
+    true
+  else
+    false
+  fi
+}
+
 function is_windows() {
   # On windows, the shell script is actually running in msys
   if [[ "${PLATFORM}" =~ msys_nt*|mingw*|cygwin*|uwin* ]]; then
@@ -65,16 +82,20 @@ if is_windows; then
   TF_NEED_OPENCL=0
 fi
 
-while [ "$TF_NEED_JEMALLOC" == "" ]; do
-  read -p "Do you wish to use jemalloc as the malloc implementation? "\
-"(Linux only) [Y/n] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "jemalloc enabled on Linux"; TF_NEED_JEMALLOC=1;;
-    [Nn]* ) echo "jemalloc disabled on Linux"; TF_NEED_JEMALLOC=0;;
-    "" ) echo "jemalloc enabled on Linux"; TF_NEED_JEMALLOC=1;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
+if is_linux; then
+  while [ "$TF_NEED_JEMALLOC" == "" ]; do
+    read -p "Do you wish to use jemalloc as the malloc implementation? [Y/n] "\
+      INPUT
+    case $INPUT in
+      [Yy]* ) echo "jemalloc enabled"; TF_NEED_JEMALLOC=1;;
+      [Nn]* ) echo "jemalloc disabled"; TF_NEED_JEMALLOC=0;;
+      "" ) echo "jemalloc enabled"; TF_NEED_JEMALLOC=1;;
+      * ) echo "Invalid selection: " $INPUT;;
+    esac
+  done
+else
+  TF_NEED_JEMALLOC=0
+fi
 
 if [ "$TF_NEED_JEMALLOC" == "1" ]; then
   sed -i -e "s/WITH_JEMALLOC = False/WITH_JEMALLOC = True/" tensorflow/core/platform/default/build_config.bzl
@@ -99,7 +120,7 @@ done
 if [ "$TF_NEED_GCP" == "1" ]; then
   ## Verify that libcurl header files are available.
   # Only check Linux, since on MacOS the header files are installed with XCode.
-  if [[ $(uname -a) =~ Linux ]] && [[ ! -f "/usr/include/curl/curl.h" ]]; then
+  if is_linux && [[ ! -f "/usr/include/curl/curl.h" ]]; then
     echo "ERROR: It appears that the development version of libcurl is not "\
 "available. Please install the libcurl3-dev package."
     exit 1
@@ -226,8 +247,6 @@ while ! is_windows && true; do
 done
 
 # Find out where the CUDA toolkit is installed
-OSNAME=`uname -s`
-
 while true; do
   # Configure the Cuda SDK version to use.
   if [ -z "$TF_CUDA_VERSION" ]; then
@@ -259,9 +278,9 @@ while true; do
 
   if is_windows; then
     CUDA_RT_LIB_PATH="lib/x64/cudart.lib"
-  elif [ "$OSNAME" == "Linux" ]; then
+  elif is_linux; then
     CUDA_RT_LIB_PATH="lib64/libcudart.so${TF_CUDA_EXT}"
-  elif [ "$OSNAME" == "Darwin" ]; then
+  elif is_macos; then
     CUDA_RT_LIB_PATH="lib/libcudart${TF_CUDA_EXT}.dylib"
   fi
 
@@ -307,10 +326,10 @@ while true; do
     if is_windows; then
       cudnn_lib_path="${CUDNN_INSTALL_PATH}/lib/x64/cudnn.lib"
       cudnn_alt_lib_path="${CUDNN_INSTALL_PATH}/lib/x64/cudnn.lib"
-    elif [ "$OSNAME" == "Linux" ]; then
+    elif is_linux; then
       cudnn_lib_path="${CUDNN_INSTALL_PATH}/lib64/libcudnn.so"
       cudnn_alt_lib_path="${CUDNN_INSTALL_PATH}/libcudnn.so"
-    elif [ "$OSNAME" == "Darwin" ]; then
+    elif is_macos; then
       cudnn_lib_path="${CUDNN_INSTALL_PATH}/lib/libcudnn.dylib"
       cudnn_alt_lib_path="${CUDNN_INSTALL_PATH}/libcudnn.dylib"
     fi
@@ -337,7 +356,7 @@ while true; do
       echo "libcudnn.dylib resolves to libcudnn${TF_CUDNN_EXT}"
     fi
   else
-    if [ "$OSNAME" == "Darwin" ]; then
+    if is_macos; then
       TF_CUDNN_EXT=".${TF_CUDNN_VERSION}.dylib"
     else
       TF_CUDNN_EXT=".$TF_CUDNN_VERSION"
@@ -347,10 +366,10 @@ while true; do
   if is_windows; then
     CUDA_DNN_LIB_PATH="lib/x64/cudnn.lib"
     CUDA_DNN_LIB_ALT_PATH="lib/x64/cudnn.lib"
-  elif [ "$OSNAME" == "Linux" ]; then
+  elif is_linux; then
     CUDA_DNN_LIB_PATH="lib64/libcudnn.so${TF_CUDNN_EXT}"
     CUDA_DNN_LIB_ALT_PATH="libcudnn.so${TF_CUDNN_EXT}"
-  elif [ "$OSNAME" == "Darwin" ]; then
+  elif is_macos; then
     CUDA_DNN_LIB_PATH="lib/libcudnn${TF_CUDNN_EXT}"
     CUDA_DNN_LIB_ALT_PATH="libcudnn${TF_CUDNN_EXT}"
   fi
@@ -361,7 +380,7 @@ while true; do
     break
   fi
 
-  if [ "$OSNAME" == "Linux" ]; then
+  if is_linux; then
     CUDNN_PATH_FROM_LDCONFIG="$(ldconfig -p | sed -n 's/.*libcudnn.so .* => \(.*\)/\1/p')"
     if [ -e "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}" ]; then
       export TF_CUDNN_VERSION
@@ -372,7 +391,7 @@ while true; do
   echo "Invalid path to cuDNN ${CUDNN_VERSION} toolkit. Neither of the following two files can be found:"
   echo "${CUDNN_INSTALL_PATH}/${CUDA_DNN_LIB_PATH}"
   echo "${CUDNN_INSTALL_PATH}/${CUDA_DNN_LIB_ALT_PATH}"
-  if [ "$OSNAME" == "Linux" ]; then
+  if is_linux; then
     echo "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}"
   fi
 
@@ -499,7 +518,7 @@ while true; do
     fi
   fi
 
-  if [ "$OSNAME" == "Linux" ]; then
+  if is_linux; then
     SYCL_RT_LIB_PATH="lib/libComputeCpp.so"
   fi
 
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index a6bc8fdc49..9e8ea84baf 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -20,6 +20,12 @@ load(
 # -----------------------------------------------------------------------------
 # Public targets
 
+filegroup(
+    name = "headers",
+    srcs = ["c_api.h"],
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
 tf_cuda_library(
     name = "c_api",
     srcs = ["c_api.cc"],
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index d3a16c57f6..38117d388f 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -355,6 +355,7 @@ tf_cc_test(
 
 tf_gen_op_wrappers_cc(
     name = "sendrecv_ops",
+    include_internal_ops = 1,
     op_lib_names = [
         "sendrecv_ops",
     ],
@@ -363,6 +364,7 @@ tf_gen_op_wrappers_cc(
 
 tf_gen_op_wrappers_cc(
     name = "function_ops",
+    include_internal_ops = 1,
     op_lib_names = [
         "function_ops",
     ],
diff --git a/tensorflow/cc/client/client_session.cc b/tensorflow/cc/client/client_session.cc
index 5a98deb259..b407d3ab03 100644
--- a/tensorflow/cc/client/client_session.cc
+++ b/tensorflow/cc/client/client_session.cc
@@ -45,20 +45,20 @@ SessionOptions ClientSession::MakeDefaultSessionOptions(
   return options;
 }
 
-Status ClientSession::Run(const std::vector<ops::Output>& fetch_outputs,
+Status ClientSession::Run(const std::vector<Output>& fetch_outputs,
                           std::vector<Tensor>* outputs) const {
   return Run(FeedType{}, fetch_outputs, {}, outputs);
 }
 
 Status ClientSession::Run(const FeedType& inputs,
-                          const std::vector<ops::Output>& fetch_outputs,
+                          const std::vector<Output>& fetch_outputs,
                           std::vector<Tensor>* outputs) const {
   return Run(inputs, fetch_outputs, {}, outputs);
 }
 
 Status ClientSession::Run(const FeedType& inputs,
-                          const std::vector<ops::Output>& fetch_outputs,
-                          const std::vector<ops::Operation>& run_outputs,
+                          const std::vector<Output>& fetch_outputs,
+                          const std::vector<Operation>& run_outputs,
                           std::vector<Tensor>* outputs) const {
   return Run(RunOptions(), inputs, fetch_outputs, run_outputs, outputs,
              nullptr);
@@ -77,8 +77,8 @@ Status ClientSession::MaybeExtendGraph() const {
 }
 
 Status ClientSession::Run(const RunOptions& run_options, const FeedType& inputs,
-                          const std::vector<ops::Output>& fetch_outputs,
-                          const std::vector<ops::Operation>& run_outputs,
+                          const std::vector<Output>& fetch_outputs,
+                          const std::vector<Operation>& run_outputs,
                           std::vector<Tensor>* outputs,
                           RunMetadata* run_metadata) const {
   std::vector<std::pair<string, Tensor>> feeds;
diff --git a/tensorflow/cc/client/client_session.h b/tensorflow/cc/client/client_session.h
index 9d480477f6..28ff3ec964 100644
--- a/tensorflow/cc/client/client_session.h
+++ b/tensorflow/cc/client/client_session.h
@@ -31,62 +31,59 @@ limitations under the License.
 
 namespace tensorflow {
 
-// A `ClientSession` object lets the caller drive the evaluation of the
-// TensorFlow graph constructed with the C++ API.
-//
-// Example:
-//
-// Scope root = Scope::NewRootScope();
-// auto a = Placeholder(root, DT_INT32);
-// auto c = Add(root, a, {41});
-//
-// ClientSession session(root);
-// std::vector<Tensor> outputs;
-//
-// Status s = session.Run({{a, {1}}}, {c}, &outputs);
-// if (!s.ok()) { /* Handle error */ }
+/// A `ClientSession` object lets the caller drive the evaluation of the
+/// TensorFlow graph constructed with the C++ API.
+///
+/// Example:
+///
+///     Scope root = Scope::NewRootScope();
+///     auto a = Placeholder(root, DT_INT32);
+///     auto c = Add(root, a, {41});
+///
+///     ClientSession session(root);
+///     std::vector<Tensor> outputs;
+///
+///     Status s = session.Run({ {a, {1}} }, {c}, &outputs);
+///     if (!s.ok()) { ... }
 class ClientSession {
  public:
-  // A data type to represent feeds to a Run call.
-  // This is a map of `Output` objects returned by op-constructors to the value
-  // to feed them with. See `ops::Input::Initializer` for details on what can be
-  // used as feed values.
-  typedef std::unordered_map<ops::Output, ops::Input::Initializer,
-                             ops::OutputHash>
-      FeedType;
-
-  // Create a new session to evaluate the graph contained in `scope` by
-  // connecting to the TensorFlow runtime specified by `target`.
+  /// A data type to represent feeds to a Run call.
+  ///
+  /// This is a map of `Output` objects returned by op-constructors to the value
+  /// to feed them with. See `Input::Initializer` for details on what can be
+  /// used as feed values.
+  typedef std::unordered_map<Output, Input::Initializer, OutputHash> FeedType;
+
+  /// Create a new session to evaluate the graph contained in `scope` by
+  /// connecting to the TensorFlow runtime specified by `target`.
   ClientSession(const Scope& scope, const string& target);
 
-  // Same as above, but use the empty string ("") as the target specification.
+  /// Same as above, but use the empty string ("") as the target specification.
   ClientSession(const Scope& scope);
 
-  // Create a new session, configuring it with `session_options`.
+  /// Create a new session, configuring it with `session_options`.
   ClientSession(const Scope& scope, const SessionOptions& session_options);
 
-  // Evaluate the tensors in `fetch_outputs`. The values are returned as
-  // `Tensor` objects in `outputs`. The number and order of `outputs` will match
-  // `fetch_outputs`.
-  Status Run(const std::vector<ops::Output>& fetch_outputs,
+  /// Evaluate the tensors in `fetch_outputs`. The values are returned as
+  /// `Tensor` objects in `outputs`. The number and order of `outputs` will
+  /// match `fetch_outputs`.
+  Status Run(const std::vector<Output>& fetch_outputs,
              std::vector<Tensor>* outputs) const;
 
-  // Same as above, but use the mapping in `inputs` as feeds.
-  Status Run(const FeedType& inputs,
-             const std::vector<ops::Output>& fetch_outputs,
+  /// Same as above, but use the mapping in `inputs` as feeds.
+  Status Run(const FeedType& inputs, const std::vector<Output>& fetch_outputs,
              std::vector<Tensor>* outputs) const;
 
-  // Same as above. Additionally runs the operations ins `run_outputs`.
-  Status Run(const FeedType& inputs,
-             const std::vector<ops::Output>& fetch_outputs,
-             const std::vector<ops::Operation>& run_outputs,
+  /// Same as above. Additionally runs the operations ins `run_outputs`.
+  Status Run(const FeedType& inputs, const std::vector<Output>& fetch_outputs,
+             const std::vector<Operation>& run_outputs,
              std::vector<Tensor>* outputs) const;
 
-  // Use `run_options` to turn on performance profiling. `run_metadata`, if not
-  // null, is filled in with the profiling results.
+  /// Use `run_options` to turn on performance profiling. `run_metadata`, if not
+  /// null, is filled in with the profiling results.
   Status Run(const RunOptions& run_options, const FeedType& inputs,
-             const std::vector<ops::Output>& fetch_outputs,
-             const std::vector<ops::Operation>& run_outputs,
+             const std::vector<Output>& fetch_outputs,
+             const std::vector<Operation>& run_outputs,
              std::vector<Tensor>* outputs, RunMetadata* run_metadata) const;
 
   // TODO(keveman): Add support for partial run.
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index d191a73547..a4da3aa8e2 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -76,9 +76,9 @@ string ToGuard(const std::string& path) {
 }
 
 // Change:     Into:
-//   ABC         // ABC
-//               //
-//   DEF         // DEF
+//   ABC         /// ABC
+//               ///
+//   DEF         /// DEF
 string MakeComment(StringPiece text, StringPiece indent) {
   string ret;
   while (!text.empty()) {
@@ -89,9 +89,9 @@ string MakeComment(StringPiece text, StringPiece indent) {
       if (text[newline] != ' ') last_non_space = newline;
     }
     if (last_non_space == -1) {
-      strings::StrAppend(&ret, indent, "//\n");
+      strings::StrAppend(&ret, indent, "///\n");
     } else {
-      strings::StrAppend(&ret, indent, "// ",
+      strings::StrAppend(&ret, indent, "/// ",
                          text.substr(0, last_non_space + 1), "\n");
     }
     text.remove_prefix(newline + 1);
@@ -406,7 +406,7 @@ OpInfo::OpInfo(const OpDef& op_def) : op_def(op_def) {
   for (int i = 0; i < op_def.input_arg_size(); ++i) {
     const auto& arg(op_def.input_arg(i));
     arg_types.push_back(strings::StrCat(
-        "::tensorflow::ops::", ArgIsList(arg) ? "InputList" : "Input"));
+        "::tensorflow::", ArgIsList(arg) ? "InputList" : "Input"));
     arg_names.push_back(AvoidCPPKeywords(arg.name()));
 
     // TODO(keveman): Include input type information.
@@ -445,8 +445,8 @@ OpInfo::OpInfo(const OpDef& op_def) : op_def(op_def) {
   for (int i = 0; i < op_def.output_arg_size(); ++i) {
     const auto& arg = op_def.output_arg(i);
     bool is_list = ArgIsList(arg);
-    output_types.push_back(strings::StrCat("::tensorflow::ops::",
-                                           is_list ? "OutputList" : "Output"));
+    output_types.push_back(
+        strings::StrCat("::tensorflow::", is_list ? "OutputList" : "Output"));
     output_names.push_back(AvoidCPPKeywords(arg.name()));
     is_list_output.push_back(is_list);
   }
@@ -537,26 +537,26 @@ void OpInfo::WriteClassDecl(WritableFile* h) const {
   if (output_types.empty()) {
     // Allow casting this class to Operation.
     strings::StrAppend(&class_decl,
-                       "  operator ::tensorflow::ops::Operation() const { "
+                       "  operator ::tensorflow::Operation() const { "
                        "return operation; }\n");
   } else if (output_types.size() == 1) {
     if (is_list_output[0]) {
       // Write the subscript operator, allowing out[i] for the list-typed
       // output.
       strings::StrAppend(&class_decl,
-                         "  ::tensorflow::ops::Output operator[](size_t index) "
+                         "  ::tensorflow::Output operator[](size_t index) "
                          "const { return ",
                          output_names[0], "[index]; }\n\n");
 
     } else {
       // Write type cast functions, allowing casting this class to Input and
       // Output.
-      strings::StrAppend(
-          &class_decl, "  operator ::tensorflow::ops::Output() const { return ",
-          output_names[0], "; }\n");
-      strings::StrAppend(
-          &class_decl, "  operator ::tensorflow::ops::Input() const { return ",
-          output_names[0], "; }\n");
+      strings::StrAppend(&class_decl,
+                         "  operator ::tensorflow::Output() const { return ",
+                         output_names[0], "; }\n");
+      strings::StrAppend(&class_decl,
+                         "  operator ::tensorflow::Input() const { return ",
+                         output_names[0], "; }\n");
       // Write node() to get the Node* directly.
       strings::StrAppend(&class_decl,
                          "  ::tensorflow::Node* node() const { return ",
diff --git a/tensorflow/cc/framework/cc_op_gen.h b/tensorflow/cc/framework/cc_op_gen.h
index d1e83a87c3..3d35d0ef32 100644
--- a/tensorflow/cc/framework/cc_op_gen.h
+++ b/tensorflow/cc/framework/cc_op_gen.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Result is written to files dot_h and dot_cc.
+/// Result is written to files dot_h and dot_cc.
 void WriteCCOps(const OpList& ops, const std::string& dot_h_fname,
                 const std::string& dot_cc_fname);
 
diff --git a/tensorflow/cc/framework/grad_op_registry.h b/tensorflow/cc/framework/grad_op_registry.h
index e4da8570f1..190b96f685 100644
--- a/tensorflow/cc/framework/grad_op_registry.h
+++ b/tensorflow/cc/framework/grad_op_registry.h
@@ -24,30 +24,30 @@ limitations under the License.
 namespace tensorflow {
 namespace ops {
 
-// GradFunc is the signature for all gradient functions in GradOpRegistry.
-// Implementations should add operations to compute the gradient outputs of 'op'
-// (returned in 'grad_outputs') using 'scope' and 'grad_inputs'.
+/// GradFunc is the signature for all gradient functions in GradOpRegistry.
+/// Implementations should add operations to compute the gradient outputs of
+/// 'op' (returned in 'grad_outputs') using 'scope' and 'grad_inputs'.
 typedef Status (*GradFunc)(const Scope& scope, const Operation& op,
                            const std::vector<Output>& grad_inputs,
                            std::vector<Output>* grad_outputs);
 
-// GradOpRegistry maintains a static registry of gradient functions.
-// Gradient functions are indexed in the registry by the forward op name (i.e.
-// "MatMul" -> MatMulGrad func).
+/// GradOpRegistry maintains a static registry of gradient functions.
+/// Gradient functions are indexed in the registry by the forward op name (i.e.
+/// "MatMul" -> MatMulGrad func).
 class GradOpRegistry {
  public:
-  // Registers 'func' as the gradient function for 'op'.
-  // Returns true if registration was successful, check fails otherwise.
+  /// Registers 'func' as the gradient function for 'op'.
+  /// Returns true if registration was successful, check fails otherwise.
   bool Register(const string& op, GradFunc func);
 
-  // Sets 'func' to the gradient function for 'op' and returns Status OK if
-  // the gradient function for 'op' exists in the registry.
-  // Note that 'func' can be null for ops that have registered no-gradient with
-  // the registry.
-  // Returns error status otherwise.
+  /// Sets 'func' to the gradient function for 'op' and returns Status OK if
+  /// the gradient function for 'op' exists in the registry.
+  /// Note that 'func' can be null for ops that have registered no-gradient with
+  /// the registry.
+  /// Returns error status otherwise.
   Status Lookup(const string& op, GradFunc* func) const;
 
-  // Returns a pointer to the global gradient function registry.
+  /// Returns a pointer to the global gradient function registry.
   static GradOpRegistry* Global();
 
  private:
diff --git a/tensorflow/cc/framework/gradient_checker.cc b/tensorflow/cc/framework/gradient_checker.cc
index 89baa1a5bb..849a8eed6f 100644
--- a/tensorflow/cc/framework/gradient_checker.cc
+++ b/tensorflow/cc/framework/gradient_checker.cc
@@ -35,20 +35,20 @@ namespace {
 
 template <typename T>
 Status ComputeTheoreticalJacobianTranspose(
-    const Scope& scope, const ops::OutputList& xs,
+    const Scope& scope, const OutputList& xs,
     const std::vector<TensorShape>& x_shapes,
-    const std::vector<Tensor>& x_datas, const ops::OutputList& ys,
+    const std::vector<Tensor>& x_datas, const OutputList& ys,
     const std::vector<TensorShape>& y_shapes,
     std::vector<Tensor>& jacobian_ts) {
   int y_num = y_shapes.size();
   int x_num = x_shapes.size();
   // Call AddSymbolicGradients to get 'dxs' (we will feed 'dys').
-  ops::OutputList dys;
+  OutputList dys;
   for (const auto& y_shape : y_shapes) {
     // TODO(suharshs): This currently assumes that all x's are the same type.
     dys.push_back(Cast(scope, Const(scope, 1.0, y_shape), xs[0].type()));
   }
-  ops::OutputList dxs;
+  OutputList dxs;
   TF_RETURN_IF_ERROR(AddSymbolicGradients(scope, ys, xs, dys, &dxs));
 
   // Initialize 'dy_data' to zeros.
@@ -97,8 +97,8 @@ Status ComputeTheoreticalJacobianTranspose(
   return Status::OK();
 }
 
-Status EvaluateGraph(ClientSession& session, const ops::OutputList& xs,
-                     const ops::OutputList& ys, std::vector<Tensor>& x_datas,
+Status EvaluateGraph(ClientSession& session, const OutputList& xs,
+                     const OutputList& ys, std::vector<Tensor>& x_datas,
                      std::vector<Tensor>* y_datas) {
   // Create the feed list.
   ClientSession::FeedType feed_list;
@@ -123,11 +123,13 @@ Status EvaluateGraph(ClientSession& session, const ops::OutputList& xs,
 }
 
 template <typename T>
-Status ComputeNumericJacobianTranspose(
-    const Scope& scope, const ops::OutputList& xs,
-    const std::vector<TensorShape>& x_shapes, const ops::OutputList& ys,
-    const std::vector<TensorShape>& y_shapes, const T delta,
-    std::vector<Tensor>& x_datas, std::vector<Tensor>& jacobian_ts) {
+Status ComputeNumericJacobianTranspose(const Scope& scope, const OutputList& xs,
+                                       const std::vector<TensorShape>& x_shapes,
+                                       const OutputList& ys,
+                                       const std::vector<TensorShape>& y_shapes,
+                                       const T delta,
+                                       std::vector<Tensor>& x_datas,
+                                       std::vector<Tensor>& jacobian_ts) {
   int y_num = y_shapes.size();
   int x_num = x_shapes.size();
 
@@ -170,7 +172,7 @@ Status ComputeNumericJacobianTranspose(
 }
 
 template <typename T>
-void InitJacobians(const ops::OutputList& xs,
+void InitJacobians(const OutputList& xs,
                    const std::vector<TensorShape>& x_shapes,
                    const std::vector<TensorShape>& y_shapes,
                    std::vector<Tensor>& jacobians) {
@@ -191,10 +193,9 @@ void InitJacobians(const ops::OutputList& xs,
 }
 
 template <typename T>
-Status ComputeGradientErrorInternal(const Scope& scope,
-                                    const ops::OutputList& xs,
+Status ComputeGradientErrorInternal(const Scope& scope, const OutputList& xs,
                                     const std::vector<TensorShape>& x_shapes,
-                                    const ops::OutputList& ys,
+                                    const OutputList& ys,
                                     const std::vector<TensorShape>& y_shapes,
                                     std::vector<Tensor>& x_datas,
                                     T* max_error) {
@@ -231,9 +232,9 @@ Status ComputeGradientErrorInternal(const Scope& scope,
 }  // namespace
 
 template <typename T>
-Status ComputeGradientError(const Scope& scope, const ops::OutputList& xs,
+Status ComputeGradientError(const Scope& scope, const OutputList& xs,
                             const std::vector<TensorShape>& x_shapes,
-                            const ops::OutputList& ys,
+                            const OutputList& ys,
                             const std::vector<TensorShape>& y_shapes,
                             T* max_error) {
   if (xs.size() != x_shapes.size()) {
@@ -259,8 +260,8 @@ Status ComputeGradientError(const Scope& scope, const ops::OutputList& xs,
 }
 
 template <typename T>
-Status ComputeGradientError(const Scope& scope, const ops::Output& x,
-                            const Tensor& x_init_value, const ops::Output& y,
+Status ComputeGradientError(const Scope& scope, const Output& x,
+                            const Tensor& x_init_value, const Output& y,
                             const TensorShape& y_shape, T* max_error) {
   // Initialize 'x_data' from 'x_init_value'.
   std::vector<Tensor> x_datas(1, Tensor(x_init_value));
@@ -269,14 +270,14 @@ Status ComputeGradientError(const Scope& scope, const ops::Output& x,
                                       {y_shape}, x_datas, max_error);
 }
 
-#define INSTANTIATE_GRAD_ERR_TYPE(T)                                        \
-  template Status ComputeGradientError<T>(                                  \
-      const Scope& scope, const ops::OutputList& xs,                        \
-      const std::vector<TensorShape>& x_shapes, const ops::OutputList& ys,  \
-      const std::vector<TensorShape>& y_shapes, T* max_error);              \
-  template Status ComputeGradientError<T>(                                  \
-      const Scope& scope, const ops::Output& x, const Tensor& x_init_value, \
-      const ops::Output& y, const TensorShape& y_shape, T* max_error);
+#define INSTANTIATE_GRAD_ERR_TYPE(T)                                   \
+  template Status ComputeGradientError<T>(                             \
+      const Scope& scope, const OutputList& xs,                        \
+      const std::vector<TensorShape>& x_shapes, const OutputList& ys,  \
+      const std::vector<TensorShape>& y_shapes, T* max_error);         \
+  template Status ComputeGradientError<T>(                             \
+      const Scope& scope, const Output& x, const Tensor& x_init_value, \
+      const Output& y, const TensorShape& y_shape, T* max_error);
 
 INSTANTIATE_GRAD_ERR_TYPE(float);
 INSTANTIATE_GRAD_ERR_TYPE(double);
diff --git a/tensorflow/cc/framework/gradient_checker.h b/tensorflow/cc/framework/gradient_checker.h
index 66a2b3040c..2e61213615 100644
--- a/tensorflow/cc/framework/gradient_checker.h
+++ b/tensorflow/cc/framework/gradient_checker.h
@@ -22,20 +22,20 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Returns in 'max_error' the maximum element-wise error for dy/dx between the
-// computed and numeric Jacobian matrices where 'xs' and 'ys' are tensors.
-// This function adds operations to the graph associated with 'scope'.
+/// Returns in 'max_error' the maximum element-wise error for dy/dx between the
+/// computed and numeric Jacobian matrices where 'xs' and 'ys' are tensors.
+/// This function adds operations to the graph associated with 'scope'.
 template <typename T>
-Status ComputeGradientError(const Scope& scope, const ops::OutputList& xs,
+Status ComputeGradientError(const Scope& scope, const OutputList& xs,
                             const std::vector<TensorShape>& x_shapes,
-                            const ops::OutputList& ys,
+                            const OutputList& ys,
                             const std::vector<TensorShape>& y_shapes,
                             T* max_error);
 
-// Overload of ComputeGradientError which takes an initial value for 'x'.
+/// Overload of ComputeGradientError which takes an initial value for 'x'.
 template <typename T>
-Status ComputeGradientError(const Scope& scope, const ops::Output& x,
-                            const Tensor& x_init_value, const ops::Output& y,
+Status ComputeGradientError(const Scope& scope, const Output& x,
+                            const Tensor& x_init_value, const Output& y,
                             const TensorShape& y_shape, T* max_error);
 
 }  // namespace tensorflow
diff --git a/tensorflow/cc/framework/gradients.cc b/tensorflow/cc/framework/gradients.cc
index 0059bdd6d1..2c60f947a5 100644
--- a/tensorflow/cc/framework/gradients.cc
+++ b/tensorflow/cc/framework/gradients.cc
@@ -29,8 +29,6 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
-using namespace ops;  // NOLINT(build/namespaces)
-
 namespace {
 
 struct OutputHash {
@@ -48,7 +46,7 @@ struct OutputEq {
 class SymbolicGradientBuilder {
  public:
   SymbolicGradientBuilder(const Scope& scope,
-                          const GradOpRegistry* registry,
+                          const ops::GradOpRegistry* registry,
                           const std::vector<Output>& outputs,
                           const std::vector<Output>& inputs,
                           const std::vector<Output>& grad_inputs,
@@ -81,7 +79,7 @@ class SymbolicGradientBuilder {
                           std::vector<Output>* grad_outputs);
 
   const Scope& scope_;
-  const GradOpRegistry* registry_;
+  const ops::GradOpRegistry* registry_;
   const std::vector<Output>& outputs_;
   const std::vector<Output>& inputs_;
   const std::vector<Output>& grad_inputs_;
@@ -119,19 +117,15 @@ class SymbolicGradientBuilder {
 };
 
 SymbolicGradientBuilder::SymbolicGradientBuilder(
-    const Scope& scope,
-    const GradOpRegistry* registry,
-    const std::vector<Output>& outputs,
-    const std::vector<Output>& inputs,
-    const std::vector<Output>& grad_inputs,
-    std::vector<Output>* grad_outputs)
+    const Scope& scope, const ops::GradOpRegistry* registry,
+    const std::vector<Output>& outputs, const std::vector<Output>& inputs,
+    const std::vector<Output>& grad_inputs, std::vector<Output>* grad_outputs)
     : scope_(scope),
       registry_(registry),
       outputs_(outputs),
       inputs_(inputs),
       grad_inputs_(grad_inputs),
-      grad_outputs_(grad_outputs) {
-}
+      grad_outputs_(grad_outputs) {}
 
 Status SymbolicGradientBuilder::BackpropAlongEdge(const Output& dst_grad,
                                                   const Output& src) {
@@ -249,14 +243,14 @@ Status SymbolicGradientBuilder::SumGradients(const Output& src, Output* grad) {
   } else {
     // Otherwise, adds backprop-ed gradients.
     // TODO(andydavis) Use a better accumulator here.
-    *grad = AddN(scope_, grads_to_keep);
+    *grad = ops::AddN(scope_, grads_to_keep);
   }
 
   return Status::OK();
 }
 
 bool SymbolicGradientBuilder::IsPrimitiveOpWithNoGrad(const string& opname) {
-  GradFunc grad_fn;
+  ops::GradFunc grad_fn;
   Status s = registry_->Lookup(opname, &grad_fn);
   return s.ok() && (grad_fn == nullptr);
 }
@@ -265,7 +259,7 @@ Status SymbolicGradientBuilder::CallGradFunction(
     const Operation& op,
     const std::vector<Output>& grad_inputs,
     std::vector<Output>* grad_outputs) {
-  GradFunc grad_fn;
+  ops::GradFunc grad_fn;
   TF_RETURN_IF_ERROR(registry_->Lookup(op.node()->type_string(), &grad_fn));
   TF_RETURN_IF_ERROR(grad_fn(scope_, op, grad_inputs, grad_outputs));
   TF_RETURN_IF_ERROR(scope_.status());
@@ -333,7 +327,7 @@ Status SymbolicGradientBuilder::AddGradients() {
       // TODO(andydavis) If static shapes are known, replace 'ZerosLike' with
       // zero-filled Constant node of appropriate shape.
       for (const int dy_index : no_grad_dy_indices) {
-        dy[dy_index] = ZerosLike(scope_, Output(n, dy_index));
+        dy[dy_index] = ops::ZerosLike(scope_, Output(n, dy_index));
       }
     }
 
@@ -368,7 +362,7 @@ Status AddSymbolicGradients(const Scope& scope,
                             const std::vector<Output>& inputs,
                             const std::vector<Output>& grad_inputs,
                             std::vector<Output>* grad_outputs) {
-  SymbolicGradientBuilder builder(scope, GradOpRegistry::Global(), outputs,
+  SymbolicGradientBuilder builder(scope, ops::GradOpRegistry::Global(), outputs,
                                   inputs, grad_inputs, grad_outputs);
   return builder.AddGradients();
 }
diff --git a/tensorflow/cc/framework/gradients.h b/tensorflow/cc/framework/gradients.h
index fa5e608bd4..d076bc43b4 100644
--- a/tensorflow/cc/framework/gradients.h
+++ b/tensorflow/cc/framework/gradients.h
@@ -21,28 +21,28 @@ limitations under the License.
 
 namespace tensorflow {
 
-// NOTE: This API is a work in progress and will likely be changing frequently.
-//
-// Given initial gradients 'grad_inputs' (which represent the symbolic partial
-// derivatives of some loss function 'L' w.r.t 'outputs'), adds gradient nodes
-// to the graph associated with 'scope', which compute (and return in
-// 'grad_outputs') the symbolic partial derivatives of 'L' w.r.t 'inputs'.
-//
+/// NOTE: This API is a work in progress and will likely be changing frequently.
+///
+/// Given initial gradients 'grad_inputs' (which represent the symbolic partial
+/// derivatives of some loss function 'L' w.r.t 'outputs'), adds gradient nodes
+/// to the graph associated with 'scope', which compute (and return in
+/// 'grad_outputs') the symbolic partial derivatives of 'L' w.r.t 'inputs'.
+///
 
 // TODO(andydavis) Add overload of this function with no 'grad_inputs' arg.
 // Implementation will fill in 'OnesLike' for all shapes in 'outputs'.
 Status AddSymbolicGradients(const Scope& scope,
-                            const std::vector<ops::Output>& outputs,
-                            const std::vector<ops::Output>& inputs,
-                            const std::vector<ops::Output>& grad_inputs,
-                            std::vector<ops::Output>* grad_outputs);
-
-// Returns a sentinel Output that represents 'no gradient' (i.e. no gradient
-// flows along some graph edge during backpropagation).
-// Can be returned in 'grad_outputs' by an invocation of 'AddSymbolicGradients'
-// (note that gradient flow through an Output can be stopped through the use of
-// the StopGradient node).
-ops::Output NoGradient();
+                            const std::vector<Output>& outputs,
+                            const std::vector<Output>& inputs,
+                            const std::vector<Output>& grad_inputs,
+                            std::vector<Output>* grad_outputs);
+
+/// Returns a sentinel Output that represents 'no gradient' (i.e. no gradient
+/// flows along some graph edge during backpropagation).
+/// Can be returned in 'grad_outputs' by an invocation of 'AddSymbolicGradients'
+/// (note that gradient flow through an Output can be stopped through the use of
+/// the StopGradient node).
+Output NoGradient();
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/cc/framework/gradients_test.cc b/tensorflow/cc/framework/gradients_test.cc
index 9ae927a762..6e9ff3e01c 100644
--- a/tensorflow/cc/framework/gradients_test.cc
+++ b/tensorflow/cc/framework/gradients_test.cc
@@ -90,7 +90,7 @@ TEST_F(GradientsTest, OneMatMul) {
     } else {
       // Call AddSymbolicGradients.
       auto dz = Const(scope, {{1.0, 1.0}, {1.0, 1.0}});
-      std::vector<ops::Output> grad_outputs;
+      std::vector<Output> grad_outputs;
       TF_ASSERT_OK(
           AddSymbolicGradients(scope, {z}, {x, y}, {dz}, &grad_outputs));
     }
@@ -123,7 +123,7 @@ TEST_F(GradientsTest, TwoMatMuls_Chained) {
     } else {
       // Call AddSymbolicGradients.
       auto dz = Const(scope, {{1.0, 1.0}, {1.0, 1.0}});
-      std::vector<ops::Output> grad_outputs;
+      std::vector<Output> grad_outputs;
       TF_ASSERT_OK(
           AddSymbolicGradients(scope, {z}, {u, v}, {dz}, &grad_outputs));
     }
@@ -160,7 +160,7 @@ TEST_F(GradientsTest, TwoMatMuls_Independent) {
       // Call AddSymbolicGradients.
       auto dv = Const(scope, {{1.0, 1.0}, {1.0, 1.0}});
       auto dz = Const(scope, {{1.0, 1.0}, {1.0, 1.0}});
-      std::vector<ops::Output> grad_outputs;
+      std::vector<Output> grad_outputs;
       TF_ASSERT_OK(AddSymbolicGradients(scope, {v, z}, {t, u, x, y}, {dv, dz},
                                         &grad_outputs));
     }
@@ -191,7 +191,7 @@ TEST_F(GradientsTest, PackUnpack_Chained) {
       auto pack_grad = Unpack(scope, unpack_grad.output, 3);
     } else {
       // Call AddSymbolicGradients.
-      std::vector<ops::Output> grad_outputs;
+      std::vector<Output> grad_outputs;
       TF_ASSERT_OK(AddSymbolicGradients(scope, unpack.output, {a, b, c},
                                         {dx, dy, dz}, &grad_outputs));
     }
@@ -225,7 +225,7 @@ TEST_F(GradientsTest, PackUnpack_StopBackprop) {
       auto unpack_grad = Pack(scope, {dx, dy, dz});
     } else {
       // Call AddSymbolicGradients.
-      std::vector<ops::Output> grad_outputs;
+      std::vector<Output> grad_outputs;
       TF_ASSERT_OK(AddSymbolicGradients(scope, unpack.output, {pack},
                                         {dx, dy, dz}, &grad_outputs));
     }
@@ -252,7 +252,7 @@ TEST_F(GradientsTest, DependentGradOutputs) {
   // The gradient w.r.t to 'v' (returned in grad_outputs[0]) is dependent on
   // the gradient w.r.t. to 'x' (returned in grad_outputs[1]).
   auto dz = Const(scope_test_, {{5}});
-  std::vector<ops::Output> grad_outputs;
+  std::vector<Output> grad_outputs;
   TF_ASSERT_OK(
       AddSymbolicGradients(scope_test_, {z}, {v, x}, {dz}, &grad_outputs));
 
@@ -281,7 +281,7 @@ TEST_F(GradientsTest, MultipleNodeOutputGrads) {
                                {3, 4, 2});
   // clang-format on
 
-  std::vector<ops::Output> grad_outputs;
+  std::vector<Output> grad_outputs;
   TF_ASSERT_OK(AddSymbolicGradients(scope_test_, {pack}, unpack.output, {dx},
                                     &grad_outputs));
 
@@ -333,7 +333,7 @@ class StopGradientSingleOutputMultiEdgeTest : public ::testing::Test {
     auto g2 = Const(scope_, {{9, 10}, {11, 12}});
 
     // Call AddSymbolicGradients and compare against 'expected_grad'.
-    std::vector<ops::Output> grad_outputs;
+    std::vector<Output> grad_outputs;
     TF_EXPECT_OK(AddSymbolicGradients(scope_, {out0, out1, out2}, {z},
                                       {g0, g1, g2}, &grad_outputs));
 
@@ -410,7 +410,7 @@ class StopGradientMultiOutputTest : public ::testing::Test {
     auto g2 = Const(scope_, {17, 18, 19, 20, 21, 22, 23, 24}, {2, 4});
 
     // Call AddSymbolicGradients and compare against 'expected_grad'.
-    std::vector<ops::Output> grad_outputs;
+    std::vector<Output> grad_outputs;
     TF_EXPECT_OK(AddSymbolicGradients(scope_, {out0, out1, out2}, {x},
                                       {g0, g1, g2}, &grad_outputs));
 
diff --git a/tensorflow/cc/framework/ops.h b/tensorflow/cc/framework/ops.h
index 82ba9c68f0..32086d4123 100644
--- a/tensorflow/cc/framework/ops.h
+++ b/tensorflow/cc/framework/ops.h
@@ -28,7 +28,7 @@ namespace tensorflow {
 
 class Output;
 
-// Represents a node in the computation graph.
+/// Represents a node in the computation graph.
 class Operation {
  public:
   Operation() : node_(nullptr) {}
@@ -56,7 +56,7 @@ class Operation {
   Node* node_;
 };
 
-// Represents a tensor value produced by an Operation.
+/// Represents a tensor value produced by an Operation.
 class Output {
  public:
   Output() = default;
@@ -87,18 +87,18 @@ struct OutputHash {
   }
 };
 
-// Represents a tensor value that can be used as an operand to an Operation.
+/// Represents a tensor value that can be used as an operand to an Operation.
 class Input {
  public:
-  // Initializer enables constructing an Input object from various kinds of C++
-  // constants such as simple primitive constants and nested initializer lists
-  // representing a multi-dimensional array. Initializer constructors are all
-  // templates, so the aforementioned kinds of C++ constants can be used to
-  // construct an Initializer. Initializer stores the value it got constructed
-  // with in a Tensor object.
+  /// Initializer enables constructing an Input object from various kinds of C++
+  /// constants such as simple primitive constants and nested initializer lists
+  /// representing a multi-dimensional array. Initializer constructors are all
+  /// templates, so the aforementioned kinds of C++ constants can be used to
+  /// construct an Initializer. Initializer stores the value it got constructed
+  /// with in a Tensor object.
   struct Initializer {
-    // Construct from a scalar value of an arithmetic type or a type that can be
-    // converted to a string (eg. a string literal).
+    /// Construct from a scalar value of an arithmetic type or a type that can
+    /// be converted to a string (eg. a string literal).
     template <typename T, typename = typename std::enable_if<
                               std::is_arithmetic<T>::value ||
                               std::is_convertible<T, string>::value>::type>
@@ -111,7 +111,7 @@ class Input {
 
     Initializer(const Tensor& t) : tensor(t) {}  // NOLINT(runtime/explicit)
 
-    // Construct from a scalar value and an explicit shape
+    /// Construct from a scalar value and an explicit shape
     template <typename T, typename = typename std::enable_if<
                               std::is_arithmetic<T>::value ||
                               std::is_convertible<T, string>::value>::type>
@@ -124,7 +124,7 @@ class Input {
       tensor = t;
     }
 
-    // Construct from a initializer list of scalars (a one-dimensional tensor).
+    /// Construct from a initializer list of scalars (a one-dimensional tensor).
     template <typename T, typename = typename std::enable_if<
                               std::is_arithmetic<T>::value ||
                               std::is_convertible<T, string>::value>::type>
@@ -137,7 +137,7 @@ class Input {
       tensor = t;
     }
 
-    // Construct from a initializer list of scalars and an explicit shape.
+    /// Construct from a initializer list of scalars and an explicit shape.
     template <typename T, typename = typename std::enable_if<
                               std::is_arithmetic<T>::value ||
                               std::is_convertible<T, string>::value>::type>
@@ -154,11 +154,11 @@ class Input {
       tensor = t;
     }
 
-    // Construct a multi-dimensional tensor from a nested initializer list. Note
-    // that C++ syntax allows nesting of arbitrarily typed initializer lists, so
-    // such invalid initializers cannot be disallowed at compile time. This
-    // function performs checks to make sure that the nested initializer list is
-    // indeed a valid multi-dimensional tensor.
+    /// Construct a multi-dimensional tensor from a nested initializer
+    /// list. Note that C++ syntax allows nesting of arbitrarily typed
+    /// initializer lists, so such invalid initializers cannot be disallowed at
+    /// compile time. This function performs checks to make sure that the nested
+    /// initializer list is indeed a valid multi-dimensional tensor.
     Initializer(const std::initializer_list<Initializer>& v);
 
     template <typename T, bool = std::is_convertible<T, string>::value>
@@ -185,14 +185,14 @@ class Input {
     Tensor tensor;
   };
 
-  // All of Input's constructors are implicit. Input can be implicitly
-  // constructed from the following objects :
-  // * Output: This is so that the output of an Operation can be directly used
-  //   as the input to a op wrapper, which takes Inputs.
-  // * A scalar, or a multi-dimensional tensor specified as a recursive
-  //   initializer list. This enables directly passing constants as
-  //   inputs to op wrappers.
-  // * A Tensor object.
+  /// All of Input's constructors are implicit. Input can be implicitly
+  /// constructed from the following objects :
+  /// * Output: This is so that the output of an Operation can be directly used
+  ///   as the input to a op wrapper, which takes Inputs.
+  /// * A scalar, or a multi-dimensional tensor specified as a recursive
+  ///   initializer list. This enables directly passing constants as
+  ///   inputs to op wrappers.
+  /// * A Tensor object.
   Input(const Output& o) : output_(o) {}  // NOLINT(runtime/explicit)
 
   template <typename T, typename = typename std::enable_if<
@@ -220,8 +220,8 @@ class Input {
     tensor_ = Initializer(init).tensor;
   }
 
-  // Constructor specifying a node name, index and datatype. This should only be
-  // used for specifying a backward edge, needed by control flow.
+  /// Constructor specifying a node name, index and datatype. This should only
+  /// be used for specifying a backward edge, needed by control flow.
   Input(const string& name, int i, DataType dt)
       : node_name_(name), index_(i), data_type_(dt) {}
 
@@ -241,15 +241,15 @@ class Input {
   DataType data_type_ = DT_INVALID;
 };
 
-// A type for representing the output of ops that produce more than one output,
-// or a list of tensors.
+/// A type for representing the output of ops that produce more than one output,
+/// or a list of tensors.
 typedef std::vector<Output> OutputList;
 
-// A type for representing the input to ops that require a list of tensors.
+/// A type for representing the input to ops that require a list of tensors.
 class InputList {
  public:
-  // Implicitly convert a list of outputs to a list of inputs. This is useful to
-  // write code such as ops::Concat(ops::Split(x, 4)).
+  /// Implicitly convert a list of outputs to a list of inputs. This is useful
+  /// to write code such as ops::Concat(ops::Split(x, 4)).
   InputList(const OutputList& out) {  // NOLINT(runtime/explicit)
     for (auto const& x : out) {
       inputs_.push_back(x);
diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index 2bce24f2fc..e1af5b36e8 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -70,14 +70,14 @@ Scope::Scope(const Scope& other, Scope::Tags::OpName, const string& name,
       colocation_constraints_(other.colocation_constraints_) {}
 
 Scope::Scope(const Scope& other, Scope::Tags::ControlDeps,
-             std::vector<ops::Operation> control_deps, bool clear_control_deps)
+             std::vector<Operation> control_deps, bool clear_control_deps)
     : graph_(other.graph_),
       status_(other.status_),
       name_map_(other.name_map_),
       refiner_(other.refiner_),
       scope_used_(other.scope_used_),
       control_deps_(clear_control_deps
-                        ? std::vector<ops::Operation>()
+                        ? std::vector<Operation>()
                         : (control_deps.insert(control_deps.begin(),
                                                other.control_deps_.begin(),
                                                other.control_deps_.end()),
@@ -148,7 +148,7 @@ Scope::Scope(const Scope& other, Scope::Tags::KernelLabel,
       colocation_constraints_(other.colocation_constraints_) {}
 
 Scope::Scope(const Scope& other, Scope::Tags::Colocate,
-             const ops::Operation& colocate_with_op, bool clear_colocations)
+             const Operation& colocate_with_op, bool clear_colocations)
     : graph_(other.graph_),
       status_(other.status_),
       name_map_(other.name_map_),
@@ -166,7 +166,7 @@ Scope::Scope(const Scope& other, Scope::Tags::Colocate,
               : other.GetColocationConstraints(colocate_with_op)) {}
 
 std::unordered_set<string> Scope::GetColocationConstraints(
-    const ops::Operation& colocate_with_op) const {
+    const Operation& colocate_with_op) const {
   std::unordered_set<string> current_constraints(colocation_constraints_);
   const NodeDef& node_def = colocate_with_op.node()->def();
   std::vector<string> node_constraints;
@@ -298,21 +298,20 @@ Scope Scope::WithOpName(const string& op_name) const {
 }
 
 Scope Scope::WithControlDependencies(
-    const gtl::ArraySlice<ops::Operation>& control_deps) const {
-  return Scope(
-      *this, Scope::Tags::ControlDeps(),
-      std::vector<ops::Operation>(control_deps.begin(), control_deps.end()),
-      /* clear_control_deps */ false);
+    const gtl::ArraySlice<Operation>& control_deps) const {
+  return Scope(*this, Scope::Tags::ControlDeps(),
+               std::vector<Operation>(control_deps.begin(), control_deps.end()),
+               /* clear_control_deps */ false);
 }
 
-Scope Scope::WithControlDependencies(const ops::Output& control_dep) const {
+Scope Scope::WithControlDependencies(const Output& control_dep) const {
   return Scope(*this, Scope::Tags::ControlDeps(),
-               std::vector<ops::Operation>(1, control_dep.op()),
+               std::vector<Operation>(1, control_dep.op()),
                /* clear_control_deps */ false);
 }
 
 Scope Scope::WithNoControlDependencies() const {
-  return Scope(*this, Scope::Tags::ControlDeps(), std::vector<ops::Operation>(),
+  return Scope(*this, Scope::Tags::ControlDeps(), std::vector<Operation>(),
                /* clear_control_deps */ true);
 }
 
@@ -320,13 +319,13 @@ Scope Scope::WithDevice(const string& device) const {
   return Scope(*this, Scope::Tags::Device(), device);
 }
 
-Scope Scope::ColocateWith(const ops::Operation& op) const {
+Scope Scope::ColocateWith(const Operation& op) const {
   return Scope(*this, Scope::Tags::Colocate(), op,
                /* clear_colocations */ false);
 }
 
 Scope Scope::ClearColocation() const {
-  return Scope(*this, Scope::Tags::Colocate(), ops::Operation(),
+  return Scope(*this, Scope::Tags::Colocate(), Operation(),
                /* clear_colocations */ true);
 }
 
diff --git a/tensorflow/cc/framework/scope.h b/tensorflow/cc/framework/scope.h
index edf25e2227..47d1026bb2 100644
--- a/tensorflow/cc/framework/scope.h
+++ b/tensorflow/cc/framework/scope.h
@@ -33,129 +33,136 @@ class GraphDef;
 class NodeBuilder;
 struct CompositeOpScopes;
 
-// A `Scope` object represents a set of related TensorFlow ops that have the
-// same properties such as a common name prefix.
-// A Scope object is a container for TensorFlow Op properties. Op constructors
-// get a Scope object as a mandatory first argument and the constructed op
-// acquires the properties in the object.
-//
-// A simple example:
-//
-// using namespace ops;
-// Scope root = Scope::NewRootScope();
-// auto c1 = Const(root, {{1, 1}});
-// auto m = MatMul(root, c1, {{41}, {1}});
-// GraphDef gdef;
-// Status s = root.ToGraphDef(&gdef);
-// if (!s.ok()) { /* Handle error */ }
-//
-// Scope hierarchy:
-// The Scope class provides various With<> functions that create a new scope.
-// The new scope typically has one property changed while other properties are
-// inherited from the parent scope.
-// NewSubScope(name) method appends `name` to the prefix of names for ops
-// created within the scope, and WithOpName() changes the suffix which
-// otherwise defaults to the type of the op.
-//
-// Name examples:
-// Scope root = Scope::NewRootScope();
-// Scope linear = root.NewSubScope("linear");
-// /* W will be named "linear/W" */
-// auto W = Variable(linear.WithOpName("W"),
-//                   {2, 2}, DT_FLOAT);
-// /* b will be named "linear/b" */
-// auto b = Variable(linear.WithOpName("b"),
-//                   {2}, DT_FLOAT);
-// auto x = Const(linear, {...});  // name: "linear/Const"
-// auto m = MatMul(linear, x, W);  // name: "linear/MatMul"
-// auto r = BiasAdd(linear, m, b); // name: "linear/BiasAdd"
-//
-// Scope lifetime:
-// A new scope is created by calling Scope::NewRootScope. This creates some
-// resources that are shared by all the child scopes that inherit from this
-// scope, directly or transitively. For instance, a new scope creates a new
-// Graph object to which operations are added when the new scope or its children
-// are used by an Op constructor. The new scope also has a Status object which
-// will be used to indicate errors by Op-constructor functions called on any
-// child scope. The Op-constructor functions have to check the scope's status by
-// calling the ok() method before proceeding to construct the op.
-//
-// Thread safety:
-// A `Scope` object is NOT thread-safe. Threads cannot concurrently call
-// op-constructor functions on the same `Scope` object.
+/// A `Scope` object represents a set of related TensorFlow ops that have the
+/// same properties such as a common name prefix.
+///
+/// A Scope object is a container for TensorFlow Op properties. Op constructors
+/// get a Scope object as a mandatory first argument and the constructed op
+/// acquires the properties in the object.
+///
+/// A simple example:
+///
+///     using namespace ops;
+///     Scope root = Scope::NewRootScope();
+///     auto c1 = Const(root, { {1, 1} });
+///     auto m = MatMul(root, c1, { {41}, {1} });
+///     GraphDef gdef;
+///     Status s = root.ToGraphDef(&gdef);
+///     if (!s.ok()) { ... }
+///
+/// Scope hierarchy:
+///
+/// The Scope class provides various With<> functions that create a new scope.
+/// The new scope typically has one property changed while other properties are
+/// inherited from the parent scope.
+/// NewSubScope(name) method appends `name` to the prefix of names for ops
+/// created within the scope, and WithOpName() changes the suffix which
+/// otherwise defaults to the type of the op.
+///
+/// Name examples:
+///
+///     Scope root = Scope::NewRootScope();
+///     Scope linear = root.NewSubScope("linear");
+///     // W will be named "linear/W"
+///     auto W = Variable(linear.WithOpName("W"),
+///                       {2, 2}, DT_FLOAT);
+///     // b will be named "linear/b"
+///     auto b = Variable(linear.WithOpName("b"),
+///                       {2}, DT_FLOAT);
+///     auto x = Const(linear, {...});  // name: "linear/Const"
+///     auto m = MatMul(linear, x, W);  // name: "linear/MatMul"
+///     auto r = BiasAdd(linear, m, b); // name: "linear/BiasAdd"
+///
+/// Scope lifetime:
+///
+/// A new scope is created by calling Scope::NewRootScope. This creates some
+/// resources that are shared by all the child scopes that inherit from this
+/// scope, directly or transitively. For instance, a new scope creates a new
+/// Graph object to which operations are added when the new scope or its
+/// children are used by an Op constructor. The new scope also has a Status
+/// object which will be used to indicate errors by Op-constructor functions
+/// called on any child scope. The Op-constructor functions have to check the
+/// scope's status by calling the ok() method before proceeding to construct the
+/// op.
+///
+/// Thread safety:
+///
+/// A `Scope` object is NOT thread-safe. Threads cannot concurrently call
+/// op-constructor functions on the same `Scope` object.
 class Scope {
  public:
   // The following functions are for users making graphs. They return brand new
   // scopes, or scopes derived from an existing scope object.
 
-  // Return a new scope.
-  // This creates a new graph and all operations constructed in this graph
-  // should use the returned object as the "root" scope.
+  /// Return a new scope.
+  /// This creates a new graph and all operations constructed in this graph
+  /// should use the returned object as the "root" scope.
   static Scope NewRootScope();
 
-  // Return a new scope. Ops created with this scope will have
-  // <name>/<child_scope_name> as the prefix. The actual name will be unique
-  // in the current scope. All other properties are inherited from the current
-  // scope. If child_scope_name is empty, the '/' is elided.
+  /// Return a new scope. Ops created with this scope will have
+  /// <name>/<child_scope_name> as the prefix. The actual name will be unique
+  /// in the current scope. All other properties are inherited from the current
+  /// scope. If child_scope_name is empty, the '/' is elided.
   Scope NewSubScope(const string& child_scope_name) const;
 
-  // Return a new scope. All ops created within the returned scope will have
-  // names of the form <name>/<op_name>[_<suffix].
+  /// Return a new scope. All ops created within the returned scope will have
+  /// names of the form <name>/<op_name>[_<suffix].
   Scope WithOpName(const string& op_name) const;
 
-  // Return a new scope. All ops created within the returned scope will have as
-  // control dependencies the union of operations in the control_deps vector and
-  // the control dependencies of the current scope.
+  /// Return a new scope. All ops created within the returned scope will have as
+  /// control dependencies the union of operations in the control_deps vector
+  /// and the control dependencies of the current scope.
   Scope WithControlDependencies(
-      const gtl::ArraySlice<ops::Operation>& control_deps) const;
-  // Same as above, but convenient to add control dependency on the operation
-  // producing the control_dep output.
-  Scope WithControlDependencies(const ops::Output& control_dep) const;
+      const gtl::ArraySlice<Operation>& control_deps) const;
+  /// Same as above, but convenient to add control dependency on the operation
+  /// producing the control_dep output.
+  Scope WithControlDependencies(const Output& control_dep) const;
 
-  // Return a new scope. All ops created within the returned scope will have no
-  // control dependencies on other operations.
+  /// Return a new scope. All ops created within the returned scope will have no
+  /// control dependencies on other operations.
   Scope WithNoControlDependencies() const;
 
-  // Return a new scope. All ops created within the returned scope will have the
-  // device field set to 'device'.
+  /// Return a new scope. All ops created within the returned scope will have
+  /// the device field set to 'device'.
   Scope WithDevice(const string& device) const;
 
-  // Return a new scope. All ops created within the returned scope will be
-  // co-located on the device where op is placed.
-  // NOTE: This function is intended to be use internal libraries only for
-  // controlling placement of ops on to devices. Public use is not encouraged
-  // because the implementation of device placement is subject to change.
-  Scope ColocateWith(const ops::Operation& op) const;
-  // Convenience function for above.
-  Scope ColocateWith(const ops::Output& out) const {
-    return ColocateWith(out.op());
-  }
-  // Clear all colocation constraints.
+  /// Return a new scope. All ops created within the returned scope will be
+  /// co-located on the device where op is placed.
+  /// NOTE: This function is intended to be use internal libraries only for
+  /// controlling placement of ops on to devices. Public use is not encouraged
+  /// because the implementation of device placement is subject to change.
+  Scope ColocateWith(const Operation& op) const;
+  /// Convenience function for above.
+  Scope ColocateWith(const Output& out) const { return ColocateWith(out.op()); }
+  /// Clear all colocation constraints.
   Scope ClearColocation() const;
 
-  // Return a new scope. The op-constructor functions taking the returned scope
-  // as the scope argument will exit as soon as an error is detected, instead of
-  // setting the status on the scope.
+  /// Return a new scope. The op-constructor functions taking the returned scope
+  /// as the scope argument will exit as soon as an error is detected, instead
+  /// of setting the status on the scope.
   Scope ExitOnError() const;
 
-  // Return a new scope. All ops created with the new scope will have
-  // kernel_label as the value for their '_kernel' attribute;
+  /// Return a new scope. All ops created with the new scope will have
+  /// kernel_label as the value for their '_kernel' attribute;
   Scope WithKernelLabel(const string& kernel_label) const;
 
   // The following functions are for scope object consumers.
 
-  // Return a unique name, using default_name if an op name has not been
-  // specified.
+  /// Return a unique name, using default_name if an op name has not been
+  /// specified.
   string GetUniqueNameForOp(const string& default_name) const;
 
-  // Update the status on this scope.
-  // Note: The status object is shared between all children of this scope.
-  // If the resulting status is not Status::OK() and exit_on_error_ is set on
-  // this scope, this function exits by calling LOG(FATAL).
+  /// Update the status on this scope.
+  /// Note: The status object is shared between all children of this scope.
+  /// If the resulting status is not Status::OK() and exit_on_error_ is set on
+  /// this scope, this function exits by calling LOG(FATAL).
   void UpdateStatus(const Status s) const;
 
-  // Update the builder with properties accumulated in this scope.
+  // START_SKIP_DOXYGEN
+
+  /// Update the builder with properties accumulated in this scope.
   void UpdateBuilder(NodeBuilder* builder) const;
+  // END_SKIP_DOXYGEN
 
   CompositeOpScopes GetCompositeOpScopes(const string& composite_op_name) const;
 
@@ -169,23 +176,24 @@ class Scope {
 
   Status status() const { return *status_; }
 
-  // If status() is Status::OK(), convert the Graph object stored in this scope
-  // to a GraphDef proto and return Status::OK(). Otherwise, return the error
-  // status as is without performing GraphDef conversion.
+  /// If status() is Status::OK(), convert the Graph object stored in this scope
+  /// to a GraphDef proto and return Status::OK(). Otherwise, return the error
+  /// status as is without performing GraphDef conversion.
   Status ToGraphDef(GraphDef* gdef) const;
 
-  // If status() is Status::OK(), construct a Graph object using the default
-  // GraphConstructorOptions, and return Status::OK if graph construction was
-  // successful. Otherwise, return the error status.
+  // START_SKIP_DOXYGEN
+
+  /// If status() is Status::OK(), construct a Graph object using the default
+  /// GraphConstructorOptions, and return Status::OK if graph construction was
+  /// successful. Otherwise, return the error status.
   // TODO(josh11b, keveman): Make this faster; right now it converts
   // Graph->GraphDef->Graph.  This cleans up the graph (e.g. adds
   // edges from the source and to the sink node, resolves back edges
   // by name), and makes sure the resulting graph is valid.
   Status ToGraph(Graph* g) const;
+  // END_SKIP_DOXYGEN
 
-  const std::vector<ops::Operation>& control_deps() const {
-    return control_deps_;
-  }
+  const std::vector<Operation>& control_deps() const { return control_deps_; }
 
  private:
   // Tag types to choose the constructor to dispatch.
@@ -214,16 +222,16 @@ class Scope {
   Scope(const Scope& other, Tags::OpName, const string& name,
         const string& op_name);
   Scope(const Scope& other, Tags::ControlDeps,
-        std::vector<ops::Operation> control_deps, bool clear_control_deps);
+        std::vector<Operation> control_deps, bool clear_control_deps);
   Scope(const Scope& other, Tags::Device, const string& device);
   Scope(const Scope& other, Tags::SingleUseScope, const string& op_name);
   Scope(const Scope& other, Tags::ExitOnError);
   Scope(const Scope& other, Tags::KernelLabel, const string& kernel_label);
-  Scope(const Scope& other, Tags::Colocate,
-        const ops::Operation& colocate_with_op, bool clear_colocations);
+  Scope(const Scope& other, Tags::Colocate, const Operation& colocate_with_op,
+        bool clear_colocations);
 
   std::unordered_set<string> GetColocationConstraints(
-      const ops::Operation& colocate_with_op) const;
+      const Operation& colocate_with_op) const;
 
   // Helper functions to get a unique names.
   string GetUniqueName(const string& prefix, bool check_single_use) const;
@@ -245,7 +253,7 @@ class Scope {
   // GetUniqueNameForOp will cause an error status to be set on this scope.
   std::shared_ptr<bool> scope_used_ = nullptr;
 
-  const std::vector<ops::Operation> control_deps_;
+  const std::vector<Operation> control_deps_;
 
   const string name_ = "";
   const string op_name_ = "";
@@ -255,13 +263,13 @@ class Scope {
   const std::unordered_set<string> colocation_constraints_;
 };
 
-// A helper struct to hold the scopes that would be used by a function
-// constructing a composite op.
+/// A helper struct to hold the scopes that would be used by a function
+/// constructing a composite op.
 struct CompositeOpScopes {
-  // Scope to be used for creating the local ops (primitive or other composite
-  // ops).
+  /// Scope to be used for creating the local ops (primitive or other composite
+  /// ops).
   Scope child;
-  // Scope to be used for creating the last op.
+  /// Scope to be used for creating the last op.
   Scope last;
 };
 
diff --git a/tensorflow/cc/framework/scope_test.cc b/tensorflow/cc/framework/scope_test.cc
index 3882b5623b..9eca9d3fac 100644
--- a/tensorflow/cc/framework/scope_test.cc
+++ b/tensorflow/cc/framework/scope_test.cc
@@ -127,11 +127,11 @@ TEST(ScopeTest, SingleUseScope) {
 
 TEST(ScopeTest, ControlDeps) {
   Scope root = Scope::NewRootScope();
-  auto c1 = ops::Operation();
-  auto c2 = ops::Operation();
+  auto c1 = Operation();
+  auto c2 = Operation();
   Scope c = root.WithControlDependencies({c1, c2});
   EXPECT_EQ(c.control_deps().size(), 2);
-  Scope c_c = c.WithControlDependencies({ops::Operation()});
+  Scope c_c = c.WithControlDependencies({Operation()});
   EXPECT_EQ(c_c.control_deps().size(), 3);
 }
 
diff --git a/tensorflow/cc/framework/testutil.cc b/tensorflow/cc/framework/testutil.cc
index 58afc6b979..b0746913a1 100644
--- a/tensorflow/cc/framework/testutil.cc
+++ b/tensorflow/cc/framework/testutil.cc
@@ -20,8 +20,6 @@ limitations under the License.
 #include "tensorflow/core/graph/default_device.h"
 
 namespace tensorflow {
-using namespace ops;  // NOLINT(build/namespaces)
-
 namespace test {
 
 void GetTensors(const Scope& scope, OutputList tensors,
diff --git a/tensorflow/cc/framework/testutil.h b/tensorflow/cc/framework/testutil.h
index 5e67ede6ab..d027ad3744 100644
--- a/tensorflow/cc/framework/testutil.h
+++ b/tensorflow/cc/framework/testutil.h
@@ -22,12 +22,12 @@ limitations under the License.
 namespace tensorflow {
 namespace test {
 
-// Computes the outputs listed in 'tensors', returns the tensors in 'out'.
-void GetTensors(const Scope& scope, ops::OutputList tensors,
+/// Computes the outputs listed in 'tensors', returns the tensors in 'out'.
+void GetTensors(const Scope& scope, OutputList tensors,
                 std::vector<Tensor>* out);
 
-// Computes the output 'tensor', returning the resulting tensor in 'out'.
-void GetTensor(const Scope& scope, ops::Output tensor, Tensor* out);
+/// Computes the output 'tensor', returning the resulting tensor in 'out'.
+void GetTensor(const Scope& scope, Output tensor, Tensor* out);
 
 }  // namespace test
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/grad_testutil.h b/tensorflow/cc/gradients/grad_testutil.h
index 7a925f9b0e..d31f412754 100644
--- a/tensorflow/cc/gradients/grad_testutil.h
+++ b/tensorflow/cc/gradients/grad_testutil.h
@@ -22,12 +22,12 @@ limitations under the License.
 namespace tensorflow {
 namespace test {
 
-// Calls the gradient function registered for 'op', adding gradient operations
-// to the graph associated with 'scope'. Gradient outputs for each 'op' input
-// are returned in 'grad_outputs'.
-Status CallGradFunction(const Scope& scope, const ops::Operation& op,
-                        const std::vector<ops::Output>& grad_inputs,
-                        std::vector<ops::Output>* grad_outputs);
+/// Calls the gradient function registered for 'op', adding gradient operations
+/// to the graph associated with 'scope'. Gradient outputs for each 'op' input
+/// are returned in 'grad_outputs'.
+Status CallGradFunction(const Scope& scope, const Operation& op,
+                        const std::vector<Output>& grad_inputs,
+                        std::vector<Output>* grad_outputs);
 
 }  // namespace test
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/constants.h b/tensorflow/cc/saved_model/constants.h
index 654e765170..7f2d560978 100644
--- a/tensorflow/cc/saved_model/constants.h
+++ b/tensorflow/cc/saved_model/constants.h
@@ -18,25 +18,25 @@ limitations under the License.
 
 namespace tensorflow {
 
-// SavedModel assets directory.
+/// SavedModel assets directory.
 constexpr char kSavedModelAssetsDirectory[] = "assets";
 
-// SavedModel assets key for graph collection-def.
+/// SavedModel assets key for graph collection-def.
 constexpr char kSavedModelAssetsKey[] = "saved_model_assets";
 
-// SavedModel proto filename.
+/// SavedModel proto filename.
 constexpr char kSavedModelFilenamePb[] = "saved_model.pb";
 
-// SavedModel text format proto filename.
+/// SavedModel text format proto filename.
 constexpr char kSavedModelFilenamePbTxt[] = "saved_model.pbtxt";
 
-// SavedModel legacy init op key.
+/// SavedModel legacy init op key.
 constexpr char kSavedModelLegacyInitOpKey[] = "legacy_init_op";
 
-// Directory in which to save the SavedModel variables.
+/// Directory in which to save the SavedModel variables.
 constexpr char kSavedModelVariablesDirectory[] = "variables";
 
-// SavedModel variables filename.
+/// SavedModel variables filename.
 constexpr char kSavedModelVariablesFilename[] = "variables";
 
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/loader.h b/tensorflow/cc/saved_model/loader.h
index 10157b0a99..9b9abdbb1f 100644
--- a/tensorflow/cc/saved_model/loader.h
+++ b/tensorflow/cc/saved_model/loader.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// SavedModel loading functions and SavedModelBundle struct.
+/// SavedModel loading functions and SavedModelBundle struct.
 
 #ifndef THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
 #define THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
@@ -27,13 +27,13 @@ limitations under the License.
 
 namespace tensorflow {
 
-// SavedModel representation once the SavedModel is loaded from storage.
+/// SavedModel representation once the SavedModel is loaded from storage.
 struct SavedModelBundle {
   std::unique_ptr<Session> session;
   MetaGraphDef meta_graph_def;
 
-  // A TensorFlow Session does not Close itself on destruction. To avoid
-  // resource leaks, we explicitly call Close on Sessions that we create.
+  /// A TensorFlow Session does not Close itself on destruction. To avoid
+  /// resource leaks, we explicitly call Close on Sessions that we create.
   ~SavedModelBundle() {
     if (session) {
       session->Close();
@@ -43,20 +43,20 @@ struct SavedModelBundle {
   SavedModelBundle() = default;
 };
 
-// Loads a SavedModel from the specified export directory. The meta graph def to
-// be loaded is identified by the supplied tags, corresponding exactly to the
-// set of tags used at SavedModel build time. Returns a SavedModel bundle with a
-// session and the requested meta graph def, if found.
+/// Loads a SavedModel from the specified export directory. The meta graph def
+/// to be loaded is identified by the supplied tags, corresponding exactly to
+/// the set of tags used at SavedModel build time. Returns a SavedModel bundle
+/// with a session and the requested meta graph def, if found.
 Status LoadSavedModel(const SessionOptions& session_options,
                       const RunOptions& run_options, const string& export_dir,
                       const std::unordered_set<string>& tags,
                       SavedModelBundle* const bundle);
 
-// Checks whether the provided directory could contain a SavedModel. Note that
-// the method does not load any data by itself. If the method returns `false`,
-// the export directory definitely does not contain a SavedModel. If the method
-// returns `true`, the export directory may contain a SavedModel but provides no
-// guarantee that it can be loaded.
+/// Checks whether the provided directory could contain a SavedModel. Note that
+/// the method does not load any data by itself. If the method returns `false`,
+/// the export directory definitely does not contain a SavedModel. If the method
+/// returns `true`, the export directory may contain a SavedModel but provides
+/// no guarantee that it can be loaded.
 bool MaybeSavedModelDirectory(const string& export_dir);
 
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/signature_constants.h b/tensorflow/cc/saved_model/signature_constants.h
index 5a784874cd..b2d39bd55b 100644
--- a/tensorflow/cc/saved_model/signature_constants.h
+++ b/tensorflow/cc/saved_model/signature_constants.h
@@ -18,48 +18,48 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Key in the signature def map for `default` serving signatures. The default
-// signature is used in inference requests where a specific signature was not
-// specified.
+/// Key in the signature def map for `default` serving signatures. The default
+/// signature is used in inference requests where a specific signature was not
+/// specified.
 static constexpr char kDefaultServingSignatureDefKey[] = "serving_default";
 
 ////////////////////////////////////////////////////////////////////////////////
-// Classification API constants.
+/// Classification API constants.
 
-// Classification inputs.
+/// Classification inputs.
 static constexpr char kClassifyInputs[] = "inputs";
 
-// Classification method name used in a SignatureDef.
+/// Classification method name used in a SignatureDef.
 static constexpr char kClassifyMethodName[] = "tensorflow/serving/classify";
 
-// Classification classes output.
+/// Classification classes output.
 static constexpr char kClassifyOutputClasses[] = "classes";
 
-// Classification scores output.
+/// Classification scores output.
 static constexpr char kClassifyOutputScores[] = "scores";
 
 ////////////////////////////////////////////////////////////////////////////////
-// Predict API constants.
+/// Predict API constants.
 
-// Predict inputs.
+/// Predict inputs.
 static constexpr char kPredictInputs[] = "inputs";
 
-// Predict method name used in a SignatureDef.
+/// Predict method name used in a SignatureDef.
 static constexpr char kPredictMethodName[] = "tensorflow/serving/predict";
 
-// Predict outputs.
+/// Predict outputs.
 static constexpr char kPredictOutputs[] = "outputs";
 
 ////////////////////////////////////////////////////////////////////////////////
-// Regression API constants.
+/// Regression API constants.
 
-// Regression inputs.
+/// Regression inputs.
 static constexpr char kRegressInputs[] = "inputs";
 
-// Regression method name used in a SignatureDef.
+/// Regression method name used in a SignatureDef.
 static constexpr char kRegressMethodName[] = "tensorflow/serving/regress";
 
-// Regression outputs.
+/// Regression outputs.
 static constexpr char kRegressOutputs[] = "outputs";
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/tensorflow/cc/saved_model/tag_constants.h b/tensorflow/cc/saved_model/tag_constants.h
index 8c4d12a57f..48ab1158e4 100644
--- a/tensorflow/cc/saved_model/tag_constants.h
+++ b/tensorflow/cc/saved_model/tag_constants.h
@@ -18,10 +18,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Tag for the `serving` graph.
+/// Tag for the `serving` graph.
 constexpr char kSavedModelTagServe[] = "serve";
 
-// Tag for the `training` graph.`
+/// Tag for the `training` graph.`
 constexpr char kSavedModelTagTrain[] = "train";
 
 }  // namespace tensorflow
diff --git a/tensorflow/cc/training/coordinator.h b/tensorflow/cc/training/coordinator.h
index 58e95f40f6..dbcf072015 100644
--- a/tensorflow/cc/training/coordinator.h
+++ b/tensorflow/cc/training/coordinator.h
@@ -28,77 +28,77 @@ limitations under the License.
 
 namespace tensorflow {
 
-// The abstract interface for runners which must implement the Join function.
+/// The abstract interface for runners which must implement the Join function.
 class RunnerInterface {
  public:
   virtual ~RunnerInterface() {}
   virtual Status Join() = 0;
 
-  // Returns true iff the runner is running, i.e. if it is trying to populate
-  // its queue.
+  /// Returns true iff the runner is running, i.e. if it is trying to populate
+  /// its queue.
   virtual bool IsRunning() const = 0;
 };
 
-// Coordinator class manages the termination of a collection of QueueRunners.
-// Without a coordinator, QueueRunners have to be joined in a specific order;
-// otherwise the QueueRunner::Join() could sometimes hang. The
-// Coordinator::RequestStop() plays the key role which notifies all running
-// threads under a coordinator to stop. This function could be called by any
-// thread or any client.
-// Usage, in the client:
-//   Coordinator coord;
-//   std::unique_ptr<QueueRunner> qr(&coord, ...);
-//   qr.Start(session);
-//   coord.RegisterRunner(std::move(qr));
-//   // do some work
-//   TF_CHECK_OK(coord.Join());
-// In each thread of QueueRunner, the coordinator needs to be used as:
-//   void Run() {
-//     while (!coord->ShouldStop()) {
-//       // do some work
-//       if (error) {
-//         coord->RequestStop();
-//         coord->ReportStatus(error_status);
-//       }
-//     }
-//   }
+/// Coordinator class manages the termination of a collection of QueueRunners.
+/// Without a coordinator, QueueRunners have to be joined in a specific order;
+/// otherwise the QueueRunner::Join() could sometimes hang. The
+/// Coordinator::RequestStop() plays the key role which notifies all running
+/// threads under a coordinator to stop. This function could be called by any
+/// thread or any client.
+/// Usage, in the client:
+///   Coordinator coord;
+///   std::unique_ptr<QueueRunner> qr(&coord, ...);
+///   qr.Start(session);
+///   coord.RegisterRunner(std::move(qr));
+///   /// do some work
+///   TF_CHECK_OK(coord.Join());
+/// In each thread of QueueRunner, the coordinator needs to be used as:
+///   void Run() {
+///     while (!coord->ShouldStop()) {
+///       /// do some work
+///       if (error) {
+///         coord->RequestStop();
+///         coord->ReportStatus(error_status);
+///       }
+///     }
+///   }
 class Coordinator {
  public:
   Coordinator();
 
-  // Constructor with a list of error codes which would not be taken as errors
-  // in status reporting.
+  /// Constructor with a list of error codes which would not be taken as errors
+  /// in status reporting.
   Coordinator(const std::vector<error::Code>& clean_stop_errors);
 
-  // In the destructor, RequestStop() and Join() would be called.
+  /// In the destructor, RequestStop() and Join() would be called.
   ~Coordinator();
 
-  // Registers a runner, i.e. a unit of running threads which is usually a
-  // QueueRunner. It takes the ownership of runner to avoid lifecycle-related
-  // problems. Note, the coordinator would not start these threads; they are
-  // supposed to be in running state when they are registered here.
+  /// Registers a runner, i.e. a unit of running threads which is usually a
+  /// QueueRunner. It takes the ownership of runner to avoid lifecycle-related
+  /// problems. Note, the coordinator would not start these threads; they are
+  /// supposed to be in running state when they are registered here.
   Status RegisterRunner(std::unique_ptr<RunnerInterface> runner);
 
-  // Returns true iff all the registered runners have been stopped.
+  /// Returns true iff all the registered runners have been stopped.
   bool AllRunnersStopped();
 
-  // Requests all running threads to stop.
+  /// Requests all running threads to stop.
   Status RequestStop();
 
-  // Returns true if its RequestStop() has been called.
+  /// Returns true if its RequestStop() has been called.
   bool ShouldStop();
 
-  // Joins all threads, returns OK or the first reported and unexpected status.
+  /// Joins all threads, returns OK or the first reported and unexpected status.
   Status Join();
 
-  // Reports status to the coordinator. This is usually called by threads.
+  /// Reports status to the coordinator. This is usually called by threads.
   void ReportStatus(const Status& status);
 
-  // Returns the latest status.
+  /// Returns the latest status.
   Status GetStatus();
 
-  // Returns immediately if the coordinator is stopped or blocks until
-  // RequestStop() is called.
+  /// Returns immediately if the coordinator is stopped or blocks until
+  /// RequestStop() is called.
   void WaitForStop();
 
  private:
diff --git a/tensorflow/cc/training/queue_runner.h b/tensorflow/cc/training/queue_runner.h
index e5aae8219f..bfe6a30593 100644
--- a/tensorflow/cc/training/queue_runner.h
+++ b/tensorflow/cc/training/queue_runner.h
@@ -32,46 +32,46 @@ limitations under the License.
 
 namespace tensorflow {
 
-// QueueRunner class imitates the behavior of the python version of QueueRunner
-// which creates a thread for each enqueue op, runs close op on completion.
+/// QueueRunner class imitates the behavior of the python version of QueueRunner
+/// which creates a thread for each enqueue op, runs close op on completion.
 class QueueRunner : public RunnerInterface {
  public:
-  // Creates a new QueueRunner from proto.
+  /// Creates a new QueueRunner from proto.
   // TODO(yuefengz): we may want to initialize from queues and ops in the
   // future.
   static Status New(const QueueRunnerDef& queue_runner_def,
                     std::unique_ptr<QueueRunner>* result);
 
-  // Creates a new QueueRunner with a coordinator, see coordinator.h for usage.
+  /// Creates a new QueueRunner with a coordinator, see coordinator.h for usage.
   static Status New(const QueueRunnerDef& queue_runner_def, Coordinator* coord,
                     std::unique_ptr<QueueRunner>* result);
 
-  // Adds a callback that the queue runner will call when it detects an error.
+  /// Adds a callback that the queue runner will call when it detects an error.
   void AddErrorCallback(const std::function<void(Status)>& cb);
 
-  // Delete the previously registered callbacks.
+  /// Delete the previously registered callbacks.
   void ClearErrorCallbacks();
 
-  // The destructor would join all the threads.
+  /// The destructor would join all the threads.
   ~QueueRunner();
 
-  // Starts the queue runner with the given session.
+  /// Starts the queue runner with the given session.
   Status Start(Session* sess);
 
-  // Starts the queue runner with the given session, and wait for up to the
-  // specified time (in milliseconds) for the queues to start to fill up.
+  /// Starts the queue runner with the given session, and wait for up to the
+  /// specified time (in milliseconds) for the queues to start to fill up.
   Status Start(Session* sess, int wait_for_ms);
 
-  // Requests to stop and runs the cancel op. It would be called in a separate
-  // thread when coordinator is set. If there is no coordinator it should be
-  // called before calling Join.
+  /// Requests to stop and runs the cancel op. It would be called in a separate
+  /// thread when coordinator is set. If there is no coordinator it should be
+  /// called before calling Join.
   void Stop(Session* sess);
 
-  // Joins all the threads. Returns okay if all threads run successfully;
-  // otherwise returns the first captured failure status.
+  /// Joins all the threads. Returns okay if all threads run successfully;
+  /// otherwise returns the first captured failure status.
   Status Join() final;
 
-  // Returns the latest status.
+  /// Returns the latest status.
   Status GetStatus();
 
  private:
diff --git a/tensorflow/compiler/jit/graph_to_functiondef_test.cc b/tensorflow/compiler/jit/graph_to_functiondef_test.cc
index df45f455a9..04b2385c9c 100644
--- a/tensorflow/compiler/jit/graph_to_functiondef_test.cc
+++ b/tensorflow/compiler/jit/graph_to_functiondef_test.cc
@@ -50,8 +50,7 @@ TEST(GraphToFunctionDefTest, Basics) {
   auto d = ops::Add(root.WithOpName("D"), a, b);
   auto e = ops::Add(root.WithOpName("b"), d, c);
   auto f = ops::Neg(root.WithOpName("h"), e);
-  auto g =
-      ops::AddN(root.WithOpName("G"), std::initializer_list<ops::Output>{e, f});
+  auto g = ops::AddN(root.WithOpName("G"), std::initializer_list<Output>{e, f});
   auto h = ops::_Retval(root.WithOpName("H"), g, 0);
 
   GraphDef graph_def;
diff --git a/tensorflow/compiler/jit/xla_local_launch_op.cc b/tensorflow/compiler/jit/xla_local_launch_op.cc
index 7945e057cf..acf2ccb890 100644
--- a/tensorflow/compiler/jit/xla_local_launch_op.cc
+++ b/tensorflow/compiler/jit/xla_local_launch_op.cc
@@ -45,6 +45,9 @@ REGISTER_OP("_XlaLaunch")
     .Output("results: Tresults")
     .Attr("Tresults: list(type) >= 0")
     .Attr("function: func")
+    // XLA random-number generation ops are stateful.
+    // TODO(phawkins): create stateful and non-stateful variants of _XlaLaunch.
+    .SetIsStateful()
     .Doc("XLA Launch Op. For use by the XLA JIT only.");
 
 // Adapter class that wraps a Tensorflow allocator as an XLA allocator.
@@ -313,9 +316,10 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
       }
       Tensor output_tensor;
       // Looks up the owning Tensor by buffer address.
-      OP_REQUIRES_OK(ctx, xla_allocator.MakeTensorFromBuffer(
-                              buffer, ctx->expected_output_dtype(i), shape,
-                              &output_tensor));
+      OP_REQUIRES_OK(
+          ctx,
+          xla_allocator.MakeTensorFromBuffer(
+              buffer, ctx->expected_output_dtype(i), shape, &output_tensor));
       ctx->set_output(i, output_tensor);
       ++output_num;
     }
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index b4f01de4f2..5c78ab7061 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -180,6 +180,20 @@ tf_xla_py_test(
 )
 
 tf_xla_py_test(
+    name = "random_ops_test",
+    size = "small",
+    srcs = ["random_ops_test.py"],
+    # TODO(b/31361304): enable RNG ops on GPU when parallelized.
+    disabled_backends = ["gpu"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
+    ],
+)
+
+tf_xla_py_test(
     name = "reduce_ops_test",
     size = "medium",
     srcs = ["reduce_ops_test.py"],
diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index 7fb8e0a26d..820db13d0b 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -9,7 +9,7 @@ def all_backends():
     return ["cpu"]
 
 def tf_xla_py_test(name, srcs=[], deps=[], tags=[], data=[], main=None,
-                   backends=None, **kwargs):
+                   disabled_backends=None, **kwargs):
   """Generates py_test targets, one per XLA backend.
 
   This rule generates py_test() targets named name_backend, for each backend
@@ -31,15 +31,16 @@ def tf_xla_py_test(name, srcs=[], deps=[], tags=[], data=[], main=None,
     tags: Tags to apply to the generated targets.
     data: Data dependencies of the target.
     main: Same as py_test's main attribute.
-    backends: A list of backends to test. Supported values include "cpu" and
-      "gpu". If not specified, defaults to all backends.
+    disabled_backends: A list of backends that should not be tested. Supported
+      values include "cpu" and "gpu". If not specified, defaults to None.
     **kwargs: keyword arguments passed onto the generated py_test() rules.
   """
-  if backends == None:
-    backends = all_backends()
+  if disabled_backends == None:
+    disabled_backends = []
 
+  enabled_backends = [b for b in all_backends() if b not in disabled_backends]
   test_names = []
-  for backend in backends:
+  for backend in enabled_backends:
     test_name = "{}_{}".format(name, backend)
     backend_tags = ["tf_xla_{}".format(backend)]
     backend_args = []
diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py
new file mode 100644
index 0000000000..31173c717d
--- /dev/null
+++ b/tensorflow/compiler/tests/random_ops_test.py
@@ -0,0 +1,67 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for random-number generation ops in the XLA JIT compiler."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import googletest
+
+
+class RandomOpsTest(XLATestCase):
+  """Test cases for random-number generating operators."""
+
+  def _testRngIsNotConstant(self, rng, dtype):
+    # Tests that 'rng' does not always return the same value.
+    with self.test_session() as sess:
+      with self.test_scope():
+        x = rng(dtype)
+
+      # The random-number generator, if working correctly, should produce the
+      # same output multiple times with low probability.
+      y = sess.run(x)
+      z = sess.run(x)
+      w = sess.run(x)
+
+      # We use exact equality here. If the random-number generator is producing
+      # deterministic output, all three outputs will be bitwise identical.
+      self.assertTrue((not np.array_equal(y, z)) or
+                      (not np.array_equal(z, w)) or
+                      (not np.array_equal(y, w)))
+
+  def testRandomUniformIsNotConstant(self):
+    def rng(dtype):
+      return random_ops.random_uniform(shape=[2], dtype=dtype,
+                                       maxval=1000000)
+    for dtype in self.numeric_types:
+      self._testRngIsNotConstant(rng, dtype)
+
+  def testRandomNormalIsNotConstant(self):
+    def rng(dtype):
+      return random_ops.random_normal(shape=[2], dtype=dtype)
+
+    # TODO(b/34339814): implement inverse erf support for non-F32 types.
+    dtype = dtypes.float32
+    self._testRngIsNotConstant(rng, dtype)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 299b5e98c0..10b4a6d054 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -55,8 +55,6 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:stream_executor_no_cuda",
-        "//tensorflow/core:tensorflow_opensource",
-        "//tensorflow/core/kernels:cwise_op",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/tf2xla/op_registrations.cc b/tensorflow/compiler/tf2xla/op_registrations.cc
index d8a4dad4b3..d1a7abb22c 100644
--- a/tensorflow/compiler/tf2xla/op_registrations.cc
+++ b/tensorflow/compiler/tf2xla/op_registrations.cc
@@ -59,9 +59,10 @@ REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
                     Name("Ceil").TypeConstraint("T", kCpuFloatTypes));
 REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
                     Name("Concat").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("ConcatV2")
-                                            .TypeConstraint("T", kCpuAllTypes)
-                                            .TypeConstraint("Tidx", DT_INT32));
+REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
+                    Name("ConcatV2")
+                        .TypeConstraint("T", kCpuAllTypes)
+                        .TypeConstraint("Tidx", DT_INT32));
 REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("ConcatOffset"));
 REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
                     Name("Conv2D").TypeConstraint("T", kCpuFloatTypes));
@@ -165,8 +166,11 @@ REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
                     Name("Prod").TypeConstraint("T", kCpuNumericTypes));
 REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
                     Name("Range").TypeConstraint("Tidx", kCpuNumericTypes));
-// TODO(b/31361304): disabled because of XLA bugs.
-// REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("RandomStandardNormal"));
+// TODO(b/34339814): implement inverse erf for double types and update the
+// type constraint.
+REGISTER_XLA_KERNEL(
+    DEVICE_CPU_XLA_JIT,
+    Name("RandomStandardNormal").TypeConstraint("dtype", DT_FLOAT));
 REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("RandomUniform"));
 REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("RandomUniformInt"));
 REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("Rank"));
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index d291888a75..517eae2f5d 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -330,6 +330,8 @@ Status XlaCompiler::CompileGraph(string const& name,
       &result->computation, &result->requires_runtime_context,
       &compile_time_constants, &num_nonconst_outputs));
 
+  VLOG(2) << "Outputs: constant: " << compile_time_constants.size()
+          << " nonconstant: " << num_nonconst_outputs;
   result->outputs.resize(compile_time_constants.size() + num_nonconst_outputs);
   for (const auto& c : compile_time_constants) {
     if (!c.status.ok()) {
diff --git a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc b/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc
index 7a966ce241..07bbcd802f 100644
--- a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc
+++ b/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc
@@ -127,7 +127,6 @@ static const char* binary_name;
 // Test that when we use both the environment variable and actual
 // commend line flags (when the latter is possible), the latter win.
 TEST(ParseFlagsFromEnv, EnvAndFlag) {
-  // TODO(m3b):  convert to Subprocess when CL 137771604 is finished.
   static struct {
     const char* env;
     const char* arg;
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index f03b158fa7..2465edc498 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -93,6 +93,38 @@ namespace xla {
       ComputationBuilder::CreateDefaultConvDimensionNumbers());
 }
 
+/* static */ std::unique_ptr<Array4D<float>>
+ReferenceUtil::SeparableConvArray4D(const Array4D<float>& input,
+                                    const Array4D<float>& depthwise_weights,
+                                    const Array4D<float>& pointwise_weights,
+                                    std::pair<int64, int64> kernel_stride,
+                                    Padding padding) {
+  const int64 depth_multiplier = depthwise_weights.planes();
+  CHECK_EQ(pointwise_weights.depth(), input.depth() * depth_multiplier);
+
+  // Combine the two weights by reducing the depth_multiplier, so that we can
+  // apply a single convolution on the combined weights.
+  Array4D<float> weights(pointwise_weights.planes(), input.depth(),
+                         depthwise_weights.height(), depthwise_weights.width());
+  for (int64 kx = 0; kx < depthwise_weights.width(); ++kx) {
+    for (int64 ky = 0; ky < depthwise_weights.height(); ++ky) {
+      for (int64 kz = 0; kz < input.depth(); ++kz) {
+        for (int64 out = 0; out < pointwise_weights.planes(); ++out) {
+          float weight = 0.0;
+          for (int64 depth = 0; depth < depth_multiplier; ++depth) {
+            weight +=
+                depthwise_weights(depth, kz, ky, kx) *
+                pointwise_weights(out, depth + kz * depth_multiplier, 0, 0);
+          }
+          weights(out, kz, ky, kx) = weight;
+        }
+      }
+    }
+  }
+
+  return ConvArray4D(input, weights, kernel_stride, padding);
+}
+
 /* static */ int64 ReferenceUtil::WindowCount(int64 unpadded_width,
                                               int64 window_len, int64 stride,
                                               Padding padding) {
diff --git a/tensorflow/compiler/xla/reference_util.h b/tensorflow/compiler/xla/reference_util.h
index 27421b2ac4..d19d5f9dbb 100644
--- a/tensorflow/compiler/xla/reference_util.h
+++ b/tensorflow/compiler/xla/reference_util.h
@@ -73,6 +73,15 @@ class ReferenceUtil {
       std::pair<int64, int64> lhs_dilation,
       std::pair<int64, int64> rhs_dilation, ConvolutionDimensionNumbers dnums);
 
+  // Returns the result of a separable  convolution with the given parameters.
+  // kernel_stride and padding applies to the depthwise convolution during
+  // the separable convolution. pointwise_weights.depth() must be equal to
+  // input.depth() * depthwise_weights.planes().
+  static std::unique_ptr<Array4D<float>> SeparableConvArray4D(
+      const Array4D<float>& input, const Array4D<float>& depthwise_weights,
+      const Array4D<float>& pointwise_weights,
+      std::pair<int64, int64> kernel_stride, Padding padding);
+
   // Returns the result of reducing a matrix to a column vector. init is the
   // initial value for the reduce operation, and reduce_function is the function
   // to apply for each reduction step.
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 1a87a0043a..4d118d2e4e 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -281,7 +281,7 @@ llvm::Value* ElementalIrEmitter::EmitFloatMin(llvm::Value* lhs_value,
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type,
                                                       llvm::Value* x) const {
   if (prim_type != F32) {
-    return Unimplemented("inverse erf");
+    return Unimplemented("inverse erf only implemented for F32 (b/34339814)");
   }
   auto getFloat = [&](const float f) {
     return llvm::ConstantFP::get(ir_builder_->getFloatTy(), f);
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 373ab79ab2..ac478afabc 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -115,6 +115,9 @@ class Executable {
 
   const HloModuleConfig& module_config() const { return *module_config_; }
 
+  // Returns whether this executable has an associated HloModuleConfig.
+  bool has_module_config() const { return module_config_ != nullptr; }
+
   // Returns the versioned computation handle of the computation computed by
   // this executable.
   const VersionedComputationHandle& entry_computation_handle() const {
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 9aeebe42f8..8353731fdd 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -365,6 +365,38 @@ cc_library(
 )
 
 cc_library(
+    name = "fusion_merger",
+    srcs = ["fusion_merger.cc"],
+    hdrs = ["fusion_merger.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_cost_analysis",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "fusion_merger_test",
+    srcs = ["fusion_merger_test.cc"],
+    deps = [
+        ":fusion_merger",
+        ":instruction_fusion",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
     name = "pad_insertion",
     srcs = ["pad_insertion.cc"],
     hdrs = ["pad_insertion.h"],
@@ -386,6 +418,7 @@ cc_library(
     deps = [
         ":convolution_folding",
         ":copy_insertion",
+        ":fusion_merger",
         ":gpu_executable",
         ":hlo_schedule",
         ":instruction_fusion",
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
new file mode 100644
index 0000000000..caa919b688
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -0,0 +1,270 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
+
+#include <algorithm>
+
+#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// Traverses users of tuple shape, adding leaf instructions to 'instructions'.
+void MaybeResolveTupleElements(HloInstruction* instruction,
+                               std::vector<HloInstruction*>* instructions) {
+  if (ShapeUtil::IsTuple(instruction->shape())) {
+    for (auto tuple_user : instruction->users()) {
+      MaybeResolveTupleElements(tuple_user, instructions);
+    }
+  } else {
+    instructions->push_back(instruction);
+  }
+}
+
+// Returns the bytes read by fusion parameter 'param', by returning the byte
+// size of 'param' shape (or the cumulative byte sizes of all leaf tuple
+// elements if 'param' is tuple-shaped).
+// In the special case where all users of 'param' (or all users of a leaf
+// tuple element if 'param' is tuple-shaped) are Slice instructions, the size
+// of each slice instruction is accumulated instead, to give a more accurate
+// value for bytes read.
+double CalculateBytesReadByFusionParameter(HloInstruction* param) {
+  CHECK_EQ(HloOpcode::kParameter, param->opcode());
+
+  // Adds all leaf tuple elements to 'instructions' if 'param' is tuple-shaped.
+  // Adds 'param' to 'instructions' otherwise.
+  std::vector<HloInstruction*> instructions;
+  MaybeResolveTupleElements(param, &instructions);
+
+  // Iterate through 'instructions' accumulating byte sizes of each instruction
+  // shape. For each 'instruction' in 'instructions', if all users of
+  // 'instruction' are Slice instructions, accumuates the byte sizes of each
+  // Slice for a more accurate estimate of bytes read.
+  double bytes = 0.0;
+  for (auto& instruction : instructions) {
+    if (std::all_of(instruction->users().begin(), instruction->users().end(),
+                    [](const HloInstruction* instruction) {
+                      return instruction->opcode() == HloOpcode::kSlice ||
+                             instruction->opcode() == HloOpcode::kDynamicSlice;
+                    })) {
+      // All users are slice: accumulate bytes of all user slice instructions.
+      for (auto& user : instruction->users()) {
+        bytes += ShapeUtil::ByteSizeOf(user->shape());
+      }
+    } else {
+      // Some users are not slice: accumulate full size of 'instruction'.
+      bytes += ShapeUtil::ByteSizeOf(instruction->shape());
+    }
+  }
+  return bytes;
+}
+
+// Returns the bytes read by all fusion parameters of instruction 'fusion'.
+double CalculateBytesReadByFusionInstruction(HloInstruction* fusion) {
+  double bytes = 0.0;
+  for (const auto& fused_instruction : fusion->fused_instructions()) {
+    if (fused_instruction->opcode() != HloOpcode::kParameter) {
+      continue;
+    }
+    bytes += CalculateBytesReadByFusionParameter(fused_instruction.get());
+  }
+  return bytes;
+}
+
+// Returns the flops to bytes transferred ratio of instruction 'fusion'.
+double CalculateFlopsToBytesRatio(HloInstruction* fusion) {
+  CHECK_EQ(HloOpcode::kFusion, fusion->opcode());
+  // Calculate total bytes transferred in/out.
+  double bytes = CalculateBytesReadByFusionInstruction(fusion);
+  // Add bytes written to root instructions buffer.
+  bytes += ShapeUtil::ByteSizeOf(fusion->fused_expression_root()->shape());
+  // Calculate flops for all fused instructions.
+  HloCostAnalysis analysis;
+  TF_CHECK_OK(fusion->fused_expression_root()->Accept(&analysis));
+  // Return flops / bytes.
+  return bytes > 0.0 ? analysis.flop_count() / bytes : analysis.flop_count();
+}
+
+// Returns bytes transferred by instruction 'fusion', including the bytes
+// that would be read by all users.
+double GetCurrentBytesTransferred(HloInstruction* fusion) {
+  CHECK_EQ(HloOpcode::kFusion, fusion->opcode());
+  const double bytes_read = CalculateBytesReadByFusionInstruction(fusion);
+  const double bytes_written =
+      ShapeUtil::ByteSizeOf(fusion->fused_expression_root()->shape());
+  // Current bytes transferred (ignoring non 'fusion' user operands) is bytes
+  // read and written by 'fusion', plus reads of size 'bytes_written' for each
+  // user.
+  return bytes_read + bytes_written * (fusion->user_count() + 1);
+}
+
+// Returns bytes transferred if 'fusion' were to be merged into its users.
+double GetMergedBytesTransferred(HloInstruction* fusion) {
+  CHECK_EQ(HloOpcode::kFusion, fusion->opcode());
+  return CalculateBytesReadByFusionInstruction(fusion) * fusion->user_count();
+}
+
+}  // anonymous namespace
+
+// FusionInstructionMerger visits all fusion instructions in 'computation'
+// in post order, attempting to merge each into all of its users.
+// Accumulates and reports stats on successful/failed merge attempts.
+class FusionInstructionMerger {
+ public:
+  explicit FusionInstructionMerger(HloComputation* computation)
+      : computation_(computation) {}
+
+  Status Run();
+
+  bool changed() const { return changed_; }
+
+ private:
+  Status HandleFusion(HloInstruction* fusion);
+
+  HloComputation* computation_;
+  bool changed_ = false;
+
+  // Fusion instruction merge stats.
+  int total_visited_ = 0;
+  int total_merged_ = 0;
+  int num_fail_no_users_ = 0;
+  int num_fail_not_loop_fusion_ = 0;
+  int num_fail_merge_all_users_ = 0;
+  int num_fail_flops_to_byte_ratio_ = 0;
+  int num_fail_net_bytes_transferred_ratio_ = 0;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FusionInstructionMerger);
+};
+
+Status FusionInstructionMerger::Run() {
+  for (auto* instruction : computation_->MakeInstructionPostOrder()) {
+    if (instruction->opcode() == HloOpcode::kFusion) {
+      TF_RETURN_IF_ERROR(HandleFusion(instruction));
+    }
+  }
+
+  VLOG(1) << "FusionInstructionMerger EXIT"
+          << " computation: " << computation_->name()
+          << " total_visited: " << total_visited_
+          << " total_merged: " << total_merged_ << " merge failures { "
+          << " no_users: " << num_fail_no_users_
+          << " not_loop_fusion: " << num_fail_not_loop_fusion_
+          << " merge_all_users: " << num_fail_merge_all_users_
+          << " flops_to_byte_ratio: " << num_fail_flops_to_byte_ratio_
+          << " net_bytes_transferred: " << num_fail_net_bytes_transferred_ratio_
+          << " }";
+  return Status::OK();
+}
+
+Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
+  VLOG(3) << "FusionInstructionMerger ENTRY fusion: " << fusion->name()
+          << " flops_to_bytes_ratio: " << CalculateFlopsToBytesRatio(fusion);
+  ++total_visited_;
+  // Skip 'fusion' instruction if there are no users into which we can merge.
+  if (fusion->users().empty()) {
+    ++num_fail_no_users_;
+    return Status::OK();
+  }
+
+  // Skip 'fusion' instruction if it is not a loop fusion. Library fusion
+  // instructions match specific patterns, so they shouldn't be further fused.
+  // Input fusion instructions need to be rooted at a particular HLO (e.g.
+  // kReduce), so they shouldn't be further fused either.
+  if (fusion->fusion_kind() != HloInstruction::FusionKind::kLoop) {
+    ++num_fail_not_loop_fusion_;
+    return Status::OK();
+  }
+  // Skip 'fusion' instruction if we cannot merge into all of its users.
+  // Merging into all users enables the removal of 'fusion' from the
+  // computation.
+  if (!std::all_of(fusion->users().begin(), fusion->users().end(),
+                   [](const HloInstruction* instruction) {
+                     return instruction->opcode() == HloOpcode::kFusion &&
+                            instruction->fusion_kind() ==
+                                HloInstruction::FusionKind::kLoop;
+                   })) {
+    ++num_fail_merge_all_users_;
+    return Status::OK();
+  }
+  // Skip 'fusion' instruction if its flops to bytes transferred ratio
+  // exceeds the threshold value.
+  if (CalculateFlopsToBytesRatio(fusion) >
+      FusionMerger::GetThresholdFlopsToBytesRatio()) {
+    ++num_fail_flops_to_byte_ratio_;
+    return Status::OK();
+  }
+  // Skip 'fusion' instruction if merging it into all users would result in a
+  // net increase in bytes transferred (currently allowing the net bytes
+  // transferred to be exceeded up to ~10% in exhange for eliminating the
+  // overhead from a GPU kernel launch).
+  const double current_bytes_transferred = GetCurrentBytesTransferred(fusion);
+  const double merged_bytes_transferred = GetMergedBytesTransferred(fusion);
+  const double merged_to_current_bytes_ratio =
+      merged_bytes_transferred / std::max(1.0, current_bytes_transferred);
+  if (merged_to_current_bytes_ratio > 1.10) {
+    ++num_fail_net_bytes_transferred_ratio_;
+    return Status::OK();
+  }
+  // Merge fused instructions from 'fusion' into each user.
+  std::set<HloInstruction*> users = fusion->users();
+  for (HloInstruction* user : users) {
+    user->MergeFusionInstruction(fusion);
+    changed_ = true;
+  }
+  ++total_merged_;
+  VLOG(2) << "Merged fusion instruction: " << fusion->name()
+          << " flops_to_bytes_ratio: " << CalculateFlopsToBytesRatio(fusion)
+          << " merged_to_current_bytes_ratio: " << merged_to_current_bytes_ratio
+          << " into users { "
+          << tensorflow::str_util::Join(users, ", ",
+                                        [](string* out, HloInstruction* user) {
+                                          tensorflow::strings::StrAppend(
+                                              out, user->name());
+                                        })
+          << " }";
+  // Remove 'fusion' instruction.
+  CHECK_EQ(0, fusion->user_count());
+  computation_->RemoveInstruction(fusion);
+  return Status::OK();
+}
+
+StatusOr<bool> FusionMerger::Run(HloModule* module) {
+  bool changed = false;
+  VLOG(2) << "FusionMerger for module: " << module->name();
+  for (auto& computation : module->computations()) {
+    VLOG(1) << "Before running FusionInstructionMerger for computation: "
+            << computation->name();
+    XLA_VLOG_LINES(3, computation->ToString());
+
+    FusionInstructionMerger fusion_merger(computation.get());
+    TF_RETURN_IF_ERROR(fusion_merger.Run());
+    changed |= fusion_merger.changed();
+
+    VLOG(1) << "After running FusionInstructionMerger for computation: "
+            << computation->name() << " changed: " << changed;
+    XLA_VLOG_LINES(3, computation->ToString());
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.h b/tensorflow/compiler/xla/service/gpu/fusion_merger.h
new file mode 100644
index 0000000000..717eb15b85
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.h
@@ -0,0 +1,47 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FUSION_MERGER_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FUSION_MERGER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass.h"
+
+namespace xla {
+namespace gpu {
+
+// An HLO pass that attempts to merge fusion instructions to reduce kernel
+// launch overhead and improve data locality.
+//
+// Fusion instructions are merged into their users if two conditons are met:
+//
+// 1) The flops_to_bytes ratio of the fusion instruction is below the threshold
+//    value of 1.0.
+// 2) The result of merging the fusion instruction into its users would not
+//    increase bytes transferred.
+//
+class FusionMerger : public HloPass {
+ public:
+  FusionMerger() : HloPass("fusion merger") {}
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+  static double GetThresholdFlopsToBytesRatio() { return 1.0; }
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FUSION_MERGER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
new file mode 100644
index 0000000000..a87e66ca86
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
@@ -0,0 +1,456 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
+
+#include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class FusionMergerTest : public HloTestBase {
+ protected:
+  FusionMergerTest() : module_(TestName()) {}
+
+  // Builds the following computation:
+  //
+  //                 Param
+  //               /   |   \
+  //              /    |    \
+  //  OnesVec  GTE(0) GTE(1) GTE(2)
+  //       \   /         \   /
+  //        Add           Add  OnesVec
+  //         \           /  \  /
+  //           \      Add   Mul  OnesVec
+  //            \      |     |  /
+  //             \    Mul    Add
+  //              \    |    /
+  //               \   |   /
+  //                 Tuple
+  //
+  HloComputation* BuildComputation0() {
+    auto builder = HloComputation::Builder(TestName() + ".Computation0");
+    // Create param instruction to access computation state.
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape3_, "param"));
+
+    // Create GetTupleElement instructions for each tuple element.
+    auto gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape_, param, 0));
+    auto gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape_, param, 1));
+    auto gte2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape_, param, 2));
+
+    // Create const vector of ones to be used in element-wise computations.
+    auto one_vec = builder.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR1<float>({1.f, 1.f, 1.f, 1.f})));
+
+    // Create simple fusable computation for tuple element 0 (wont get merged).
+    auto out0 = builder.AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kAdd, one_vec, gte0));
+
+    // Create fusable computation which is dependent on second and third tuple
+    // elements (will initially be fused on its own).
+    auto add1 = builder.AddInstruction(
+        HloInstruction::CreateBinary(data_shape_, HloOpcode::kAdd, gte1, gte2));
+
+    // Create two sub-computations, both of which are users of 'add1'.
+
+    // First sub-computation: out1 = Mul(Add(add1, one_vec), one_vec)
+    auto add2 = builder.AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kAdd, add1, one_vec));
+    auto out1 = builder.AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kMultiply, add2, one_vec));
+
+    // Second sub-computation: out2 = Add(Mul(add1, one_vec), one_vec)
+    auto mul0 = builder.AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kMultiply, add1, one_vec));
+    auto out2 = builder.AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kAdd, mul0, one_vec));
+
+    // Create output Tuple.
+    builder.AddInstruction(HloInstruction::CreateTuple({out0, out1, out2}));
+    return module_.AddEntryComputation(builder.Build());
+  }
+
+  // Builds the following computation:
+  //
+  //                 Param
+  //               /      \
+  //            GTE(0)   GTE(1)
+  //            | | \   /
+  //            | |  Mul
+  //             \  \ |
+  //              \  Mul
+  //               \ |
+  //      OnesVec   Mul  OnesVec
+  //             \  /  \ /
+  //     OnesVec  Add  Mul  OnesVec
+  //            \  |    |  /
+  //             Mul    Add
+  //               \    /
+  //                \  /
+  //                Tuple
+  //
+  HloComputation* BuildComputation1() {
+    auto builder = HloComputation::Builder(TestName() + ".Computation1");
+    Shape tuple_shape2_ = ShapeUtil::MakeTupleShape({data_shape_, data_shape_});
+    // Create param instruction to access computation state.
+    auto state = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape2_, "state"));
+
+    // Create shared sub-computation (will initially be fused on its own).
+    auto gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape_, state, 0));
+    auto gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape_, state, 2));
+    // Calculate the flops we need to generate for this shared computation
+    // to exceed the threshold flops_to_bytes_ratio.
+    // Note that bytes transferred is multiplied by 3 because there are two
+    // operands and one output of size 'data_shape_'.
+    const int64 flops_needed = FusionMerger::GetThresholdFlopsToBytesRatio() *
+                               ShapeUtil::ByteSizeOf(data_shape_) * 3;
+    const int64 vec_elements = ShapeUtil::ElementsIn(data_shape_);
+    const int64 iters = (flops_needed + vec_elements - 1) / vec_elements;
+
+    auto mul0 = builder.AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kMultiply, gte0, gte1));
+    for (int i = 0; i < iters; ++i) {
+      mul0 = builder.AddInstruction(HloInstruction::CreateBinary(
+          data_shape_, HloOpcode::kMultiply, gte0, mul0));
+    }
+
+    // Create two sub-computations, both of which are users of 'mul0'.
+    auto one_vec = builder.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR1<float>({1.f, 1.f, 1.f, 1.f})));
+
+    // First sub-computation: out0 = Mul(Add(mul0, one_vec), one_vec)
+    auto add0 = builder.AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kAdd, mul0, one_vec));
+    auto out0 = builder.AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kMultiply, add0, one_vec));
+
+    // Second sub-computation: out1 = Add(Mul(mul0, one_vec), one_vec)
+    auto mul1 = builder.AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kMultiply, mul0, one_vec));
+    auto out1 = builder.AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kAdd, mul1, one_vec));
+
+    // Create output Tuple.
+    builder.AddInstruction(HloInstruction::CreateTuple({out0, out1}));
+    return module_.AddEntryComputation(builder.Build());
+  }
+
+  // Builds the following computation:
+  //
+  //                Param
+  //             /   |   |  \
+  //            /    |   |   \
+  //           /     |   |    \
+  //      GTE(0) GTE(1) GTE(2) GTE(3)
+  //           \   /    /     /
+  //            Add    /     /
+  //              \   /     /
+  //               Add     /
+  //                 \    /
+  //                  \  /
+  //         OnesVec   Add  OnesVec
+  //                \  /  \ /
+  //        OnesVec  Add  Mul OnesVec
+  //              \  |    |  /
+  //               Mul    Add
+  //                 \    /
+  //                  \  /
+  //                  Tuple
+  //
+  HloComputation* BuildComputation2(bool add_extra_input) {
+    auto builder = HloComputation::Builder(TestName() + ".Computation2");
+    Shape state_shape = add_extra_input ? tuple_shape4_ : tuple_shape3_;
+    // Create param instruction to access computation state.
+    auto state = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, state_shape, "state"));
+
+    // Create GetTupleElement instructions for each tuple element.
+    auto gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape_, state, 0));
+    auto gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape_, state, 1));
+    auto gte2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape_, state, 2));
+
+    // Create shared fusable computation that reduces its operands.
+    auto reduce0 = builder.AddInstruction(
+        HloInstruction::CreateBinary(data_shape_, HloOpcode::kAdd, gte0, gte1));
+    auto reduce_out = builder.AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kAdd, reduce0, gte2));
+    if (add_extra_input) {
+      auto gte3 = builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(data_shape_, state, 3));
+      reduce_out = builder.AddInstruction(HloInstruction::CreateBinary(
+          data_shape_, HloOpcode::kAdd, reduce_out, gte3));
+    }
+
+    // Create two fusable sub-computations which are dependent on shared
+    // computation 'reduce_out'.
+    auto one_vec = builder.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR1<float>({1.f, 1.f, 1.f, 1.f})));
+
+    // First sub-computation: out0 = Mul(Add(reduce_out, one_vec), one_vec)
+    auto add2 = builder.AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kAdd, reduce_out, one_vec));
+    auto out0 = builder.AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kMultiply, add2, one_vec));
+
+    // Second sub-computation: out1 = Add(Mul(reduce_out, one_vec), one_vec)
+    auto mul0 = builder.AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kMultiply, reduce_out, one_vec));
+    auto out1 = builder.AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kAdd, mul0, one_vec));
+
+    // Create output Tuple.
+    builder.AddInstruction(HloInstruction::CreateTuple({out0, out1}));
+    return module_.AddEntryComputation(builder.Build());
+  }
+
+  Shape data_shape_ = ShapeUtil::MakeShape(F32, {4});
+  Shape tuple_shape2_ = ShapeUtil::MakeTupleShape({data_shape_, data_shape_});
+  Shape tuple_shape3_ =
+      ShapeUtil::MakeTupleShape({data_shape_, data_shape_, data_shape_});
+  Shape tuple_shape4_ = ShapeUtil::MakeTupleShape(
+      {data_shape_, data_shape_, data_shape_, data_shape_});
+
+  HloModule module_;
+};
+
+// Tests that we can merge a fusion instruction that is below threshold.
+//
+// Original computation:
+//
+//                 Param
+//                /  |  \
+//               /   |   \
+//  OnesVec  GTE(0) GTE(1) GTE(2)
+//       \   /         \   /
+//        Add           Add  OnesVec
+//         \           /  \  /
+//           \      Add   Mul  OnesVec
+//            \      |     |  /
+//             \    Mul    Add
+//              \    |    /
+//               \   |   /
+//                 Tuple
+//
+// Computation after fusion passes:
+//
+//                  Param
+//                 /     \
+//            Fusion3    Fusion2
+//               |       /     \
+//                \ Fusion0  Fusion1
+//                 \    |   /
+//                  \   |  /
+//                   Tuple
+//
+// Computation after fusion merger pass (Fusion2 is merged into Fusion0 and
+// Fusion1):
+//                   Param
+//                 /   |   \
+//          Fusion3 Fusion0 Fusion1
+//                 \   |   /
+//                   Tuple
+//
+TEST_F(FusionMergerTest, MergeSharedFusionInstruction) {
+  auto computation = BuildComputation0();
+  // Run standard fusion passes.
+  EXPECT_TRUE(
+      GpuInstructionFusion(/*may_duplicate=*/false).Run(&module_).ValueOrDie());
+  EXPECT_FALSE(
+      GpuInstructionFusion(/*may_duplicate=*/true).Run(&module_).ValueOrDie());
+  // Run fusion merger pass, which should merge the shared fusion instruction
+  // into its two users.
+  EXPECT_TRUE(FusionMerger().Run(&module_).ValueOrDie());
+
+  auto* root = computation->root_instruction();
+  EXPECT_EQ(HloOpcode::kTuple, root->opcode());
+  // Check operand 0 (not merged). Should have 4 instructions.
+  auto* operand0 = root->operand(0);
+  EXPECT_EQ(HloOpcode::kFusion, operand0->opcode());
+  EXPECT_EQ(4, operand0->fused_instructions().size());
+  // Check operand 1 (should have merged in its operand fusion instruction).
+  auto* operand1 = root->operand(1);
+  EXPECT_EQ(HloOpcode::kFusion, operand1->opcode());
+  EXPECT_EQ(7, operand1->fused_instructions().size());
+  // Check operand 2 (should have merged in its operand fusion instruction).
+  auto* operand2 = root->operand(2);
+  EXPECT_EQ(HloOpcode::kFusion, operand2->opcode());
+  EXPECT_EQ(7, operand2->fused_instructions().size());
+}
+
+// Tests that we do not merge a fusion instruction that above flops to bytes
+// threshold.
+//
+// Original computation:
+//
+//                 Param
+//                /     \
+//            GTE(0)   GTE(1)
+//            | | \   /
+//            | |  Mul
+//             \  \ |
+//              \  Mul
+//               \ |
+//      OnesVec   Mul  OnesVec
+//             \  /  \ /
+//     OnesVec  Add  Mul  OnesVec
+//            \  |    |  /
+//             Mul    Add
+//               \    /
+//                \  /
+//                Tuple
+//
+// Computation after fusion passes and fusion merger pass (Fusion2 is not
+// merged because it exceeds the threshold flops to bytes ratio).
+//
+//                 Param
+//                   |
+//                Fusion2
+//                /     \
+//           Fusion0  Fusion1
+//                \    /
+//                 Tuple
+//
+TEST_F(FusionMergerTest, FlopsToBytesRatioThresholdExceeded) {
+  BuildComputation1();
+  // Run standard fusion passes.
+  EXPECT_TRUE(
+      GpuInstructionFusion(/*may_duplicate=*/false).Run(&module_).ValueOrDie());
+  EXPECT_FALSE(
+      GpuInstructionFusion(/*may_duplicate=*/true).Run(&module_).ValueOrDie());
+  // Run fusion merger pass, which should detect that the flops/bytes of the
+  // shared fusion instruction exceeds the threshold ratio, and therefore
+  // cannot be merged with other fusion instructions.
+  EXPECT_FALSE(FusionMerger().Run(&module_).ValueOrDie());
+}
+
+// Tests that threshold for bytes transferred if merged is exceeded.
+//
+// Original computation:
+//
+//                Param
+//             /   |   |  \
+//            /    |   |   \
+//           /     |   |    \
+//      GTE(0) GTE(1) GTE(2) GTE(3)
+//           \   /    /     /
+//            Add    /     /
+//              \   /     /
+//               Add     /
+//                 \    /
+//                  \  /
+//         OnesVec   Add  OnesVec
+//                \  /  \ /
+//        OnesVec  Add  Mul OnesVec
+//              \  |    |  /
+//               Mul    Add
+//                 \    /
+//                  \  /
+//                  Tuple
+//
+// Computation after fusion passes and fusion merger pass. Fusion2 is not
+// merged because it exceeds the threshold bytes transferred. This is because
+// the bytes read by Fusion2 (when replicated if the instruction is merged
+// into Fusion0 and Fusion1) would exceed the bytes transferred threshold.
+//
+//                 Param
+//                   |
+//                Fusion2
+//                /     \
+//           Fusion0  Fusion1
+//                \    /
+//                 Tuple
+//
+TEST_F(FusionMergerTest, BytesTransferredThresholdExeceeded) {
+  BuildComputation2(/*add_extra_input=*/true);
+  // Run standard fusion passes.
+  EXPECT_TRUE(
+      GpuInstructionFusion(/*may_duplicate=*/false).Run(&module_).ValueOrDie());
+  EXPECT_FALSE(
+      GpuInstructionFusion(/*may_duplicate=*/true).Run(&module_).ValueOrDie());
+  // Run fusion merger pass, which should detect that the net bytes transferred
+  // (if merged) would increase.
+  EXPECT_FALSE(FusionMerger().Run(&module_).ValueOrDie());
+}
+
+// Tests that threshold for bytes transferred if merged is not exceeded.
+//
+// Original computation:
+//
+//               Param
+//             /   |  \
+//            /    |   \
+//           /     |    \
+//      GTE(0) GTE(1) GTE(2)
+//           \   /    /
+//            Add    /
+//              \   /
+//     OnesVec   Add  OnesVec
+//            \  /  \ /
+//   OnesVec  Add   Mul OnesVec
+//              \  /   \  /
+//               Mul    Add
+//                 \    /
+//                  \  /
+//                  Tuple
+//
+// Computation after fusion passes:
+//
+//                 Param
+//                   |
+//                Fusion2
+//                /     \
+//           Fusion0  Fusion1
+//                \    /
+//                 Tuple
+//
+// Computation after fusion merger pass (Fusion2 is merged into Fusion0 and
+// Fusion1, because bytes read from Param by Fusion2 is reduced for this test
+// which makes the merge operation into its operand below the bytes
+// transferred threshold.
+//
+//                   Param
+//                   /  \
+//             Fusion0  Fusion1
+//                   \    /
+//                   Tuple
+//
+TEST_F(FusionMergerTest, BytesTransferredThresholdNotExeceeded) {
+  BuildComputation2(/*add_extra_input=*/false);
+  // Run standard fusion passes.
+  EXPECT_TRUE(
+      GpuInstructionFusion(/*may_duplicate=*/false).Run(&module_).ValueOrDie());
+  EXPECT_FALSE(
+      GpuInstructionFusion(/*may_duplicate=*/true).Run(&module_).ValueOrDie());
+  // Run fusion merger pass, which should detect that the net bytes transferred
+  // (if merged) would not increase.
+  EXPECT_TRUE(FusionMerger().Run(&module_).ValueOrDie());
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 2f95446e6c..b5d7ba48d2 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_folding.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_insertion.h"
+#include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
@@ -132,6 +133,7 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
     HloPassFix<HloPassPipeline> fusion("fusion", dump_hlo);
     fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false);
     fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true);
+    fusion.AddPass<FusionMerger>();
     return fusion.Run(hlo_module).status();
   }
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 7ae0a995af..48be0bd2c0 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -420,6 +420,37 @@ HloInstruction::CreateFusionForBackwardConvolution(
   return fusion;
 }
 
+void HloInstruction::MergeFusionInstruction(
+    HloInstruction* instruction_to_merge) {
+  CHECK_EQ(opcode_, HloOpcode::kFusion);
+  CHECK_EQ(instruction_to_merge->opcode(), HloOpcode::kFusion);
+  // Clone the instruction from which to merge fused instructions.
+  std::unique_ptr<HloInstruction> clone = instruction_to_merge->Clone();
+  // Replace uses of fused parameters with the corresponding operand of the
+  // fusion.
+  // Add all non-parameter fused instructions to 'unfused_instructions' to be
+  // merged into 'this'.
+  std::vector<HloInstruction*> unfused_instructions;
+  for (auto& fused_instruction : clone->fused_instructions()) {
+    if (fused_instruction->opcode() == HloOpcode::kParameter) {
+      fused_instruction->ReplaceAllUsesWith(
+          clone->mutable_operand(fused_instruction->parameter_number()));
+    } else {
+      unfused_instructions.push_back(fused_instruction.get());
+    }
+  }
+  CHECK(unfused_instructions.front() == clone->fused_expression_root());
+  // Replace instruction_to_merge use of 'this' with unfused_root.
+  instruction_to_merge->ReplaceUseWith(this, unfused_instructions.front());
+  // Fuse 'unfused_instructions' into 'this'.
+  for (auto& instruction : unfused_instructions) {
+    FuseInstruction(instruction);
+    instruction->DetachFromOperands();
+  }
+  CHECK_EQ(0, clone->user_count());
+  clone->DetachFromOperands();
+}
+
 HloInstruction* HloInstruction::FuseInstruction(
     HloInstruction* instruction_to_fuse) {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 8e7a253578..ecf29a476d 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -79,11 +79,6 @@ class HloInstruction {
       const Shape& shape, RandomDistribution distribution,
       tensorflow::gtl::ArraySlice<HloInstruction*> parameters);
 
-  // Creates an n-ary elementwise operation.
-  static std::unique_ptr<HloInstruction> CreateNary(
-      const Shape& shape, HloOpcode opcode,
-      tensorflow::gtl::ArraySlice<HloInstruction*> operands);
-
   // Creates a unary instruction (one operand).
   // Precondition: opcode must be a legitimate unary operation.
   static std::unique_ptr<HloInstruction> CreateUnary(const Shape& shape,
@@ -492,6 +487,13 @@ class HloInstruction {
     return fusion_kind_;
   }
 
+  // Merges the fused instructions from 'instruction_to_merge' into the
+  // fused instruction set of 'this', updating operands as necessary.
+  //
+  // Precondition: opcode() == HloOpcode::kFusion
+  // Predondition: 'instruction_to_merge' must be an operand of 'this'.
+  void MergeFusionInstruction(HloInstruction* instruction_to_merge);
+
   // Fuses the given instruction in this fusion instruction. instruction_to_fuse
   // is cloned and the clone is placed in the fusion
   // instruction. instruction_to_fuse is unchanged. Instruction is cloned rather
@@ -636,6 +638,11 @@ class HloInstruction {
  private:
   enum class UseKind { kNoUse, kReuse, kUsePermutingElements, kUse };
 
+  // Creates an n-ary elementwise operation.
+  static std::unique_ptr<HloInstruction> CreateNary(
+      const Shape& shape, HloOpcode opcode,
+      tensorflow::gtl::ArraySlice<HloInstruction*> operands);
+
   // Appends operand to the list of operands and adds this instruction as a user
   // of the operand.
   void AppendOperand(HloInstruction* operand);
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 7f86a3cbb5..30bf450c5b 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -247,10 +247,9 @@ LocalService::CompileAheadOfTime(
             *instance.result_layout));
   }
 
-  return execute_backend_->compiler()
-      ->CompileAheadOfTime(std::move(hlo_modules), std::move(module_configs),
-                           MakeHloDumper(), options)
-      .ConsumeValueOrDie();
+  return execute_backend_->compiler()->CompileAheadOfTime(
+      std::move(hlo_modules), std::move(module_configs), MakeHloDumper(),
+      options);
 }
 
 tensorflow::Status LocalService::ValidateExecuteOptions(
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index ab2c43cd3d..6626fe5af8 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -37,32 +37,62 @@ limitations under the License.
 
 namespace xla {
 
-/* static */ bool ShapeUtil::CompareShapes(const Shape& lhs, const Shape& rhs,
-                                           bool compare_layouts) {
-  if (IsTuple(lhs)) {
-    return IsTuple(rhs) &&
-           ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
-                           [=](const Shape& l, const Shape& r) {
-                             return CompareShapes(l, r, compare_layouts);
-                           });
+namespace {
+
+// Recursive helper for comparing the equality of two shapes. Returns true if
+// the shapes are the same. If compare_layouts is true, then layouts must also
+// match.
+bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
+  if (ShapeUtil::IsTuple(lhs)) {
+    if (!ShapeUtil::IsTuple(rhs)) {
+      VLOG(3) << "CompareShapes: lhs is a tuple, rhs not a tuple";
+      return false;
+    }
+
+    if (!ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
+                         [=](const Shape& l, const Shape& r) {
+                           return CompareShapes(l, r, compare_layouts);
+                         })) {
+      VLOG(3) << "CompareShapes: tuples on lhs and rhs not equal";
+      return false;
+    }
   }
   // Explicitly compare the fields rather than using MessageDifferencer because
   // we want empty layouts to be treated identically to missing layouts.
-  if (compare_layouts &&
-      (!ContainersEqual(lhs.layout().minor_to_major(),
-                        rhs.layout().minor_to_major()) ||
-       !ContainersEqual(lhs.layout().padded_dimensions(),
-                        rhs.layout().padded_dimensions()) ||
-       lhs.layout().padding_value() != rhs.layout().padding_value())) {
+  if (compare_layouts) {
+    if (!ContainersEqual(lhs.layout().minor_to_major(),
+                         rhs.layout().minor_to_major())) {
+      VLOG(3) << "CompareShapes: lhs layout != rhs layout";
+      return false;
+    }
+    if (!ContainersEqual(lhs.layout().padded_dimensions(),
+                         rhs.layout().padded_dimensions())) {
+      VLOG(3)
+          << "CompareShapes: lhs padded_dimensions != rhs padded_dimensions";
+      return false;
+    }
+    if (lhs.layout().padding_value() != rhs.layout().padding_value()) {
+      VLOG(3) << "CompareShapes: lhs padding value != rhs padding_value";
+      return false;
+    }
+  }
+
+  if (!ShapeUtil::SameDimensions(lhs, rhs)) {
+    VLOG(3) << "CompareShapes: lhs dimensions != rhs dimensions";
     return false;
   }
-  return SameDimensions(lhs, rhs) && SameElementType(lhs, rhs);
+  if (!ShapeUtil::SameElementType(lhs, rhs)) {
+    VLOG(3) << "CompareShapes: lhs element type != rhs element type";
+    return false;
+  }
+  return true;
 }
 
+}  // namespace
+
 /* static */ bool ShapeUtil::Equal(const Shape& lhs, const Shape& rhs) {
   bool equal = CompareShapes(lhs, rhs, /*compare_layouts=*/true);
   if (!equal && VLOG_IS_ON(3)) {
-    // TODO(jeff): Maybe print more info about where lhs and rhs differ
     VLOG(3) << "ShapeUtil::Equal differ: lhs = " << lhs.ShortDebugString()
             << ", rhs = " << rhs.ShortDebugString();
   }
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index fa5fcc0224..963a3e4805 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -388,12 +388,6 @@ class ShapeUtil {
                                 Shape shape);
 
  private:
-  // Recursive helper for comparing the equality of two shapes. Returns true if
-  // the shapes are the same. If compare_layouts is true, then layouts must also
-  // match.
-  static bool CompareShapes(const Shape& lhs, const Shape& rhs,
-                            bool compare_layouts);
-
   // Validates all of the non-layout properties of the shape -- this is a helper
   // used by both the layout-optional and layout-required public method.
   static Status ValidateShapeWithOptionalLayoutInternal(const Shape& shape);
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 4e8a496e7e..fb2f8fb284 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -150,6 +150,26 @@ TEST(ShapeUtilTest, EmptyLayoutEqualsMissingLayout) {
   EXPECT_TRUE(ShapeUtil::Equal(scalar1, scalar2));
 }
 
+TEST(ShapeUtilTest, CompareShapesWithPaddedDimensionsMismatch) {
+  Shape shape1 = ShapeUtil::MakeShape(F32, {20, 30});
+  shape1.mutable_layout()->add_padded_dimensions(10);
+
+  Shape shape2 = ShapeUtil::MakeShape(F32, {20, 30});
+  shape2.mutable_layout()->add_padded_dimensions(11);
+
+  EXPECT_FALSE(ShapeUtil::Equal(shape1, shape2));
+}
+
+TEST(ShapeUtilTest, CompareShapesWithPaddingValueMismatch) {
+  Shape shape1 = ShapeUtil::MakeShape(F32, {20, 30});
+  shape1.mutable_layout()->set_padding_value(ZERO_PAD);
+
+  Shape shape2 = ShapeUtil::MakeShape(F32, {20, 30});
+  shape2.mutable_layout()->set_padding_value(LOWEST_PAD);
+
+  EXPECT_FALSE(ShapeUtil::Equal(shape1, shape2));
+}
+
 TEST(ShapeUtilTest, ScalarUnpopulatedLayoutEqualsScalarLayout) {
   Shape scalar_unpopulated = ShapeUtil::MakeShape(F32, {});
   scalar_unpopulated.clear_layout();
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py b/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py
index 81fbf2a6ef..f378966562 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py
@@ -26,7 +26,9 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
@@ -146,6 +148,32 @@ class CategoricalTest(test.TestCase):
           -(0.6 * np.log(0.6) + 0.4 * np.log(0.4))
       ])
 
+  def testEntropyGradient(self):
+    with self.test_session() as sess:
+      logits = constant_op.constant([[1., 2., 3.], [2., 5., 1.]])
+
+      probabilities = nn_ops.softmax(logits)
+      log_probabilities = nn_ops.log_softmax(logits)
+      true_entropy = - math_ops.reduce_sum(
+          probabilities * log_probabilities, axis=-1)
+
+      categorical_distribution = categorical.Categorical(p=probabilities)
+      categorical_entropy = categorical_distribution.entropy()
+
+      # works
+      true_entropy_g = gradients_impl.gradients(true_entropy, [logits])
+      categorical_entropy_g = gradients_impl.gradients(
+          categorical_entropy, [logits])
+
+      res = sess.run({"true_entropy": true_entropy,
+                      "categorical_entropy": categorical_entropy,
+                      "true_entropy_g": true_entropy_g,
+                      "categorical_entropy_g": categorical_entropy_g})
+      self.assertAllClose(res["true_entropy"],
+                          res["categorical_entropy"])
+      self.assertAllClose(res["true_entropy_g"],
+                          res["categorical_entropy_g"])
+
   def testSample(self):
     with self.test_session():
       histograms = [[[0.2, 0.8], [0.4, 0.6]]]
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
index 57c873f59e..0181ded643 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
@@ -569,10 +569,11 @@ class SoftplusTest(test.TestCase):
   def testInverseSoftplusGradientNeverNan(self):
     with self.test_session():
       # Note that this range contains both zero and inf.
-      x = constant_op.constant((10.**np.arange(-8, 6)).astype(np.float16))
-      y = distribution_util.softplus_inverse(x).eval()
+      x = constant_op.constant(np.logspace(-8, 6).astype(np.float16))
+      y = distribution_util.softplus_inverse(x)
+      grads = gradients_impl.gradients(y, x)[0].eval()
       # Equivalent to `assertAllFalse` (if it existed).
-      self.assertAllEqual(np.zeros_like(y).astype(np.bool), np.isnan(y))
+      self.assertAllEqual(np.zeros_like(grads).astype(np.bool), np.isnan(grads))
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/bijector.py b/tensorflow/contrib/distributions/python/ops/bijector.py
index 7e92f49677..41a4f9d859 100644
--- a/tensorflow/contrib/distributions/python/ops/bijector.py
+++ b/tensorflow/contrib/distributions/python/ops/bijector.py
@@ -1977,7 +1977,7 @@ class AffineLinearOperator(Bijector):
         if scale.tensor_rank is not None:
           batch_ndims = scale.tensor_rank - 2
         else:
-          batch_ndims = scale.tensor_rank_dynamic() - 2
+          batch_ndims = scale.tensor_rank_tensor() - 2
           graph_parents += [batch_ndims]
       else:
         batch_ndims = 0  # We won't need shape inference when scale is None.
diff --git a/tensorflow/contrib/distributions/python/ops/categorical.py b/tensorflow/contrib/distributions/python/ops/categorical.py
index feca611d00..9573e89237 100644
--- a/tensorflow/contrib/distributions/python/ops/categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/categorical.py
@@ -209,17 +209,8 @@ class Categorical(distribution.Distribution):
     return math_ops.exp(self._log_prob(k))
 
   def _entropy(self):
-    if self.logits.get_shape().ndims == 2:
-      logits_2d = self.logits
-    else:
-      logits_2d = array_ops.reshape(self.logits, [-1, self.num_classes])
-    histogram_2d = nn_ops.softmax(logits_2d)
-    ret = array_ops.reshape(
-        nn_ops.softmax_cross_entropy_with_logits(labels=histogram_2d,
-                                                 logits=logits_2d),
-        self.batch_shape())
-    ret.set_shape(self.get_batch_shape())
-    return ret
+    return -math_ops.reduce_sum(
+        nn_ops.log_softmax(self.logits) * self.p, axis=-1)
 
   def _mode(self):
     ret = math_ops.argmax(self.logits, dimension=self._batch_rank)
@@ -245,5 +236,6 @@ def _kl_categorical_categorical(a, b, name=None):
     name, "kl_categorical_categorical", [a.logits, b.logits]):
     # sum(p*ln(p/q))
     return math_ops.reduce_sum(
-        nn_ops.softmax(a.logits)*(nn_ops.log_softmax(a.logits)
-            - nn_ops.log_softmax(b.logits)), reduction_indices=[-1])
+        nn_ops.softmax(a.logits) * (
+            nn_ops.log_softmax(a.logits) - nn_ops.log_softmax(b.logits)),
+        axis=-1)
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index 6fb347c834..832698b8a0 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -26,13 +26,13 @@ from six import iteritems
 from six import iterkeys
 from six import string_types
 from six import StringIO
-
 from tensorflow.contrib.graph_editor import edit
 from tensorflow.contrib.graph_editor import reroute
 from tensorflow.contrib.graph_editor import select
 from tensorflow.contrib.graph_editor import subgraph
 from tensorflow.contrib.graph_editor import util
 from tensorflow.python.framework import ops as tf_ops
+from tensorflow.python.platform import tf_logging as logging
 
 __all__ = [
     "replace_t_with_placeholder_handler",
@@ -87,17 +87,24 @@ def keep_t_if_possible_handler(info, t):
 def assign_renamed_collections_handler(info, elem, elem_):
   """Add the transformed elem to the (renamed) collections of elem.
 
+  A collection is renamed only if is not a known key, as described in
+  `tf.GraphKeys`.
+
   Args:
     info: Transform._Info instance.
     elem: the original element (`tf.Tensor` or `tf.Operation`)
     elem_: the transformed element
   """
-  # TODO(fkp): handle known special cases
+  known_collection_names = util.get_predefined_collection_names()
   for name, collection in iteritems(info.collections):
     if elem not in collection:
       continue
-    collection_name_ = info.transformer.new_name(name)
-    info.graph_.add_to_collection(collection_name_, elem_)
+
+    if name in known_collection_names:
+      transformed_name = name
+    else:
+      transformed_name = info.transformer.new_name(name)
+    info.graph_.add_to_collection(transformed_name, elem_)
 
 
 def transform_op_if_inside_handler(info, op, keep_if_possible=True):
@@ -150,6 +157,11 @@ def copy_op_handler(info, op, copy_shape=True):
   # Transform inputs:
   inputs_ = [info.transformer._transform_t(t) for t in op.inputs]
 
+  # Leave inputs empty if a graph cycle was found.
+  if None in inputs_:
+    info.cyclic_ops.append(op)
+    inputs_ = []
+
   # Clone the node def:
   node_def_ = deepcopy(op._node_def)
 
@@ -239,7 +251,7 @@ class Transformer(object):
       self.transformed_ts = {}
       self.collections = dict((key, self.graph.get_collection(key))
                               for key in self.graph.get_all_collection_keys())
-
+      self.cyclic_ops = []
 
   class ResultInfo(object):
     """"Contains information about the result of a transform operation."""
@@ -452,6 +464,17 @@ class Transformer(object):
     for op in remaining_roots:
       self._transform_op(op)
 
+    # Finalize cyclic ops:
+    for op in self._info.cyclic_ops:
+      logging.debug("Finalizing cyclic op: %s", op.name)
+      op_ = self._info.transformed_ops[op]
+      inputs_ = [self._info.transformed_ts[t] for t in op.inputs]
+      if None in inputs_:
+        raise ValueError("Could not find all the inputs of cyclic op: {}"
+                         .format(op_.name))
+      for input_id, t_ in enumerate(inputs_):
+        op_._update_input(input_id, t_)  # pylint: disable=protected-access
+
     sgv_ = self._transform_sgv(sgv)
 
     res_info = Transformer.ResultInfo(self._info)
@@ -506,9 +529,13 @@ class Transformer(object):
     Returns:
       The transformed tensor.
     """
+    logging.debug("Transforming tensor: %s", t.name)
     if t in self._info.transformed_ts:
       return self._info.transformed_ts[t]
 
+    # Mark as None to detect cycle.
+    self._info.transformed_ts[t] = None
+
     op, op_index = t.op, t.value_index
 
     # If op is not in the subgraph:
diff --git a/tensorflow/contrib/graph_editor/util.py b/tensorflow/contrib/graph_editor/util.py
index 11ee2435c9..d8824f6792 100644
--- a/tensorflow/contrib/graph_editor/util.py
+++ b/tensorflow/contrib/graph_editor/util.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import re
 from six import iteritems
 from tensorflow.python.framework import ops as tf_ops
 from tensorflow.python.ops import array_ops as tf_array_ops
@@ -465,3 +466,75 @@ def make_placeholder_from_dtype_and_shape(dtype, shape=None, scope=None):
   """
   return tf_array_ops.placeholder(
       dtype=dtype, shape=shape, name=placeholder_name(scope=scope))
+
+
+_INTERNAL_VARIABLE_RE = re.compile(r"^__\w+__$")
+
+
+def get_predefined_collection_names():
+  """Return all the predefined collection names."""
+  return [getattr(tf_ops.GraphKeys, key) for key in dir(tf_ops.GraphKeys)
+          if not _INTERNAL_VARIABLE_RE.match(key)]
+
+
+def find_corresponding_elem(target, dst_graph, dst_scope="", src_scope=""):
+  """Find corresponding op/tensor in a different graph.
+
+  Args:
+    target: A `tf.Tensor` or a `tf.Operation` belonging to the original graph.
+    dst_graph: The graph in which the corresponding graph element must be found.
+    dst_scope: A scope which is prepended to the name to look for.
+    src_scope: A scope which is removed from the original of `target` name.
+
+  Returns:
+    The corresponding tf.Tensor` or a `tf.Operation`.
+
+  Raises:
+    ValueError: if `src_name` does not start with `src_scope`.
+    TypeError: if `target` is not a `tf.Tensor` or a `tf.Operation`
+    KeyError: If the corresponding graph element cannot be found.
+  """
+  src_name = target.name
+  if src_scope:
+    src_scope = scope_finalize(src_scope)
+    if not src_name.startswidth(src_scope):
+      raise ValueError("{} does not start with {}".format(src_name, src_scope))
+    src_name = src_name[len(src_scope):]
+
+  dst_name = src_name
+  if dst_scope:
+    dst_scope = scope_finalize(dst_scope)
+    dst_name = dst_scope + dst_name
+
+  if isinstance(target, tf_ops.Tensor):
+    return dst_graph.get_tensor_by_name(dst_name)
+  if isinstance(target, tf_ops.Operation):
+    return dst_graph.get_operation_by_name(dst_name)
+  raise TypeError("Expected tf.Tensor or tf.Operation, got: {}", type(target))
+
+
+def find_corresponding(targets, dst_graph, dst_scope="", src_scope=""):
+  """Find corresponding ops/tensors in a different graph.
+
+  `targets` is a Python tree, that is, a nested structure of iterable
+  (list, tupple, dictionary) whose leaves are instances of
+  `tf.Tensor` or `tf.Operation`
+
+  Args:
+    targets: A Python tree containing `tf.Tensor` or `tf.Operation`
+      belonging to the original graph.
+    dst_graph: The graph in which the corresponding graph element must be found.
+    dst_scope: A scope which is prepended to the name to look for.
+    src_scope: A scope which is removed from the original of `top` name.
+
+  Returns:
+    A Python tree containin the corresponding tf.Tensor` or a `tf.Operation`.
+
+  Raises:
+    ValueError: if `src_name` does not start with `src_scope`.
+    TypeError: if `top` is not a `tf.Tensor` or a `tf.Operation`
+    KeyError: If the corresponding graph element cannot be found.
+  """
+  def func(top):
+    return find_corresponding_elem(top, dst_graph, dst_scope, src_scope)
+  return transform_tree(targets, func)
diff --git a/tensorflow/contrib/hvx/hexagon_controller/Makefile b/tensorflow/contrib/hvx/hexagon_controller/Makefile
new file mode 100644
index 0000000000..9fe2ed596a
--- /dev/null
+++ b/tensorflow/contrib/hvx/hexagon_controller/Makefile
@@ -0,0 +1,19 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include glue/defines.min
+
+include target/make/android.min
+
+include $(RULES_MIN)
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_dummy_data/inception_v1_graph_init.c b/tensorflow/contrib/hvx/hexagon_controller/src_dummy_data/inception_v1_graph_init.c
new file mode 100644
index 0000000000..3ca5532c38
--- /dev/null
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_dummy_data/inception_v1_graph_init.c
@@ -0,0 +1,16 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+void init_graph_v1(int nn_id) {}
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_dummy_data/inception_v3_dummy_float_data.c b/tensorflow/contrib/hvx/hexagon_controller/src_dummy_data/inception_v3_dummy_float_data.c
new file mode 100644
index 0000000000..dc61ae754a
--- /dev/null
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_dummy_data/inception_v3_dummy_float_data.c
@@ -0,0 +1,16 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+float inception_dummy_float_data_299x299[299*299*3] = {};
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_dummy_data/inception_v3_dummy_int_data.c b/tensorflow/contrib/hvx/hexagon_controller/src_dummy_data/inception_v3_dummy_int_data.c
new file mode 100644
index 0000000000..27e1ca40b9
--- /dev/null
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_dummy_data/inception_v3_dummy_int_data.c
@@ -0,0 +1,17 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+uint8_t inception_dummy_int_data_299x299[299*299*3] = {};
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_dummy_data/inception_v3_graph_init.c b/tensorflow/contrib/hvx/hexagon_controller/src_dummy_data/inception_v3_graph_init.c
new file mode 100644
index 0000000000..9def665827
--- /dev/null
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_dummy_data/inception_v3_graph_init.c
@@ -0,0 +1,16 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+void init_graph(int nn_id) {}
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/graph_functions_wrapper.c b/tensorflow/contrib/hvx/hexagon_controller/src_impl/graph_functions_wrapper.c
new file mode 100644
index 0000000000..567485b035
--- /dev/null
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/graph_functions_wrapper.c
@@ -0,0 +1,355 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// to demonstrate the performance difference between ION and HLOS memory
+// for sharing with ADSP.
+#define USE_ION_MEMORY
+
+#include <limits.h>
+#include <stdio.h>
+
+#include "hexagon_controller.h"
+#include "hexagon_nn.h"
+#include "tfm_log.h"
+
+static const uint32_t MAX_NODES = 2048;
+static const uint32_t MAX_EVENT_COUNT = 256;
+
+static const bool DUMP_OUTPUT = false;
+static const bool DBG_EXECUTION = true;
+
+static const int OUT_RANKING_SIZE = 5;
+
+// static only for this file.
+// TODO(satok): allocate dynamically
+static float s_output_values[300 * 300 * 3 * 4];
+
+extern void init_graph(uint32_t id);
+extern void init_graph_v1(uint32_t id);
+extern uint8_t inception_dummy_int_data_299x299[];
+extern uint8_t inception_sample_int_data_224x224[];
+extern float inception_dummy_float_data_299x299_299x299[];
+
+enum InceptionVersion {
+  INCEPTION_V1,
+  INCEPTION_V3,
+};
+
+static enum InceptionVersion s_inception_version = INCEPTION_V3;
+
+/////////////////////////////////////////////////
+// file local functions
+
+static const char *ConvertGraphInfoIdToName(unsigned int id) {
+  // TODO(satok): implement
+  return "?";
+}
+
+static const char *ConvertGraphInfoIdToOpName(unsigned int id) {
+  // TODO(satok): implement
+  return "?";
+}
+
+/////////////////////////////////////////////////
+// file local utilities
+static uint32_t FindMaxIdxWithExcludeList(
+    const float *data, uint32_t entries, const int exclude_size,
+    const int* exclude_idx) {
+  int i;
+  float maxval = data[0];
+  int maxidx = 0;
+  for (i = 0; i < entries; i++) {
+    bool exclude = false;
+    for (int j = 0; j < exclude_size; ++j) {
+      if (exclude_idx[j] == i) {
+        exclude = true;
+        break;
+      }
+    }
+    if (exclude) {
+      continue;
+    }
+    if (maxval < data[i]) {
+      maxval = data[i];
+      maxidx = i;
+    }
+  }
+  return maxidx;
+}
+
+static uint32_t FindMaxIdx(const float* data, uint32_t entries) {
+  return FindMaxIdxWithExcludeList(data, entries, 0, NULL);
+}
+
+void hexagon_controller_PrintMaxNIdx(const float *data, const uint32_t entries,
+                         const int n, int* out_ranking) {
+  if (DUMP_OUTPUT) {
+    for (int i = 0; i < entries; ++i) {
+      TFMLOGD("%d: val = %f", i, data[i]);
+    }
+  }
+  for (int i = 0; i < n; ++i) {
+    out_ranking[i] = INT_MAX;
+  }
+  for (int i = 0; i < n; ++i) {
+    out_ranking[i] = FindMaxIdxWithExcludeList(data, entries, n, out_ranking);
+  }
+  TFMLOGD("=== RANKING ===");
+  for (int i = 0; i < n; ++i) {
+    TFMLOGD("%d: id = %d, val = %f", i, out_ranking[i], data[out_ranking[i]]);
+  }
+}
+
+static inline unsigned long long int GetCounter(hexagon_nn_perfinfo s) {
+  unsigned long long int ret;
+  ret = s.counter_hi;
+  ret <<= 32;
+  ret |= s.counter_lo;
+  return ret;
+}
+
+static int CompareCycle(const void *va, const void *vb) {
+  const hexagon_nn_perfinfo *a = va;
+  const hexagon_nn_perfinfo *b = vb;
+  unsigned long long int acount = GetCounter(*a);
+  unsigned long long int bcount = GetCounter(*b);
+  if (acount < bcount) {
+    return -1;
+  } else if (acount > bcount) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+/////////////////////////////////////////////////
+// Graph functions
+
+uint32_t hexagon_controller_InstantiateGraph() {
+  const uint32_t nn_id = hexagon_nn_init();
+  // set debug level to 99 for now
+  //hexagon_nn_set_debug_level(nn_id, 99);
+  // TODO(satok): make this as argument
+  hexagon_nn_set_debug_level(nn_id, 0);
+  return nn_id;
+}
+
+void hexagon_controller_InitGraph(int version, uint32_t nn_id) {
+  if (version == 1) {
+    s_inception_version = INCEPTION_V1;
+  } else if (version == 3) {
+    s_inception_version = INCEPTION_V3;
+  } else {
+    TFMLOGE("Unsupported inception version %d", version);
+    return;
+  }
+  if (s_inception_version == INCEPTION_V3) {
+    init_graph(nn_id);
+  } else if (s_inception_version == INCEPTION_V1) {
+    init_graph_v1(nn_id);
+  }
+  TFMLOGD("Init graph (inception version = %d) done.", version);
+}
+
+bool hexagon_controller_ConstructGraph(uint32_t nn_id) {
+  int err;
+  if ((err = hexagon_nn_prepare(nn_id)) != 0) {
+    TFMLOGE("Prepare failed! returned 0x%x\n", err);
+    return false;
+  } else {
+    TFMLOGD("Prepare success!\n");
+    return true;
+  }
+}
+
+uint32_t hexagon_controller_SetupGraph(int version)  {
+  const uint32_t nn_id = hexagon_controller_InstantiateGraph();
+  hexagon_controller_InitGraph(version, nn_id);
+  hexagon_controller_ConstructGraph(nn_id);
+  return nn_id;
+}
+
+bool hexagon_controller_ExecuteGraph(
+    const uint32_t nn_id,
+    const uint32_t batches,
+    const uint32_t height,
+    const uint32_t width,
+    const uint32_t depth,
+    uint8_t* int_data,
+    const uint32_t int_data_size,
+    uint32_t* out_batches,
+    uint32_t* out_height,
+    uint32_t* out_width,
+    uint32_t* out_depth,
+    uint8_t* out_vals,
+    const uint32_t output_val_byte_size,
+    uint32_t* out_data_byte_size) {
+  int err;
+  if (DBG_EXECUTION) {
+    TFMLOGD("Preparing to execute...");
+    TFMLOGD("Input: %d, %d, %d, %d, %d, %d",
+            batches, height, width, depth, int_data[0], int_data_size);
+    TFMLOGD("Output: %d, %p", output_val_byte_size, out_vals);
+    LogDHexagon("Execute graph!");
+  }
+  
+  if ((err = hexagon_nn_execute(nn_id,
+                                batches,
+                                height,
+                                width,
+                                depth,
+                                int_data,
+                                int_data_size,
+                                out_batches,
+                                out_height,
+                                out_width,
+                                out_depth,
+                                out_vals,
+                                output_val_byte_size,
+                                out_data_byte_size)) != 0) {
+    if (DBG_EXECUTION) {
+      LogDHexagon("Execution failed!");
+      TFMLOGE("execute got err: %d\n",err);
+    }
+    return false;
+  } else {
+    if (DBG_EXECUTION) {
+      LogDHexagon("Execution succeeded!");
+      TFMLOGD("%d x %d x %d x %d, byte size = %d\n",
+              *out_batches,
+              *out_height,
+              *out_width,
+              *out_depth,
+              *out_data_byte_size);
+    }
+    return true;
+  }
+}
+
+bool hexagon_controller_ExecuteInceptionDummyData(uint32_t nn_id) {
+  uint32_t out_batches, out_height, out_width, out_depth;
+  uint32_t out_data_size;
+  // s_output_values = 300 * 300 * 3 * 4 * 4
+  const bool success = hexagon_controller_ExecuteGraph(
+      nn_id, INCEPTION_PARAM_BATCHES, INCEPTION_PARAM_HEIGHT_V3,
+      INCEPTION_PARAM_WIDTH_V3, INCEPTION_PARAM_DEPTH,
+      (uint8_t *)inception_dummy_int_data_299x299,
+      INCEPTION_PARAM_HEIGHT_V3 * INCEPTION_PARAM_WIDTH_V3 *
+      INCEPTION_PARAM_DEPTH,
+      &out_batches, &out_height, &out_width, &out_depth,
+      (uint8_t *)s_output_values, sizeof(s_output_values),
+      &out_data_size);
+  if (success) {
+    int out_ranking[OUT_RANKING_SIZE];
+    hexagon_controller_PrintMaxNIdx(
+        s_output_values,
+        out_batches * out_height * out_width * out_depth,
+        OUT_RANKING_SIZE, out_ranking);
+    TFMLOGD("%d x %d x %d x %d, size = %d\n",
+            out_batches,
+            out_height,
+            out_width,
+            out_depth,
+            out_data_size);
+    TFMLOGD("max idx: %d\n", FindMaxIdx(
+        s_output_values,
+        out_batches * out_height * out_width * out_depth));
+    if (out_ranking[0] == 169 && out_ranking[1] == 7) {
+      return true;
+    } else {
+      TFMLOGD("Result is wrong! %d, %d", out_ranking[0], out_ranking[1]);
+      return false;
+    }
+  } else {
+    return false;
+  }
+}
+
+void hexagon_controller_DumpPerf(uint32_t nn_id) {
+  hexagon_nn_perfinfo info[MAX_NODES];
+  unsigned long long int total_cycles = 0;
+  unsigned long long int cum_cycles = 0;
+  unsigned long long int counter = 0;
+  int n_nodes;
+  int i;
+  TFMLOGD("Perf dump follows:");
+  if (hexagon_nn_get_perfinfo(nn_id, info, MAX_NODES,&n_nodes) != 0) {
+    TFMLOGE("perf info failure");
+    return;
+  }
+  TFMLOGD("Total %d nodes.",n_nodes);
+  qsort(info,n_nodes,sizeof(info[0]), CompareCycle);
+  for (i = 0; i < n_nodes; i++) {
+    total_cycles += GetCounter(info[i]);
+  }
+  TFMLOGD("Total %lld cycles.",total_cycles);
+  for (i = 0; i < n_nodes; i++) {
+    counter = GetCounter(info[i]);
+    cum_cycles += counter;
+    TFMLOGD("node,0x%x,%s,%s,executions,%d,cycles,%lld,%f %%,"
+            "cum_cycles,%lld,%f %%\n",
+           info[i].node_id,
+           ConvertGraphInfoIdToName(info[i].node_id),
+           ConvertGraphInfoIdToOpName(info[i].node_id),
+           info[i].executions,
+           counter,
+           100*((double)counter)/total_cycles,
+           cum_cycles,
+           100*((double)cum_cycles)/total_cycles);
+  }
+#ifdef ENABLE_HVX_FULL_DEBUG
+  DumpAllPerf(nn_id);
+#endif
+}
+
+void hexagon_controller_DumpNodeName(uint32_t nn_id) {
+  TFMLOGD("Show node name");
+  const uint32_t id = nn_id;
+  hexagon_nn_perfinfo info[MAX_NODES];
+  unsigned long long int total_cycles = 0;
+  unsigned long long int cum_cycles = 0;
+  unsigned long long int counter = 0;
+  int node_count;
+  int i;
+  TFMLOGD("Perf dump follows:");
+  if (hexagon_nn_get_perfinfo(id, info, MAX_NODES, &node_count) != 0) {
+    TFMLOGD("perf info failure");
+    return;
+  }
+  TFMLOGD("Total %d nodes.",node_count);
+  qsort(info, node_count, sizeof(info[0]), CompareCycle);
+  for (i = 0; i < node_count; i++) {
+    total_cycles += GetCounter(info[i]);
+  }
+  TFMLOGD("Total %lld cycles.", total_cycles);
+  for (i = 0; i < node_count; i++) {
+    counter = GetCounter(info[i]);
+    cum_cycles += counter;
+    TFMLOGD("node,0x%x,%s,%s,executions,%d,cycles,%lld,%f %%,"
+            "cum_cycles,%lld,%f %%",
+            info[i].node_id,
+            ConvertGraphInfoIdToName(info[i].node_id),
+            ConvertGraphInfoIdToOpName(info[i].node_id),
+            info[i].executions,
+            counter,
+            100*((double)counter)/total_cycles,
+            cum_cycles,
+            100*((double)cum_cycles)/total_cycles);
+  }
+}
+
+void hexagon_controller_Teardown(uint32_t nn_id) {
+  hexagon_nn_teardown(nn_id);
+}
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
new file mode 100644
index 0000000000..fe329e2f59
--- /dev/null
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
@@ -0,0 +1,374 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// to demonstrate the performance difference between ION and HLOS memory
+// for sharing with ADSP.
+#define USE_ION_MEMORY
+
+#include "hexagon_controller.h"
+
+#include <malloc.h>
+#include <stdio.h>
+
+#include "adspmsgd.h"
+#include "dspCV.h"
+#include "rpcmem.h"    // helper API's for shared buffer allocation
+#include "soc_interface.h"
+#include "tfm_log.h"
+
+// if false, use int data as input.  This is only for acceleration purpose
+static const bool USE_FLOAT_DATA = true;
+
+// if true, show id for each node
+static const bool DBG_SHOW_ID = false;
+
+static const uint32_t OUTPUT_PARAM_MAX_LINE_SIZE = 1000;
+
+// extern pre-generated inception dummy data
+extern uint8_t inception_dummy_int_data_224x224[];
+extern uint8_t inception_dummy_int_data_299x299[];
+extern float inception_dummy_float_data_299x299_299x299[];
+
+#define GEMM_WRAPPER_VERSION 1
+
+// allocate print bufsize in advance @MB
+#define PRINT_BUFSIZE (2 * 1024 * 1024)
+
+static unsigned char s_print_buf[PRINT_BUFSIZE];
+
+// input node data buffer size
+// x2 1024 * 1024 * 2 > 299 * 299 * 3 * 4 > 1024 * 1024
+static const int INPUT_NODE_DATA_BUFFER_SIZE = 1024 * 1024 * 2;
+// output node data buffer size
+// (1008 is enough for inception)
+static const int OUTPUT_NODE_DATA_BUFFER_SIZE = 300 * 300 * 3 * 4;
+
+static struct NodeDataFloat s_input_node_data_float_buffer;
+static float* s_output_node_data_float_buffer;
+static int s_output_node_data_float_buffer_byte_size;
+static int s_output_node_data_float_array_size;
+static uint32_t s_target_graph_id;
+
+static bool s_dbg_use_inception_dummy_data = false;
+
+void hexagon_controller_InitInputNodeDataToInceptionDummyData(int version) {
+  if (version == 1) {
+    if (USE_FLOAT_DATA) {
+      TFMLOGE("ERROR!!!! Do not use float data for v1");
+      return;
+    }
+    hexagon_controller_CopyByteNodeData(
+        INCEPTION_PARAM_BATCHES, INCEPTION_PARAM_HEIGHT_V1,
+        INCEPTION_PARAM_WIDTH_V1, INCEPTION_PARAM_DEPTH,
+        1, inception_dummy_int_data_224x224);
+  } else if (version == 3) {
+    if (USE_FLOAT_DATA) {
+      hexagon_controller_CopyByteNodeData(
+          INCEPTION_PARAM_BATCHES, INCEPTION_PARAM_HEIGHT_V3,
+          INCEPTION_PARAM_WIDTH_V3, INCEPTION_PARAM_DEPTH,
+          sizeof(float), (uint8_t*)inception_dummy_float_data_299x299_299x299);
+    } else {
+      hexagon_controller_CopyByteNodeData(
+          INCEPTION_PARAM_BATCHES, INCEPTION_PARAM_HEIGHT_V3,
+          INCEPTION_PARAM_WIDTH_V3, INCEPTION_PARAM_DEPTH,
+          1, inception_dummy_int_data_299x299);
+    }
+  }
+}
+
+bool hexagon_controller_ExecuteGraphWithBuffer(
+    uint32_t nn_id, bool show_ranking) {
+  uint32_t out_batches, out_height, out_width, out_depth;
+  uint32_t out_data_size;
+  int x = s_input_node_data_float_buffer.x;
+  int y = s_input_node_data_float_buffer.y;
+  int z = s_input_node_data_float_buffer.z;
+  int d = s_input_node_data_float_buffer.d;
+  uint8_t *byte_data = s_input_node_data_float_buffer.byte_array_data;
+  int array_size = s_input_node_data_float_buffer.array_size;
+  const bool success = hexagon_controller_ExecuteGraph(
+      nn_id, x, y, z, d, byte_data, array_size,
+      &out_batches, &out_height, &out_width, &out_depth,
+      (uint8_t *)s_output_node_data_float_buffer,
+      s_output_node_data_float_buffer_byte_size,
+      &out_data_size);
+  s_output_node_data_float_array_size =
+      out_batches * out_height * out_width * out_depth;
+  if (!success) {
+    TFMLOGE("Execution failed");
+    return false;
+  } else if (!show_ranking) {
+    return true;
+  }
+
+  static const int OUT_RANKING_SIZE = 5;
+  int out_ranking[OUT_RANKING_SIZE];
+  hexagon_controller_PrintMaxNIdx(
+      s_output_node_data_float_buffer,
+      out_batches * out_height * out_width * out_depth,
+      OUT_RANKING_SIZE, out_ranking);
+  TFMLOGD("%d x %d x %d x %d, byte size = %d\n",
+          out_batches,
+          out_height,
+          out_width,
+          out_depth,
+          out_data_size);
+  if (s_dbg_use_inception_dummy_data) {
+    // Check the result of inception with a dummy data. This step shouldn't
+    // be passed when show_ranking != true to avoid adding unnecessary
+    // additional computation cost.
+    if (out_ranking[0] == 169 && out_ranking[1] == 7) {
+      TFMLOGD("Result is correct! %d, %d", out_ranking[0], out_ranking[1]);
+      return true;
+    } else {
+      TFMLOGD("Result is wrong! %d, %d", out_ranking[0], out_ranking[1]);
+      return false;
+    }
+  }
+  return true;
+}
+
+uint32_t hexagon_controller_GetTargetGraphId() {
+  return s_target_graph_id;
+}
+
+void hexagon_controller_SetTargetGraphId(uint32_t graph_id) {
+  s_target_graph_id = graph_id;
+}
+
+void hexagon_controller_PrintGraph(uint32_t id) {
+  int retval = hexagon_nn_snpprint(id, s_print_buf, PRINT_BUFSIZE);
+  TFMLOGD("PrintGraph %s\n", s_print_buf);
+  if (retval) {
+    TFMLOGE("Error on print graph\n");
+  }
+}
+
+int hexagon_controller_GetWrapperVersion() {
+  return GEMM_WRAPPER_VERSION;
+}
+
+int hexagon_controller_GetHexagonBinaryVersion() {
+  int retval = 0;
+  hexagon_nn_GetHexagonBinaryVersion(&retval);
+  return retval;
+}
+
+bool hexagon_controller_AllocateNodeDataBuffers(
+    int input_size, int output_size) {
+  TFMLOGD("Allocate memory for input / output node data float");
+  if (s_input_node_data_float_buffer.buf_size != 0) {
+    TFMLOGE("ERROR! input buffer is already allocated!!");
+    return false;
+  } else {
+    int byte_array_data_size = USE_FLOAT_DATA ?
+        input_size * sizeof(float) : input_size; /* sizeof(uint8_t) ? */
+    s_input_node_data_float_buffer.buf_size = input_size;
+    // unused? remove?
+    s_input_node_data_float_buffer.array_data =
+        malloc(input_size * sizeof(float));
+    s_input_node_data_float_buffer.byte_array_data =
+        malloc(byte_array_data_size);
+
+    s_output_node_data_float_buffer = malloc(output_size * sizeof(float));
+    s_output_node_data_float_buffer_byte_size = output_size * sizeof(float);
+    s_output_node_data_float_array_size = 0;
+    TFMLOGD("allocate node data buffers");
+  }
+  return true;
+}
+
+bool hexagon_controller_ReleaseNodeDataBuffers() {
+  if (s_input_node_data_float_buffer.buf_size == 0) {
+    TFMLOGE("ERROR! input buffer has not been allocated yet!!");
+    return false;
+  } else {
+    s_input_node_data_float_buffer.buf_size = 0;
+    free(s_input_node_data_float_buffer.array_data);
+  }
+  if (s_output_node_data_float_buffer_byte_size == 0) {
+    TFMLOGE("ERROR! output buffer has not been allocated yet!!");
+    return false;
+  } else {
+    s_output_node_data_float_buffer_byte_size = 0;
+    free(s_input_node_data_float_buffer.byte_array_data);
+  }
+  return true;
+}
+
+bool hexagon_controller_CopyByteNodeData(
+    int x, int y, int z, int d, int type_byte_size, uint8_t* array_data) {
+  int array_byte_size = x * y * z * d * type_byte_size;
+  TFMLOGD("--- %d, %d, %d, %d, %d, %d",x,y,z,d,type_byte_size,array_byte_size);
+  if (s_input_node_data_float_buffer.buf_size < array_byte_size) {
+    TFMLOGE("ERROR! input buffer size is too small! %d < %d",
+            s_input_node_data_float_buffer.buf_size, array_byte_size);
+    return false;
+  }
+  memcpy(s_input_node_data_float_buffer.byte_array_data,
+         array_data, array_byte_size);
+  s_input_node_data_float_buffer.array_size = array_byte_size;
+  s_input_node_data_float_buffer.x = x;
+  s_input_node_data_float_buffer.y = y;
+  s_input_node_data_float_buffer.z = z;
+  s_input_node_data_float_buffer.d = d;
+  return true;
+}
+
+int hexagon_controller_InitHexagonWithMaxAttributes(
+    int enable_dcvs, int bus_usage, int version) {
+  TFMLOGI("Init hexagon with max attributes");
+  const int MCPS = 1000;
+  const int MBPS = 12000;
+
+  adspmsgd_start(0, RPCMEM_HEAP_DEFAULT, 4096);
+
+  dspCV_Attribute attrib[] = {
+    // The below values will result in the maximum aDSP performance,
+    // at Turbo voltage.
+    // Slightly more MCPS than are available on current targets
+    {DSP_TOTAL_MCPS, MCPS},
+    // drive the clock to MAX on known targets
+    {DSP_MCPS_PER_THREAD, MCPS / 2},
+    // 12 GB/sec is slightly higher than the max realistic
+    // max BW on existing targets.
+    {PEAK_BUS_BANDWIDTH_MBPS, MBPS},
+    // This app is non-real time, and constantly reading/writing memory
+    {BUS_USAGE_PERCENT, bus_usage},
+  };
+  int retval = 0;
+  if (!enable_dcvs) {
+    retval = hexagon_nn_disableDcvs();
+    if (retval) {
+      TFMLOGE("Failed to disable DSP DCVS: %x\n", retval);
+    }
+  }
+
+  retval =
+      dspCV_initQ6_with_attributes(attrib, sizeof(attrib) / sizeof(attrib[0]));
+  TFMLOGD("Return value from dspCV_initQ6() : %d\n", retval);
+
+  hexagon_controller_AllocateNodeDataBuffers(
+      INPUT_NODE_DATA_BUFFER_SIZE, OUTPUT_NODE_DATA_BUFFER_SIZE);
+
+  if (s_dbg_use_inception_dummy_data) {
+    hexagon_controller_InitInputNodeDataToInceptionDummyData(version);
+  }
+  s_target_graph_id = 0;
+
+  return retval;
+}
+
+int hexagon_controller_DeInitHexagon() {
+  adspmsgd_stop();
+  TFMLOGI("Finalize hexagon");
+  const int retval = dspCV_deinitQ6();
+  TFMLOGD("return value from dspCV_deinitQ6(): %d \n", retval);
+
+  hexagon_controller_ReleaseNodeDataBuffers();
+
+  return retval;
+}
+
+void hexagon_controller_GrowMemorySize() {
+  hexagon_nn_config();
+}
+
+struct NodeDataFloat* hexagon_controller_GetInputNodeDataFloatBuffer() {
+  return &s_input_node_data_float_buffer;
+}
+
+float* hexagon_controller_GetOutputNodeDataFloatBuffer(
+    const char *const node_name, int* out_array_size) {
+  *out_array_size = s_output_node_data_float_array_size;
+  return s_output_node_data_float_buffer;
+}
+
+// Append const node to the graph
+int hexagon_controller_AppendConstNode(
+    const char* const name, int graph_id, int node_id,
+    int batch, int height, int width, int depth,
+    const uint8_t* const data, int data_length) {
+  if (DBG_SHOW_ID) {
+    TFMLOGV("---(CONST) %s, %d, %d, %d, %d, %d, %d",
+            name, node_id, batch, height, width, depth, data_length);
+  } else {
+    TFMLOGV("---(CONST) %s, %d, %d, %d, %d, %d",
+            name, batch, height, width, depth, data_length);
+  }
+  const int retval = hexagon_nn_append_const_node(
+      graph_id, node_id, batch, height, width, depth, data, data_length);
+  if (retval != 0) {
+    TFMLOGE("Failed to append const node %d", node_id);
+    return retval;
+  }
+  return retval;
+}
+
+// Append node to the graph
+int hexagon_controller_AppendNode(
+    const char* const name, int graph_id, int node_id, int ops_id,
+    int padding_id, const hexagon_nn_input* const inputs,
+    int inputs_count, const hexagon_nn_output* const outputs,
+    int outputs_count) {
+  char input_param_buf[OUTPUT_PARAM_MAX_LINE_SIZE];
+  memset(input_param_buf, 0, OUTPUT_PARAM_MAX_LINE_SIZE);
+  int pos = 0;
+  pos += snprintf(&input_param_buf[pos], 500, "in: ");
+  for (int i = 0; i < inputs_count; ++i) {
+    if (DBG_SHOW_ID) {
+      pos += snprintf(&input_param_buf[pos], 500, "(%d, %d), ",
+                      inputs[i].src_id, inputs[i].output_idx);
+    } else {
+      pos += snprintf(&input_param_buf[pos], 500, "(%d), ",
+                      inputs[i].output_idx);
+    }
+  }
+
+  char output_param_buf[OUTPUT_PARAM_MAX_LINE_SIZE];
+  memset(output_param_buf, 0, OUTPUT_PARAM_MAX_LINE_SIZE);
+  pos = 0;
+  pos += snprintf(&output_param_buf[pos], 500, "out: ");
+  for (int i = 0; i < outputs_count; ++i) {
+    pos += snprintf(&output_param_buf[pos], 500, "(%d), ", outputs[i].max_size);
+  }
+
+  if (DBG_SHOW_ID) {
+    TFMLOGV("---(OP) %s, %d, %d, %d, %d, %d, %s, %s", name, node_id,
+            ops_id, padding_id, inputs_count, outputs_count, input_param_buf,
+            output_param_buf);
+  } else {
+    TFMLOGV("---(OP) %s, %d, %d, %d, %d, %s, %s", name,
+            ops_id, padding_id, inputs_count, outputs_count, input_param_buf,
+            output_param_buf);
+  }
+  const int retval = hexagon_nn_append_node(
+      graph_id, node_id, ops_id, padding_id,
+      inputs, inputs_count,
+      outputs, outputs_count);
+  if (retval != 0) {
+    TFMLOGE("Failed to append const node %d", node_id);
+    return retval;
+  }
+  return retval;
+}
+
+void hexagon_controller_EnableDbgUseInceptionDummyData(bool enable) {
+  s_dbg_use_inception_dummy_data = enable;
+}
+
+bool hexagon_controller_IsDbgUseInceptionDummyDataEnabled() {
+  return s_dbg_use_inception_dummy_data;
+}
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/include/hexagon_controller.h b/tensorflow/contrib/hvx/hexagon_controller/src_impl/include/hexagon_controller.h
new file mode 100644
index 0000000000..eaf4a58751
--- /dev/null
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/include/hexagon_controller.h
@@ -0,0 +1,124 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef GEMM_WRAPPER_H
+#define GEMM_WRAPPER_H
+
+#include <stdbool.h>
+#include <stdlib.h>
+
+#include "hexagon_nn.h"
+#include "node_data_float.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define INCEPTION_PARAM_BATCHES 1
+#define INCEPTION_PARAM_HEIGHT_V1 224
+#define INCEPTION_PARAM_WIDTH_V1 224
+#define INCEPTION_PARAM_HEIGHT_V3 299
+#define INCEPTION_PARAM_WIDTH_V3 299
+#define INCEPTION_PARAM_DEPTH 3
+
+// General functions
+void hexagon_controller_PrintGraph(uint32_t nn_id);
+
+int hexagon_controller_GetWrapperVersion();
+
+int hexagon_controller_GetHexagonBinaryVersion();
+
+// Hexagon perf functions
+int hexagon_controller_InitHexagonWithMaxAttributes(int enable_dcvs,
+                                                    int bus_usage, int version);
+
+bool hexagon_controller_AllocateNodeDataBuffers(int input_size,
+                                                int output_size);
+
+bool hexagon_controller_ReleaseNodeDataBuffers();
+
+bool hexagon_controller_CopyByteNodeData(int x, int y, int z, int d,
+                                         int type_byte_size,
+                                         uint8_t* array_data);
+
+int hexagon_controller_DeInitHexagon();
+
+uint32_t hexagon_controller_GetTargetGraphId();
+
+void hexagon_controller_SetTargetGraphId(uint32_t graph_id);
+
+// Hexagon config functions
+void hexagon_controller_GrowMemorySize();
+
+// Graph data transfer functions
+struct NodeDataFloat* hexagon_controller_GetInputNodeDataFloatBuffer();
+
+float* hexagon_controller_GetOutputNodeDataFloatBuffer(
+    const char* const node_name, int* out_array_size);
+
+// Graph functions
+uint32_t hexagon_controller_InstantiateGraph();
+
+void hexagon_controller_InitGraph(int version, uint32_t nn_id);
+
+bool hexagon_controller_ConstructGraph(uint32_t nn_id);
+
+uint32_t hexagon_controller_SetupGraph(int version);
+
+bool hexagon_controller_ExecuteInceptionDummyData(uint32_t nn_id);
+
+bool hexagon_controller_ExecuteGraph(
+    const uint32_t nn_id, const uint32_t batches, const uint32_t height,
+    const uint32_t width, const uint32_t depth, uint8_t* int_data,
+    const uint32_t int_data_size, uint32_t* out_batches, uint32_t* out_height,
+    uint32_t* out_width, uint32_t* out_depth, uint8_t* out_vals,
+    const uint32_t output_val_byte_size, uint32_t* out_data_byte_size);
+
+bool hexagon_controller_ExecuteGraphWithBuffer(uint32_t nn_id,
+                                               bool show_ranking);
+
+void hexagon_controller_DumpPerf(uint32_t nn_id);
+
+void hexagon_controller_DumpNodeName(uint32_t nn_id);
+
+void hexagon_controller_Teardown(uint32_t nn_id);
+
+void hexagon_controller_PrintMaxNIdx(const float* data, const uint32_t entries,
+                                     const int n, int* out_ranking);
+
+void hexagon_controller_InitInputNodeDataToInceptionDummyData(int version);
+
+int hexagon_controller_AppendNode(const char* const name, int graph_id,
+                                  int node_id, int op_id, int padding_id,
+                                  const hexagon_nn_input* const inputs,
+                                  int inputs_count,
+                                  const hexagon_nn_output* const outputs,
+                                  int outputs_count);
+
+int hexagon_controller_AppendConstNode(const char* const name, int graph_id,
+                                       int node_id, int batch, int height,
+                                       int width, int depth,
+                                       const uint8_t* const data,
+                                       int data_length);
+
+void hexagon_controller_EnableDbgUseInceptionDummyData(bool enable);
+
+bool hexagon_controller_IsDbgUseInceptionDummyDataEnabled();
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // GEMM_WRAPPER_H
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_log/include/tfm_log.h b/tensorflow/contrib/hvx/hexagon_controller/src_log/include/tfm_log.h
new file mode 100644
index 0000000000..e8615fd4ec
--- /dev/null
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_log/include/tfm_log.h
@@ -0,0 +1,74 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef GEMM_WRAPPER_LOG_H
+#define GEMM_WRAPPER_LOG_H
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+#define TFM_LOG_LEVEL_VERBOSE -2
+#define TFM_LOG_LEVEL_DEBUG -1
+#define TFM_LOG_LEVEL_INFO 0
+#define TFM_LOG_LEVEL_WARNING 1
+#define TFM_LOG_LEVEL_ERROR 2
+#define TFM_LOG_LEVEL_FATAL 3
+
+static int s_log_level = TFM_LOG_LEVEL_INFO;
+
+static inline bool IsLogOn(int log_level) { return log_level >= s_log_level; }
+
+static inline void SetLogLevel(int log_level) { s_log_level = log_level; }
+
+#define TFMLOGV(fmt, ...)                       \
+  do {                                          \
+    if (!IsLogOn(TFM_LOG_LEVEL_VERBOSE)) break; \
+    printf(fmt "\n", ##__VA_ARGS__);            \
+  } while (0)
+
+#define TFMLOGD(fmt, ...)                     \
+  do {                                        \
+    if (!IsLogOn(TFM_LOG_LEVEL_DEBUG)) break; \
+    printf(fmt "\n", ##__VA_ARGS__);          \
+  } while (0)
+
+#define TFMLOGI(fmt, ...)                    \
+  do {                                       \
+    if (!IsLogOn(TFM_LOG_LEVEL_INFO)) break; \
+    printf(fmt "\n", ##__VA_ARGS__);         \
+  } while (0)
+
+#define TFMLOGE(fmt, ...)                     \
+  do {                                        \
+    if (!IsLogOn(TFM_LOG_LEVEL_ERROR)) break; \
+    printf(fmt "\n", ##__VA_ARGS__);          \
+  } while (0)
+
+static inline void PrintLogHexagon(const char* fmt, va_list ap) {
+  char buffer[200];
+  const int count = snprintf(buffer, 200, fmt, ap);
+  buffer[count] = 0;
+  TFMLOGI("%s", buffer);
+}
+
+static inline void LogDHexagon(const char* fmt, ...) {
+  va_list ap;
+  va_start(ap, fmt);
+  PrintLogHexagon(fmt, ap);
+  va_end(ap);
+}
+
+#endif
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/include/node_data_float.h b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/include/node_data_float.h
new file mode 100644
index 0000000000..a9c3296e9f
--- /dev/null
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/include/node_data_float.h
@@ -0,0 +1,41 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef NODE_DATA_FLOAT_H
+#define NODE_DATA_FLOAT_H
+
+#ifdef __cplusplus
+extern "C" {
+#else
+#include <inttypes.h>
+#endif
+#define NODE_DATA_FLOAT_NODE_NAME_BUF_SIZE 100
+
+struct NodeDataFloat {
+  int x;
+  int y;
+  int z;
+  int d;
+  int buf_size;
+  int array_size;
+  float* array_data;
+  uint8_t* byte_array_data;
+  char node_name[NODE_DATA_FLOAT_NODE_NAME_BUF_SIZE];
+};
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // NODE_DATA_FLOAT_H
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/soc_interface.c b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/soc_interface.c
index ebcbb963e8..7db8d4870c 100755
--- a/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/soc_interface.c
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/soc_interface.c
@@ -15,110 +15,230 @@ limitations under the License.
 
 #include "soc_interface.h"
 
+#include <inttypes.h>
+
+#include "hexagon_controller.h"
+#include "hexagon_nn.h"
+#include "node_data_float.h"
+#include "tfm_log.h"
+
+const int64_t FLAG_ENABLE_INCEPTION_DUMMY_BINARY_INPUT = 0x01;
+
+static const int INCEPTION_VERSION = 3;
+
+static hexagon_nn_input* s_node_inputs_array;
+static int s_node_inputs_array_index;
+static int s_node_inputs_array_max_count;
+
+static hexagon_nn_output* s_node_outputs_array;
+static int s_node_outputs_array_index;
+static int s_node_outputs_array_max_count;
+
 int soc_interface_GetWrapperVersion() {
-  // TODO(satok): implement
-  return -1;
+  TFMLOGD("GetWrapperVersion");
+  return hexagon_controller_GetWrapperVersion();
 }
 
 int soc_interface_GetSocControllerVersion() {
-  // TODO(satok): implement
-  return -1;
+  TFMLOGD("GetSocControllerVersion");
+  return hexagon_controller_GetHexagonBinaryVersion();
 }
 
 bool soc_interface_Init() {
-  // TODO(satok): implement
-  return false;
+  TFMLOGD("Init");
+  hexagon_controller_InitHexagonWithMaxAttributes(
+      0, 100, INCEPTION_VERSION /* version */);
+  hexagon_controller_GrowMemorySize();
+  return true;
 }
 
 bool soc_interface_Finalize() {
-  // TODO(satok): implement
-  return false;
+  TFMLOGD("Finalize");
+  hexagon_controller_DeInitHexagon();
+  return true;
 }
 
 bool soc_interface_ExecuteGraph() {
-  // TODO(satok): implement
-  return false;
+  TFMLOGD("ExecuteGraph");
+  if (hexagon_controller_IsDbgUseInceptionDummyDataEnabled()) {
+    hexagon_controller_InitInputNodeDataToInceptionDummyData(
+        INCEPTION_VERSION /* version */);
+  }
+  const uint32_t graph_id = hexagon_controller_GetTargetGraphId();
+  if (graph_id == 0) {
+    TFMLOGE("Graph id has not been set yet.");
+    return false;
+  }
+  hexagon_controller_ExecuteGraphWithBuffer(graph_id, true);
+  return true;
 }
 
 bool soc_interface_TeardownGraph() {
-  // TODO(satok): implement
-  return false;
+  TFMLOGD("TeardownGraph");
+  const uint32_t graph_id = hexagon_controller_GetTargetGraphId();
+  if (graph_id == 0) {
+    TFMLOGE("Graph id has not been set yet.");
+    return false;
+  }
+  hexagon_controller_Teardown(graph_id);
+  return true;
 }
 
 bool soc_interface_FillInputNodeFloat(
-    int x, int y, int z, int d, const uint8_t* const buf, uint64_t buf_size) {
-  // TODO(satok): implement
-  return false;
+    int x, int y, int z, int d, const uint8_t* const buf,
+    uint64_t buf_size) {
+  TFMLOGD("FillInputNodeFloat");
+  struct NodeDataFloat* node_data_float =
+      hexagon_controller_GetInputNodeDataFloatBuffer();
+  const int array_size = x * y * z * d;
+  if (array_size > node_data_float->buf_size) {
+    TFMLOGE("Array size exceeds buf size %d > %d",
+            array_size, node_data_float->buf_size);
+    return false;
+  }
+  if (buf_size != array_size * sizeof(float)) {
+    TFMLOGE("Invalid buf size!");
+    return false;
+  }
+  memcpy(node_data_float->byte_array_data, buf, buf_size);
+  node_data_float->x = x;
+  node_data_float->y = y;
+  node_data_float->z = z;
+  node_data_float->d = d;
+  node_data_float->array_size = buf_size;
+  return true;
 }
 
 // TODO(satok): Remove and use runtime version
 bool soc_interface_ReadOutputNodeFloat(
     const char* const node_name, uint8_t** buf, uint64_t *buf_size) {
-  // TODO(satok): implement
-  return false;
+  TFMLOGD("ReadOutputNodeFloat");
+  int array_size = -1;
+  float* output_node_data_float =
+      hexagon_controller_GetOutputNodeDataFloatBuffer(node_name, &array_size);
+  if (array_size < 0) {
+    TFMLOGE("Failed to read data.");
+    return false;
+  }
+  *buf = (uint8_t*)output_node_data_float;
+  *buf_size = array_size * sizeof(float);
+  return true;
 }
 
 bool soc_interface_SetupGraphDummy(int version) {
-  // TODO(satok): implement
-  return false;
+  TFMLOGD("SetupGraphDummy");
+  const uint32_t graph_id = hexagon_controller_SetupGraph(version);
+  if (graph_id == 0) {
+    TFMLOGE("Failed to setup graph");
+    return false;
+  }
+  hexagon_controller_SetTargetGraphId(graph_id);
+  return true;
 }
 
 bool soc_interface_AllocateNodeInputAndNodeOutputArray(
     int total_input_count, int total_output_count) {
-  // TODO(satok): implement
-  return false;
+  TFMLOGD("Allocate node inputs and node outputs array %d, %d",
+          total_input_count, total_output_count);
+  s_node_inputs_array = malloc(total_input_count * sizeof(hexagon_nn_input));
+  s_node_outputs_array = malloc(total_output_count * sizeof(hexagon_nn_output));
+  s_node_inputs_array_index = 0;
+  s_node_outputs_array_index = 0;
+  s_node_inputs_array_max_count = total_input_count;
+  s_node_outputs_array_max_count = total_output_count;
+  return true;
 }
 
 bool soc_interface_ReleaseNodeInputAndNodeOutputArray() {
-  // TODO(satok): implement
-  return false;
+  TFMLOGD("Release node inputs and node outputs array");
+  free(s_node_inputs_array);
+  free(s_node_outputs_array);
+  return true;
 }
 
 void* soc_interface_SetOneNodeInputs(
     int input_count, const int* const node_id, const int* const port) {
-  // TODO(satok): implement
-  return 0;
+  if (s_node_inputs_array_index + input_count > s_node_inputs_array_max_count) {
+    TFMLOGE("input count exceeds limit");
+    return 0;
+  }
+  for (int i = 0; i < input_count; ++i) {
+    const int index = s_node_inputs_array_index + i;
+    s_node_inputs_array[index].src_id = node_id[i];
+    s_node_inputs_array[index].output_idx = port[i];
+  }
+  void* retval = (void*)(&s_node_inputs_array[s_node_inputs_array_index]);
+  s_node_inputs_array_index += input_count;
+  return retval;
 }
 
 void* soc_interface_SetOneNodeOutputs(int output_count, int* max_size) {
-  // TODO(satok): implement
-  return 0;
+  if (s_node_outputs_array_index + output_count >
+      s_node_outputs_array_max_count) {
+    TFMLOGE("output count exceeds limit");
+    return 0;
+  }
+  for (int i = 0; i < output_count; ++i) {
+    const int index = s_node_outputs_array_index + i;
+    s_node_outputs_array[index].max_size = max_size[i];
+  }
+  void* retval = (void*)(&s_node_outputs_array[s_node_outputs_array_index]);
+  s_node_outputs_array_index += output_count;
+  return retval;
 }
 
 // Append const node to the graph
 bool soc_interface_AppendConstNode(
-    const char* const name, int node_id, int batch, int height, int width,
-    int depth, const uint8_t* const data, int data_length) {
-  // TODO(satok): implement
-  return false;
+    const char* const name, int node_id, int batch, int height, int width, int depth,
+    const uint8_t* const data, int data_length) {
+  const uint32_t graph_id = hexagon_controller_GetTargetGraphId();
+  const int retval = hexagon_controller_AppendConstNode(
+      name, graph_id, node_id, batch, height, width, depth, data, data_length);
+  if (retval != 0) {
+    TFMLOGE("Failed to append const node %d", node_id);
+    return false;
+  }
+  return true;
 }
 
 // Append node to the graph
 bool soc_interface_AppendNode(
-    const char* const name, int node_id, int ops_id, int padding_id,
-    const void* const inputs, int inputs_count, const void* const outputs,
-    int outputs_count) {
-  // TODO(satok): implement
-  return false;
+    const char* const name, int node_id, int ops_id, int padding_id, const void* const inputs,
+    int inputs_count, const void* const outputs, int outputs_count) {
+  const uint32_t graph_id = hexagon_controller_GetTargetGraphId();
+  const int retval = hexagon_controller_AppendNode(
+      name, graph_id, node_id, ops_id, padding_id,
+      (hexagon_nn_input*) inputs, inputs_count,
+      (hexagon_nn_output*) outputs, outputs_count);
+  if (retval != 0) {
+    TFMLOGE("Failed to append const node %d", node_id);
+    return false;
+  }
+  return true;
 }
 
 
 // Instantiate graph
 bool soc_interface_InstantiateGraph() {
-  // TODO(satok): implement
-  return false;
+  const uint32_t nn_id = hexagon_controller_InstantiateGraph();
+  hexagon_controller_SetTargetGraphId(nn_id);
+  return true;
 }
 
 // Construct graph
 bool soc_interface_ConstructGraph() {
-  // TODO(satok): implement
-  return false;
+  const uint32_t graph_id = hexagon_controller_GetTargetGraphId();
+  return hexagon_controller_ConstructGraph(graph_id);
 }
 
 void soc_interface_SetLogLevel(int log_level) {
-  // TODO(satok): implement
+  SetLogLevel(log_level);
 }
 
 void soc_interface_SetDebugFlag(uint64_t flag) {
-  // TODO(satok): implement
+  TFMLOGI("Set debug flag 0x%" PRIx64, flag);
+  if ((flag & FLAG_ENABLE_INCEPTION_DUMMY_BINARY_INPUT) != 0) {
+    TFMLOGI("Enable always use panda data");
+    hexagon_controller_EnableDbgUseInceptionDummyData(true);
+  }
 }
diff --git a/tensorflow/contrib/hvx/hexagon_controller/target/make/android.min b/tensorflow/contrib/hvx/hexagon_controller/target/make/android.min
new file mode 100644
index 0000000000..4770d31c56
--- /dev/null
+++ b/tensorflow/contrib/hvx/hexagon_controller/target/make/android.min
@@ -0,0 +1,70 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+$(info ------------------------------------------)
+$(info --- V = $(V))
+$(info --- GLUE_DIR = $(GLUE_DIR))
+$(info --- HEXAGON_SDK_ROOT = $(HEXAGON_SDK_ROOT))
+$(info ------------------------------------------)
+
+INCDIRS += ../../../libs/common/adspmsgd/ship/android_Release
+
+INCDIRS += src_impl/include
+INCDIRS += src_log/include
+INCDIRS += src_soc_interface/include
+
+LIBDIRS += ../../../libs/common/adspmsgd/ship/android_Release
+
+BUILD_DLLS=libhexagon_controller
+
+hexagon_controller_lib_QAICIDLS += \
+interface/hexagon_nn \
+$(MAKE_D_DSPCV_INCDIR)/dspCV
+
+# hexagon controller library
+hexagon_controller_lib_C_SRCS += \
+src_impl/hexagon_controller \
+src_impl/graph_functions_wrapper \
+src_soc_interface/soc_interface
+
+# dummy data
+hexagon_controller_lib_C_SRCS += \
+src_dummy_data/inception_v1_graph_init \
+src_dummy_data/inception_v3_dummy_float_data \
+src_dummy_data/inception_v3_dummy_int_data \
+src_dummy_data/inception_v3_graph_init
+
+# hexagon interface
+hexagon_controller_lib_C_SRCS += \
+$V/hexagon_nn_stub \
+$V/dspCV_stub
+
+hexagon_controller_lib_DLLS += libadsprpc
+hexagon_controller_lib_LIBS += rpcmem adspmsgd
+hexagon_controller_lib_LD_FLAGS += -llog
+hexagon_controller_lib_DEFINES += VERIFY_PRINT_ERROR
+
+libhexagon_controller_QAICIDLS += $(hexagon_controller_lib_QAICIDLS)
+libhexagon_controller_C_SRCS += $(hexagon_controller_lib_C_SRCS)
+libhexagon_controller_DLLS += $(hexagon_controller_lib_DLLS)
+libhexagon_controller_LIBS += $(hexagon_controller_lib_LIBS)
+libhexagon_controller_LD_FLAGS += $(hexagon_controller_lib_LD_FLAGS)
+libhexagon_controller_DEFINES += $(hexagon_controller_lib_DEFINES)
+
+BUILD_COPIES = \
+   $(DLLS) \
+   $(EXES) \
+   $(LIBS) \
+   $(SHIP_DIR)/ ;
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 2673495b90..e47342f966 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -1385,7 +1385,8 @@ def fully_connected(inputs,
   if not isinstance(num_outputs, six.integer_types):
     raise ValueError('num_outputs should be int or long, got %s.', num_outputs)
 
-  layer_variable_getter = _build_variable_getter({'bias': 'biases'})
+  layer_variable_getter = _build_variable_getter({'bias': 'biases',
+                                                  'kernel': 'weights'})
 
   with variable_scope.variable_scope(
       scope, 'fully_connected', [inputs],
@@ -1395,9 +1396,9 @@ def fully_connected(inputs,
         units=num_outputs,
         activation=None,
         use_bias=not normalizer_fn and biases_initializer,
-        weights_initializer=weights_initializer,
+        kernel_initializer=weights_initializer,
         bias_initializer=biases_initializer,
-        weights_regularizer=weights_regularizer,
+        kernel_regularizer=weights_regularizer,
         bias_regularizer=biases_regularizer,
         activity_regularizer=None,
         trainable=trainable,
@@ -1408,7 +1409,7 @@ def fully_connected(inputs,
     outputs = layer.apply(inputs)
 
     # Add variables to collections.
-    _add_variable_to_collections(layer.w, variables_collections, 'weights')
+    _add_variable_to_collections(layer.kernel, variables_collections, 'weights')
     if layer.bias is not None:
       _add_variable_to_collections(layer.bias, variables_collections, 'biases')
 
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index d1b35e33c2..6043d4dc0e 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -1563,7 +1563,7 @@ class FCTest(test.TestCase):
       _layers.fully_connected(inputs, 32, weights_regularizer=weight_decay)
       wd = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)[0]
       self.assertEqual(wd.op.name,
-                       'fully_connected/weights/Regularizer/l2_regularizer')
+                       'fully_connected/kernel/Regularizer/l2_regularizer')
       sess.run(variables_lib.global_variables_initializer())
       self.assertLess(sess.run(wd), 0.4)
 
diff --git a/tensorflow/contrib/layers/python/layers/optimizers.py b/tensorflow/contrib/layers/python/layers/optimizers.py
index 0b50d93b72..bab59d0048 100644
--- a/tensorflow/contrib/layers/python/layers/optimizers.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers.py
@@ -176,6 +176,11 @@ def optimize_loss(loss,
                                                 str(type(learning_rate))))
     if summaries is None:
       summaries = ["loss", "learning_rate"]
+    else:
+      for summ in summaries:
+        if summ not in OPTIMIZER_SUMMARIES:
+          raise ValueError("Summaries should be one of [%s], you provided %s." %
+                           (", ".join(OPTIMIZER_SUMMARIES), summ))
     if learning_rate is not None and learning_rate_decay_fn is not None:
       if global_step is None:
         raise ValueError("global_step is required for learning_rate_decay_fn.")
diff --git a/tensorflow/contrib/layers/python/layers/optimizers_test.py b/tensorflow/contrib/layers/python/layers/optimizers_test.py
index b7b984b1e8..9dc612e58e 100644
--- a/tensorflow/contrib/layers/python/layers/optimizers_test.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers_test.py
@@ -108,6 +108,14 @@ class OptimizersTest(test.TestCase):
             optimizers_lib.optimize_loss(
                 loss, global_step, learning_rate=0.1, optimizer=optimizer)
 
+  def testBadSummaries(self):
+    with ops.Graph().as_default() as g, self.test_session(graph=g):
+      _, _, loss, global_step = _setup_model()
+      with self.assertRaises(ValueError):
+        optimizers_lib.optimize_loss(
+            loss, global_step, learning_rate=0.1, optimizer="SGD",
+            summaries=["loss", "bad_summary"])
+
   def testInvalidLoss(self):
     with ops.Graph().as_default() as g, self.test_session(graph=g):
       _, _, _, global_step = _setup_model()
diff --git a/tensorflow/contrib/learn/python/learn/__init__.py b/tensorflow/contrib/learn/python/learn/__init__.py
index d7b9aaffd4..6a6ff10d44 100644
--- a/tensorflow/contrib/learn/python/learn/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/__init__.py
@@ -46,4 +46,5 @@ from tensorflow.contrib.learn.python.learn.learn_io import *
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.learn.python.learn.monitors import NanLossDuringTrainingError
 from tensorflow.contrib.learn.python.learn.trainable import Trainable
+from tensorflow.contrib.learn.python.learn.utils import *
 # pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 1d36389722..becdf61709 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -36,7 +36,6 @@ from tensorflow.contrib.framework import deprecated_arg_values
 from tensorflow.contrib.framework import deprecated_args
 from tensorflow.contrib.framework import list_variables
 from tensorflow.contrib.framework import load_variable
-from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
 from tensorflow.contrib.learn.python.learn import evaluable
 from tensorflow.contrib.learn.python.learn import metric_spec
@@ -68,7 +67,6 @@ from tensorflow.python.training import basic_session_run_hooks
 from tensorflow.python.training import device_setter
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver
-from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import summary_io
 from tensorflow.python.util import compat
 
@@ -815,9 +813,10 @@ class BaseEstimator(
 
       update_op, eval_dict = self._extract_metric_update_ops(eval_dict)
 
-      hooks = hooks or []
+      # We need to copy the hook array as we modify it, thus [:].
+      hooks = hooks[:] if hooks else []
       if feed_fn:
-        hooks.append(_FeedFnHook(feed_fn))
+        hooks.append(basic_session_run_hooks.FeedFnHook(feed_fn))
       if steps:
         hooks.append(
             evaluation.StopAfterNEvalsHook(
@@ -1216,22 +1215,20 @@ class Estimator(BaseEstimator):
         self._labels_info)
     return self._call_model_fn(features, labels, model_fn_lib.ModeKeys.INFER)
 
-  @experimental
   def export_savedmodel(
-      self, export_dir_base, input_fn,
+      self, export_dir_base, serving_input_fn,
       default_output_alternative_key=None,
       assets_extra=None,
-      as_text=False,
-      exports_to_keep=None):
+      as_text=False):
     """Exports inference graph as a SavedModel into given dir.
 
     Args:
       export_dir_base: A string containing a directory to write the exported
         graph and checkpoints.
-      input_fn: A function that takes no argument and
+      serving_input_fn: A function that takes no argument and
         returns an `InputFnOps`.
       default_output_alternative_key: the name of the head to serve when none is
-        specified.
+        specified.  Not needed for single-headed models.
       assets_extra: A dict specifying how to populate the assets.extra directory
         within the exported SavedModel.  Each key should give the destination
         path (including the filename) relative to the assets.extra directory.
@@ -1240,7 +1237,6 @@ class Estimator(BaseEstimator):
         renaming it is specified as
         `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
       as_text: whether to write the SavedModel proto in text format.
-      exports_to_keep: Number of exports to keep.
 
     Returns:
       The string path to the exported directory.
@@ -1248,14 +1244,14 @@ class Estimator(BaseEstimator):
     Raises:
       ValueError: if an unrecognized export_type is requested.
     """
-    if input_fn is None:
-      raise ValueError('input_fn must be defined.')
+    if serving_input_fn is None:
+      raise ValueError('serving_input_fn must be defined.')
 
     with ops.Graph().as_default() as g:
       contrib_variables.create_global_step(g)
 
-      # Call the input_fn and collect the input alternatives.
-      input_ops = input_fn()
+      # Call the serving_input_fn and collect the input alternatives.
+      input_ops = serving_input_fn()
       input_alternatives, features = (
           saved_model_export_utils.get_input_alternatives(input_ops))
 
@@ -1266,7 +1262,7 @@ class Estimator(BaseEstimator):
           saved_model_export_utils.get_output_alternatives(
               model_fn_ops, default_output_alternative_key))
 
-      # Build the SignatureDefs from all pairs of input and output signatures
+      # Build the SignatureDefs from all pairs of input and output alternatives
       signature_def_map = saved_model_export_utils.build_all_signature_defs(
           input_alternatives, output_alternatives,
           actual_default_output_alternative_key)
@@ -1317,17 +1313,6 @@ class Estimator(BaseEstimator):
       return export_dir
 
 
-class _FeedFnHook(session_run_hook.SessionRunHook):
-  """Runs feed_fn and sets the feed_dict accordingly."""
-
-  def __init__(self, feed_fn):
-    self.feed_fn = feed_fn
-
-  def before_run(self, run_context):  # pylint: disable=unused-argument
-    return session_run_hook.SessionRunArgs(
-        fetches=None, feed_dict=self.feed_fn())
-
-
 # For time of deprecation x,y from Estimator allow direct access.
 # pylint: disable=protected-access
 class SKCompat(sklearn.BaseEstimator):
@@ -1343,7 +1328,7 @@ class SKCompat(sklearn.BaseEstimator):
                                       epochs=None)
     all_monitors = []
     if feed_fn:
-      all_monitors = [_FeedFnHook(feed_fn)]
+      all_monitors = [basic_session_run_hooks.FeedFnHook(feed_fn)]
     if monitors:
       all_monitors.extend(monitors)
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index 0b4897d4b2..ffa2e17aec 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -211,12 +211,12 @@ def _build_estimator_for_export_tests(tmpdir):
 
   feature_spec = feature_column_lib.create_feature_spec_for_parsing(
       feature_columns)
-  export_input_fn = input_fn_utils.build_parsing_serving_input_fn(feature_spec)
+  serving_input_fn = input_fn_utils.build_parsing_serving_input_fn(feature_spec)
 
   # hack in an op that uses an asset, in order to test asset export.
   # this is not actually valid, of course.
-  def export_input_fn_with_asset():
-    features, labels, inputs = export_input_fn()
+  def serving_input_fn_with_asset():
+    features, labels, inputs = serving_input_fn()
 
     vocab_file_name = os.path.join(tmpdir, 'my_vocab_file')
     vocab_file = gfile.GFile(vocab_file_name, mode='w')
@@ -229,7 +229,7 @@ def _build_estimator_for_export_tests(tmpdir):
 
     return input_fn_utils.InputFnOps(features, labels, inputs)
 
-  return est, export_input_fn_with_asset
+  return est, serving_input_fn_with_asset
 
 
 class CheckCallsMonitor(monitors_lib.BaseMonitor):
@@ -620,6 +620,16 @@ class EstimatorTest(test.TestCase):
     predictions = list(est.predict(x=iris.data))
     self.assertEqual(len(predictions), iris.target.shape[0])
 
+  def testHooksNotChanged(self):
+    est = estimator.Estimator(model_fn=logistic_model_no_mode_fn)
+    # We pass empty array and expect it to remain empty after calling
+    # fit and evaluate. Requires inside to copy this array if any hooks were
+    # added.
+    my_array = []
+    est.fit(input_fn=iris_input_fn, steps=100, monitors=my_array)
+    _ = est.evaluate(input_fn=iris_input_fn, steps=1, hooks=my_array)
+    self.assertEqual(my_array, [])
+
   def testIrisInputFnLabelsDict(self):
     iris = base.load_iris()
     est = estimator.Estimator(model_fn=logistic_model_no_mode_fn)
@@ -811,7 +821,7 @@ class EstimatorTest(test.TestCase):
 
   def test_export_savedmodel(self):
     tmpdir = tempfile.mkdtemp()
-    est, export_input_fn = _build_estimator_for_export_tests(tmpdir)
+    est, serving_input_fn = _build_estimator_for_export_tests(tmpdir)
 
     extra_file_name = os.path.join(
         compat.as_bytes(tmpdir), compat.as_bytes('my_extra_file'))
@@ -823,7 +833,7 @@ class EstimatorTest(test.TestCase):
     export_dir_base = os.path.join(
         compat.as_bytes(tmpdir), compat.as_bytes('export'))
     export_dir = est.export_savedmodel(
-        export_dir_base, export_input_fn, assets_extra=assets_extra)
+        export_dir_base, serving_input_fn, assets_extra=assets_extra)
 
     self.assertTrue(gfile.Exists(export_dir_base))
     self.assertTrue(gfile.Exists(export_dir))
diff --git a/tensorflow/contrib/learn/python/learn/estimators/svm.py b/tensorflow/contrib/learn/python/learn/estimators/svm.py
index e7805d9a90..c898a4865b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/svm.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/svm.py
@@ -18,14 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
-import re
-
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated_arg_values
-from tensorflow.contrib.framework.python.framework import experimental
-from tensorflow.contrib.learn.python.learn import evaluable
-from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import linear
@@ -38,15 +32,7 @@ def _as_iterable(preds, output):
     yield pred[output]
 
 
-def _get_metric_args(metric):
-  if hasattr(metric, "__code__"):
-    return inspect.getargspec(metric).args
-  elif hasattr(metric, "func") and hasattr(metric, "keywords"):
-    return [arg for arg in inspect.getargspec(metric.func).args
-            if arg not in metric.keywords.keys()]
-
-
-class SVM(trainable.Trainable, evaluable.Evaluable):
+class SVM(estimator.Estimator):
   """Support Vector Machine (SVM) model for binary classification.
 
   Currently, only linear SVMs are supported. For the underlying optimization
@@ -106,7 +92,7 @@ class SVM(trainable.Trainable, evaluable.Evaluable):
                kernels=None,
                config=None,
                feature_engineering_fn=None):
-    """Constructs a `SVM~ estimator object.
+    """Constructs an `SVM` estimator object.
 
     Args:
       example_id_column: A string defining the feature column name representing
@@ -139,15 +125,15 @@ class SVM(trainable.Trainable, evaluable.Evaluable):
     """
     if kernels is not None:
       raise ValueError("Kernel SVMs are not currently supported.")
-    self._optimizer = sdca_optimizer.SDCAOptimizer(
+    optimizer = sdca_optimizer.SDCAOptimizer(
         example_id_column=example_id_column,
         num_loss_partitions=num_loss_partitions,
         symmetric_l1_regularization=l1_regularization,
         symmetric_l2_regularization=l2_regularization)
 
     self._feature_columns = feature_columns
-    self._chief_hook = linear._SdcaUpdateWeightsHook()  # pylint: disable=protected-access
-    self._estimator = estimator.Estimator(
+    chief_hook = linear._SdcaUpdateWeightsHook()  # pylint: disable=protected-access
+    super(SVM, self).__init__(
         model_fn=linear.sdca_model_fn,
         model_dir=model_dir,
         config=config,
@@ -156,62 +142,20 @@ class SVM(trainable.Trainable, evaluable.Evaluable):
                 weight_column_name=weight_column_name,
                 enable_centered_bias=False),
             "feature_columns": feature_columns,
-            "optimizer": self._optimizer,
+            "optimizer": optimizer,
             "weight_column_name": weight_column_name,
-            "update_weights_hook": self._chief_hook,
+            "update_weights_hook": chief_hook,
         },
         feature_engineering_fn=feature_engineering_fn)
-    if not self._estimator.config.is_chief:
-      self._chief_hook = None
-
-  @property
-  def model_dir(self):
-    """See trainable.Evaluable."""
-    return self._estimator.model_dir
-
-  def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
-          monitors=None, max_steps=None):
-    """See trainable.Trainable."""
-    if monitors is None:
-      monitors = []
-    if self._chief_hook:
-      monitors.append(self._chief_hook)
-    return self._estimator.fit(x=x, y=y, input_fn=input_fn, steps=steps,
-                               batch_size=batch_size, monitors=monitors,
-                               max_steps=max_steps)
-
-  # pylint: disable=protected-access
-  def evaluate(self,
-               x=None,
-               y=None,
-               input_fn=None,
-               feed_fn=None,
-               batch_size=None,
-               steps=None,
-               metrics=None,
-               name=None,
-               checkpoint_path=None,
-               hooks=None):
-    """See evaluable.Evaluable."""
-    return self._estimator.evaluate(
-        x=x,
-        y=y,
-        input_fn=input_fn,
-        feed_fn=feed_fn,
-        batch_size=batch_size,
-        steps=steps,
-        metrics=metrics,
-        name=name,
-        checkpoint_path=checkpoint_path,
-        hooks=hooks)
 
   @deprecated_arg_values(
       estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
       as_iterable=False)
-  def predict(self, x=None, input_fn=None, batch_size=None, as_iterable=True):
+  def predict_classes(self, x=None, input_fn=None, batch_size=None,
+                      as_iterable=True):
     """Runs inference to determine the predicted class."""
     key = prediction_key.PredictionKey.CLASSES
-    preds = self._estimator.predict(
+    preds = super(SVM, self).predict(
         x=x,
         input_fn=input_fn,
         batch_size=batch_size,
@@ -228,7 +172,7 @@ class SVM(trainable.Trainable, evaluable.Evaluable):
                     as_iterable=True):
     """Runs inference to determine the class probability predictions."""
     key = prediction_key.PredictionKey.PROBABILITIES
-    preds = self._estimator.predict(
+    preds = super(SVM, self).predict(
         x=x,
         input_fn=input_fn,
         batch_size=batch_size,
@@ -239,51 +183,30 @@ class SVM(trainable.Trainable, evaluable.Evaluable):
     return preds[key]
   # pylint: enable=protected-access
 
-  def get_variable_names(self):
-    return self._estimator.get_variable_names()
-
   def export(self, export_dir, signature_fn=None,
              input_fn=None, default_batch_size=1,
              exports_to_keep=None):
     """See BaseEstimator.export."""
+    return self.export_with_defaults(
+        export_dir=export_dir,
+        signature_fn=signature_fn,
+        input_fn=input_fn,
+        default_batch_size=default_batch_size,
+        exports_to_keep=exports_to_keep)
+
+  def export_with_defaults(
+      self,
+      export_dir,
+      signature_fn=None,
+      input_fn=None,
+      default_batch_size=1,
+      exports_to_keep=None):
+    """Same as BaseEstimator.export, but uses some defaults."""
     def default_input_fn(unused_estimator, examples):
       return layers.parse_feature_columns_from_examples(
           examples, self._feature_columns)
-    return self._estimator.export(export_dir=export_dir,
-                                  signature_fn=signature_fn,
-                                  input_fn=input_fn or default_input_fn,
-                                  default_batch_size=default_batch_size,
-                                  exports_to_keep=exports_to_keep)
-
-  @experimental
-  def export_savedmodel(self,
-                        export_dir_base,
-                        input_fn,
-                        default_output_alternative_key=None,
-                        assets_extra=None,
-                        as_text=False,
-                        exports_to_keep=None):
-    return self._estimator.export_savedmodel(
-        export_dir_base,
-        input_fn,
-        default_output_alternative_key=default_output_alternative_key,
-        assets_extra=assets_extra,
-        as_text=as_text,
-        exports_to_keep=exports_to_keep)
-
-  @property
-  def weights_(self):
-    values = {}
-    optimizer_regex = r".*/"+self._optimizer.get_name() + r"(_\d)?$"
-    for name in self.get_variable_names():
-      if (name.startswith("linear/") and
-          name != "linear/bias_weight" and
-          not re.match(optimizer_regex, name)):
-        values[name] = self.get_variable_value(name)
-    if len(values) == 1:
-      return values[list(values.keys())[0]]
-    return values
-
-  @property
-  def bias_(self):
-    return self.get_variable_value("linear/bias_weight")
+    return super(SVM, self).export(export_dir=export_dir,
+                                   signature_fn=signature_fn,
+                                   input_fn=input_fn or default_input_fn,
+                                   default_batch_size=default_batch_size,
+                                   exports_to_keep=exports_to_keep)
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index 3bc5013540..ed0e546442 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -139,8 +139,8 @@ class Experiment(object):
     self._continuous_eval_throttle_secs = continuous_eval_throttle_secs
     self._min_eval_frequency = min_eval_frequency
     self._delay_workers_by_global_step = delay_workers_by_global_step
+    self._train_monitors = train_monitors or []
     # Mutable fields, using the setters.
-    self.train_monitors = train_monitors
     self.eval_hooks = eval_hooks
     self.export_strategies = export_strategies
     self.continuous_eval_predicate_fn = continuous_eval_predicate_fn
@@ -170,12 +170,9 @@ class Experiment(object):
     return self._eval_steps
 
   @property
-  def train_monitors(self):
-    return self._train_monitors
-
-  @train_monitors.setter
-  def train_monitors(self, value):
-    self._train_monitors = value or []
+  def train_hooks(self):
+    """Returns a shallow copy of train hooks for inspecting."""
+    return [m for m in self._train_monitors]
 
   @property
   def eval_hooks(self):
@@ -232,6 +229,10 @@ class Experiment(object):
       raise ValueError("`export_strategies` must be an ExportStrategy, "
                        "a list of ExportStrategies, or None.")
 
+  def extend_train_hooks(self, additional_hooks):
+    """Extends the hooks for training."""
+    self._train_monitors.extend(additional_hooks)
+
   def train(self, delay_secs=None):
     """Fit the estimator using the training data.
 
@@ -378,7 +379,8 @@ class Experiment(object):
                                                steps=self._eval_steps,
                                                metrics=self._eval_metrics,
                                                name=name,
-                                               checkpoint_path=latest_path)
+                                               checkpoint_path=latest_path,
+                                               hooks=self._eval_hooks)
         # Ensure eval result is not None for next round of evaluation.
         if not eval_result:
           eval_result = {}
@@ -454,14 +456,15 @@ class Experiment(object):
         self._train_monitors += [monitors.ValidationMonitor(
             input_fn=self._eval_input_fn, eval_steps=self._eval_steps,
             metrics=self._eval_metrics, every_n_steps=self._min_eval_frequency,
-            name=eval_dir_suffix,
+            name=eval_dir_suffix, hooks=self._eval_hooks
         )]
       self.train(delay_secs=0)
 
     eval_result = self._estimator.evaluate(input_fn=self._eval_input_fn,
                                            steps=self._eval_steps,
                                            metrics=self._eval_metrics,
-                                           name=eval_dir_suffix)
+                                           name=eval_dir_suffix,
+                                           hooks=self._eval_hooks)
     export_results = self._maybe_export(eval_result)
     return eval_result, export_results
 
diff --git a/tensorflow/contrib/learn/python/learn/experiment_test.py b/tensorflow/contrib/learn/python/learn/experiment_test.py
index 8b43973bb8..096d334e8c 100644
--- a/tensorflow/contrib/learn/python/learn/experiment_test.py
+++ b/tensorflow/contrib/learn/python/learn/experiment_test.py
@@ -42,6 +42,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
+from tensorflow.python.training import session_run_hook
 from tensorflow.python.util import compat
 from tensorflow.python.util.all_util import reveal_undocumented
 
@@ -74,6 +75,7 @@ class TestEstimator(evaluable.Evaluable, trainable.Trainable):
     self._max_evals = max_evals
     self.export_count = 0
     self.monitors = []
+    self.eval_hooks = []
     self._config = config or run_config.RunConfig()
     self._model_dir = tempfile.mkdtemp()
 
@@ -87,6 +89,8 @@ class TestEstimator(evaluable.Evaluable, trainable.Trainable):
 
   def evaluate(self, **kwargs):
     tf_logging.info('evaluate called with args: %s' % kwargs)
+    if 'hooks' in kwargs:
+      self.eval_hooks = kwargs['hooks']
     self.eval_count += 1
     if self.eval_count > self._max_evals:
       tf_logging.info('Ran %d evals. Done.' % self.eval_count)
@@ -109,14 +113,18 @@ class TestEstimator(evaluable.Evaluable, trainable.Trainable):
       self.monitors = kwargs['monitors']
     return [(key, kwargs[key]) for key in sorted(kwargs.keys())]
 
-  def export_savedmodel(self, export_dir_base, export_input_fn, **kwargs):
+  def export_savedmodel(self, export_dir_base, serving_input_fn, **kwargs):
     tf_logging.info('export_savedmodel called with args: %s, %s, %s' %
-                    (export_dir_base, export_input_fn, kwargs))
+                    (export_dir_base, serving_input_fn, kwargs))
     self.export_count += 1
     return os.path.join(
         compat.as_bytes(export_dir_base), compat.as_bytes('bogus_timestamp'))
 
 
+class _NoopHook(session_run_hook.SessionRunHook):
+  pass
+
+
 class ExperimentTest(test.TestCase):
 
   def _cluster_spec(self):
@@ -253,52 +261,63 @@ class ExperimentTest(test.TestCase):
   def test_evaluate(self):
     est = TestEstimator()
     est.fake_checkpoint()
+    noop_hook = _NoopHook()
     ex = experiment.Experiment(
         est,
         train_input_fn='train_input',
         eval_input_fn='eval_input',
         eval_metrics='eval_metrics',
+        eval_hooks=[noop_hook],
         eval_steps='steps',
         eval_delay_secs=0)
     ex.evaluate()
-    self.assertEquals(1, est.eval_count)
     self.assertEquals(0, est.fit_count)
+    self.assertEquals(1, est.eval_count)
+    self.assertEquals([noop_hook], est.eval_hooks)
 
   def test_evaluate_delay(self):
     est = TestEstimator()
     est.fake_checkpoint()
+    noop_hook = _NoopHook()
     ex = experiment.Experiment(
-        est, train_input_fn='train_input', eval_input_fn='eval_input')
+        est, train_input_fn='train_input', eval_input_fn='eval_input',
+        eval_hooks=[noop_hook])
 
     for delay in [0, 1, 3]:
       with test.mock.patch('time.sleep', SheepCounter()) as sheep:
         ex.evaluate(delay_secs=delay)
       self.assertAlmostEqual(delay, sheep.total_time, delta=0.1)
+      self.assertEquals([noop_hook], est.eval_hooks)
 
   def test_continuous_eval(self):
     est = TestEstimator()
     est.fake_checkpoint()
+    noop_hook = _NoopHook()
     ex = experiment.Experiment(
         est,
         train_input_fn='train_input',
         eval_input_fn='eval_input',
         eval_metrics='eval_metrics',
+        eval_hooks=[noop_hook],
         eval_delay_secs=0,
         continuous_eval_throttle_secs=0)
     self.assertRaises(
         StopIteration, ex.continuous_eval, evaluate_checkpoint_only_once=False)
-    self.assertEquals(6, est.eval_count)
     self.assertEquals(0, est.fit_count)
+    self.assertEquals(6, est.eval_count)
+    self.assertEquals([noop_hook], est.eval_hooks)
 
   def test_continuous_eval_throttle_delay(self):
     for delay in [0, 1, 2]:
       est = TestEstimator()
       est.fake_checkpoint()
+      noop_hook = _NoopHook()
       ex = experiment.Experiment(
           est,
           train_input_fn='train_input',
           eval_input_fn='eval_input',
           eval_metrics='eval_metrics',
+          eval_hooks=[noop_hook],
           continuous_eval_throttle_secs=delay,
           eval_delay_secs=0)
       with test.mock.patch('time.sleep', SheepCounter()) as sheep:
@@ -311,6 +330,7 @@ class ExperimentTest(test.TestCase):
   def test_continuous_eval_predicate_fn(self):
     est = TestEstimator()
     est.fake_checkpoint()
+    noop_hook = _NoopHook()
 
     def _predicate_fn(unused_eval_result):
       return est.eval_count < 3
@@ -320,20 +340,24 @@ class ExperimentTest(test.TestCase):
         train_input_fn='train_input',
         eval_input_fn='eval_input',
         eval_metrics='eval_metrics',
+        eval_hooks=[noop_hook],
         eval_delay_secs=0,
         continuous_eval_throttle_secs=0,
         continuous_eval_predicate_fn=_predicate_fn)
     ex.continuous_eval(evaluate_checkpoint_only_once=False)
-    self.assertEquals(3, est.eval_count)
     self.assertEquals(0, est.fit_count)
+    self.assertEquals(3, est.eval_count)
+    self.assertEquals([noop_hook], est.eval_hooks)
 
   def test_run_local(self):
     est = TestEstimator()
+    noop_hook = _NoopHook()
     ex = experiment.Experiment(
         est,
         train_input_fn='train_input',
         eval_input_fn='eval_input',
         eval_metrics='eval_metrics',
+        eval_hooks=[noop_hook],
         train_steps=100,
         eval_steps=100,
         local_eval_frequency=10)
@@ -341,17 +365,42 @@ class ExperimentTest(test.TestCase):
     self.assertEquals(1, est.fit_count)
     self.assertEquals(1, est.eval_count)
     self.assertEquals(1, len(est.monitors))
+    self.assertEquals([noop_hook], est.eval_hooks)
     self.assertTrue(isinstance(est.monitors[0], monitors.ValidationMonitor))
 
+  def test_train_monitors_returns_shallow_copy(self):
+    noop_hook = _NoopHook()
+    ex = experiment.Experiment(
+        TestEstimator(),
+        train_input_fn='train_input',
+        eval_input_fn='eval_input',
+        eval_metrics='eval_metrics',
+        train_monitors=[noop_hook],
+        train_steps=100,
+        eval_steps=100,
+        local_eval_frequency=10)
+    self.assertAllEqual([noop_hook], ex.train_hooks)
+
+    another_noop_hook = _NoopHook()
+    # Assert that the property getter returns a shallow copy.
+    ex.train_hooks.extend([another_noop_hook])
+    self.assertAllEqual([noop_hook], ex.train_hooks)
+
+    # Assert that the extend API mutates the monitors.
+    ex.extend_train_hooks([another_noop_hook])
+    self.assertAllEqual([noop_hook, another_noop_hook], ex.train_hooks)
+
   def test_train_and_evaluate(self):
     est = TestEstimator()
+    noop_hook = _NoopHook()
     export_strategy = saved_model_export_utils.make_export_strategy(
-        est, 'export_input')
+        est, 'export_input', exports_to_keep=None)
     ex = experiment.Experiment(
         est,
         train_input_fn='train_input',
         eval_input_fn='eval_input',
         eval_metrics='eval_metrics',
+        eval_hooks=[noop_hook],
         train_steps=100,
         eval_steps=100,
         export_strategies=export_strategy)
@@ -360,6 +409,7 @@ class ExperimentTest(test.TestCase):
     self.assertEquals(1, est.eval_count)
     self.assertEquals(1, est.export_count)
     self.assertEquals(1, len(est.monitors))
+    self.assertEquals([noop_hook], est.eval_hooks)
     self.assertTrue(isinstance(est.monitors[0], monitors.ValidationMonitor))
 
   @test.mock.patch.object(server_lib, 'Server')
diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index d8fe2315da..ab6ea0fb02 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -618,7 +618,8 @@ class ValidationMonitor(EveryN):
 
   def __init__(self, x=None, y=None, input_fn=None, batch_size=None,
                eval_steps=None,
-               every_n_steps=100, metrics=None, early_stopping_rounds=None,
+               every_n_steps=100, metrics=None, hooks=None,
+               early_stopping_rounds=None,
                early_stopping_metric="loss",
                early_stopping_metric_minimize=True, name=None):
     """Initializes a ValidationMonitor.
@@ -632,6 +633,8 @@ class ValidationMonitor(EveryN):
       every_n_steps: Check for new checkpoints to evaluate every N steps. If a
           new checkpoint is found, it is evaluated. See `EveryN`.
       metrics: See `BaseEstimator.evaluate`.
+      hooks: A list of `SessionRunHook` hooks to pass to the
+        `Estimator`'s `evaluate` function.
       early_stopping_rounds: `int`. If the metric indicated by
           `early_stopping_metric` does not change according to
           `early_stopping_metric_minimize` for this many steps, then training
@@ -660,6 +663,7 @@ class ValidationMonitor(EveryN):
     self.batch_size = batch_size
     self.eval_steps = eval_steps
     self.metrics = metrics
+    self.hooks = hooks
     self.early_stopping_rounds = early_stopping_rounds
     self.early_stopping_metric = early_stopping_metric
     self.early_stopping_metric_minimize = early_stopping_metric_minimize
@@ -709,7 +713,8 @@ class ValidationMonitor(EveryN):
     # Run evaluation and log it.
     validation_outputs = self._estimator.evaluate(
         x=self.x, y=self.y, input_fn=self.input_fn, batch_size=self.batch_size,
-        steps=self.eval_steps, metrics=self.metrics, name=self.name)
+        steps=self.eval_steps, metrics=self.metrics, hooks=self.hooks,
+        name=self.name)
     stats = []
     for name in validation_outputs:
       stats.append("%s = %s" % (name, str(validation_outputs[name])))
diff --git a/tensorflow/contrib/learn/python/learn/utils/__init__.py b/tensorflow/contrib/learn/python/learn/utils/__init__.py
index f313699c14..74236da979 100644
--- a/tensorflow/contrib/learn/python/learn/utils/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/utils/__init__.py
@@ -20,3 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.learn.python.learn.utils.export import export_estimator
+from tensorflow.contrib.learn.python.learn.utils.input_fn_utils import build_default_serving_input_fn
+from tensorflow.contrib.learn.python.learn.utils.input_fn_utils import build_parsing_serving_input_fn
+from tensorflow.contrib.learn.python.learn.utils.saved_model_export_utils import make_export_strategy
+
diff --git a/tensorflow/contrib/learn/python/learn/utils/export_test.py b/tensorflow/contrib/learn/python/learn/utils/export_test.py
index caae60029a..ce1d73256a 100644
--- a/tensorflow/contrib/learn/python/learn/utils/export_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/export_test.py
@@ -112,7 +112,7 @@ class ExportTest(test.TestCase):
   def testExportMonitorInputFeatureKeyMissing(self):
     random.seed(42)
 
-    def _export_input_fn():
+    def _serving_input_fn():
       return {
           _X_KEY:
               random_ops.random_uniform(
@@ -123,7 +123,7 @@ class ExportTest(test.TestCase):
     monitor = learn.monitors.ExportMonitor(
         every_n_steps=1,
         export_dir=tempfile.mkdtemp() + 'export/',
-        input_fn=_export_input_fn,
+        input_fn=_serving_input_fn,
         input_feature_key=input_feature_key,
         exports_to_keep=2,
         signature_fn=export.generic_signature_fn)
@@ -135,13 +135,13 @@ class ExportTest(test.TestCase):
     random.seed(42)
     input_feature_key = 'my_example_key'
 
-    def _export_input_fn():
+    def _serving_input_fn():
       return {input_feature_key: None}, None
 
     monitor = learn.monitors.ExportMonitor(
         every_n_steps=1,
         export_dir=tempfile.mkdtemp() + 'export/',
-        input_fn=_export_input_fn,
+        input_fn=_serving_input_fn,
         input_feature_key=input_feature_key,
         exports_to_keep=2,
         signature_fn=export.generic_signature_fn)
@@ -154,7 +154,7 @@ class ExportTest(test.TestCase):
     random.seed(42)
     input_feature_key = 'my_example_key'
 
-    def _export_input_fn():
+    def _serving_input_fn():
       return {
           input_feature_key:
               None,
@@ -166,7 +166,7 @@ class ExportTest(test.TestCase):
     monitor = learn.monitors.ExportMonitor(
         every_n_steps=1,
         export_dir=tempfile.mkdtemp() + 'export/',
-        input_fn=_export_input_fn,
+        input_fn=_serving_input_fn,
         input_feature_key=input_feature_key,
         exports_to_keep=2,
         signature_fn=export.generic_signature_fn)
@@ -178,7 +178,7 @@ class ExportTest(test.TestCase):
     random.seed(42)
     input_feature_key = 'my_example_key'
 
-    def _export_input_fn():
+    def _serving_input_fn():
       return {
           input_feature_key:
               array_ops.placeholder(
@@ -188,7 +188,7 @@ class ExportTest(test.TestCase):
     monitor = learn.monitors.ExportMonitor(
         every_n_steps=1,
         export_dir=tempfile.mkdtemp() + 'export/',
-        input_fn=_export_input_fn,
+        input_fn=_serving_input_fn,
         input_feature_key=input_feature_key,
         exports_to_keep=2,
         signature_fn=export.generic_signature_fn)
@@ -200,7 +200,7 @@ class ExportTest(test.TestCase):
     random.seed(42)
     input_feature_key = 'my_example_key'
 
-    def _export_input_fn():
+    def _serving_input_fn():
       return {
           input_feature_key:
               array_ops.placeholder(
@@ -214,7 +214,7 @@ class ExportTest(test.TestCase):
     monitor = learn.monitors.ExportMonitor(
         every_n_steps=1,
         export_dir=export_dir,
-        input_fn=_export_input_fn,
+        input_fn=_serving_input_fn,
         input_feature_key=input_feature_key,
         exports_to_keep=2,
         signature_fn=export.generic_signature_fn)
diff --git a/tensorflow/contrib/learn/python/learn/utils/input_fn_utils.py b/tensorflow/contrib/learn/python/learn/utils/input_fn_utils.py
index 18bfdc61c6..1a51971619 100644
--- a/tensorflow/contrib/learn/python/learn/utils/input_fn_utils.py
+++ b/tensorflow/contrib/learn/python/learn/utils/input_fn_utils.py
@@ -41,7 +41,7 @@ InputFnOps = collections.namedtuple('InputFnOps',
                                      'default_inputs'])
 
 
-def build_parsing_serving_input_fn(feature_spec, default_batch_size=1):
+def build_parsing_serving_input_fn(feature_spec, default_batch_size=None):
   """Build an input_fn appropriate for serving, expecting fed tf.Examples.
 
   Creates an input_fn that expects a serialized tf.Example fed into a string
@@ -52,6 +52,7 @@ def build_parsing_serving_input_fn(feature_spec, default_batch_size=1):
   Args:
     feature_spec: a dict of string to `VarLenFeature`/`FixedLenFeature`.
     default_batch_size: the number of query examples expected per batch.
+        Leave unset for variable batch size (recommended).
 
   Returns:
     An input_fn suitable for use in serving.
@@ -68,7 +69,7 @@ def build_parsing_serving_input_fn(feature_spec, default_batch_size=1):
   return input_fn
 
 
-def build_default_serving_input_fn(features, default_batch_size=1):
+def build_default_serving_input_fn(features, default_batch_size=None):
   """Build an input_fn appropriate for serving, expecting feature Tensors.
 
   Creates an input_fn that expects all features to be fed directly.
@@ -78,6 +79,7 @@ def build_default_serving_input_fn(features, default_batch_size=1):
   Args:
     features: a dict of string to `Tensor`.
     default_batch_size: the number of query examples expected per batch.
+        Leave unset for variable batch size (recommended).
 
   Returns:
     An input_fn suitable for use in serving.
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
index 9e452d0905..8d53b01511 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import re
 import time
 
 from tensorflow.contrib.learn.python.learn import export_strategy
@@ -208,7 +207,7 @@ def get_timestamped_export_dir(export_dir_base):
   Each export is written into a new subdirectory named using the
   current time.  This guarantees monotonically increasing version
   numbers even across multiple runs of the pipeline.
-  The timestamp used is the number of milliseconds since epoch UTC.
+  The timestamp used is the number of seconds since epoch UTC.
 
   Args:
     export_dir_base: A string containing a directory to write the exported
@@ -216,7 +215,7 @@ def get_timestamped_export_dir(export_dir_base):
   Returns:
     The full path of the new subdirectory (which is not actually created yet).
   """
-  export_timestamp = int(time.time() * 1e3)
+  export_timestamp = int(time.time())
 
   export_dir = os.path.join(
       compat.as_bytes(export_dir_base),
@@ -241,37 +240,63 @@ def garbage_collect_exports(export_dir_base, exports_to_keep):
   keep_filter = gc.largest_export_versions(exports_to_keep)
   delete_filter = gc.negation(keep_filter)
 
-  # Export dir must not end with / or it will break the re match below.
-  if export_dir_base.endswith('/'):
-    export_dir_base = export_dir_base[:-1]
-
   # create a simple parser that pulls the export_version from the directory.
   def parser(path):
-    match = re.match('^' + export_dir_base + '/(\\d{13})$', path.path)
-    if not match:
+    filename = os.path.basename(path.path)
+    if not (len(filename) == 10 and filename.isdigit()):
       return None
-    return path._replace(export_version=int(match.group(1)))
+    return path._replace(export_version=int(filename))
 
   for p in delete_filter(gc.get_paths(export_dir_base, parser=parser)):
     gfile.DeleteRecursively(p.path)
 
 
-def make_export_strategy(export_input_fn,
+def make_export_strategy(serving_input_fn,
                          default_output_alternative_key='default',
                          assets_extra=None,
                          as_text=False,
-                         exports_to_keep=None):
-  """Create an ExportStrategy for use with Experiment."""
+                         exports_to_keep=5):
+  """Create an ExportStrategy for use with Experiment.
+
+  Args:
+    serving_input_fn: A function that takes no arguments and returns an
+      `InputFnOps`.
+    default_output_alternative_key: the name of the head to serve when an
+      incoming serving request does not explicitly request a specific head.
+      Not needed for single-headed models.
+    assets_extra: A dict specifying how to populate the assets.extra directory
+      within the exported SavedModel.  Each key should give the destination
+      path (including the filename) relative to the assets.extra directory.
+      The corresponding value gives the full path of the source file to be
+      copied.  For example, the simple case of copying a single file without
+      renaming it is specified as
+      `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
+    as_text: whether to write the SavedModel proto in text format.
+    exports_to_keep: Number of exports to keep.  Older exports will be
+      garbage-collected.  Defaults to 5.  Set to None to disable garbage
+      collection.
+
+  Returns:
+    an ExportStrategy that can be passed to the Experiment constructor.
+  """
 
   def export_fn(estimator, export_dir_base):
-    """Exports the given Estimator as a SavedModel."""
+    """Exports the given Estimator as a SavedModel.
+
+    Args:
+      estimator: the Estimator to export.
+      export_dir_base: A string containing a directory to write the exported
+        graph and checkpoints.
+
+    Returns:
+      The string path to the exported directory.
+    """
     export_result = estimator.export_savedmodel(
         export_dir_base,
-        export_input_fn,
+        serving_input_fn,
         default_output_alternative_key=default_output_alternative_key,
         assets_extra=assets_extra,
-        as_text=as_text,
-        exports_to_keep=exports_to_keep)
+        as_text=as_text)
 
     garbage_collect_exports(export_dir_base, exports_to_keep)
     return export_result
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
index 955e14ae44..e22f11943b 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
@@ -240,21 +240,21 @@ class SavedModelExportUtilsTest(test.TestCase):
     export_dir_base = tempfile.mkdtemp() + "export/"
     export_dir_1 = saved_model_export_utils.get_timestamped_export_dir(
         export_dir_base)
-    time.sleep(0.001)
+    time.sleep(1)
     export_dir_2 = saved_model_export_utils.get_timestamped_export_dir(
         export_dir_base)
-    time.sleep(0.001)
+    time.sleep(1)
     export_dir_3 = saved_model_export_utils.get_timestamped_export_dir(
         export_dir_base)
 
-    # Export directories should be named using a timestamp that is milliseconds
-    # since epoch.  Such a timestamp is 13 digits long.
+    # Export directories should be named using a timestamp that is seconds
+    # since epoch.  Such a timestamp is 10 digits long.
     time_1 = os.path.basename(export_dir_1)
-    self.assertEqual(13, len(time_1))
+    self.assertEqual(10, len(time_1))
     time_2 = os.path.basename(export_dir_2)
-    self.assertEqual(13, len(time_2))
+    self.assertEqual(10, len(time_2))
     time_3 = os.path.basename(export_dir_3)
-    self.assertEqual(13, len(time_3))
+    self.assertEqual(10, len(time_3))
 
     self.assertTrue(int(time_1) < int(time_2))
     self.assertTrue(int(time_2) < int(time_3))
@@ -283,10 +283,10 @@ class SavedModelExportUtilsTest(test.TestCase):
 
   def test_make_export_strategy(self):
     """Only tests that an ExportStrategy instance is created."""
-    def _export_input_fn():
+    def _serving_input_fn():
       return array_ops.constant([1]), None
     export_strategy = saved_model_export_utils.make_export_strategy(
-        export_input_fn=_export_input_fn,
+        serving_input_fn=_serving_input_fn,
         default_output_alternative_key="default",
         assets_extra={"from/path": "to/path"},
         as_text=False,
@@ -299,7 +299,7 @@ def _create_test_export_dir(export_dir_base):
   export_dir = saved_model_export_utils.get_timestamped_export_dir(
       export_dir_base)
   gfile.MkDir(export_dir)
-  time.sleep(0.001)
+  time.sleep(1)
   return export_dir
 
 
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py
index 2f60554104..6309d36258 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py
@@ -200,16 +200,16 @@ class NonSquareLinearOperatorCompositionTest(
     operator = linalg.LinearOperatorComposition(operators)
     self.assertAllEqual((2, 3, 5), operator.shape)
 
-  def test_dynamic_shapes_when_statically_available(self):
+  def test_shape_tensors_when_statically_available(self):
     operators = [
         linalg.LinearOperatorMatrix(rng.rand(2, 3, 4)),
         linalg.LinearOperatorMatrix(rng.rand(2, 4, 5))
     ]
     operator = linalg.LinearOperatorComposition(operators)
     with self.test_session():
-      self.assertAllEqual((2, 3, 5), operator.shape_dynamic().eval())
+      self.assertAllEqual((2, 3, 5), operator.shape_tensor().eval())
 
-  def test_dynamic_shapes_when_only_dynamically_available(self):
+  def test_shape_tensors_when_only_dynamically_available(self):
     mat_1 = rng.rand(1, 2, 3, 4)
     mat_2 = rng.rand(1, 2, 4, 5)
     mat_ph_1 = array_ops.placeholder(dtypes.float64)
@@ -223,7 +223,7 @@ class NonSquareLinearOperatorCompositionTest(
     operator = linalg.LinearOperatorComposition(operators)
     with self.test_session():
       self.assertAllEqual(
-          (1, 2, 3, 5), operator.shape_dynamic().eval(feed_dict=feed_dict))
+          (1, 2, 3, 5), operator.shape_tensor().eval(feed_dict=feed_dict))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py
index 8f77c5e6e3..c099194eed 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py
@@ -31,7 +31,7 @@ rng = np.random.RandomState(123)
 
 
 class LinearOperatorShape(linalg.LinearOperator):
-  """LinearOperator that implements the methods ._shape and _shape_dynamic."""
+  """LinearOperator that implements the methods ._shape and _shape_tensor."""
 
   def __init__(self,
                shape,
@@ -49,7 +49,7 @@ class LinearOperatorShape(linalg.LinearOperator):
   def _shape(self):
     return tensor_shape.TensorShape(self._stored_shape)
 
-  def _shape_dynamic(self):
+  def _shape_tensor(self):
     return constant_op.constant(self._stored_shape, dtype=dtypes.int32)
 
 
@@ -71,7 +71,7 @@ class LinearOperatorApplyOnly(linalg.LinearOperator):
   def _shape(self):
     return self._matrix.get_shape()
 
-  def _shape_dynamic(self):
+  def _shape_tensor(self):
     return array_ops.shape(self._matrix)
 
   def _apply(self, x, adjoint=False):
@@ -96,11 +96,11 @@ class LinearOperatorTest(test.TestCase):
       shape = (1, 2, 3, 4)
       operator = LinearOperatorShape(shape)
 
-      self.assertAllEqual(shape, operator.shape_dynamic().eval())
-      self.assertAllEqual(4, operator.tensor_rank_dynamic().eval())
-      self.assertAllEqual((1, 2), operator.batch_shape_dynamic().eval())
-      self.assertAllEqual(4, operator.domain_dimension_dynamic().eval())
-      self.assertAllEqual(3, operator.range_dimension_dynamic().eval())
+      self.assertAllEqual(shape, operator.shape_tensor().eval())
+      self.assertAllEqual(4, operator.tensor_rank_tensor().eval())
+      self.assertAllEqual((1, 2), operator.batch_shape_tensor().eval())
+      self.assertAllEqual(4, operator.domain_dimension_tensor().eval())
+      self.assertAllEqual(3, operator.range_dimension_tensor().eval())
 
   def test_is_x_properties(self):
     operator = LinearOperatorShape(
@@ -120,7 +120,7 @@ class LinearOperatorTest(test.TestCase):
       self.assertAllEqual((2, 3, 4), operator_dense.get_shape())
       self.assertAllClose(matrix, operator_dense.eval())
 
-  def test_generic_to_dense_method_non_square_matrix_dynamic(self):
+  def test_generic_to_dense_method_non_square_matrix_tensor(self):
     matrix = rng.randn(2, 3, 4)
     matrix_ph = array_ops.placeholder(dtypes.float64)
     operator = LinearOperatorApplyOnly(matrix_ph)
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_util_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_util_test.py
index 4eac01092f..bf6f8f8302 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_util_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_util_test.py
@@ -96,7 +96,7 @@ class DomainDimensionStubOperator(object):
   def __init__(self, domain_dimension):
     self._domain_dimension = ops.convert_to_tensor(domain_dimension)
 
-  def domain_dimension_dynamic(self):
+  def domain_dimension_tensor(self):
     return self._domain_dimension
 
 
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator.py b/tensorflow/contrib/linalg/python/ops/linear_operator.py
index e229820edc..2467603605 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator.py
@@ -180,13 +180,15 @@ class LinearOperator(object):
     self._is_positive_definite = is_positive_definite
     self._name = name or type(self).__name__
 
-    # We will cache some values to avoid repeatedly adding shape
-    # manipulation ops to the graph.  Cleaner.
-    self._cached_shape_dynamic = None
-    self._cached_batch_shape_dynamic = None
-    self._cached_domain_dimension_dynamic = None
-    self._cached_range_dimension_dynamic = None
-    self._cached_tensor_rank_dynamic = None
+    # We will cache some tensors to avoid repeatedly adding shape
+    # manipulation ops to the graph.
+    # Naming convention:
+    #   self._cached_X_tensor is the cached version of self._X_tensor.
+    self._cached_shape_tensor = None
+    self._cached_batch_shape_tensor = None
+    self._cached_domain_dimension_tensor = None
+    self._cached_range_dimension_tensor = None
+    self._cached_tensor_rank_tensor = None
 
   @contextlib.contextmanager
   def _name_scope(self, name=None, values=None):
@@ -240,10 +242,10 @@ class LinearOperator(object):
     """
     return self._shape()
 
-  def _shape_dynamic(self):
-    raise NotImplementedError("_shape_dynamic is not implemented.")
+  def _shape_tensor(self):
+    raise NotImplementedError("_shape_tensor is not implemented.")
 
-  def shape_dynamic(self, name="shape_dynamic"):
+  def shape_tensor(self, name="shape_tensor"):
     """Shape of this `LinearOperator`, determined at runtime.
 
     If this operator acts like the batch matrix `A` with
@@ -258,14 +260,14 @@ class LinearOperator(object):
     """
     with self._name_scope(name):
       # Be clean by avoiding adding shape Ops to the graph too many times.
-      if self._cached_shape_dynamic is None:
+      if self._cached_shape_tensor is None:
         # Prefer to use statically defined shape if available.
         if self.shape.is_fully_defined():
-          self._cached_shape_dynamic = linear_operator_util.shape_tensor(
+          self._cached_shape_tensor = linear_operator_util.shape_tensor(
               self.shape.as_list())
         else:
-          self._cached_shape_dynamic = self._shape_dynamic()
-      return self._cached_shape_dynamic
+          self._cached_shape_tensor = self._shape_tensor()
+      return self._cached_shape_tensor
 
   @property
   def batch_shape(self):
@@ -281,7 +283,7 @@ class LinearOperator(object):
     # Derived classes get this "for free" once .shape is implemented.
     return self.shape[:-2]
 
-  def batch_shape_dynamic(self, name="batch_shape_dynamic"):
+  def batch_shape_tensor(self, name="batch_shape_tensor"):
     """Shape of batch dimensions of this operator, determined at runtime.
 
     If this operator acts like the batch matrix `A` with
@@ -296,14 +298,14 @@ class LinearOperator(object):
     """
     # Derived classes get this "for free" once .shape() is implemented.
     with self._name_scope(name):
-      if self._cached_batch_shape_dynamic is None:
+      if self._cached_batch_shape_tensor is None:
         # Prefer to use statically defined shape if available.
         if self.batch_shape.is_fully_defined():
-          self._cached_batch_shape_dynamic = linear_operator_util.shape_tensor(
+          self._cached_batch_shape_tensor = linear_operator_util.shape_tensor(
               self.batch_shape.as_list(), name="batch_shape")
         else:
-          self._cached_batch_shape_dynamic = self.shape_dynamic()[:-2]
-      return self._cached_batch_shape_dynamic
+          self._cached_batch_shape_tensor = self.shape_tensor()[:-2]
+      return self._cached_batch_shape_tensor
 
   @property
   def tensor_rank(self, name="tensor_rank"):
@@ -322,7 +324,7 @@ class LinearOperator(object):
     with self._name_scope(name):
       return self.shape.ndims
 
-  def tensor_rank_dynamic(self, name="tensor_rank_dynamic"):
+  def tensor_rank_tensor(self, name="tensor_rank_tensor"):
     """Rank (in the sense of tensors) of matrix corresponding to this operator.
 
     If this operator acts like the batch matrix `A` with
@@ -336,15 +338,15 @@ class LinearOperator(object):
     """
     # Derived classes get this "for free" once .shape() is implemented.
     with self._name_scope(name):
-      if self._cached_tensor_rank_dynamic is None:
+      if self._cached_tensor_rank_tensor is None:
         # Prefer to use statically defined shape if available.
         if self.tensor_rank is not None:
-          self._cached_tensor_rank_dynamic = ops.convert_to_tensor(
+          self._cached_tensor_rank_tensor = ops.convert_to_tensor(
               self.tensor_rank)
         else:
-          self._cached_tensor_rank_dynamic = array_ops.size(
-              self.shape_dynamic())
-      return self._cached_tensor_rank_dynamic
+          self._cached_tensor_rank_tensor = array_ops.size(
+              self.shape_tensor())
+      return self._cached_tensor_rank_tensor
 
   @property
   def domain_dimension(self):
@@ -359,7 +361,7 @@ class LinearOperator(object):
     # Derived classes get this "for free" once .shape is implemented.
     return self.shape[-1]
 
-  def domain_dimension_dynamic(self, name="domain_dimension_dynamic"):
+  def domain_dimension_tensor(self, name="domain_dimension_tensor"):
     """Dimension (in the sense of vector spaces) of the domain of this operator.
 
     Determined at runtime.
@@ -375,14 +377,14 @@ class LinearOperator(object):
     """
     # Derived classes get this "for free" once .shape() is implemented.
     with self._name_scope(name):
-      if self._cached_domain_dimension_dynamic is None:
+      if self._cached_domain_dimension_tensor is None:
         # Prefer to use statically defined shape if available.
         if self.domain_dimension.value is not None:
-          self._cached_domain_dimension_dynamic = ops.convert_to_tensor(
+          self._cached_domain_dimension_tensor = ops.convert_to_tensor(
               self.domain_dimension.value)
         else:
-          self._cached_domain_dimension_dynamic = self.shape_dynamic()[-1]
-      return self._cached_domain_dimension_dynamic
+          self._cached_domain_dimension_tensor = self.shape_tensor()[-1]
+      return self._cached_domain_dimension_tensor
 
   @property
   def range_dimension(self):
@@ -397,7 +399,7 @@ class LinearOperator(object):
     # Derived classes get this "for free" once .shape is implemented.
     return self.shape[-2]
 
-  def range_dimension_dynamic(self, name="range_dimension_dynamic"):
+  def range_dimension_tensor(self, name="range_dimension_tensor"):
     """Dimension (in the sense of vector spaces) of the range of this operator.
 
     Determined at runtime.
@@ -413,14 +415,14 @@ class LinearOperator(object):
     """
     # Derived classes get this "for free" once .shape() is implemented.
     with self._name_scope(name):
-      if self._cached_range_dimension_dynamic is None:
+      if self._cached_range_dimension_tensor is None:
         # Prefer to use statically defined shape if available.
         if self.range_dimension.value is not None:
-          self._cached_range_dimension_dynamic = ops.convert_to_tensor(
+          self._cached_range_dimension_tensor = ops.convert_to_tensor(
               self.range_dimension.value)
         else:
-          self._cached_range_dimension_dynamic = self.shape_dynamic()[-2]
-      return self._cached_range_dimension_dynamic
+          self._cached_range_dimension_tensor = self.shape_tensor()[-2]
+      return self._cached_range_dimension_tensor
 
   def _assert_non_singular(self):
     raise NotImplementedError("assert_non_singular is not implemented.")
@@ -574,12 +576,12 @@ class LinearOperator(object):
     if self.batch_shape.is_fully_defined():
       batch_shape = self.batch_shape
     else:
-      batch_shape = self.batch_shape_dynamic()
+      batch_shape = self.batch_shape_tensor()
 
     if self.domain_dimension.value is not None:
       n = self.domain_dimension.value
     else:
-      n = self.domain_dimension_dynamic()
+      n = self.domain_dimension_tensor()
 
     eye = linalg_ops.eye(num_rows=n, batch_shape=batch_shape, dtype=self.dtype)
     return self.apply(eye)
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py b/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
index 3e118ebbd4..81e7735841 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
@@ -202,7 +202,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
 
     return batch_shape.concatenate(matrix_shape)
 
-  def _shape_dynamic(self):
+  def _shape_tensor(self):
     # Avoid messy broadcasting if possible.
     if self.shape.is_fully_defined():
       return ops.convert_to_tensor(
@@ -212,14 +212,14 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
     # the graph.  Things will fail at runtime naturally if shapes are
     # incompatible.
     matrix_shape = array_ops.stack([
-        self.operators[0].range_dimension_dynamic(),
-        self.operators[-1].domain_dimension_dynamic()
+        self.operators[0].range_dimension_tensor(),
+        self.operators[-1].domain_dimension_tensor()
     ])
 
     # Dummy Tensor of zeros.  Will never be materialized.
-    zeros = array_ops.zeros(shape=self.operators[0].batch_shape_dynamic())
+    zeros = array_ops.zeros(shape=self.operators[0].batch_shape_tensor())
     for operator in self.operators[1:]:
-      zeros += array_ops.zeros(shape=operator.batch_shape_dynamic())
+      zeros += array_ops.zeros(shape=operator.batch_shape_tensor())
     batch_shape = array_ops.shape(zeros)
 
     return array_ops.concat((batch_shape, matrix_shape), 0)
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py b/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
index d59e8be767..4700e65518 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
@@ -166,7 +166,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
     d_shape = self._diag.get_shape()
     return d_shape.concatenate(d_shape[-1:])
 
-  def _shape_dynamic(self):
+  def _shape_tensor(self):
     d_shape = array_ops.shape(self._diag)
     k = d_shape[-1]
     return array_ops.concat((d_shape, [k]), 0)
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py b/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
index 3304698ec6..6559f8b116 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
@@ -261,7 +261,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
     batch_shape = tensor_shape.TensorShape(self._batch_shape_static)
     return batch_shape.concatenate(matrix_shape)
 
-  def _shape_dynamic(self):
+  def _shape_tensor(self):
     matrix_shape = array_ops.stack(
         (self._num_rows, self._num_rows), axis=0)
     if self._batch_shape_arg is None:
@@ -307,7 +307,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
     # Dynamic broadcast:
     #   Always add to an array of zeros, rather than using a "cond", since a
     #   cond would require copying data from GPU --> CPU.
-    special_shape = array_ops.concat((self.batch_shape_dynamic(), [1, 1]), 0)
+    special_shape = array_ops.concat((self.batch_shape_tensor(), [1, 1]), 0)
     zeros = array_ops.zeros(shape=special_shape, dtype=self.dtype)
     return x + zeros
 
@@ -320,10 +320,10 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
     return self._possibly_broadcast_batch_shape(x)
 
   def _determinant(self):
-    return array_ops.ones(shape=self.batch_shape_dynamic(), dtype=self.dtype)
+    return array_ops.ones(shape=self.batch_shape_tensor(), dtype=self.dtype)
 
   def _log_abs_determinant(self):
-    return array_ops.zeros(shape=self.batch_shape_dynamic(), dtype=self.dtype)
+    return array_ops.zeros(shape=self.batch_shape_tensor(), dtype=self.dtype)
 
   def _solve(self, rhs, adjoint=False):
     return self._apply(rhs)
@@ -566,7 +566,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
     batch_shape = self.multiplier.get_shape()
     return batch_shape.concatenate(matrix_shape)
 
-  def _shape_dynamic(self):
+  def _shape_tensor(self):
     matrix_shape = array_ops.stack(
         (self._num_rows, self._num_rows), axis=0)
 
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_matrix.py b/tensorflow/contrib/linalg/python/ops/linear_operator_matrix.py
index 7ca18450d1..3b5dc7c481 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_matrix.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_matrix.py
@@ -157,7 +157,7 @@ class LinearOperatorMatrix(linear_operator.LinearOperator):
   def _shape(self):
     return self._matrix.get_shape()
 
-  def _shape_dynamic(self):
+  def _shape_tensor(self):
     return array_ops.shape(self._matrix)
 
   def _apply(self, x, adjoint=False):
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
index 5de9bb5d77..85cd7fcd9a 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
@@ -174,6 +174,29 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                 feed_dict=feed_dict)
             self.assertAC(op_det_v, mat_det_v)
 
+  def test_log_abs_det(self):
+    self._maybe_skip("log_abs_det")
+    for use_placeholder in False, True:
+      for shape in self._shapes_to_test:
+        for dtype in self._dtypes_to_test:
+          if dtype.is_complex:
+            self.skipTest(
+                "tf.matrix_determinant does not work with complex, so this "
+                "test is being skipped.")
+          with self.test_session(graph=ops.Graph()) as sess:
+            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+            operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
+                shape, dtype, use_placeholder=use_placeholder)
+            op_log_abs_det = operator.log_abs_determinant()
+            mat_log_abs_det = math_ops.log(
+                math_ops.abs(linalg_ops.matrix_determinant(mat)))
+            if not use_placeholder:
+              self.assertAllEqual(shape[:-2], op_log_abs_det.get_shape())
+            op_log_abs_det_v, mat_log_abs_det_v = sess.run(
+                [op_log_abs_det, mat_log_abs_det],
+                feed_dict=feed_dict)
+            self.assertAC(op_log_abs_det_v, mat_log_abs_det_v)
+
   def test_apply(self):
     self._maybe_skip("apply")
     for use_placeholder in False, True:
@@ -262,8 +285,8 @@ class SquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
       n = operator.domain_dimension.value
       x_shape = batch_shape + [n, r]
     else:
-      batch_shape = operator.batch_shape_dynamic()
-      n = operator.domain_dimension_dynamic()
+      batch_shape = operator.batch_shape_tensor()
+      n = operator.domain_dimension_tensor()
       x_shape = array_ops.concat((batch_shape, [n, r]), 0)
 
     return random_normal(x_shape, dtype=operator.dtype)
@@ -291,7 +314,7 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
   @property
   def _tests_to_skip(self):
     """List of test names to skip."""
-    return ["solve", "det"]
+    return ["solve", "det", "log_abs_det"]
 
   @property
   def _shapes_to_test(self):
@@ -316,11 +339,11 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
         n = operator.domain_dimension.value
       x_shape = batch_shape + [n, r]
     else:
-      batch_shape = operator.batch_shape_dynamic()
+      batch_shape = operator.batch_shape_tensor()
       if adjoint:
-        n = operator.range_dimension_dynamic()
+        n = operator.range_dimension_tensor()
       else:
-        n = operator.domain_dimension_dynamic()
+        n = operator.domain_dimension_tensor()
       x_shape = array_ops.concat((batch_shape, [n, r]), 0)
 
     return random_normal(x_shape, dtype=operator.dtype)
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py b/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
index 7c5b9b6b54..2b1fb4c04c 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
@@ -157,7 +157,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
   def _shape(self):
     return self._tril.get_shape()
 
-  def _shape_dynamic(self):
+  def _shape_tensor(self):
     return array_ops.shape(self._tril)
 
   def _assert_non_singular(self):
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_util.py b/tensorflow/contrib/linalg/python/ops/linear_operator_util.py
index 44092f0c06..6e56fac2e3 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_util.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_util.py
@@ -83,10 +83,10 @@ def assert_compatible_matrix_dimensions(operator, x):
   Returns:
     `Assert` `Op`.
   """
-  # Static checks are done in the base class.  Only dynamic asserts here.
+  # Static checks are done in the base class.  Only tensor asserts here.
   assert_same_dd = check_ops.assert_equal(
       array_ops.shape(x)[-2],
-      operator.domain_dimension_dynamic(),
+      operator.domain_dimension_tensor(),
       message=(
           "Incompatible matrix dimensions.  "
           "shape[-2] of argument to be the same as this operator"))
diff --git a/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
index 3bad4c42a9..986150cb3f 100644
--- a/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
+++ b/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
@@ -44,6 +44,7 @@ CXXFLAGS += -DTENSORFLOW_DISABLE_META
 CXXFLAGS += -D__ANDROID_TYPES_FULL__
 
 GRAPH_EXECUTION_SRCS := \
+tensorflow/core/kernels/hexagon/graph_transfer_utils.cc \
 tensorflow/core/kernels/hexagon/graph_transferer.cc \
 tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc \
 tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc \
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index c3f59dd84c..96acead47f 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -28,6 +28,7 @@ tensorflow/core/kernels/split_op.cc
 tensorflow/core/kernels/split_v_op.cc
 tensorflow/core/kernels/split_lib_cpu.cc
 tensorflow/core/kernels/sparse_to_dense_op.cc
+tensorflow/core/kernels/sparse_matmul_op.cc
 tensorflow/core/kernels/softsign_op.cc
 tensorflow/core/kernels/softplus_op.cc
 tensorflow/core/kernels/softmax_op.cc
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 6ceeacbc72..7ac337732a 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -167,10 +167,10 @@ def streaming_true_positives(predictions, labels, weights=None,
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
   Args:
-    predictions: The predicted values, a `bool` `Tensor` of arbitrary
-      dimensions.
-    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
-      match `predictions`.
+    predictions: The predicted values, a `Tensor` of arbitrary dimensions. Will
+      be cast to `bool`.
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions
       must be either `1`, or the same as the corresponding `labels`
@@ -206,10 +206,10 @@ def streaming_true_negatives(predictions, labels, weights=None,
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
   Args:
-    predictions: The predicted values, a `bool` `Tensor` of arbitrary
-      dimensions.
-    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
-      match `predictions`.
+    predictions: The predicted values, a `Tensor` of arbitrary dimensions. Will
+      be cast to `bool`.
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions
       must be either `1`, or the same as the corresponding `labels`
@@ -233,11 +233,11 @@ def streaming_true_negatives(predictions, labels, weights=None,
   with variable_scope.variable_scope(
       name, 'true_negatives', (predictions, labels, weights)):
 
-    predictions = ops.convert_to_tensor(predictions)
-    labels = ops.convert_to_tensor(labels)
+    predictions = math_ops.cast(predictions, dtype=dtypes.bool)
+    labels = math_ops.cast(labels, dtype=dtypes.bool)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    is_true_negative = math_ops.logical_and(math_ops.equal(labels, 0),
-                                            math_ops.equal(predictions, 0))
+    is_true_negative = math_ops.logical_and(math_ops.equal(labels, False),
+                                            math_ops.equal(predictions, False))
     return _count_condition(is_true_negative, weights, metrics_collections,
                             updates_collections)
 
@@ -251,10 +251,10 @@ def streaming_false_positives(predictions, labels, weights=None,
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
   Args:
-    predictions: The predicted values, a `bool` `Tensor` of arbitrary
-      dimensions.
-    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
-      match `predictions`.
+    predictions: The predicted values, a `Tensor` of arbitrary dimensions. Will
+      be cast to `bool`.
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions
       must be either `1`, or the same as the corresponding `labels`
@@ -290,10 +290,10 @@ def streaming_false_negatives(predictions, labels, weights=None,
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
   Args:
-    predictions: The predicted values, a `bool` `Tensor` of arbitrary
-      dimensions.
-    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
-      match `predictions`.
+    predictions: The predicted values, a `Tensor` of arbitrary dimensions. Will
+      be cast to `bool`.
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions
       must be either `1`, or the same as the corresponding `labels`
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 3e2e408e6f..4fb244e3d4 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -663,35 +663,41 @@ class StreamingTruePositivesTest(test.TestCase):
     _assert_local_variables(self, ('true_positives/count:0',))
 
   def testUnweighted(self):
-    predictions = constant_op.constant(((1, 0, 1, 0),
-                                        (0, 1, 1, 1),
-                                        (0, 0, 0, 0)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
-    tp, tp_update_op = metrics.streaming_true_positives(predictions, labels)
+    for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+      predictions = math_ops.cast(constant_op.constant(
+          ((1, 0, 1, 0),
+           (0, 1, 1, 1),
+           (0, 0, 0, 0))), dtype=dtype)
+      labels = math_ops.cast(constant_op.constant(
+          ((0, 1, 1, 0),
+           (1, 0, 0, 0),
+           (0, 0, 0, 0))), dtype=dtype)
+      tp, tp_update_op = metrics.streaming_true_positives(predictions, labels)
 
-    with self.test_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, tp.eval())
-      self.assertEqual(1, tp_update_op.eval())
-      self.assertEqual(1, tp.eval())
+      with self.test_session() as sess:
+        sess.run(variables.local_variables_initializer())
+        self.assertEqual(0, tp.eval())
+        self.assertEqual(1, tp_update_op.eval())
+        self.assertEqual(1, tp.eval())
 
   def testWeighted(self):
-    predictions = constant_op.constant(((1, 0, 1, 0),
-                                        (0, 1, 1, 1),
-                                        (0, 0, 0, 0)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
-    tp, tp_update_op = metrics.streaming_true_positives(
-        predictions, labels, weights=37.0)
+    for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+      predictions = math_ops.cast(constant_op.constant(
+          ((1, 0, 1, 0),
+           (0, 1, 1, 1),
+           (0, 0, 0, 0))), dtype=dtype)
+      labels = math_ops.cast(constant_op.constant(
+          ((0, 1, 1, 0),
+           (1, 0, 0, 0),
+           (0, 0, 0, 0))), dtype=dtype)
+      tp, tp_update_op = metrics.streaming_true_positives(
+          predictions, labels, weights=37.0)
 
-    with self.test_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, tp.eval())
-      self.assertEqual(37.0, tp_update_op.eval())
-      self.assertEqual(37.0, tp.eval())
+      with self.test_session() as sess:
+        sess.run(variables.local_variables_initializer())
+        self.assertEqual(0, tp.eval())
+        self.assertEqual(37.0, tp_update_op.eval())
+        self.assertEqual(37.0, tp.eval())
 
 
 class StreamingFalseNegativesTest(test.TestCase):
@@ -706,35 +712,41 @@ class StreamingFalseNegativesTest(test.TestCase):
     _assert_local_variables(self, ('false_negatives/count:0',))
 
   def testUnweighted(self):
-    predictions = constant_op.constant(((1, 0, 1, 0),
-                                        (0, 1, 1, 1),
-                                        (0, 0, 0, 0)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
-    fn, fn_update_op = metrics.streaming_false_negatives(predictions, labels)
+    for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+      predictions = math_ops.cast(constant_op.constant(
+          ((1, 0, 1, 0),
+           (0, 1, 1, 1),
+           (0, 0, 0, 0))), dtype=dtype)
+      labels = math_ops.cast(constant_op.constant(
+          ((0, 1, 1, 0),
+           (1, 0, 0, 0),
+           (0, 0, 0, 0))), dtype=dtype)
+      fn, fn_update_op = metrics.streaming_false_negatives(predictions, labels)
 
-    with self.test_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, fn.eval())
-      self.assertEqual(2, fn_update_op.eval())
-      self.assertEqual(2, fn.eval())
+      with self.test_session() as sess:
+        sess.run(variables.local_variables_initializer())
+        self.assertEqual(0, fn.eval())
+        self.assertEqual(2, fn_update_op.eval())
+        self.assertEqual(2, fn.eval())
 
   def testWeighted(self):
-    predictions = constant_op.constant(((1, 0, 1, 0),
-                                        (0, 1, 1, 1),
-                                        (0, 0, 0, 0)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
-    fn, fn_update_op = metrics.streaming_false_negatives(
-        predictions, labels, weights=((3.0,), (5.0,), (7.0,)))
+    for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+      predictions = math_ops.cast(constant_op.constant(
+          ((1, 0, 1, 0),
+           (0, 1, 1, 1),
+           (0, 0, 0, 0))), dtype=dtype)
+      labels = math_ops.cast(constant_op.constant(
+          ((0, 1, 1, 0),
+           (1, 0, 0, 0),
+           (0, 0, 0, 0))), dtype=dtype)
+      fn, fn_update_op = metrics.streaming_false_negatives(
+          predictions, labels, weights=((3.0,), (5.0,), (7.0,)))
 
-    with self.test_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, fn.eval())
-      self.assertEqual(8.0, fn_update_op.eval())
-      self.assertEqual(8.0, fn.eval())
+      with self.test_session() as sess:
+        sess.run(variables.local_variables_initializer())
+        self.assertEqual(0, fn.eval())
+        self.assertEqual(8.0, fn_update_op.eval())
+        self.assertEqual(8.0, fn.eval())
 
 
 class StreamingFalsePositivesTest(test.TestCase):
@@ -749,39 +761,45 @@ class StreamingFalsePositivesTest(test.TestCase):
     _assert_local_variables(self, ('false_positives/count:0',))
 
   def testUnweighted(self):
-    predictions = constant_op.constant(((1, 0, 1, 0),
-                                        (0, 1, 1, 1),
-                                        (0, 0, 0, 0)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
-    fp, fp_update_op = metrics.streaming_false_positives(predictions, labels)
+    for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+      predictions = math_ops.cast(constant_op.constant(
+          ((1, 0, 1, 0),
+           (0, 1, 1, 1),
+           (0, 0, 0, 0))), dtype=dtype)
+      labels = math_ops.cast(constant_op.constant(
+          ((0, 1, 1, 0),
+           (1, 0, 0, 0),
+           (0, 0, 0, 0))), dtype=dtype)
+      fp, fp_update_op = metrics.streaming_false_positives(predictions, labels)
 
-    with self.test_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, fp.eval())
-      self.assertEqual(4, fp_update_op.eval())
-      self.assertEqual(4, fp.eval())
+      with self.test_session() as sess:
+        sess.run(variables.local_variables_initializer())
+        self.assertEqual(0, fp.eval())
+        self.assertEqual(4, fp_update_op.eval())
+        self.assertEqual(4, fp.eval())
 
   def testWeighted(self):
-    predictions = constant_op.constant(((1, 0, 1, 0),
-                                        (0, 1, 1, 1),
-                                        (0, 0, 0, 0)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
-    fp, fp_update_op = metrics.streaming_false_positives(
-        predictions,
-        labels,
-        weights=((1.0, 2.0, 3.0, 5.0),
-                 (7.0, 11.0, 13.0, 17.0),
-                 (19.0, 23.0, 29.0, 31.0)))
+    for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+      predictions = math_ops.cast(constant_op.constant(
+          ((1, 0, 1, 0),
+           (0, 1, 1, 1),
+           (0, 0, 0, 0))), dtype=dtype)
+      labels = math_ops.cast(constant_op.constant(
+          ((0, 1, 1, 0),
+           (1, 0, 0, 0),
+           (0, 0, 0, 0))), dtype=dtype)
+      fp, fp_update_op = metrics.streaming_false_positives(
+          predictions,
+          labels,
+          weights=((1.0, 2.0, 3.0, 5.0),
+                   (7.0, 11.0, 13.0, 17.0),
+                   (19.0, 23.0, 29.0, 31.0)))
 
-    with self.test_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, fp.eval())
-      self.assertEqual(42.0, fp_update_op.eval())
-      self.assertEqual(42.0, fp.eval())
+      with self.test_session() as sess:
+        sess.run(variables.local_variables_initializer())
+        self.assertEqual(0, fp.eval())
+        self.assertEqual(42.0, fp_update_op.eval())
+        self.assertEqual(42.0, fp.eval())
 
 
 class StreamingTrueNegativesTest(test.TestCase):
@@ -796,35 +814,41 @@ class StreamingTrueNegativesTest(test.TestCase):
     _assert_local_variables(self, ('true_negatives/count:0',))
 
   def testUnweighted(self):
-    predictions = constant_op.constant(((1, 0, 1, 0),
-                                        (0, 1, 1, 1),
-                                        (0, 0, 0, 0)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
-    tn, tn_update_op = metrics.streaming_true_negatives(predictions, labels)
+    for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+      predictions = math_ops.cast(constant_op.constant(
+          ((1, 0, 1, 0),
+           (0, 1, 1, 1),
+           (0, 0, 0, 0))), dtype=dtype)
+      labels = math_ops.cast(constant_op.constant(
+          ((0, 1, 1, 0),
+           (1, 0, 0, 0),
+           (0, 0, 0, 0))), dtype=dtype)
+      tn, tn_update_op = metrics.streaming_true_negatives(predictions, labels)
 
-    with self.test_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, tn.eval())
-      self.assertEqual(5, tn_update_op.eval())
-      self.assertEqual(5, tn.eval())
+      with self.test_session() as sess:
+        sess.run(variables.local_variables_initializer())
+        self.assertEqual(0, tn.eval())
+        self.assertEqual(5, tn_update_op.eval())
+        self.assertEqual(5, tn.eval())
 
   def testWeighted(self):
-    predictions = constant_op.constant(((1, 0, 1, 0),
-                                        (0, 1, 1, 1),
-                                        (0, 0, 0, 0)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
-    tn, tn_update_op = metrics.streaming_true_negatives(
-        predictions, labels, weights=((0.0, 2.0, 3.0, 5.0),))
+    for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+      predictions = math_ops.cast(constant_op.constant(
+          ((1, 0, 1, 0),
+           (0, 1, 1, 1),
+           (0, 0, 0, 0))), dtype=dtype)
+      labels = math_ops.cast(constant_op.constant(
+          ((0, 1, 1, 0),
+           (1, 0, 0, 0),
+           (0, 0, 0, 0))), dtype=dtype)
+      tn, tn_update_op = metrics.streaming_true_negatives(
+          predictions, labels, weights=((0.0, 2.0, 3.0, 5.0),))
 
-    with self.test_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, tn.eval())
-      self.assertEqual(15.0, tn_update_op.eval())
-      self.assertEqual(15.0, tn.eval())
+      with self.test_session() as sess:
+        sess.run(variables.local_variables_initializer())
+        self.assertEqual(0, tn.eval())
+        self.assertEqual(15.0, tn_update_op.eval())
+        self.assertEqual(15.0, tn.eval())
 
 
 class StreamingTruePositivesAtThresholdsTest(test.TestCase):
diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD
index 3c314e2f28..a739487ae3 100644
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@@ -72,6 +72,46 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "sampling_decoder_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/sampling_decoder_test.py"],
+    additional_deps = [
+        ":seq2seq_py",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "decoder_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/decoder_test.py"],
+    additional_deps = [
+        ":seq2seq_py",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
new file mode 100644
index 0000000000..b3c6c593c5
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
@@ -0,0 +1,156 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for contrib.seq2seq.python.seq2seq.decoder."""
+# pylint: disable=unused-import,g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# pylint: enable=unused-import
+
+import sys
+
+# TODO(jart): #6568 Remove this hack that makes dlopen() not crash.
+if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
+  import ctypes  # pylint: disable=g-import-not-at-top
+  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
+
+# pylint: disable=g-import-not-at-top
+import numpy as np
+
+from tensorflow.contrib.rnn import core_rnn_cell
+from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.contrib.seq2seq.python.ops import sampling_decoder
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.platform import test
+# pylint: enable=g-import-not-at-top
+
+
+class DynamicDecodeRNNTest(test.TestCase):
+
+  def _testDynamicDecodeRNN(self, time_major):
+
+    sequence_length = [3, 4, 3, 1, 0]
+    batch_size = 5
+    max_time = 8
+    input_depth = 7
+    cell_depth = 10
+    max_out = max(sequence_length)
+
+    with self.test_session() as sess:
+      if time_major:
+        inputs = np.random.randn(max_time, batch_size,
+                                 input_depth).astype(np.float32)
+      else:
+        inputs = np.random.randn(batch_size, max_time,
+                                 input_depth).astype(np.float32)
+      cell = core_rnn_cell.LSTMCell(cell_depth)
+      sampler = sampling_decoder.BasicTrainingSampler(
+          inputs, sequence_length, time_major=time_major)
+      my_decoder = sampling_decoder.BasicSamplingDecoder(
+          cell=cell,
+          sampler=sampler,
+          initial_state=cell.zero_state(
+              dtype=dtypes.float32, batch_size=batch_size))
+
+      final_outputs, final_state = decoder.dynamic_decode_rnn(
+          my_decoder, output_time_major=time_major)
+
+      def _t(shape):
+        if time_major:
+          return (shape[1], shape[0]) + shape[2:]
+        return shape
+
+      self.assertTrue(
+          isinstance(final_outputs, sampling_decoder.SamplingDecoderOutput))
+      self.assertTrue(isinstance(final_state, core_rnn_cell.LSTMStateTuple))
+
+      self.assertEqual(
+          _t((batch_size, None, cell_depth)),
+          tuple(final_outputs.rnn_output.get_shape().as_list()))
+      self.assertEqual(
+          _t((batch_size, None)),
+          tuple(final_outputs.sample_id.get_shape().as_list()))
+
+      sess.run(variables.global_variables_initializer())
+      sess_results = sess.run({
+          "final_outputs": final_outputs,
+          "final_state": final_state
+      })
+
+      self.assertEqual(
+          _t((batch_size, max_out, cell_depth)),
+          sess_results["final_outputs"].rnn_output.shape)
+      self.assertEqual(
+          _t((batch_size, max_out)),
+          sess_results["final_outputs"].sample_id.shape)
+
+  def testDynamicDecodeRNNBatchMajor(self):
+    self._testDynamicDecodeRNN(time_major=False)
+
+  def testDynamicDecodeRNNTimeMajor(self):
+    self._testDynamicDecodeRNN(time_major=True)
+
+  def testDynamicDecodeRNNWithBasicTrainingSamplerMatchesDynamicRNN(self):
+    sequence_length = [3, 4, 3, 1, 0]
+    batch_size = 5
+    max_time = 8
+    input_depth = 7
+    cell_depth = 10
+    max_out = max(sequence_length)
+
+    with self.test_session() as sess:
+      inputs = np.random.randn(batch_size, max_time,
+                               input_depth).astype(np.float32)
+
+      cell = core_rnn_cell.LSTMCell(cell_depth)
+      zero_state = cell.zero_state(dtype=dtypes.float32, batch_size=batch_size)
+      sampler = sampling_decoder.BasicTrainingSampler(inputs, sequence_length)
+      my_decoder = sampling_decoder.BasicSamplingDecoder(
+          cell=cell, sampler=sampler, initial_state=zero_state)
+
+      # Match the variable scope of dynamic_rnn below so we end up
+      # using the same variables
+      with vs.variable_scope("rnn"):
+        final_decoder_outputs, final_decoder_state = decoder.dynamic_decode_rnn(
+            my_decoder)
+
+      with vs.variable_scope(vs.get_variable_scope(), reuse=True):
+        final_rnn_outputs, final_rnn_state = rnn.dynamic_rnn(
+            cell,
+            inputs,
+            sequence_length=sequence_length,
+            initial_state=zero_state)
+
+      sess.run(variables.global_variables_initializer())
+      sess_results = sess.run({
+          "final_decoder_outputs": final_decoder_outputs,
+          "final_decoder_state": final_decoder_state,
+          "final_rnn_outputs": final_rnn_outputs,
+          "final_rnn_state": final_rnn_state
+      })
+
+      # Decoder only runs out to max_out; ensure values are identical
+      # to dynamic_rnn, which also zeros out outputs and passes along state.
+      self.assertAllClose(sess_results["final_decoder_outputs"].rnn_output,
+                          sess_results["final_rnn_outputs"][:, 0:max_out, :])
+      self.assertAllClose(sess_results["final_decoder_state"],
+                          sess_results["final_rnn_state"])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/sampling_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/sampling_decoder_test.py
new file mode 100644
index 0000000000..ba945a0ecb
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/sampling_decoder_test.py
@@ -0,0 +1,109 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for contrib.seq2seq.python.seq2seq.sampling_decoder."""
+# pylint: disable=unused-import,g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# pylint: enable=unused-import
+
+import sys
+
+# TODO(jart): #6568 Remove this hack that makes dlopen() not crash.
+if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
+  import ctypes  # pylint: disable=g-import-not-at-top
+  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
+
+# pylint: disable=g-import-not-at-top
+import numpy as np
+
+from tensorflow.contrib.rnn import core_rnn_cell
+from tensorflow.contrib.seq2seq.python.ops import sampling_decoder
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+# pylint: enable=g-import-not-at-top
+
+
+class BasicSamplingDecoderTest(test.TestCase):
+
+  def testStepWithBasicTrainingSampler(self):
+    sequence_length = [3, 4, 3, 1, 0]
+    batch_size = 5
+    max_time = 8
+    input_depth = 7
+    cell_depth = 10
+
+    with self.test_session() as sess:
+      inputs = np.random.randn(batch_size, max_time,
+                               input_depth).astype(np.float32)
+      cell = core_rnn_cell.LSTMCell(cell_depth)
+      sampler = sampling_decoder.BasicTrainingSampler(
+          inputs, sequence_length, time_major=False)
+      my_decoder = sampling_decoder.BasicSamplingDecoder(
+          cell=cell,
+          sampler=sampler,
+          initial_state=cell.zero_state(
+              dtype=dtypes.float32, batch_size=batch_size))
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      batch_size_t = my_decoder.batch_size
+      self.assertEqual(
+          sampling_decoder.SamplingDecoderOutput(cell_depth,
+                                                 tensor_shape.TensorShape([])),
+          output_size)
+      self.assertEqual(
+          sampling_decoder.SamplingDecoderOutput(dtypes.float32, dtypes.int32),
+          output_dtype)
+
+      (first_finished, first_inputs, first_state) = my_decoder.initialize()
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+
+      self.assertTrue(isinstance(first_state, core_rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, core_rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, sampling_decoder.SamplingDecoderOutput))
+      self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
+      self.assertEqual((batch_size,), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+      sess.run(variables.global_variables_initializer())
+      sess_results = sess.run({
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      })
+
+      self.assertAllEqual([False, False, False, False, True],
+                          sess_results["first_finished"])
+      self.assertAllEqual([False, False, False, True, True],
+                          sess_results["step_finished"])
+      self.assertAllEqual([-1] * 5, sess_results["step_outputs"].sample_id)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/ops/decoder.py b/tensorflow/contrib/seq2seq/python/ops/decoder.py
new file mode 100644
index 0000000000..3ab6cb0e8c
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/ops/decoder.py
@@ -0,0 +1,237 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Seq2seq layer operations for use in neural networks.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.util import nest
+
+__all__ = ["Decoder", "dynamic_decode_rnn"]
+
+
+def _transpose_batch_time(x):
+  """Transpose the batch and time dimensions of a Tensor.
+
+  Retains as much of the static shape information as possible.
+
+  Args:
+    x: A tensor of rank 2 or higher.
+
+  Returns:
+    x transposed along the first two dimensions.
+
+  Raises:
+    ValueError: if `x` is rank 1 or lower.
+  """
+  x_static_shape = x.get_shape()
+  if x_static_shape.ndims is not None and x_static_shape.ndims < 2:
+    raise ValueError(
+        "Expected input tensor %s to have rank at least 2, but saw shape: %s" %
+        (x, x_static_shape))
+  x_rank = array_ops.rank(x)
+  x_t = array_ops.transpose(
+      x, array_ops.concat_v2(
+          ([1, 0], math_ops.range(2, x_rank)), axis=0))
+  x_t.set_shape(
+      tensor_shape.TensorShape([
+          x_static_shape[1].value, x_static_shape[0].value
+      ]).concatenate(x_static_shape[2:]))
+  return x_t
+
+
+@six.add_metaclass(abc.ABCMeta)
+class Decoder(object):
+  """An RNN Decoder abstract interface object."""
+
+  @property
+  def batch_size(self):
+    """The batch size of the inputs returned by `sample`."""
+    raise NotImplementedError
+
+  @property
+  def output_size(self):
+    """A (possibly nested tuple of...) integer[s] or `TensorShape` object[s]."""
+    raise NotImplementedError
+
+  @property
+  def output_dtype(self):
+    """A (possibly nested tuple of...) dtype[s]."""
+    raise NotImplementedError
+
+  @abc.abstractmethod
+  def initialize(self, name=None):
+    """Called before any decoding iterations.
+
+    Args:
+      name: Name scope for any created operations.
+
+    Returns:
+      `(finished, first_inputs, initial_state)`.
+    """
+    raise NotImplementedError
+
+  @abc.abstractmethod
+  def step(self, time, inputs, state):
+    """Called per step of decoding (but only once for dynamic decoding).
+
+    Args:
+      time: Scalar `int32` tensor.
+      inputs: Input (possibly nested tuple of) tensor[s] for this time step.
+      state: State (possibly nested tuple of) tensor[s] from previous time step.
+
+    Returns:
+      `(outputs, next_state, next_inputs, finished)`.
+    """
+    raise NotImplementedError
+
+
+def _create_zero_outputs(size, dtype, batch_size):
+  """Create a zero outputs Tensor structure."""
+  def _t(s):
+    return (s if isinstance(s, ops.Tensor) else constant_op.constant(
+        tensor_shape.TensorShape(s).as_list(),
+        dtype=dtypes.int32,
+        name="zero_suffix_shape"))
+
+  def _create(s, d):
+    return array_ops.zeros(
+        array_ops.concat(
+            ([batch_size], _t(s)), axis=0), dtype=d)
+
+  return nest.map_structure(_create, size, dtype)
+
+
+def dynamic_decode_rnn(decoder,
+                       output_time_major=False,
+                       parallel_iterations=32,
+                       swap_memory=False):
+  """Perform dynamic decoding with `decoder`.
+
+  Args:
+    decoder: A `Decoder` instance.
+    output_time_major: Python boolean.  Default: `False` (batch major).  If
+      `True`, outputs are returned as time major tensors (this mode is faster).
+      Otherwise, outputs are returned as batch major tensors (this adds extra
+      time to the computation).
+    parallel_iterations: Argument passed to `tf.while_loop`.
+    swap_memory: Argument passed to `tf.while_loop`.
+
+  Returns:
+    `(final_outputs, final_state)`.
+
+  Raises:
+    TypeError: if `decoder` is not an instance of `Decoder`.
+  """
+  if not isinstance(decoder, Decoder):
+    raise TypeError("Expected decoder to be type Decoder, but saw: %s" %
+                    type(decoder))
+
+  zero_outputs = _create_zero_outputs(decoder.output_size, decoder.output_dtype,
+                                      decoder.batch_size)
+
+  initial_finished, initial_inputs, initial_state = decoder.initialize()
+  initial_time = constant_op.constant(0, dtype=dtypes.int32)
+
+  def _shape(batch_size, from_shape):
+    if not isinstance(from_shape, tensor_shape.TensorShape):
+      return tensor_shape.TensorShape(None)
+    else:
+      batch_size = tensor_util.constant_value(
+          ops.convert_to_tensor(
+              batch_size, name="batch_size"))
+      return tensor_shape.TensorShape([batch_size]).concatenate(from_shape)
+
+  def _create_ta(s, d):
+    return tensor_array_ops.TensorArray(
+        dtype=d, size=0, dynamic_size=True,
+        element_shape=_shape(decoder.batch_size, s))
+
+  initial_outputs_ta = nest.map_structure(
+      _create_ta, decoder.output_size, decoder.output_dtype)
+
+  def condition(unused_time, unused_outputs_ta, unused_state, unused_inputs,
+                finished):
+    return math_ops.logical_not(math_ops.reduce_all(finished))
+
+  def body(time, outputs_ta, state, inputs, finished):
+    """Internal while_loop body.
+
+    Args:
+      time: scalar int32 tensor.
+      outputs_ta: structure of TensorArray.
+      state: (structure of) state tensors and TensorArrays.
+      inputs: (structure of) input tensors.
+      finished: 1-D bool tensor.
+
+    Returns:
+      `(time + 1, outputs_ta, next_state, next_inputs, next_finished)`.
+    """
+    (next_outputs, decoder_state, next_inputs, decoder_finished) = decoder.step(
+        time, inputs, state)
+    next_finished = math_ops.logical_or(decoder_finished, finished)
+
+    nest.assert_same_structure(state, decoder_state)
+    nest.assert_same_structure(outputs_ta, next_outputs)
+    nest.assert_same_structure(inputs, next_inputs)
+
+    # Zero out output values past finish
+    emit = nest.map_structure(
+        lambda out, zero: array_ops.where(finished, zero, out), next_outputs,
+        zero_outputs)
+
+    # Copy through states past finish
+    def _maybe_copy_state(new, cur):
+      return (new if isinstance(cur, tensor_array_ops.TensorArray) else
+              array_ops.where(finished, cur, new))
+
+    next_state = nest.map_structure(_maybe_copy_state, decoder_state, state)
+    outputs_ta = nest.map_structure(lambda ta, out: ta.write(time, out),
+                                    outputs_ta, emit)
+    return (time + 1, outputs_ta, next_state, next_inputs, next_finished)
+
+  res = control_flow_ops.while_loop(
+      condition,
+      body,
+      loop_vars=[
+          initial_time, initial_outputs_ta, initial_state, initial_inputs,
+          initial_finished
+      ],
+      parallel_iterations=parallel_iterations,
+      swap_memory=swap_memory)
+
+  final_outputs_ta = res[1]
+  final_state = res[2]
+
+  final_outputs = nest.map_structure(lambda ta: ta.stack(), final_outputs_ta)
+  if not output_time_major:
+    final_outputs = nest.map_structure(_transpose_batch_time, final_outputs)
+
+  return final_outputs, final_state
diff --git a/tensorflow/contrib/seq2seq/python/ops/sampling_decoder.py b/tensorflow/contrib/seq2seq/python/ops/sampling_decoder.py
new file mode 100644
index 0000000000..c4654e535d
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/ops/sampling_decoder.py
@@ -0,0 +1,190 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A class of Decoders that may sample to generate the next input.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import collections
+
+import six
+
+from tensorflow.contrib.rnn import core_rnn_cell
+from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.util import nest
+
+__all__ = [
+    "Sampler", "SamplingDecoderOutput", "BasicSamplingDecoder",
+    "BasicTrainingSampler"
+]
+
+_transpose_batch_time = decoder._transpose_batch_time  # pylint: disable=protected-access
+
+
+@six.add_metaclass(abc.ABCMeta)
+class Sampler(object):
+
+  @property
+  def batch_size(self):
+    pass
+
+  @abc.abstractmethod
+  def initialize(self):
+    pass
+
+  @abc.abstractmethod
+  def sample(self, time, outputs, state):
+    pass
+
+
+class SamplingDecoderOutput(
+    collections.namedtuple("SamplingDecoderOutput",
+                           ("rnn_output", "sample_id"))):
+  pass
+
+
+class BasicSamplingDecoder(decoder.Decoder):
+  """Basic sampling decoder."""
+
+  def __init__(self, cell, sampler, initial_state):
+    """Initialize BasicSamplingDecoder.
+
+    Args:
+      cell: An `RNNCell` instance.
+      sampler: A `Sampler` instance.
+      initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
+
+    Raises:
+      TypeError: if `cell` is not an instance of `RNNCell` or `sampler`
+        is not an instance of `Sampler`.
+    """
+    if not isinstance(cell, core_rnn_cell.RNNCell):
+      raise TypeError("cell must be an RNNCell, received: %s" % type(cell))
+    if not isinstance(sampler, Sampler):
+      raise TypeError("sampler must be a Sampler, received: %s" %
+                      type(sampler))
+    self._cell = cell
+    self._sampler = sampler
+    self._initial_state = initial_state
+
+  @property
+  def batch_size(self):
+    return self._sampler.batch_size
+
+  @property
+  def output_size(self):
+    # Return the cell output and the id
+    return SamplingDecoderOutput(
+        rnn_output=self._cell.output_size,
+        sample_id=tensor_shape.TensorShape([]))
+
+  @property
+  def output_dtype(self):
+    # Assume the dtype of the cell is the output_size structure
+    # containing the input_state's first component's dtype.
+    # Return that structure and int32 (the id)
+    dtype = nest.flatten(self._initial_state)[0].dtype
+    return SamplingDecoderOutput(
+        nest.map_structure(lambda _: dtype, self._cell.output_size),
+        dtypes.int32)
+
+  def initialize(self, name=None):
+    return self._sampler.initialize() + (self._initial_state,)
+
+  def step(self, time, inputs, state):
+    """Perform a decoding step.
+
+    Args:
+      time: scalar `int32` tensor.
+      inputs: A (structure of) input tensors.
+      state: A (structure of) state tensors and TensorArrays.
+
+    Returns:
+      `(outputs, next_state, next_inputs, finished)`.
+    """
+    cell_outputs, next_state = self._cell(inputs, state)
+    (sample_id, finished, next_inputs) = self._sampler.sample(
+        time=time, outputs=cell_outputs, state=next_state)
+    outputs = SamplingDecoderOutput(cell_outputs, sample_id)
+    return (outputs, next_state, next_inputs, finished)
+
+
+class BasicTrainingSampler(Sampler):
+  """A (non-)sampler for use during training.  Only reads inputs."""
+
+  def __init__(self, inputs, sequence_length, time_major=False):
+    """Initializer.
+
+    Args:
+      inputs: A (structure of) input tensors.
+      sequence_length: An int32 vector tensor.
+      time_major: Python bool.
+
+    Raises:
+      ValueError: if `sequence_length` is not a 1D tensor.
+    """
+    inputs = ops.convert_to_tensor(inputs, name="inputs")
+    if not time_major:
+      inputs = nest.map_structure(_transpose_batch_time, inputs)
+
+    def _unstack_ta(inp):
+      return tensor_array_ops.TensorArray(
+          dtype=inp.dtype, size=array_ops.shape(inp)[0],
+          element_shape=inp.get_shape()[1:]).unstack(inp)
+
+    self._input_tas = nest.map_structure(_unstack_ta, inputs)
+    sequence_length = ops.convert_to_tensor(
+        sequence_length, name="sequence_length")
+    if sequence_length.get_shape().ndims != 1:
+      raise ValueError(
+          "Expected sequence_length to be a vector, but received shape: %s" %
+          sequence_length.get_shape())
+    self._sequence_length = sequence_length
+    self._zero_inputs = nest.map_structure(
+        lambda inp: array_ops.zeros_like(inp[0, :]), inputs)
+    self._batch_size = array_ops.size(sequence_length)
+
+  @property
+  def batch_size(self):
+    return self._batch_size
+
+  def initialize(self):
+    finished = math_ops.equal(0, self._sequence_length)
+    all_finished = math_ops.reduce_all(finished)
+    next_inputs = control_flow_ops.cond(
+        all_finished, lambda: self._zero_inputs,
+        lambda: nest.map_structure(lambda inp: inp.read(0), self._input_tas))
+    return (finished, next_inputs)
+
+  def sample(self, time, **unused_kwargs):
+    next_time = time + 1
+    finished = (next_time >= self._sequence_length)
+    all_finished = math_ops.reduce_all(finished)
+    sample_id = array_ops.tile([constant_op.constant(-1)], [self._batch_size])
+    next_inputs = control_flow_ops.cond(
+        all_finished, lambda: self._zero_inputs,
+        lambda: nest.map_structure(lambda inp: inp.read(next_time), self._input_tas))
+    return (sample_id, finished, next_inputs)
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py
index 28d9b43bbd..420a6d3138 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@@ -18,7 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import framework as contrib_framework
-from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.contrib.learn.python.learn import evaluable
 from tensorflow.contrib.learn.python.learn import trainable
 
@@ -355,18 +354,15 @@ class TensorForestEstimator(evaluable.Evaluable, trainable.Trainable):
     # pylint: enable=protected-access
     return result
 
-  @experimental
   def export_savedmodel(self,
                         export_dir_base,
-                        input_fn,
+                        serving_input_fn,
                         default_output_alternative_key=None,
                         assets_extra=None,
-                        as_text=False,
-                        exports_to_keep=None):
+                        as_text=False):
     return self._estimator.export_savedmodel(
         export_dir_base,
-        input_fn,
+        serving_input_fn,
         default_output_alternative_key=default_output_alternative_key,
         assets_extra=assets_extra,
-        as_text=as_text,
-        exports_to_keep=exports_to_keep)
+        as_text=as_text)
diff --git a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
index 9fd102d0f6..0f52c2128d 100644
--- a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
+++ b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
@@ -26,6 +26,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import string_ops
@@ -41,6 +43,31 @@ class BatchSequencesWithStatesTest(test.TestCase):
   def setUp(self):
     super(BatchSequencesWithStatesTest, self).setUp()
     self.value_length = 4
+    ind1 = np.array([
+        [0, 0],
+        [1, 0], [1, 3], [1, 4],
+        [3, 2], [3, 3]])
+    val1 = np.array([0, 10, 13, 14, 32, 33])
+    shape1 = np.array([self.value_length, 6])
+    sp_tensor1 = sparse_tensor.SparseTensor(
+        array_ops.constant(ind1, dtypes.int64),
+        array_ops.constant(val1, dtypes.int64),
+        array_ops.constant(shape1, dtypes.int64))
+    ind2 = np.array([
+        [0, 0, 1],
+        [0, 1, 0],
+        [0, 1, 2],
+        [1, 0, 3],
+        [1, 1, 0],
+        [1, 1, 1],
+        [1, 1, 2],
+        [1, 2, 2]])
+    val2 = np.array([1, 10, 12, 103, 150, 149, 150, 122])
+    shape2 = np.array([self.value_length, 3, 4])
+    sp_tensor2 = sparse_tensor.SparseTensor(
+        array_ops.constant(ind2, dtypes.int64),
+        array_ops.constant(val2, dtypes.int64),
+        array_ops.constant(shape2, dtypes.int64))
     self.batch_size = 2
     self.key = string_ops.string_join([
         "key_", string_ops.as_string(
@@ -48,8 +75,9 @@ class BatchSequencesWithStatesTest(test.TestCase):
     ])
     self.sequences = {
         "seq1": np.random.rand(self.value_length, 5),
-        "seq2": np.random.rand(self.value_length, 4, 2)
-    }
+        "seq2": np.random.rand(self.value_length, 4, 2),
+        "seq3": sp_tensor1,
+        "seq4": sp_tensor2}
     self.context = {"context1": [3, 4]}
     self.initial_states = {
         "state1": np.random.rand(6, 7),
@@ -60,9 +88,12 @@ class BatchSequencesWithStatesTest(test.TestCase):
     return set(
         [s.decode("ascii").split(":")[0].encode("ascii") for s in key_value])
 
-  def _testBasics(self, num_unroll, length, pad, expected_seq1_batch1,
-                  expected_seq2_batch1, expected_seq1_batch2,
-                  expected_seq2_batch2):
+  def _testBasics(self, num_unroll, length, pad,
+                  expected_seq1_batch1, expected_seq2_batch1,
+                  expected_seq1_batch2, expected_seq2_batch2,
+                  expected_seq3_batch1, expected_seq3_batch2,
+                  expected_seq4_batch1, expected_seq4_batch2):
+
     with self.test_session() as sess:
       next_batch = sqss.batch_sequences_with_states(
           input_key=self.key,
@@ -99,12 +130,13 @@ class BatchSequencesWithStatesTest(test.TestCase):
       threads = queue_runner_impl.start_queue_runners(coord=coord)
 
       # Step 1
-      (key_value, next_key_value, seq1_value, seq2_value, context1_value,
-       state1_value, state2_value, length_value, _, _) = sess.run(
+      (key_value, next_key_value, seq1_value, seq2_value, seq3_value,
+       seq4_value, context1_value, state1_value, state2_value, length_value,
+       _, _) = sess.run(
            (next_batch.key, next_batch.next_key, next_batch.sequences["seq1"],
-            next_batch.sequences["seq2"], next_batch.context["context1"],
+            next_batch.sequences["seq2"], next_batch.sequences["seq3"],
+            next_batch.sequences["seq4"], next_batch.context["context1"],
             state1, state2, next_batch.length, state1_update, state2_update))
-
       expected_first_keys = set([b"00000_of_00002"])
       expected_second_keys = set([b"00001_of_00002"])
       expected_final_keys = set([b"STOP"])
@@ -116,6 +148,14 @@ class BatchSequencesWithStatesTest(test.TestCase):
           context1_value)
       self.assertAllEqual(expected_seq1_batch1, seq1_value)
       self.assertAllEqual(expected_seq2_batch1, seq2_value)
+      self.assertAllEqual(expected_seq3_batch1.indices, seq3_value.indices)
+      self.assertAllEqual(expected_seq3_batch1.values, seq3_value.values)
+      self.assertAllEqual(expected_seq3_batch1.dense_shape,
+                          seq3_value.dense_shape)
+      self.assertAllEqual(expected_seq4_batch1.indices, seq4_value.indices)
+      self.assertAllEqual(expected_seq4_batch1.values, seq4_value.values)
+      self.assertAllEqual(expected_seq4_batch1.dense_shape,
+                          seq4_value.dense_shape)
       self.assertAllEqual(
           np.tile(self.initial_states["state1"], (self.batch_size, 1, 1)),
           state1_value)
@@ -125,12 +165,13 @@ class BatchSequencesWithStatesTest(test.TestCase):
       self.assertAllEqual(length_value, [num_unroll, num_unroll])
 
       # Step 2
-      (key_value, next_key_value, seq1_value, seq2_value, context1_value,
-       state1_value, state2_value, length_value, _, _) = sess.run(
+      (key_value, next_key_value, seq1_value, seq2_value, seq3_value,
+       seq4_value, context1_value, state1_value, state2_value, length_value,
+       _, _) = sess.run(
            (next_batch.key, next_batch.next_key, next_batch.sequences["seq1"],
-            next_batch.sequences["seq2"], next_batch.context["context1"],
-            next_batch.state("state1"), next_batch.state("state2"),
-            next_batch.length, state1_update, state2_update))
+            next_batch.sequences["seq2"], next_batch.sequences["seq3"],
+            next_batch.sequences["seq4"], next_batch.context["context1"],
+            state1, state2, next_batch.length, state1_update, state2_update))
 
       self.assertEqual(expected_second_keys, self._prefix(key_value))
       self.assertEqual(expected_final_keys, self._prefix(next_key_value))
@@ -139,6 +180,14 @@ class BatchSequencesWithStatesTest(test.TestCase):
           context1_value)
       self.assertAllEqual(expected_seq1_batch2, seq1_value)
       self.assertAllEqual(expected_seq2_batch2, seq2_value)
+      self.assertAllEqual(expected_seq3_batch2.indices, seq3_value.indices)
+      self.assertAllEqual(expected_seq3_batch2.values, seq3_value.values)
+      self.assertAllEqual(expected_seq3_batch2.dense_shape,
+                          seq3_value.dense_shape)
+      self.assertAllEqual(expected_seq4_batch2.indices, seq4_value.indices)
+      self.assertAllEqual(expected_seq4_batch2.values, seq4_value.values)
+      self.assertAllEqual(expected_seq4_batch2.dense_shape,
+                          seq4_value.dense_shape)
       self.assertAllEqual(1 + np.tile(self.initial_states["state1"],
                                       (self.batch_size, 1, 1)), state1_value)
       self.assertAllEqual(-1 + np.tile(self.initial_states["state2"],
@@ -148,7 +197,7 @@ class BatchSequencesWithStatesTest(test.TestCase):
       coord.request_stop()
       coord.join(threads, stop_grace_period_secs=2)
 
-  def testBasicPadding(self):
+  def _testBasicPadding(self, pad):
     num_unroll = 2  # Divisor of value_length - so no padding necessary.
     expected_seq1_batch1 = np.tile(
         self.sequences["seq1"][np.newaxis, 0:num_unroll, :],
@@ -162,37 +211,74 @@ class BatchSequencesWithStatesTest(test.TestCase):
     expected_seq2_batch2 = np.tile(
         self.sequences["seq2"][np.newaxis, num_unroll:self.value_length, :, :],
         (self.batch_size, 1, 1, 1))
+    ind1_1 = np.array([
+        # batch entry 1
+        [0, 0, 0],
+        [0, 1, 0], [0, 1, 3], [0, 1, 4],
+        # batch entry 2
+        [1, 0, 0],
+        [1, 1, 0], [1, 1, 3], [1, 1, 4]])
+    ind1_2 = np.array([
+        # batch entry 1
+        [0, 1, 2], [0, 1, 3],
+        # batch entry 2
+        [1, 1, 2], [1, 1, 3]])
+    val1_1 = np.array([0, 10, 13, 14,
+                       0, 10, 13, 14])
+    val1_2 = np.array([32, 33,
+                       32, 33])
+    shape1 = np.array([self.batch_size, num_unroll, 6])
+
+    # For sp_tensor2 all values fall into the first segment.
+    ind2_1 = np.array([
+        # batch entry 1
+        [0, 0, 0, 1],
+        [0, 0, 1, 0],
+        [0, 0, 1, 2],
+        [0, 1, 0, 3],
+        [0, 1, 1, 0],
+        [0, 1, 1, 1],
+        [0, 1, 1, 2],
+        [0, 1, 2, 2],
+        # batch entry 2
+        [1, 0, 0, 1],
+        [1, 0, 1, 0],
+        [1, 0, 1, 2],
+        [1, 1, 0, 3],
+        [1, 1, 1, 0],
+        [1, 1, 1, 1],
+        [1, 1, 1, 2],
+        [1, 1, 2, 2],
+    ])
+    val2_1 = np.array([1, 10, 12, 103, 150, 149, 150, 122,
+                       1, 10, 12, 103, 150, 149, 150, 122])
+    shape2 = np.array([self.batch_size, num_unroll, 3, 4])
+    expected_seq3_batch1 = sparse_tensor.SparseTensorValue(
+        ind1_1, val1_1, shape1)
+    expected_seq3_batch2 = sparse_tensor.SparseTensorValue(
+        ind1_2, val1_2, shape1)
+    expected_seq4_batch1 = sparse_tensor.SparseTensorValue(
+        ind2_1, val2_1, shape2)
+    expected_seq4_batch2 = sparse_tensor.SparseTensorValue(
+        np.empty(shape=[0, 4], dtype=np.int64), np.array([]), shape2)
     self._testBasics(
         num_unroll=num_unroll,
         length=3,
-        pad=True,
+        pad=pad,
         expected_seq1_batch1=expected_seq1_batch1,
-        expected_seq2_batch1=expected_seq2_batch1,
         expected_seq1_batch2=expected_seq1_batch2,
-        expected_seq2_batch2=expected_seq2_batch2)
-
-  def testBasics(self):
-    num_unroll = 2  # Divisor of value_length - so no padding necessary.
-    expected_seq1_batch1 = np.tile(
-        self.sequences["seq1"][np.newaxis, 0:num_unroll, :],
-        (self.batch_size, 1, 1))
-    expected_seq2_batch1 = np.tile(
-        self.sequences["seq2"][np.newaxis, 0:num_unroll, :, :],
-        (self.batch_size, 1, 1, 1))
-    expected_seq1_batch2 = np.tile(
-        self.sequences["seq1"][np.newaxis, num_unroll:self.value_length, :],
-        (self.batch_size, 1, 1))
-    expected_seq2_batch2 = np.tile(
-        self.sequences["seq2"][np.newaxis, num_unroll:self.value_length, :, :],
-        (self.batch_size, 1, 1, 1))
-    self._testBasics(
-        num_unroll=num_unroll,
-        length=3,
-        pad=False,
-        expected_seq1_batch1=expected_seq1_batch1,
         expected_seq2_batch1=expected_seq2_batch1,
-        expected_seq1_batch2=expected_seq1_batch2,
-        expected_seq2_batch2=expected_seq2_batch2)
+        expected_seq2_batch2=expected_seq2_batch2,
+        expected_seq3_batch1=expected_seq3_batch1,
+        expected_seq3_batch2=expected_seq3_batch2,
+        expected_seq4_batch1=expected_seq4_batch1,
+        expected_seq4_batch2=expected_seq4_batch2)
+
+  def testBasicPadding(self):
+    self._testBasicPadding(pad=True)
+
+  def testBasicNoPadding(self):
+    self._testBasicPadding(pad=False)
 
   def testNotAMultiple(self):
     num_unroll = 3  # Not a divisor of value_length -
@@ -254,14 +340,69 @@ class BatchSequencesWithStatesTest(test.TestCase):
     expected_seq2_batch2 = np.concatenate(
         [padded_seq2] * self.batch_size, axis=0)
 
+    ind1_1 = np.array([
+        # batch entry 1
+        [0, 0, 0],
+        [0, 1, 0], [0, 1, 3], [0, 1, 4],
+        # batch entry 2
+        [1, 0, 0],
+        [1, 1, 0], [1, 1, 3], [1, 1, 4]])
+    ind1_2 = np.array([
+        # batch entry 1
+        [0, 0, 2], [0, 0, 3],
+        # batch entry 2
+        [1, 0, 2], [1, 0, 3]])
+    val1_1 = np.array([0, 10, 13, 14,
+                       0, 10, 13, 14])
+    val1_2 = np.array([32, 33,
+                       32, 33])
+    shape1 = np.array([self.batch_size, num_unroll, 6])
+
+    # For sp_tensor2 all values fall into the first segment.
+    ind2_1 = np.array([
+        # batch entry 1
+        [0, 0, 0, 1],
+        [0, 0, 1, 0],
+        [0, 0, 1, 2],
+        [0, 1, 0, 3],
+        [0, 1, 1, 0],
+        [0, 1, 1, 1],
+        [0, 1, 1, 2],
+        [0, 1, 2, 2],
+        # batch entry 2
+        [1, 0, 0, 1],
+        [1, 0, 1, 0],
+        [1, 0, 1, 2],
+        [1, 1, 0, 3],
+        [1, 1, 1, 0],
+        [1, 1, 1, 1],
+        [1, 1, 1, 2],
+        [1, 1, 2, 2],
+    ])
+    val2_1 = np.array([1, 10, 12, 103, 150, 149, 150, 122,
+                       1, 10, 12, 103, 150, 149, 150, 122])
+    shape2 = np.array([self.batch_size, num_unroll, 3, 4])
+    expected_seq3_batch1 = sparse_tensor.SparseTensorValue(
+        ind1_1, val1_1, shape1)
+    expected_seq3_batch2 = sparse_tensor.SparseTensorValue(
+        ind1_2, val1_2, shape1)
+    expected_seq4_batch1 = sparse_tensor.SparseTensorValue(
+        ind2_1, val2_1, shape2)
+    expected_seq4_batch2 = sparse_tensor.SparseTensorValue(
+        np.empty(shape=[0, 4], dtype=np.int64), np.array([]), shape2)
+
     self._testBasics(
         num_unroll=num_unroll,
         length=None,
         pad=True,
         expected_seq1_batch1=expected_seq1_batch1,
-        expected_seq2_batch1=expected_seq2_batch1,
         expected_seq1_batch2=expected_seq1_batch2,
-        expected_seq2_batch2=expected_seq2_batch2)
+        expected_seq2_batch1=expected_seq2_batch1,
+        expected_seq2_batch2=expected_seq2_batch2,
+        expected_seq3_batch1=expected_seq3_batch1,
+        expected_seq3_batch2=expected_seq3_batch2,
+        expected_seq4_batch1=expected_seq4_batch1,
+        expected_seq4_batch2=expected_seq4_batch2)
 
 
 class PaddingTest(test.TestCase):
@@ -270,8 +411,8 @@ class PaddingTest(test.TestCase):
     with ops.Graph().as_default() as g, self.test_session(graph=g):
       sequences = {
           "key_1": constant_op.constant([1, 2, 3]),  # length 3
-          "key_2": constant_op.constant([1.5, 2.5])
-      }  # length 2
+          "key_2": constant_op.constant([1.5, 2.5])  # length 2
+      }
 
       _, padded_seq = sqss._padding(sequences, 2)
       with self.assertRaisesOpError(
@@ -300,5 +441,63 @@ class PaddingTest(test.TestCase):
             math_ops.reduce_all(math_ops.equal(val, padded_seq[key])).eval())
 
 
+class SparseTensorReConstructionTest(test.TestCase):
+
+  def testAddManyTakeManyRoundTripBatched(self):
+    with self.test_session(use_gpu=False) as sess:
+      # N == 4 because shape_value == [4, 5]
+      indices_value_1 = np.array([[0, 0], [0, 1], [2, 0]], dtype=np.int64)
+      values_value_1 = np.array([b"a", b"b", b"c"])
+      shape_value_1 = np.array([4, 5], dtype=np.int64)
+      sparse_tensor_1 = sparse_tensor.SparseTensor(
+          array_ops.placeholder(dtypes.int64),
+          array_ops.placeholder(dtypes.string),
+          array_ops.placeholder(dtypes.int64))
+      dict1 = {"key": sparse_tensor_1}
+      indices_value_2 = np.array([[1, 4], [2, 3]], dtype=np.int64)
+      values_value_2 = np.array([b"d", b"e"])
+      shape_value_2 = np.array([4, 5], dtype=np.int64)
+      sparse_tensor_2 = sparse_tensor.SparseTensor(
+          array_ops.placeholder(dtypes.int64),
+          array_ops.placeholder(dtypes.string),
+          array_ops.placeholder(dtypes.int64))
+      dict2 = {"key": sparse_tensor_2}
+
+      input_seq1, keys1, tensor_list1 = sqss._deconstruct_sparse_tensor_seq(
+          dict1, shared_name="a")
+      handles_1 = input_seq1["key"]
+      input_seq2, _, _ = sqss._deconstruct_sparse_tensor_seq(
+          dict2, shared_name="a")
+      handles_2 = input_seq2["key"]
+
+      combined_handles = array_ops.stack(
+          [handles_1[1], handles_1[2], handles_1[3],
+           handles_2[1], handles_2[2], handles_2[3]])
+      batched_dict = {"key": combined_handles}
+      sqss._reconstruct_sparse_tensor_seq(
+          batched_dict,
+          keys1,
+          tensor_list1,
+          batch_size=2,
+          num_unroll=3)
+
+      roundtrip_value, = sess.run(
+          [batched_dict["key"]],
+          feed_dict={sparse_tensor_1.indices: indices_value_1,
+                     sparse_tensor_1.values: values_value_1,
+                     sparse_tensor_1.dense_shape: shape_value_1,
+                     sparse_tensor_2.indices: indices_value_2,
+                     sparse_tensor_2.values: values_value_2,
+                     sparse_tensor_2.dense_shape: shape_value_2})
+
+      self.assertAllEqual(roundtrip_value.indices,
+                          np.array([[0, 1, 0], [1, 0, 4], [1, 1, 3]],
+                                   dtype=np.int64))
+      self.assertAllEqual(roundtrip_value.values,
+                          np.array([b"c", b"d", b"e"]))
+      self.assertAllEqual(roundtrip_value.dense_shape,
+                          np.array([2, 3, 5], dtype=np.int64))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
index a4f753acca..19e0809be8 100644
--- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
+++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
@@ -29,16 +29,23 @@ import six
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.summary import summary
 from tensorflow.python.training import queue_runner
 
+# pylint: disable=protected-access
+_restore_sparse = sparse_ops._take_many_sparse_from_tensors_map
+_store_sparse = sparse_ops._add_many_sparse_to_tensors_map
+# pylint: enable=protected-access
+
 
 class _SequenceInputWrapper(object):
   """A wrapper object for storing sequence-related input.
@@ -1418,23 +1425,60 @@ def batch_sequences_with_states(input_key,
     elif input_sequences:
       # Assert that value_length is a multiple of num_unroll.
       for key, value in input_sequences.items():
-        value_length = array_ops.shape(value)[0]
-        with ops.control_dependencies([
-            control_flow_ops.Assert(
-                math_ops.logical_and(
-                    math_ops.equal(value_length % num_unroll, 0),
-                    math_ops.not_equal(value_length, 0)),
-                [
-                    string_ops.string_join([
-                        "Tensor %s first dimension should be a multiple of: " %
-                        key, string_ops.as_string(num_unroll),
-                        ", but saw value: ", string_ops.as_string(value_length),
-                        ". Consider setting pad=True."
-                    ])
-                ])
-        ]):
-          input_sequences[key] = array_ops.identity(
-              value, name="multiple_of_checked")
+        if (isinstance(value, sparse_tensor.SparseTensor) or
+            isinstance(value, sparse_tensor.SparseTensorValue)):
+          value_length = value.dense_shape[0]
+          with ops.control_dependencies([
+              control_flow_ops.Assert(
+                  math_ops.logical_and(
+                      math_ops.equal(value_length % num_unroll, 0),
+                      math_ops.not_equal(value_length, 0)),
+                  [
+                      string_ops.string_join([
+                          "SparseTensor %s first dimension should be a "
+                          "multiple of: " % key,
+                          string_ops.as_string(num_unroll),
+                          ", but saw value: ",
+                          string_ops.as_string(value_length),
+                          ". Consider setting pad=True."])])]):
+            input_sequences[key] = sparse_tensor.SparseTensor(
+                indices=value.indices,
+                values=array_ops.identity(
+                    value.values, name="multiple_of_checked"),
+                dense_shape=value.dense_shape)
+        else:
+          if not isinstance(value, ops.Tensor):
+            try:
+              value = ops.convert_to_tensor(value)
+            except TypeError:
+              raise TypeError(
+                  "Unsupported input_sequences expected Tensor or SparseTensor "
+                  "values, got: %s for key %s" % (str(type(value)), key))
+          value_length = array_ops.shape(value)[0]
+          with ops.control_dependencies([
+              control_flow_ops.Assert(
+                  math_ops.logical_and(
+                      math_ops.equal(value_length % num_unroll, 0),
+                      math_ops.not_equal(value_length, 0)),
+                  [
+                      string_ops.string_join([
+                          "Tensor %s first dimension should be a multiple "
+                          "of: " % key,
+                          string_ops.as_string(num_unroll),
+                          ", but saw value: ",
+                          string_ops.as_string(value_length),
+                          ". Consider setting pad=True."
+                      ])
+                  ])
+          ]):
+            input_sequences[key] = array_ops.identity(
+                value, name="multiple_of_checked")
+
+    # Deconstruct SparseTensors in sequence into a dense Tensor before inputting
+    # to SQSS.
+    (transformed_input_seq,
+     sparse_tensor_keys,
+     tensor_list) = _deconstruct_sparse_tensor_seq(input_sequences)
 
     # setup stateful queue reader
     stateful_reader = SequenceQueueingStateSaver(
@@ -1442,7 +1486,7 @@ def batch_sequences_with_states(input_key,
         num_unroll,
         input_length=input_length,
         input_key=input_key,
-        input_sequences=input_sequences,
+        input_sequences=transformed_input_seq,
         input_context=input_context,
         initial_states=initial_states,
         capacity=capacity,
@@ -1457,7 +1501,16 @@ def batch_sequences_with_states(input_key,
         queue_closed_exception_types=(errors.OutOfRangeError,
                                       errors.CancelledError))
     queue_runner.add_queue_runner(q_runner)
-    return stateful_reader.next_batch
+    batch = stateful_reader.next_batch
+
+    # Reconstruct SparseTensors in sequence.
+    _reconstruct_sparse_tensor_seq(
+        batch.sequences,
+        sparse_tensor_keys,
+        tensor_list,
+        batch_size,
+        num_unroll)
+    return batch
 
 
 def _padding(sequences, num_unroll):
@@ -1489,38 +1542,187 @@ def _padding(sequences, num_unroll):
 
   sequences_dict = {}
   for key, value in sequences.items():
-    sequences_dict[key] = ops.convert_to_tensor(value)
-
-  lengths = [array_ops.shape(value)[0] for value in sequences_dict.values()]
-  length = lengths[0]
-  all_lengths_equal = [
-      control_flow_ops.Assert(
-          math_ops.equal(l, length), [
-              string_ops.string_join([
-                  "All sequence lengths must match, but received lengths: ",
-                  string_ops.as_string(lengths)
-              ])
-          ]) for l in lengths
-  ]
+    if not (isinstance(value, sparse_tensor.SparseTensor) or
+            isinstance(value, sparse_tensor.SparseTensorValue)):
+      sequences_dict[key] = ops.convert_to_tensor(value)
+    else:
+      sequences_dict[key] = value
+
+  lengths = [array_ops.shape(value)[0] for value in sequences_dict.values()
+             if isinstance(value, ops.Tensor)]
+  if lengths:
+    length = lengths[0]
+    all_lengths_equal = [
+        control_flow_ops.Assert(
+            math_ops.equal(l, length), [string_ops.string_join(
+                ["All sequence lengths must match, but received lengths: ",
+                 string_ops.as_string(lengths)])])
+        for l in lengths]
+    length = control_flow_ops.with_dependencies(all_lengths_equal, length)
+  else:  # Only have SparseTensors
+    sparse_lengths = [value.dense_shape[0] for value in sequences_dict.values()
+                      if isinstance(value, sparse_tensor.SparseTensor)]
+    length = math_ops.maximum(sparse_lengths)
 
-  length = control_flow_ops.with_dependencies(all_lengths_equal, length)
   unroll = array_ops.constant(num_unroll)
   padded_length = length + ((unroll - (length % unroll)) % unroll)
   padded_sequences = {}
   for key, value in sequences_dict.items():
-    # 1. create shape of paddings
-    # first dimension of value will be increased by num_paddings to
-    # padded_length
-    num_paddings = [padded_length - array_ops.shape(value)[0]]
-    # the shape of the paddings that we concat with the original value will be
-    # [num_paddings, tf.shape(value)[1], tf.shape(value)[2], ...,
-    #  tf.shape(value)[tf.rank(value) - 1])]
-    padding_shape = array_ops.concat((num_paddings, array_ops.shape(value)[1:]),
-                                     0)
-    # 2. fill padding shape with dummies
-    dummy = array_ops.constant(
-        "" if value.dtype == dtypes.string else 0, dtype=value.dtype)
-    paddings = array_ops.fill(dims=padding_shape, value=dummy)
-    # 3. concat values with paddings
-    padded_sequences[key] = array_ops.concat([value, paddings], 0)
+    if isinstance(value, ops.Tensor):
+      # 1. create shape of paddings
+      # first dimension of value will be increased by num_paddings to
+      # padded_length
+      num_paddings = [padded_length - array_ops.shape(value)[0]]
+      # the shape of the paddings that we concat with the original value will be
+      # [num_paddings, tf.shape(value)[1], tf.shape(value)[2], ...,
+      #  tf.shape(value)[tf.rank(value) - 1])]
+      padding_shape = array_ops.concat(
+          (num_paddings, array_ops.shape(value)[1:]), 0)
+      # 2. fill padding shape with dummies
+      dummy = array_ops.constant(
+          "" if value.dtype == dtypes.string else 0, dtype=value.dtype)
+      paddings = array_ops.fill(dims=padding_shape, value=dummy)
+      # 3. concat values with paddings
+      padded_sequences[key] = array_ops.concat([value, paddings], 0)
+    else:
+      padded_shape = array_ops.concat([[math_ops.to_int64(padded_length)],
+                                       value.dense_shape[1:]], 0)
+      padded_sequences[key] = sparse_tensor.SparseTensor(
+          indices=value.indices,
+          values=value.values,
+          dense_shape=padded_shape)
   return length, padded_sequences
+
+
+def _deconstruct_sparse_tensor_seq(input_sequence, shared_name=None):
+  """Converts `SparseTensor` values into `Tensors` of IDs and meta data.
+
+  Given a dict of keys -> `Tensor` or `SparseTensor` transforms the
+  `SparseTensor` values into `Tensor` values of IDs by calling `_store_sparse`.
+  The IDs are pointers into and underlying `SparseTensorsMap` that is being
+  constructed. Additional meta data is returned in order to be able to
+  reconstruct `SparseTensor` values after batching and segmenting the IDs
+  `Tensor`.
+
+  Args:
+    input_sequence: dictionary with `Tensor` or `SparseTensor` values.
+    shared_name: The shared name for the underlying `SparseTensorsMap`
+      (optional, defaults to the name of the newly created op).
+  Returns:
+    A tuple `(sequence, sparse_tensor_keys, tensor_list)` where `sequence` is
+    dictionary with the same keys as `input_sequence` but only `Tensor` values,
+    `sparse_tensor_keys` is a list of the keys of the `SparseTensor` values that
+    were converted, and `tensor_list` is a list of the same length with
+    `Tensor` objects.
+  """
+  sparse_tensor_keys = [
+      k for k in sorted(input_sequence.keys())
+      if isinstance(input_sequence[k], sparse_tensor.SparseTensor)]
+  if not sparse_tensor_keys:
+    return input_sequence, None, sparse_tensor_keys
+  sparse_tensor_list = [input_sequence[k] for k in sparse_tensor_keys]
+  tensor_list = [_store_sparse(sp_tensor, shared_name=shared_name)
+                 for sp_tensor in sparse_tensor_list]
+  transformed_input_seq = dict(input_sequence)
+  tensor_op_list = []
+  for i, k in enumerate(sparse_tensor_keys):
+    transformed_input_seq[k] = tensor_list[i]
+    tensor_op_list += [tensor_list[i].op]
+  return transformed_input_seq, sparse_tensor_keys, tensor_op_list
+
+
+def _reconstruct_sparse_tensor_seq(sequence,
+                                   sparse_tensor_keys,
+                                   tensor_op_list,
+                                   batch_size,
+                                   num_unroll):
+  """Inverse of _deconstruct_sparse_tensor_seq.
+
+  Given a dict of keys -> `Tensor` reconstructs `SparseTensor` values for keys
+  in `sparse_tensor_keys`. Their `Tensor` values are assumed to be IDs into the
+  underlying `SparseTensorsMap`. The `dense_shape` of the `SparseTensor`s is
+  `[batch_size, num_unroll, d_0, d_1, ..., d_n]` when the original
+  `SparseTensor` that got deconstructed with `_deconstruct_sparse_tensor_seq`
+  has a `dense_shape` of `[None, d_0, d_1, ..., d_n]`.
+
+  Args:
+    sequence: dictionary with only `Tensor` values that is being updated.
+    sparse_tensor_keys: list of the keys present in `sequence` identifying
+      `SparseTensor` values that should be reconstructed.
+    tensor_op_list: list of the same length as `sparse_tensor_keys` with
+      `Tensor` objects.
+    batch_size: int or int32 scalar `Tensor`, how large minibatches should
+      be.
+    num_unroll: Python integer, how many time steps were unrolled at a time.
+  """
+  def _flatten_tensor(tensor):
+    """Flattens `Tensor` of `shape [batch_size, num_unroll]` into 1D `Tensor`.
+
+    The main use of this function is to work around the limitation of
+    `_restore_sparse` to only accept 1D handles.
+
+    Args:
+      tensor: 2D `Tensor` of `shape [batch_size, num_unroll]`
+    Returns:
+      1D `Tensor`.
+    """
+    return array_ops.reshape(tensor, [-1])
+
+  def _unflatten_sparse_tensor(sp_tensor):
+    """Recreates `[batch_size, num_unroll]` dimensions in the `SparseTensor`.
+
+    Counter-part of `_flatten_tensor` which is called on the input of
+    `_restore_sparse` while this method is called on the output of it.
+    Together they  work around the limitation of `_restore_sparse` to only
+    accept 1D handles.
+
+    The `indices` in `sp_tensor` is a 2D `Tensor` of `shape [N, ndims]`, where
+    `N` is the number of `values` and `ndims` is the number of dimension in its
+    dense counterpart. Among `ndims` the first entry corresponds to the batch
+    dimension `[0, num_unroll * batch_size)` from which we need to recreate the
+    2 dimensions `batch_size` and `num_unroll`.
+
+    The reason this reconstruction works is because the output of
+    `_restore_sparse` despite being a `SparseTensor` is actually dense w.r.t.
+    that first entry.
+
+    Args:
+      sp_tensor: A SparseTensor.
+    Returns:
+      A SparseTensor with a +1 higher rank than the input.
+    """
+    idx_batch = math_ops.to_int64(math_ops.floor(s.indices[:, 0] / num_unroll))
+    idx_time = math_ops.mod(s.indices[:, 0], num_unroll)
+    indices = array_ops.concat_v2([array_ops.expand_dims(idx_batch, 1),
+                                   array_ops.expand_dims(idx_time, 1),
+                                   s.indices[:, 1:]], axis=1)
+    dense_shape = array_ops.concat_v2(
+        [[batch_size], [num_unroll], s.dense_shape[1:]], axis=0)
+    return sparse_tensor.SparseTensor(
+        indices=indices,
+        values=sp_tensor.values,
+        dense_shape=dense_shape)
+
+  if not sparse_tensor_keys:
+    return
+  tensor_list = [sequence[k] for k in sparse_tensor_keys]
+  sp_tensors = [
+      _restore_sparse(sparse_map_op=i,
+                      # Flatten the 2D Tensor [batch_size, num_unroll] of
+                      # handles to a 1D Tensor.
+                      # Reconstruct the dimensions later.
+                      # TODO(b/34247140): Remove this workaround.
+                      sparse_handles=_flatten_tensor(s), rank=None)
+      for i, s in zip(tensor_op_list, tensor_list)]
+  num_unroll = ops.convert_to_tensor(num_unroll, dtype=dtypes.int64,
+                                     name="num_unroll_int64")
+
+  # Recreate the [batch_size, num_unroll] dimensions in the SparseTensors.
+  # The dense_shape will have a +1 higher rank.
+  # TODO(b/34247140): Remove this workaround.
+  sp_tensors_higher_dim = [_unflatten_sparse_tensor(s) for s in sp_tensors]
+
+  # Set values to SparseTensors for sparse_tensor_keys.
+  for i, key in enumerate(sparse_tensor_keys):
+    sequence[key] = sp_tensors_higher_dim[i]
+  return
diff --git a/tensorflow/contrib/util/convert_graphdef_memmapped_format_test.cc b/tensorflow/contrib/util/convert_graphdef_memmapped_format_test.cc
index cb1e7577cf..096ca0f0cf 100644
--- a/tensorflow/contrib/util/convert_graphdef_memmapped_format_test.cc
+++ b/tensorflow/contrib/util/convert_graphdef_memmapped_format_test.cc
@@ -52,7 +52,7 @@ TEST(ConvertGraphdefMemmappedFormatTest, ConvertModel) {
   test::FillFn<float>(&test_tensor2, [](int) -> float { return 3.0; });
 
   auto root = Scope::NewRootScope().ExitOnError();
-  ops::Output m = ops::MatMul(root, test_tensor1, test_tensor2);
+  Output m = ops::MatMul(root, test_tensor1, test_tensor2);
   const string result_name = m.node()->name();
 
   GraphDef graph_def;
@@ -103,7 +103,7 @@ TEST(ConvertGraphdefMemmappedFormatTest, NotSupportedTypesConvert) {
   Tensor test_tensor2(DT_STRING, kTestTensorShape);
   test::FillFn<string>(&test_tensor2, [](int) -> string { return "XYZ"; });
   auto root = Scope::NewRootScope().ExitOnError();
-  ops::Output m = ops::Add(root, test_tensor1, test_tensor2);
+  Output m = ops::Add(root, test_tensor1, test_tensor2);
   const string result_name = m.node()->name();
 
   GraphDef graph_def;
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index c27cc48805..72268c8824 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -258,6 +258,7 @@ cc_library(
         "platform/net.h",
         "platform/notification.h",
         "platform/prefetch.h",
+        "platform/profile_utils/clock_cycle_profiler.h",
         "platform/profile_utils/cpu_utils.h",
         "platform/protobuf.h",
         "platform/stacktrace.h",
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 85ce9d772a..38eb283b10 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -739,8 +739,7 @@ Status DirectSession::SendInputs(const NamedTensorList& inputs,
   for (const auto& input : inputs) {
     auto it = executors_and_keys->input_keys.find(input.first);
     if (it == executors_and_keys->input_keys.end()) {
-      return errors::InvalidArgument("'", input.first,
-                                     "' is not a pre-defined feed!");
+      return errors::Internal("'", input.first, "' is not a pre-defined feed.");
     }
     const string& input_key = it->second;
 
@@ -775,9 +774,8 @@ Status DirectSession::RecvOutputs(const std::vector<string>& output_names,
     const string& output_name = output_names[output_offset];
     auto it = executors_and_keys->output_keys.find(output_name);
     if (it == executors_and_keys->output_keys.end()) {
-      return errors::InvalidArgument("'", output_name,
-                                     "' was not defined as a fetch"
-                                     " target in PRunSetup.");
+      return errors::Internal("'", output_name,
+                              "' is not a pre-defined fetch.");
     }
     const string& output_key = it->second;
     Tensor output_tensor;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
index 3aaaf87e79..b186c9d88c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
@@ -107,7 +107,7 @@ TEST_F(GpuStreamUtilTest, StreamOverrides) {
   auto root = Scope::NewRootScope().ExitOnError();
   ops::_Recv(root.WithOpName("input"), DT_FLOAT, "input", "/cpu:0", 0,
              "/gpu:0");
-  ops::Output n = ops::MatMul(root, {}, {});
+  Output n = ops::MatMul(root, {}, {});
   ops::_Send(root.WithOpName("output"), n, "output", "/gpu:0", 0, "/cpu:0");
   Graph g(OpRegistry::Global());
   TF_ASSERT_OK(root.ToGraph(&g));
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index cd4bf579c9..e2be3a6086 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -18,131 +18,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/constant_folding.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/optimizer_cse.h"
 
 namespace tensorflow {
-namespace {
-
-// Replaces occurrences of parallel_concat with the implementation based on
-// unsafe ops. Sets removed_any to true if any parallel_concats were removed;
-// leaves it untouched otherwise.
-// TODO(apassos) Use NodeBuilder.
-Status RemoveParallelConcat(bool* removed_any, Graph* g) {
-  gtl::InlinedVector<Node*, 2> matches;
-  for (Node* n : g->nodes()) {
-    if (n->type_string() == "ParallelConcat") {
-      matches.push_back(n);
-    }
-  }
-  for (Node* n : matches) {
-    AttrSlice n_attrs(n->def());
-    auto make_node = [n, g, &n_attrs](string op) {
-      NodeDef node;
-      node.set_op(op);
-      node.set_name(g->NewName(n->name()));
-      node.set_device(n->def().device());
-      string colo;
-      if (GetNodeAttr(n_attrs, "_class", &colo).ok()) {
-        AddNodeAttr("_class", colo, &node);
-      }
-      return node;
-    };
-    DataType dtype;
-    TF_RETURN_IF_ERROR(GetNodeAttr(n_attrs, "T", &dtype));
-    TensorShapeProto shape;
-    TF_RETURN_IF_ERROR(GetNodeAttr(n_attrs, "shape", &shape));
-    // Add the constant shape input to the start node.
-    NodeDef shape_node_def = make_node("Const");
-    AddNodeAttr("dtype", DT_INT32, &shape_node_def);
-    TensorProto shape_tensor;
-    shape_tensor.set_dtype(DT_INT32);
-    shape_tensor.mutable_tensor_shape()->add_dim()->set_size(shape.dim_size());
-    for (int i = 0; i < shape.dim_size(); ++i) {
-      shape_tensor.add_int_val(shape.dim(i).size());
-    }
-    AddNodeAttr("value", shape_tensor, &shape_node_def);
-    Status status = Status::OK();
-    Node* shape_node = g->AddNode(shape_node_def, &status);
-    if (!status.ok()) return status;
-
-    // Add the start node
-    NodeDef start_def = make_node("_ParallelConcatStart");
-    AddNodeAttr("dtype", dtype, &start_def);
-    AddNodeAttr("Tshape", DT_INT32, &start_def);
-    AddNodeAttr("init", false, &start_def);
-    start_def.add_input(shape_node_def.name());
-    Node* start = g->AddNode(start_def, &status);
-    if (!status.ok()) return status;
-    // TODO(apassos): make the shape an attr of _ParallelStackBegin.
-    g->AddEdge(shape_node, 0, start, 0);
-
-    // Add all the inplace_updates.
-    std::vector<string> control_dependencies;
-    std::vector<Node*> control_nodes;
-    int i = 0;
-    for (const Edge* input_edge : n->in_edges()) {
-      if (input_edge->IsControlEdge()) {
-        g->AddControlEdge(input_edge->src(), start);
-        continue;
-      }
-      // Constant index for the update node.
-      // TODO(apassos): make _ParallelStackUpdate take this as an attr.
-      NodeDef update_idx_def = make_node("Const");
-      AddNodeAttr("dtype", DT_INT64, &update_idx_def);
-      TensorProto index_tensor;
-      index_tensor.set_dtype(DT_INT64);
-      index_tensor.mutable_tensor_shape()->add_dim()->set_size(1);
-      index_tensor.add_int64_val(i);
-      AddNodeAttr("value", index_tensor, &update_idx_def);
-      Node* index = g->AddNode(update_idx_def, &status);
-      if (!status.ok()) return status;
-
-      NodeDef update_def = make_node("_ParallelConcatUpdate");
-      control_dependencies.push_back(update_def.name());
-      AddNodeAttr("T", dtype, &update_def);
-      AddNodeAttr("Tshape", DT_INT64, &update_def);
-      update_def.add_input(start_def.name());
-      update_def.add_input(update_idx_def.name());
-      update_def.add_input(strings::StrCat(input_edge->src()->name(), ":",
-                                           input_edge->src_output()));
-      Node* update = g->AddNode(update_def, &status);
-      if (!status.ok()) return status;
-      g->AddEdge(start, 0, update, 0);
-      g->AddEdge(index, 0, update, 1);
-      g->AddEdge(input_edge->src(), input_edge->src_output(), update, 2);
-      control_nodes.push_back(update);
-
-      ++i;
-    }
-
-    // Add the final identity.
-    NodeDef identity_def = make_node("Identity");
-    AddNodeAttr("T", dtype, &identity_def);
-    identity_def.add_input(start_def.name());
-    for (const string& s : control_dependencies) {
-      identity_def.add_input(strings::StrCat("^", s));
-    }
-    Node* identity_node = g->AddNode(identity_def, &status);
-    if (!status.ok()) return status;
-    g->AddEdge(start, 0, identity_node, 0);
-    for (Node* inp : control_nodes) {
-      g->AddControlEdge(inp, identity_node);
-    }
-
-    // Remove the node and redirect edges.
-    for (auto* e : n->out_edges()) {
-      if (e->IsControlEdge()) {
-        g->AddControlEdge(identity_node, e->dst());
-      } else {
-        g->AddEdge(identity_node, 0, e->dst(), e->dst_input());
-      }
-    }
-    g->RemoveNode(n);
-    *removed_any = true;
-  }
-  return Status::OK();
-}
-}
 
 GraphOptimizer::GraphOptimizer(const OptimizerOptions& opts) : opts_(opts) {
   if (opts_.opt_level() >= OptimizerOptions::L1) {
@@ -166,11 +45,6 @@ void GraphOptimizer::Optimize(FunctionLibraryRuntime* runtime, Env* env,
       DumpGraph("RemoveListArrayConverter", g);
       changed = true;
     }
-    auto s = RemoveParallelConcat(&changed, g);
-    if (!s.ok()) {
-      // TODO(apassos): figure out how to halt here.
-      LOG(WARNING) << s;
-    }
     if (opts_.do_function_inlining() && RemoveDeadNodes(g)) {
       DumpGraph("RemoveDeadNodes", g);
       changed = true;
diff --git a/tensorflow/core/common_runtime/parallel_concat_optimizer.cc b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
new file mode 100644
index 0000000000..ffbfbc74f1
--- /dev/null
+++ b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
@@ -0,0 +1,126 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/graph_optimizer.h"
+
+#include "tensorflow/core/common_runtime/constant_folding.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/optimizer_cse.h"
+
+namespace tensorflow {
+namespace {
+
+// Replaces occurrences of parallel_concat with the implementation based on
+// unsafe ops. Sets removed_any to true if any parallel_concats were removed;
+// leaves it untouched otherwise.
+class ParallelConcatRemovePass : public GraphOptimizationPass {
+ public:
+  Status Run(const GraphOptimizationPassOptions& options) override {
+    if (options.graph == nullptr) {
+      // TODO(apassos) returning OK feels weird here as we can't do anything
+      // without a graph, but some tests require this.
+      return Status::OK();
+    }
+    Graph* g = options.graph->get();
+    if (g == nullptr) {
+      return errors::Internal(
+          "Parallel concat removal should happen before partitioning and a "
+          "graph should be available.");
+    }
+    gtl::InlinedVector<Node*, 2> matches;
+    for (Node* n : g->nodes()) {
+      if (n->type_string() == "ParallelConcat") {
+        matches.push_back(n);
+      }
+    }
+    for (Node* n : matches) {
+      AttrSlice n_attrs(n->def());
+      auto base_make_node = [n, g, &n_attrs](const string& op,
+                                             const string& name) {
+        NodeBuilder node_builder(name, op);
+        node_builder.Device(n->def().device());
+        string colo;
+        if (GetNodeAttr(n_attrs, "_class", &colo).ok()) {
+          node_builder.Attr("_class", colo);
+        }
+        return node_builder;
+      };
+      auto make_node = [n, g, &n_attrs, &base_make_node](string op) {
+        return base_make_node(
+            op, g->NewName(strings::StrCat(n->name(), "/Internal")));
+      };
+      DataType dtype;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n_attrs, "T", &dtype));
+      TensorShapeProto shape;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n_attrs, "shape", &shape));
+
+      // Add the start node
+      Node* start;
+      TF_RETURN_IF_ERROR(make_node("_ParallelConcatStart")
+                             .Attr("shape", shape)
+                             .Attr("dtype", dtype)
+                             .Finalize(g, &start));
+
+      // Add all the inplace_updates.
+      std::vector<Node*> control_nodes;
+      int64 i = 0;
+      for (const Edge* input_edge : n->in_edges()) {
+        if (input_edge->IsControlEdge()) {
+          g->AddControlEdge(input_edge->src(), start);
+          continue;
+        }
+
+        Node* update;
+        TF_RETURN_IF_ERROR(
+            make_node("_ParallelConcatUpdate")
+                .Attr("loc", i)
+                .Input(start)
+                .Input(input_edge->src(), input_edge->src_output())
+                .Finalize(g, &update));
+        control_nodes.push_back(update);
+
+        ++i;
+      }
+
+      // Add the final identity.
+      NodeBuilder identity_def = base_make_node("Identity", n->name());
+      identity_def.Input(start, 0);
+      for (Node* s : control_nodes) {
+        identity_def.ControlInput(s);
+      }
+      Node* identity_node;
+      TF_RETURN_IF_ERROR(identity_def.Finalize(g, &identity_node));
+
+      // Remove the node and redirect edges.
+      for (auto* e : n->out_edges()) {
+        if (e->IsControlEdge()) {
+          g->AddControlEdge(identity_node, e->dst());
+        } else {
+          g->AddEdge(identity_node, 0, e->dst(), e->dst_input());
+        }
+      }
+      g->RemoveNode(n);
+    }
+    return Status::OK();
+  }
+};
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 0,
+                      ParallelConcatRemovePass);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index 420594d98a..f7d5a9cfc9 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -492,7 +492,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_Shape) {
     TF_ASSERT_OK(
         NodeBuilder("in", pass == 0 ? "WithPartialShape" : "WithUnknownShape")
             .Finalize(root.graph(), &input));
-    auto shape = ops::Shape(root, ops::Output(input));
+    auto shape = ops::Shape(root, Output(input));
     Node* result;
     TF_ASSERT_OK(NodeBuilder("test", "TensorAsShapeInt32")
                      .Input(shape.node())
@@ -518,12 +518,13 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_PackInt32) {
   TF_ASSERT_OK(NodeBuilder("in", "NonConstScalarInt32")
                    .Finalize(root.graph(), &scalar_non_const));
 
-  ops::InputList inputs{
-      ops::Input(ops::Const<int32>(root, 10)),
-      ops::Input(ops::Const<int32>(root, 20)),
-      ops::Input(ops::Output(scalar_non_const)),
-      ops::Input(ops::Const<int32>(root, 40)),
-  };
+  InputList inputs{
+      // clang-format off
+      Input(ops::Const<int32>(root, 10)),
+      Input(ops::Const<int32>(root, 20)),
+      Input(Output(scalar_non_const)),
+      Input(ops::Const<int32>(root, 40)),
+  };  // clang-format on
   auto pack = ops::Pack(root, inputs);
   TF_ASSERT_OK(root.status());
 
@@ -549,12 +550,13 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_PackInt64) {
   TF_ASSERT_OK(NodeBuilder("in", "NonConstScalarInt64")
                    .Finalize(root.graph(), &scalar_non_const));
 
-  ops::InputList inputs{
-      ops::Input(ops::Const<int64>(root, 10LL)),
-      ops::Input(ops::Const<int64>(root, 20LL)),
-      ops::Input(ops::Output(scalar_non_const)),
-      ops::Input(ops::Const<int64>(root, 1LL << 40)),
-  };
+  InputList inputs{
+      // clang-format off
+      Input(ops::Const<int64>(root, 10LL)),
+      Input(ops::Const<int64>(root, 20LL)),
+      Input(Output(scalar_non_const)),
+      Input(ops::Const<int64>(root, 1LL << 40)),
+  };  // clang-format on
   auto pack = ops::Pack(root, inputs);
   TF_ASSERT_OK(root.status());
 
@@ -577,9 +579,9 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_PackInt64) {
 TEST(ShapeRefinerTest, ConstantValueAsShape_PackUnknownDim) {
   Scope root = Scope::NewRootScope();
 
-  ops::InputList inputs{
-      ops::Input(ops::Const<int64>(root, 10LL)),
-      ops::Input(ops::Const<int64>(root, -1LL)),
+  InputList inputs{
+      Input(ops::Const<int64>(root, 10LL)),
+      Input(ops::Const<int64>(root, -1LL)),
   };
   auto pack = ops::Pack(root, inputs);
   TF_ASSERT_OK(root.status());
@@ -604,9 +606,9 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_PackInvalidInput) {
   Scope root = Scope::NewRootScope();
 
   // Inputs are length 2 vectors instead of scalars.
-  ops::InputList inputs{
-      ops::Input(ops::Const<int64>(root, {10LL, 20LL})),
-      ops::Input(ops::Const<int64>(root, {10LL, 21LL})),
+  InputList inputs{
+      Input(ops::Const<int64>(root, {10LL, 20LL})),
+      Input(ops::Const<int64>(root, {10LL, 21LL})),
   };
   auto pack = ops::Pack(root, inputs);
   TF_ASSERT_OK(root.status());
@@ -633,10 +635,12 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_Concat) {
   TF_ASSERT_OK(NodeBuilder("in", "WithPartialShape").Finalize(g, &partial_1));
   TF_ASSERT_OK(NodeBuilder("in", "WithPartialShape2").Finalize(g, &partial_2));
   auto const_input = ops::Const(root, {9, 10, 11});
-  ops::OutputList concat_inputs{
-      ops::Shape(root, ops::Output(partial_1)),
-      ops::Shape(root, ops::Output(partial_2)), const_input,
-  };
+  OutputList concat_inputs{
+      // clang-format off
+      ops::Shape(root, Output(partial_1)),
+      ops::Shape(root, Output(partial_2)),
+      const_input,
+  };  // clang-format on
   auto concat_dim = ops::Const(root, 0);
   auto concat = ops::Concat(root, concat_dim, concat_inputs);
   TF_ASSERT_OK(root.status());
@@ -673,11 +677,12 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_ConcatWithUnknown) {
   TF_ASSERT_OK(NodeBuilder("in", "WithPartialShape").Finalize(g, &partial_1));
   TF_ASSERT_OK(NodeBuilder("in", "WithPartialShape2").Finalize(g, &partial_2));
   TF_ASSERT_OK(NodeBuilder("in", "WithUnknownShape").Finalize(g, &unknown));
-  ops::OutputList concat_inputs{
-      ops::Shape(root, ops::Output(partial_1)),
-      ops::Shape(root, ops::Output(partial_2)),
-      ops::Shape(root, ops::Output(unknown)),
-  };
+  OutputList concat_inputs{
+      // clang-format off
+      ops::Shape(root, Output(partial_1)),
+      ops::Shape(root, Output(partial_2)),
+      ops::Shape(root, Output(unknown)),
+  };  // clang-format on
   auto concat_dim = ops::Const(root, 0);
   auto concat = ops::Concat(root, concat_dim, concat_inputs);
   TF_ASSERT_OK(root.status());
@@ -714,11 +719,12 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_ConcatInvalidDimValue) {
   TF_ASSERT_OK(NodeBuilder("in", "WithPartialShape").Finalize(g, &partial_1));
   TF_ASSERT_OK(NodeBuilder("in", "WithPartialShape2").Finalize(g, &partial_2));
   auto const_input = ops::Const(root, {9, -2, 11});
-  ops::OutputList concat_inputs{
-      ops::Shape(root, ops::Output(partial_1)),
-      ops::Shape(root, ops::Output(partial_2)),  //
+  OutputList concat_inputs{
+      // clang-format off
+      ops::Shape(root, Output(partial_1)),
+      ops::Shape(root, Output(partial_2)),
       const_input,
-  };
+  };  // clang-format on
   auto concat_dim = ops::Const(root, 0);
   auto concat = ops::Concat(root, concat_dim, concat_inputs);
   TF_ASSERT_OK(root.status());
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 73018ec258..e267414654 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -392,7 +392,7 @@ tf_cuda_cc_test(
     name = "rpcbench_test",
     size = "small",
     srcs = ["rpcbench_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
+    linkstatic = 1,
     tags = tf_cuda_tests_tags(),
     deps = [
         "//tensorflow/cc:cc_ops",
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index d155051273..44646e9241 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -67,6 +67,7 @@ class GraphConstructor {
                      : in.prefix + "/"),
           input_map(in.input_map),
           control_dependencies(in.control_dependencies),
+          return_tensors(in.return_tensors),
           importing(true) {}
 
     bool allow_internal_ops;
@@ -75,6 +76,7 @@ class GraphConstructor {
     string prefix;
     std::map<TensorId, TensorId> input_map;
     std::vector<string> control_dependencies;
+    std::vector<TensorId> return_tensors;
 
     // TODO(ashankar): This bool exists to separate out functionality required
     // to make ImportGraphDef a close equivalent of Python's import_graph_def
@@ -88,11 +90,12 @@ class GraphConstructor {
   };
 
   static Status Construct(const Options& opts, const GraphDef* gdef, Graph* g,
-                          ShapeRefiner* refiner) {
+                          ShapeRefiner* refiner,
+                          std::vector<std::pair<Node*, int>>* return_tensors) {
     TF_RETURN_IF_ERROR(CheckVersions(gdef->versions(), TF_GRAPH_DEF_VERSION,
                                      TF_GRAPH_DEF_VERSION_MIN_PRODUCER,
                                      "GraphDef", "graph"));
-    GraphConstructor c(opts, gdef, g, refiner);
+    GraphConstructor c(opts, gdef, g, refiner, return_tensors);
     const Status s = c.TryImport();
     if (!s.ok()) c.Undo();
     return s;
@@ -100,12 +103,14 @@ class GraphConstructor {
 
  private:
   GraphConstructor(const Options& opts, const GraphDef* gdef, Graph* g,
-                   ShapeRefiner* refiner)
+                   ShapeRefiner* refiner,
+                   std::vector<std::pair<Node*, int>>* return_tensors)
       : opts_(opts),
         gdef_(gdef),
         g_(g),
         original_versions_(g->versions()),
-        refiner_(refiner) {}
+        refiner_(refiner),
+        return_tensors_(return_tensors) {}
 
   Status TryImport() {
     TF_RETURN_IF_ERROR(EnsureNoNameCollisions());
@@ -115,6 +120,7 @@ class GraphConstructor {
     TF_RETURN_IF_ERROR(Convert());
     TF_RETURN_IF_ERROR(AddBackEdges());
     TF_RETURN_IF_ERROR(UpdateVersionDef());
+    TF_RETURN_IF_ERROR(PopulateReturnTensors());
     FixupSourceAndSinkEdges(g_);
     return Status::OK();
   }
@@ -126,6 +132,7 @@ class GraphConstructor {
   Status Convert();
   Status AddBackEdges();
   Status UpdateVersionDef();
+  Status PopulateReturnTensors();
 
   void Undo();
 
@@ -156,6 +163,9 @@ class GraphConstructor {
 
   ShapeRefiner* refiner_;
 
+  // May be null. Not owned.
+  std::vector<std::pair<Node*, int>>* return_tensors_;
+
   // Mapping from node name to the index within gdef_
   struct NodeInfo {
     explicit NodeInfo(int i) : gdef_index(i), node(nullptr) {}
@@ -752,6 +762,36 @@ Status GraphConstructor::UpdateVersionDef() {
   return Status::OK();
 }
 
+Status GraphConstructor::PopulateReturnTensors() {
+  if (opts_.return_tensors.empty()) return Status::OK();
+  for (const TensorId& id : opts_.return_tensors) {
+    auto iter = opts_.input_map.find(id);
+    if (iter == opts_.input_map.end()) {
+      // Locate id in imported nodes
+      auto iter = gdef_nodes_.find(id.first);
+      if (iter == gdef_nodes_.end()) {
+        return errors::InvalidArgument(
+            "Requested return node '", id.first, "' not found in graph def");
+      }
+      int num_outputs = iter->second.node->num_outputs();
+      if ((id.second < 0 || id.second >= num_outputs) &&
+          id.second != Graph::kControlSlot) {
+        return errors::InvalidArgument(
+            "Invalid return output ", id.second, " of node '", id.first,
+            "', which has ", num_outputs, " outputs");
+      }
+      return_tensors_->push_back({iter->second.node, id.second});
+    } else {
+      // id was remapped to existing node
+      TensorId remapped_id = iter->second;
+      DCHECK_GT(existing_nodes_.count(remapped_id.first), 0);
+      Node* node = existing_nodes_[remapped_id.first];
+      return_tensors_->push_back({node, remapped_id.second});
+    }
+  }
+  return Status::OK();
+}
+
 void GraphConstructor::Undo() {
   for (const auto& iter : gdef_nodes_) {
     if (iter.second.node != nullptr) {
@@ -780,16 +820,30 @@ Status GraphConstructor::MakeEdge(Node* src, int output_index, Node* dst,
 Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
                               const GraphDef& gdef, Graph* g) {
   ShapeRefiner refiner(g->op_registry());
-  return GraphConstructor::Construct(opts, &gdef, g, &refiner);
+  return GraphConstructor::Construct(opts, &gdef, g, &refiner, nullptr);
 }
 
 Status ImportGraphDef(const ImportGraphDefOptions& opts, const GraphDef& gdef,
-                      Graph* g, ShapeRefiner* refiner) {
+                      Graph* g, ShapeRefiner* refiner,
+                      std::vector<std::pair<Node*, int>>* return_tensors) {
   ShapeRefiner default_refiner(g->op_registry());
   if (refiner == nullptr) {
     refiner = &default_refiner;
   }
-  return GraphConstructor::Construct(opts, &gdef, g, refiner);
+
+  if (!opts.return_tensors.empty()) {
+    if (return_tensors == nullptr) {
+      return errors::InvalidArgument(
+          "return_tensors argument to ImportNodeDef() must be non-null if "
+          "opts.return_tensors is non-empty");
+    }
+    if (!return_tensors->empty()) {
+      return errors::InvalidArgument(
+          "return_tensors argument to ImportNodeDef() should be empty (has "
+          "size ", return_tensors->size(), ")");
+    }
+  }
+  return GraphConstructor::Construct(opts, &gdef, g, refiner, return_tensors);
 }
 
 void CopyGraph(const Graph& src, Graph* dest) {
diff --git a/tensorflow/core/graph/graph_constructor.h b/tensorflow/core/graph/graph_constructor.h
index 61704913c3..186859d132 100644
--- a/tensorflow/core/graph/graph_constructor.h
+++ b/tensorflow/core/graph/graph_constructor.h
@@ -97,14 +97,31 @@ struct ImportGraphDefOptions {
   // other nodes in `gdef`.
   std::vector<string> control_dependencies;
 
+  // Tensors in `gdef` that will be returned via the `return_tensors` output
+  // parameter of `ImportGraphDef()`. If this list is non-empty, the caller must
+  // pass an empty vector to `ImportGraphDef()`. The vector will be populated
+  // with the imported nodes in `g`.
+  //
+  // Entries should not include `prefix`, i.e., each TensorId's name should be
+  // the name as it originally appears in `gdef`.
+  //
+  // If this contains a tensor that's also being remapped via `input_map`, the
+  // corresponding existing tensor in `g` will be returned.
+  std::vector<TensorId> return_tensors;
+
   // TODO(ashankar): Enable handling of GraphDefs produced by newer binaries
   // with ops that are not defined in the binary calling ImportGraphDef.
   // Similar to the producer_op_list argument to import_graph_def in the
   // python API.
 };
-extern Status ImportGraphDef(const ImportGraphDefOptions& opts,
-                             const GraphDef& gdef, Graph* g,
-                             ShapeRefiner* refiner);
+
+// Each `return_tensors` entry is the requested node and output index. The index
+// is included in case the returned tensor has been remapped according to
+// `input_map`.
+extern Status ImportGraphDef(
+    const ImportGraphDefOptions& opts, const GraphDef& gdef, Graph* g,
+    ShapeRefiner* refiner,
+    std::vector<std::pair<Node*, int>>* return_tensors = nullptr);
 
 // Make a copy of "src" into "*dest".
 //
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index a173d3a627..9ce7a0fdf8 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -65,14 +65,17 @@ class GraphConstructorTest : public ::testing::Test {
     EXPECT_EQ(original_graph_description, GraphDebugString());
   }
 
-  void ExpectError(const string& gdef_ascii, const ImportGraphDefOptions& opts,
-                   const std::vector<string>& expected_error_strs,
-                   ShapeRefiner* refiner = nullptr) {
+  void ExpectError(
+      const string& gdef_ascii, const ImportGraphDefOptions& opts,
+      const std::vector<string>& expected_error_strs,
+      ShapeRefiner* refiner = nullptr,
+      std::vector<std::pair<Node*, int>>* return_tensors = nullptr) {
     // Used to verify that errors don't change graph
     const string original_graph_description = GraphDebugString();
 
     Convert(gdef_ascii);
-    Status status = ImportGraphDef(opts, gdef_, &graph_, refiner);
+    Status status =
+        ImportGraphDef(opts, gdef_, &graph_, refiner, return_tensors);
     EXPECT_FALSE(status.ok());
 
     for (const string& error : expected_error_strs) {
@@ -90,9 +93,10 @@ class GraphConstructorTest : public ::testing::Test {
   }
 
   void ExpectOK(const string& gdef_ascii, const ImportGraphDefOptions& opts,
-                ShapeRefiner* refiner = nullptr) {
+                ShapeRefiner* refiner = nullptr,
+                std::vector<std::pair<Node*, int>>* return_tensors = nullptr) {
     Convert(gdef_ascii);
-    Status s = ImportGraphDef(opts, gdef_, &graph_, refiner);
+    Status s = ImportGraphDef(opts, gdef_, &graph_, refiner, return_tensors);
     EXPECT_EQ(Status::OK(), s) << s;
   }
 
@@ -981,6 +985,104 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapDuplicateNodeNames) {
       &refiner);
 }
 
+TEST_F(GraphConstructorTest, ImportGraphDef_ReturnTensors) {
+  ShapeRefiner refiner(graph_.op_registry());
+
+  ImportGraphDefOptions opts;
+  opts.return_tensors.push_back({"input", 1});
+  opts.return_tensors.push_back({"t1", 0});
+  opts.return_tensors.push_back({"input", 0});
+  std::vector<std::pair<Node*, int>> return_tensors;
+  ExpectOK(
+      "node { name: 'input' op: 'TestInput' }"
+      "node { name: 't1' op: 'TestMul' input: ['input:0', 'input:1'] }",
+      opts, &refiner, &return_tensors);
+
+  // Sanity checks
+  EXPECT_TRUE(HasNode("input"));
+  EXPECT_TRUE(HasNode("t1"));
+  EXPECT_TRUE(HasEdge("input", 0, "t1", 0));
+  EXPECT_TRUE(HasEdge("input", 1, "t1", 1));
+
+  // Check return tensors
+  ASSERT_EQ(return_tensors.size(), 3);
+  EXPECT_EQ(return_tensors[0].first->name(), "input");
+  EXPECT_EQ(return_tensors[0].second, 1);
+  EXPECT_EQ(return_tensors[1].first->name(), "t1");
+  EXPECT_EQ(return_tensors[1].second, 0);
+  EXPECT_EQ(return_tensors[2].first->name(), "input");
+  EXPECT_EQ(return_tensors[2].second, 0);
+
+  // Test using prefix and returning element from input_map
+  opts.return_tensors.clear();
+  return_tensors.clear();
+  opts.prefix = "import";
+  opts.input_map[{"new_input", 1}] = {"input", 0};
+  opts.return_tensors.push_back({"new_input", 0});
+  opts.return_tensors.push_back({"new_input", 1});
+  ExpectOK("node { name: 'new_input' op: 'TestInput' }", opts, &refiner,
+           &return_tensors);
+
+  EXPECT_TRUE(HasNode("import/new_input"));
+
+  ASSERT_EQ(return_tensors.size(), 2);
+  EXPECT_EQ(return_tensors[0].first->name(), "import/new_input");
+  EXPECT_EQ(return_tensors[0].second, 0);
+  EXPECT_EQ(return_tensors[1].first->name(), "input");
+  EXPECT_EQ(return_tensors[1].second, 0);
+
+  // Test returning node remapped to source node
+  opts.prefix.clear();
+  opts.input_map.clear();
+  opts.return_tensors.clear();
+  return_tensors.clear();
+  opts.input_map[{"new_input", 0}] = {"_SOURCE", 0};
+  opts.return_tensors.push_back({"new_input", 0});
+  ExpectOK("node { name: 'new_input' op: 'TestInput' }", opts, &refiner,
+           &return_tensors);
+
+  EXPECT_TRUE(HasNode("new_input"));
+
+  ASSERT_EQ(return_tensors.size(), 1);
+  EXPECT_EQ(return_tensors[0].first->name(), "_SOURCE");
+  EXPECT_EQ(return_tensors[0].second, 0);
+}
+
+TEST_F(GraphConstructorTest, ImportGraphDef_ReturnTensorsErrors) {
+  // Passing in return_tensors with empty opts.return_tensors is OK
+  ImportGraphDefOptions opts;
+  std::vector<std::pair<Node*, int>> return_tensors;
+  ExpectOK("node { name: 'input' op: 'TestInput' }", opts, nullptr,
+           &return_tensors);
+
+  // Null return_tensors with non-empty opts.return_tensors
+  opts.return_tensors.push_back({"new_input", 0});
+  ExpectError("node { name: 'new_input' op: 'TestInput' }", opts,
+              {"return_tensors argument to ImportNodeDef() must be non-null "
+               "if opts.return_tensors is non-empty"});
+
+  // Non-empty return_tensors
+  return_tensors.push_back({nullptr, 0});
+  ExpectError("node { name: 'new_input' op: 'TestInput' }", opts,
+              {"return_tensors argument to ImportNodeDef() should be empty "
+               "(has size 1)"},
+              nullptr, &return_tensors);
+
+  // Requesting tensor that isn't in graph def
+  return_tensors.clear();
+  ExpectError("node { name: 'W1' op: 'TestParams' }", opts,
+              {"Requested return node 'new_input' not found in graph def"},
+              nullptr, &return_tensors);
+
+  // Requesting invalid node index
+  opts.return_tensors.clear();
+  opts.return_tensors.push_back({"new_input", 2});
+  ExpectError("node { name: 'new_input' op: 'TestInput' }", opts,
+              {"Invalid return output 2 of node 'new_input', which has 2 "
+               "outputs"},
+              nullptr, &return_tensors);
+}
+
 TEST_F(GraphConstructorTest, ImportGraphDef_WithCycle) {
   // Test graph produced in python using:
   /*
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index d8322e6077..6d3dbc0abb 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -128,13 +128,13 @@ void CheckLoopConstruction(const GraphDef& graph_def) {
   }
 }
 
-REGISTER_OP("Input").Output("o: float");
+REGISTER_OP("FloatInput").Output("o: float");
 REGISTER_OP("BoolInput").Output("o: bool");
 REGISTER_OP("Combine").Input("a: float").Input("b: float").Output("o: float");
 
-ops::Output ConstructOp(const Scope& scope, const string& op_type,
-                        const gtl::ArraySlice<ops::Input>& inputs) {
-  if (!scope.ok()) return ops::Output();
+Output ConstructOp(const Scope& scope, const string& op_type,
+                   const gtl::ArraySlice<Input>& inputs) {
+  if (!scope.ok()) return Output();
   const string unique_name = scope.GetUniqueNameForOp(op_type);
   auto builder = NodeBuilder(unique_name, op_type);
   for (auto const& input : inputs) {
@@ -143,19 +143,19 @@ ops::Output ConstructOp(const Scope& scope, const string& op_type,
   scope.UpdateBuilder(&builder);
   Node* ret;
   scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
-  if (!scope.ok()) return ops::Output();
-  return ops::Output(ret);
+  if (!scope.ok()) return Output();
+  return Output(ret);
 }
 
-ops::Output Input(const Scope& scope) {
-  return ConstructOp(scope, "Input", {});
+Output FloatInput(const Scope& scope) {
+  return ConstructOp(scope, "FloatInput", {});
 }
 
-ops::Output BoolInput(const Scope& scope) {
+Output BoolInput(const Scope& scope) {
   return ConstructOp(scope, "BoolInput", {});
 }
 
-ops::Output Combine(const Scope& scope, ops::Input a, ops::Input b) {
+Output Combine(const Scope& scope, Input a, Input b) {
   return ConstructOp(scope, "Combine", {a, b});
 }
 
@@ -196,21 +196,21 @@ class GraphPartitionTest : public ::testing::Test {
 
 TEST_F(GraphPartitionTest, SingleDevice) {
   using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
-  auto a1 = Input(in_.WithOpName("A1"));
+  auto a1 = FloatInput(in_.WithOpName("A1"));
   Combine(in_.WithOpName("A2"), a1, a1);
 
   Partition(ToGraphDef(), &partitions_);
   EXPECT_EQ(1, partitions_.size());
 
-  a1 = Input(scope_a_.WithOpName("A1"));
+  a1 = FloatInput(scope_a_.WithOpName("A1"));
   Combine(scope_a_.WithOpName("A2"), a1, a1);
   ExpectMatchA();
 }
 
 TEST_F(GraphPartitionTest, CrossDeviceData) {
   using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
-  auto a1 = Input(in_.WithOpName("A1"));
-  auto b1 = Input(in_.WithOpName("B1"));
+  auto a1 = FloatInput(in_.WithOpName("A1"));
+  auto b1 = FloatInput(in_.WithOpName("B1"));
   Combine(in_.WithOpName("B2"), a1, b1);
 
   Partition(ToGraphDef(), &partitions_);
@@ -218,11 +218,11 @@ TEST_F(GraphPartitionTest, CrossDeviceData) {
 
   string a = "/job:a/replica:0/task:0/cpu:0";
   string b = "/job:a/replica:0/task:0/cpu:1";
-  a1 = Input(scope_a_.WithOpName("A1"));
+  a1 = FloatInput(scope_a_.WithOpName("A1"));
   _Send(scope_a_.WithOpName("A1/_0"), a1, "edge_1_A1", a, 82, b);
   ExpectMatchA();
 
-  b1 = Input(scope_b_.WithOpName("B1"));
+  b1 = FloatInput(scope_b_.WithOpName("B1"));
   auto recv =
       _Recv(scope_b_.WithOpName("A1/_1"), DT_FLOAT, "edge_1_A1", a, 82, b);
   Combine(scope_b_.WithOpName("B2"), recv, b1);
@@ -231,8 +231,8 @@ TEST_F(GraphPartitionTest, CrossDeviceData) {
 
 TEST_F(GraphPartitionTest, CrossDeviceControl) {
   using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
-  auto a1 = Input(in_.WithOpName("A1"));
-  auto b1 = Input(in_.WithOpName("B1"));
+  auto a1 = FloatInput(in_.WithOpName("A1"));
+  auto b1 = FloatInput(in_.WithOpName("B1"));
   Combine(in_.WithOpName("B2").WithControlDependencies(a1), b1, b1);
 
   Partition(ToGraphDef(), &partitions_);
@@ -240,7 +240,7 @@ TEST_F(GraphPartitionTest, CrossDeviceControl) {
 
   string a = "/job:a/replica:0/task:0/cpu:0";
   string b = "/job:a/replica:0/task:0/cpu:1";
-  a1 = Input(scope_a_.WithOpName("A1"));
+  a1 = FloatInput(scope_a_.WithOpName("A1"));
   auto c = Const(scope_a_.WithOpName("A1/_0").WithControlDependencies(a1), {});
   _Send(scope_a_.WithOpName("A1/_1"), c, "edge_3_A1", a, 82, b);
   ExpectMatchA();
@@ -248,15 +248,15 @@ TEST_F(GraphPartitionTest, CrossDeviceControl) {
   auto recv =
       _Recv(scope_b_.WithOpName("A1/_2"), DT_FLOAT, "edge_3_A1", a, 82, b);
   auto id = Identity(scope_b_.WithOpName("A1/_3"), recv);
-  b1 = Input(scope_b_.WithOpName("B1"));
+  b1 = FloatInput(scope_b_.WithOpName("B1"));
   Combine(scope_b_.WithOpName("B2").WithControlDependencies(id), b1, b1);
   ExpectMatchB();
 }
 
 TEST_F(GraphPartitionTest, CrossDeviceData_MultiUse) {
   using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
-  auto a1 = Input(in_.WithOpName("A1"));
-  auto b1 = Input(in_.WithOpName("B1"));
+  auto a1 = FloatInput(in_.WithOpName("A1"));
+  auto b1 = FloatInput(in_.WithOpName("B1"));
   Combine(in_.WithOpName("B2"), a1, b1);
   Combine(in_.WithOpName("B3"), a1, a1);
 
@@ -265,13 +265,13 @@ TEST_F(GraphPartitionTest, CrossDeviceData_MultiUse) {
 
   string a = "/job:a/replica:0/task:0/cpu:0";
   string b = "/job:a/replica:0/task:0/cpu:1";
-  a1 = Input(scope_a_.WithOpName("A1"));
+  a1 = FloatInput(scope_a_.WithOpName("A1"));
   _Send(scope_a_.WithOpName("A1/_0"), a1, "edge_1_A1", a, 82, b);
   ExpectMatchA();
 
   auto recv =
       _Recv(scope_b_.WithOpName("A1/_1"), DT_FLOAT, "edge_1_A1", a, 82, b);
-  b1 = Input(scope_b_.WithOpName("B1"));
+  b1 = FloatInput(scope_b_.WithOpName("B1"));
   Combine(scope_b_.WithOpName("B2"), recv, b1);
   Combine(scope_b_.WithOpName("B3"), recv, recv);
   ExpectMatchB();
@@ -279,17 +279,17 @@ TEST_F(GraphPartitionTest, CrossDeviceData_MultiUse) {
 
 TEST_F(GraphPartitionTest, CrossDeviceControl_MultiUse) {
   using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
-  auto a1 = Input(in_.WithOpName("A1"));
-  auto b1 = Input(in_.WithOpName("B1"));
+  auto a1 = FloatInput(in_.WithOpName("A1"));
+  auto b1 = FloatInput(in_.WithOpName("B1"));
   Combine(in_.WithOpName("B2").WithControlDependencies(a1), b1, b1);
-  Input(in_.WithOpName("B3").WithControlDependencies(a1));
+  FloatInput(in_.WithOpName("B3").WithControlDependencies(a1));
 
   Partition(ToGraphDef(), &partitions_);
   EXPECT_EQ(2, partitions_.size());
 
   string a = "/job:a/replica:0/task:0/cpu:0";
   string b = "/job:a/replica:0/task:0/cpu:1";
-  a1 = Input(scope_a_.WithOpName("A1"));
+  a1 = FloatInput(scope_a_.WithOpName("A1"));
   auto c = Const(scope_a_.WithOpName("A1/_0").WithControlDependencies(a1), {});
   _Send(scope_a_.WithOpName("A1/_1"), c, "edge_1_A1", a, 82, b);
   ExpectMatchA();
@@ -297,25 +297,25 @@ TEST_F(GraphPartitionTest, CrossDeviceControl_MultiUse) {
   auto recv =
       _Recv(scope_b_.WithOpName("A1/_2"), DT_FLOAT, "edge_1_A1", a, 82, b);
   auto id = Identity(scope_b_.WithOpName("A1/_3"), recv);
-  b1 = Input(scope_b_.WithOpName("B1"));
+  b1 = FloatInput(scope_b_.WithOpName("B1"));
   Combine(scope_b_.WithOpName("B2").WithControlDependencies(id), b1, b1);
-  Input(scope_b_.WithOpName("B3").WithControlDependencies(id));
+  FloatInput(scope_b_.WithOpName("B3").WithControlDependencies(id));
   ExpectMatchB();
 }
 
 TEST_F(GraphPartitionTest, CrossDevice_DataControl) {
   using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
-  auto a1 = Input(in_.WithOpName("A1"));
-  auto b1 = Input(in_.WithOpName("B1"));
+  auto a1 = FloatInput(in_.WithOpName("A1"));
+  auto b1 = FloatInput(in_.WithOpName("B1"));
   Combine(in_.WithOpName("B2"), a1, b1);
-  Input(in_.WithOpName("B3").WithControlDependencies(a1));
+  FloatInput(in_.WithOpName("B3").WithControlDependencies(a1));
 
   Partition(ToGraphDef(), &partitions_);
   EXPECT_EQ(2, partitions_.size());
 
   string a = "/job:a/replica:0/task:0/cpu:0";
   string b = "/job:a/replica:0/task:0/cpu:1";
-  a1 = Input(scope_a_.WithOpName("A1"));
+  a1 = FloatInput(scope_a_.WithOpName("A1"));
   auto c = Const(scope_a_.WithOpName("A1/_0").WithControlDependencies(a1), {});
   // NOTE: Send 0 A1/_1 -> A1/_2 is not necessarily needed. We could
   // use A1/_0 -> A1/_4 as the control as a minor optimization.
@@ -328,9 +328,9 @@ TEST_F(GraphPartitionTest, CrossDevice_DataControl) {
   auto id1 = Identity(scope_b_.WithOpName("A1/_3"), recv1);
   auto recv2 =
       _Recv(scope_b_.WithOpName("A1/_5"), DT_FLOAT, "edge_2_A1", a, 82, b);
-  b1 = Input(scope_b_.WithOpName("B1"));
+  b1 = FloatInput(scope_b_.WithOpName("B1"));
   Combine(scope_b_.WithOpName("B2"), recv2, b1);
-  Input(scope_b_.WithOpName("B3").WithControlDependencies(id1));
+  FloatInput(scope_b_.WithOpName("B3").WithControlDependencies(id1));
   ExpectMatchB();
 }
 
@@ -338,8 +338,7 @@ TEST_F(GraphPartitionTest, CrossDeviceLoop) {
   using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
   auto a1 = BoolInput(in_.WithOpName("A1"));
   auto a2 = Enter(in_.WithOpName("A2"), a1, "foo");
-  auto a3 =
-      Merge(in_.WithOpName("A3"), {a2, ops::Input("A5", 0, DT_BOOL)}).output;
+  auto a3 = Merge(in_.WithOpName("A3"), {a2, Input("A5", 0, DT_BOOL)}).output;
   LoopCond(in_.WithOpName("A4"), a3);
   auto b1 = Identity(in_.WithOpName("B1"), a3);
   NextIteration(in_.WithOpName("A5"), b1);
@@ -351,8 +350,7 @@ TEST_F(GraphPartitionTest, CrossDeviceLoop1) {
   using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
   auto a1 = BoolInput(in_.WithOpName("A1"));
   auto a2 = Enter(in_.WithOpName("B2"), a1, "foo");
-  auto a3 =
-      Merge(in_.WithOpName("A3"), {a2, ops::Input("B5", 0, DT_BOOL)}).output;
+  auto a3 = Merge(in_.WithOpName("A3"), {a2, Input("B5", 0, DT_BOOL)}).output;
   LoopCond(in_.WithOpName("A4"), a3);
   auto b1 = Identity(in_.WithOpName("B1"), a3);
   NextIteration(in_.WithOpName("B5"), b1);
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 82ed7d6b42..fb663e5f58 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -318,6 +318,19 @@ cc_library(
 )
 
 cc_library(
+    name = "record_input_op",
+    srcs = [
+        "record_input_op.cc",
+        "record_yielder.cc",
+        "record_yielder.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
     name = "save_restore_tensor",
     srcs = ["save_restore_tensor.cc"],
     hdrs = ["save_restore_tensor.h"],
@@ -1177,6 +1190,7 @@ cc_library(
         ":priority_queue_op",
         ":queue_ops",
         ":random_shuffle_queue_op",
+        ":record_input_op",
         ":session_ops",
         ":sparse_conditional_accumulator_op",
         ":stack_ops",
@@ -1679,6 +1693,7 @@ tf_cc_tests(
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -3735,10 +3750,7 @@ filegroup(
             "ctc_loss_op.*",
             # Excluded due to experimental status:
             "debug_ops.*",
-            # Ops excluded because they do not build correctly for Android.
-            # See b/29213790
             "scatter_nd_op*",
-            "sparse_matmul_op.*",
             # Lib CURL is not supported on Android.
             "bigquery*",
         ],
diff --git a/tensorflow/core/kernels/hexagon/BUILD b/tensorflow/core/kernels/hexagon/BUILD
index 1222093a7a..9263c062ba 100644
--- a/tensorflow/core/kernels/hexagon/BUILD
+++ b/tensorflow/core/kernels/hexagon/BUILD
@@ -72,12 +72,14 @@ tf_cc_test(
 tf_kernel_library(
     name = "graph_transferer",
     srcs = [
+        "graph_transfer_utils.cc",
         "graph_transferer.cc",
         "hexagon_control_wrapper.cc",
         "hexagon_ops_definitions.cc",
         "i_graph_transfer_ops_definitions.cc",
     ],
     hdrs = [
+        "graph_transfer_utils.h",
         "graph_transferer.h",
         "hexagon_control_wrapper.h",
         "hexagon_ops_definitions.h",
diff --git a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
new file mode 100644
index 0000000000..c37e49f242
--- /dev/null
+++ b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
@@ -0,0 +1,49 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/hexagon/graph_transfer_utils.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+/* static */ std::priority_queue<std::tuple<float, int, string>>
+GraphTransferUtils::GetTopNFloatResults(const float *const data,
+                                        const string *const labels,
+                                        const int element_count) {
+  CHECK(data != nullptr);
+  CHECK(labels != nullptr);
+  std::priority_queue<std::tuple<float, int, string>> queue;
+  for (int i = 0; i < element_count; ++i) {
+    queue.emplace(data[i], i, labels[i]);
+  }
+  return queue;
+}
+
+/* static */ void GraphTransferUtils::DumpTopNFloatResults(
+    const float *const data, const string *const labels,
+    const int element_count, const int top_n) {
+  std::priority_queue<std::tuple<float, int, string>> queue =
+      GetTopNFloatResults(data, labels, element_count);
+  LOG(INFO) << "=== Dump ranking ===";
+  for (int i = 0; i < top_n; ++i) {
+    const std::tuple<float, int, string> &entry = queue.top();
+    LOG(INFO) << i << ": " << std::get<1>(entry) << ", " << std::get<2>(entry)
+              << ", " << std::get<0>(entry);
+    queue.pop();
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/graph_transfer_utils.h b/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
new file mode 100644
index 0000000000..85af9b5ce3
--- /dev/null
+++ b/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
@@ -0,0 +1,41 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PLATFORM_HEXAGON_GRAPH_TRANSFER_UTILS_H_
+#define TENSORFLOW_PLATFORM_HEXAGON_GRAPH_TRANSFER_UTILS_H_
+
+#include <queue>
+
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class GraphTransferUtils {
+ public:
+  static std::priority_queue<std::tuple<float, int, string>>
+  GetTopNFloatResults(const float *const data, const string *const labels,
+                      const int element_count);
+  static void DumpTopNFloatResults(const float *const data,
+                                   const string *const labels,
+                                   const int element_count, const int top_n);
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(GraphTransferUtils);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PLATFORM_HEXAGON_GRAPH_TRANSFER_UTILS_H_
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.cc b/tensorflow/core/kernels/hexagon/graph_transferer.cc
index 5b2a95a371..662b935b90 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.cc
@@ -38,14 +38,11 @@ const string INPUTS_NODE_PREFIX = "inputs_for_";
 const string OUTPUTS_NODE_PREFIX = "outputs_for_";
 const string DATA_NODE_PREFIX = "data_for_op_";
 const string CONST_SHAPE_PREFIX = "const_shape_";
-const string PADDING_PREFIX = "NN_PAD_";
 const string PADDING_ATTR_NAME = "padding";
 const string STRIDES_ATTR_NAME = "strides";
 const string KSIZE_ATTR_NAME = "ksize";
-const string PADDING_VALID_STR = "VALID";
-const string PADDING_SAME_STR = "SAME";
-const string PADDING_NA = "NA";
 const string NULL_OUTPUT_NAME = "NULL";
+const int PADDING_NA_ID = 0;  // VALID = 1, SAME = 2
 
 // This is a temporary workaround to support android build
 // where std::string is not supported even with c++11 option.
@@ -413,7 +410,6 @@ void GraphTransferer::RegisterConstantNode(
   VLOG(1) << "Register constant node: " << node.name();
   CHECK(node_name_to_id_cache_map_.count(node.name()) == 1);
   const int id = node_name_to_id_cache_map_[node.name()];
-  const string data_name = DATA_NODE_PREFIX + ToString(id);
   const int output_node_size = node.num_outputs();
   CHECK(output_node_size == 1);
   // TODO(satok): support multiple outputs?
@@ -448,7 +444,6 @@ void GraphTransferer::RegisterConstantNode(
       ConstNodeTransferParams{node.name(),
                               id,
                               {{shape[0], shape[1], shape[2], shape[3]}},
-                              data_name,
                               data_size});
   // TODO(satok): Remove. Determine constant value without dryrun
   if (!output_tensor_map.empty() && data_size != 0) {
@@ -474,7 +469,7 @@ int GraphTransferer::RegisterConstantShape(const std::vector<int>& shape) {
     const int id = node_name_cache_list_.size() - 1;
     node_name_to_id_cache_map_.emplace(shape_name, id);
     const_node_transfer_params_list_.emplace_back(ConstNodeTransferParams{
-        shape_name, id, {{shape[0], shape[1], shape[2], shape[3]}}, "", 0});
+        shape_name, id, {{shape[0], shape[1], shape[2], shape[3]}}, 0});
   }
   return node_name_to_id_cache_map_[shape_name];
 }
@@ -545,17 +540,17 @@ void GraphTransferer::RegisterNodeWithPaddingAndStrides(
     const int ksize_id = RegisterConstantShape(kernel_sizes);
     extra_inputs.insert(extra_inputs.begin(), ksize_id);
   }
-  const std::string padding_str =
-      padding == VALID ? PADDING_VALID_STR : PADDING_SAME_STR;
   const int op_type_id = ops_definitions.GetOpIdFor(node.type_string());
   CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount())
       << "Op " << node.type_string() << " not found in map(id = " << op_type_id
       << ")";
-  AppendNodeParamsWithIoParams(shape_refiner, output_tensor_map, node,
-                               node.name(), id, node.type_string(), op_type_id,
-                               padding_str, node.num_inputs(), extra_inputs,
-                               node.num_outputs(), true /* append_input */,
-                               true /* append_output */);
+  // Safety check of padding id
+  CHECK(padding == Padding::VALID ? 1 : 2);
+  AppendNodeParamsWithIoParams(
+      shape_refiner, output_tensor_map, node, node.name(), id,
+      node.type_string(), op_type_id, static_cast<int>(padding),
+      node.num_inputs(), extra_inputs, node.num_outputs(),
+      true /* append_input */, true /* append_output */);
 }
 
 void GraphTransferer::RegisterInputNode(
@@ -570,7 +565,7 @@ void GraphTransferer::RegisterInputNode(
   CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount());
   AppendNodeParamsWithIoParams(
       shape_refiner, output_tensor_map, node, node.name(), id,
-      node.type_string(), op_type_id, PADDING_NA, node.num_inputs(), {},
+      node.type_string(), op_type_id, PADDING_NA_ID, node.num_inputs(), {},
       node.num_outputs(), true /* append_input */, true /* append_output */);
 }
 
@@ -587,7 +582,7 @@ void GraphTransferer::RegisterOutputNode(
   // TODO(satok): Set output for output node?
   AppendNodeParamsWithIoParams(
       shape_refiner, output_tensor_map, node, node.name(), id,
-      node.type_string(), op_type_id, PADDING_NA, node.num_inputs(), {},
+      node.type_string(), op_type_id, PADDING_NA_ID, node.num_inputs(), {},
       0 /* outputs_size */, true /* append_input */, false /* append_output */);
 }
 
@@ -604,7 +599,7 @@ void GraphTransferer::RegisterFlattenNode(
 
   AppendNodeParamsWithIoParams(
       shape_refiner, output_tensor_map, node, node.name(), id,
-      node.type_string(), op_type_id, PADDING_NA, node.num_inputs(), {},
+      node.type_string(), op_type_id, PADDING_NA_ID, node.num_inputs(), {},
       node.num_outputs(), true /* append_input */, true /* append_output */);
 }
 
@@ -620,7 +615,7 @@ void GraphTransferer::RegisterGenericNode(
 
   AppendNodeParamsWithIoParams(
       shape_refiner, output_tensor_map, node, node.name(), id,
-      node.type_string(), op_type_id, PADDING_NA, node.num_inputs(), {},
+      node.type_string(), op_type_id, PADDING_NA_ID, node.num_inputs(), {},
       node.num_outputs(), true /* append_input */, true /* append_output */);
 }
 
@@ -644,18 +639,13 @@ Status GraphTransferer::RegisterNodeIfAllInputsAreCached(
 // CAVEAT: Append inputs and outputs params accordingly
 void GraphTransferer::AppendNodeParams(const string& name, const int id,
                                        const string& type, const int type_id,
-                                       const string& padding_str,
-                                       const int inputs_size,
+                                       const int padding, const int inputs_size,
                                        const std::vector<int>& extra_inputs,
                                        const int outputs_size) {
   VLOG(1) << "Append node params: " << name;
-  // TODO(satok): store padding as Padding?
-  const string output_name = OUTPUTS_NODE_PREFIX + ToString(id);
   node_transfer_params_list_.emplace_back(
-      NodeTransferParams{name, id, type, type_id, PADDING_PREFIX + padding_str,
-                         INPUTS_NODE_PREFIX + ToString(id),
+      NodeTransferParams{name, id, type, type_id, padding,
                          inputs_size + static_cast<int>(extra_inputs.size()),
-                         outputs_size <= 0 ? NULL_OUTPUT_NAME : output_name,
                          static_cast<int>(outputs_size)});
 }
 
@@ -738,7 +728,7 @@ void GraphTransferer::AppendNodeOutputParams(
 void GraphTransferer::AppendNodeParamsWithIoParams(
     const ShapeRefiner& shape_refiner, const OutputTensorMap& output_tensor_map,
     const Node& node, const string& name, const int id, const string& type,
-    const int type_id, const string& padding_str, const int inputs_size,
+    const int type_id, const int padding, const int inputs_size,
     const std::vector<int>& extra_inputs, const int outputs_size,
     const bool append_input_params, const bool append_output_params) {
   VLOG(1) << "Append node with io params: " << node.name();
@@ -748,8 +738,8 @@ void GraphTransferer::AppendNodeParamsWithIoParams(
   if (append_output_params) {
     AppendNodeOutputParams(shape_refiner, output_tensor_map, id, node);
   }
-  AppendNodeParams(name, id, type, type_id, padding_str, inputs_size,
-                   extra_inputs, outputs_size);
+  AppendNodeParams(name, id, type, type_id, padding, inputs_size, extra_inputs,
+                   outputs_size);
 }
 
 /* static */ std::array<int64, GraphTransferer::SHAPE_ARRAY_SIZE>
@@ -808,6 +798,20 @@ GraphTransferer::ToTensorShapeArray(const TensorShape& shape) {
   }
 }
 
+/* static */ string GraphTransferer::ToPaddingDebugString(const int padding) {
+  switch (padding) {
+    case 0:
+      return "NN_PAD_NA";
+    case Padding::VALID:
+      return "NN_PAD_VALID";
+    case Padding::SAME:
+      return "NN_PAD_SAME";
+    default:
+      CHECK(false);
+      return "";
+  }
+}
+
 /* static */ void GraphTransferer::CheckShape(
     const OutputTensorMap& output_tensor_map, const string& node_name,
     const std::array<int64, SHAPE_ARRAY_SIZE>& expected) {
@@ -903,7 +907,10 @@ void GraphTransferer::DumpNodeTransferParams() const {
     LOG(INFO) << "[ " << params.node_id << " \"" << params.name << "\" (Const)";
     LOG(INFO) << "  shape: " << params.shape[0] << params.shape[1]
               << params.shape[2] << params.shape[3];
-    LOG(INFO) << "  data_name: " << params.data_name;
+    LOG(INFO) << "  data_name: "
+              << (params.data_size <= 0
+                      ? ""
+                      : DATA_NODE_PREFIX + ToString(params.node_id));
     LOG(INFO) << "  data_size: " << params.data_size << " bytes"
               << " ]";
   }
@@ -911,11 +918,14 @@ void GraphTransferer::DumpNodeTransferParams() const {
   LOG(INFO) << "*** Op Nodes ***";
   for (const NodeTransferParams& params : node_transfer_params_list_) {
     LOG(INFO) << "[ " << params.node_id << " \"" << params.name;
-    LOG(INFO) << "  type: " << params.type;
-    LOG(INFO) << "  padding: " << params.padding;
-    LOG(INFO) << "  inputs: " << params.inputs_name
+    LOG(INFO) << "  type: " << params.type_name;
+    LOG(INFO) << "  padding: " << ToPaddingDebugString(params.padding);
+    LOG(INFO) << "  inputs: " << INPUTS_NODE_PREFIX + ToString(params.node_id)
               << ", size = " << params.inputs_size;
-    LOG(INFO) << "  outputs: " << params.outputs_name
+    LOG(INFO) << "  outputs: "
+              << (params.outputs_size <= 0
+                      ? NULL_OUTPUT_NAME
+                      : (OUTPUTS_NODE_PREFIX + ToString(params.node_id)))
               << ", size = " << params.outputs_size << " ]";
   }
   LOG(INFO) << "******\n";
@@ -946,8 +956,10 @@ void GraphTransferer::DumpVerificationStringOfNodeTransferParams() const {
     sstream << "---(CONST) [" << std::hex << params.node_id << std::dec << ","
             << params.shape[0] << "," << params.shape[1] << ","
             << params.shape[2] << "," << params.shape[3] << ","
-            << params.data_name << "," << params.data_size << "," << params.name
-            << "]";
+            << (params.data_size <= 0
+                    ? ""
+                    : DATA_NODE_PREFIX + ToString(params.node_id))
+            << "," << params.data_size << "," << params.name << "]";
     LOG(INFO) << sstream.str();
   }
   LOG(INFO) << "Const node count = " << const_node_transfer_params_list_.size();
@@ -955,9 +967,13 @@ void GraphTransferer::DumpVerificationStringOfNodeTransferParams() const {
     std::stringstream sstream;
     sstream << "---(OP) [" << params.name.c_str() << "," << std::hex
             << params.node_id << std::dec << "," << params.soc_op_id << ","
-            << params.padding << "," << params.inputs_name << ","
-            << params.inputs_size << "," << params.outputs_name << ","
-            << params.outputs_size << "," << params.type << "]";
+            << ToPaddingDebugString(params.padding) << ","
+            << INPUTS_NODE_PREFIX + ToString(params.node_id) << ","
+            << params.inputs_size << ","
+            << (params.outputs_size <= 0
+                    ? NULL_OUTPUT_NAME
+                    : (OUTPUTS_NODE_PREFIX + ToString(params.node_id)))
+            << "," << params.outputs_size << "," << params.type_name << "]";
     LOG(INFO) << sstream.str();
   }
   LOG(INFO) << "Op node count = " << node_transfer_params_list_.size();
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.h b/tensorflow/core/kernels/hexagon/graph_transferer.h
index 7bc6293be8..d86452905f 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.h
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.h
@@ -52,21 +52,18 @@ class GraphTransferer {
   struct NodeTransferParams {
     string name;
     int node_id;
-    string type;  // for debug info
+    string type_name;
     int soc_op_id;
-    string padding;
-    string inputs_name;  // for debug info TODO(satok): remove
+    int padding;
     int inputs_size;
-    string outputs_name;  // for debug info TODO(satok): remove
     int outputs_size;
   };
 
   // Const node parameters for transfer
   struct ConstNodeTransferParams {
-    string name;  // for debug info
+    string name;
     int node_id;
     std::array<int64, MAX_SUPPORTED_RANK> shape;
-    string data_name;  // for debug info TODO(satok): remove
     int data_size;
     std::vector<uint8> data;
   };
@@ -215,7 +212,7 @@ class GraphTransferer {
       const OutputTensorMap& output_tensor_map);
 
   void AppendNodeParams(const string& name, const int id, const string& type,
-                        const int type_id, const string& padding_str,
+                        const int type_id, const int padding,
                         const int inputs_size,
                         const std::vector<int>& extra_inputs,
                         const int outputs_size);
@@ -235,13 +232,15 @@ class GraphTransferer {
       const ShapeRefiner& shape_refiner,
       const OutputTensorMap& output_tensor_map, const Node& node,
       const string& name, const int id, const string& type, const int type_id,
-      const string& padding_str, const int inputs_size,
+      const int padding, const int inputs_size,
       const std::vector<int>& extra_inputs, const int outputs_size,
       const bool append_input_params, const bool append_output_params);
 
   static std::array<int64, SHAPE_ARRAY_SIZE> ToTensorShapeArray(
       const TensorShape& shape);
 
+  static string ToPaddingDebugString(int padding);
+
   static void CheckShape(const OutputTensorMap& output_tensor_map,
                          const string& node_name,
                          const std::array<int64, SHAPE_ARRAY_SIZE>& actual);
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
index b9a4c8aff0..92b58083b9 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
@@ -69,10 +69,9 @@ class TestGraphTransferOpsDefinitions : public IGraphTransferOpsDefinitions {
 
 static GraphDef CreateAddGraphDef() {
   Scope root = Scope::NewRootScope();
-  ops::Output node_a = ops::Const(root.WithOpName(NAME_A), NODE_A_VAL);
-  ops::Output node_b = ops::Const(root.WithOpName(NAME_B), NODE_B_VAL);
-  ops::Output node_add =
-      ops::Add(root.WithOpName(NAME_A_PLUS_B), node_a, node_b);
+  Output node_a = ops::Const(root.WithOpName(NAME_A), NODE_A_VAL);
+  Output node_b = ops::Const(root.WithOpName(NAME_B), NODE_B_VAL);
+  Output node_add = ops::Add(root.WithOpName(NAME_A_PLUS_B), node_a, node_b);
   GraphDef def;
   TF_CHECK_OK(root.ToGraphDef(&def));
   return def;
@@ -82,16 +81,16 @@ static GraphDef CreateConvGraphDef() {
   Scope root = Scope::NewRootScope();
   Tensor input_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
   test::FillIota<float>(&input_data, 1.0f);
-  ops::Output input =
-      ops::Const(root.WithOpName("input"), ops::Input::Initializer(input_data));
+  Output input =
+      ops::Const(root.WithOpName("input"), Input::Initializer(input_data));
   Tensor filter_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
   test::FillIota<float>(&filter_data, 1.0f);
-  ops::Output filter = ops::Const(root.WithOpName("filter"),
-                                  ops::Input::Initializer(filter_data));
+  Output filter =
+      ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data));
   const std::vector<int> strides{1, 1, 1, 1};
-  ops::Output conv =
+  Output conv =
       ops::Conv2D(root.WithOpName("conv"), input, filter, strides, "SAME");
-  ops::Output softmax = ops::Softmax(root.WithOpName("softmax"), conv);
+  Output softmax = ops::Softmax(root.WithOpName("softmax"), conv);
   GraphDef def;
   TF_CHECK_OK(root.ToGraphDef(&def));
   return def;
@@ -101,18 +100,18 @@ static GraphDef CreatePoolGraphDef() {
   Scope root = Scope::NewRootScope();
   Tensor input_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
   test::FillIota<float>(&input_data, 1.0f);
-  ops::Output input =
-      ops::Const(root.WithOpName("input"), ops::Input::Initializer(input_data));
+  Output input =
+      ops::Const(root.WithOpName("input"), Input::Initializer(input_data));
   Tensor filter_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
   test::FillIota<float>(&filter_data, 1.0f);
-  ops::Output filter = ops::Const(root.WithOpName("filter"),
-                                  ops::Input::Initializer(filter_data));
+  Output filter =
+      ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data));
   const std::vector<int> ksize{1, 1, 1, 1};
   const std::vector<int> padding{0, 0, 0, 0};
   const std::vector<int> strides{1, 1, 1, 1};
-  ops::Output max_pool =
+  Output max_pool =
       ops::MaxPool(root.WithOpName("maxpool"), input, ksize, strides, "SAME");
-  ops::Output softmax = ops::Softmax(root.WithOpName("softmax"), max_pool);
+  Output softmax = ops::Softmax(root.WithOpName("softmax"), max_pool);
   GraphDef def;
   TF_CHECK_OK(root.ToGraphDef(&def));
   return def;
@@ -352,10 +351,10 @@ TEST_F(GraphTransfererTest, LoadConvGraph) {
   ASSERT_TRUE(params_conv != nullptr);
   const int id = params_conv->node_id;
   EXPECT_GE(id, 0);
-  EXPECT_EQ("Conv2D", params_conv->type);
+  EXPECT_EQ("Conv2D", params_conv->type_name);
   EXPECT_EQ(3, params_conv->inputs_size);
   EXPECT_EQ(1, params_conv->outputs_size);
-  EXPECT_EQ("NN_PAD_SAME", params_conv->padding);
+  EXPECT_EQ(Padding::SAME, params_conv->padding);
 }
 
 TEST_F(GraphTransfererTest, LoadMaxPoolGraph) {
@@ -378,10 +377,10 @@ TEST_F(GraphTransfererTest, LoadMaxPoolGraph) {
   ASSERT_TRUE(params_max_pool != nullptr);
   const int id = params_max_pool->node_id;
   EXPECT_GE(id, 0);
-  EXPECT_EQ("MaxPool", params_max_pool->type);
+  EXPECT_EQ("MaxPool", params_max_pool->type_name);
   EXPECT_EQ(3, params_max_pool->inputs_size);
   EXPECT_EQ(1, params_max_pool->outputs_size);
-  EXPECT_EQ("NN_PAD_SAME", params_max_pool->padding);
+  EXPECT_EQ(Padding::SAME, params_max_pool->padding);
 }
 
 TEST(HexagonOpsDefinitions, CheckOpsDefinitions) {
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
index ecebd3c599..ca29fcdd47 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
@@ -15,12 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h"
 
-#include <queue>
-
 #ifdef USE_HEXAGON_LIBS
 #include "tensorflow/core/platform/hexagon/soc_interface.h"
 #include "tensorflow/core/platform/profile_utils/cpu_utils.h"
-#include "tensorflow/core/platform/types.h"
 #endif
 
 namespace tensorflow {
@@ -28,7 +25,6 @@ namespace tensorflow {
 const bool SHOW_DBG_IN_SOC = false;
 const bool DBG_USE_DUMMY_INPUT = false;
 const bool DBG_USE_SAMPLE_INPUT = false;
-const bool DBG_SHOW_RESULT = false;
 const int64 FLAG_ENABLE_PANDA_BINARY_INPUT = 0x01;
 
 #ifdef USE_HEXAGON_LIBS
@@ -145,18 +141,15 @@ bool HexagonControlWrapper::SetupGraph(
       output_count = std::get<1>(output_ptr_and_count);
       CHECK(output_count > 0);
     }
-
-    // TODO(satok): Do not use string. Use enum instead.
-    const string padding = params.padding;
     int padding_id = -1;
-    if (padding == "NN_PAD_NA") {
+    if (params.padding == 0) {
       padding_id = 0;
-    } else if (padding == "NN_PAD_SAME") {
+    } else if (params.padding == Padding::SAME) {
       padding_id = 1;
-    } else if (padding == "NN_PAD_VALID") {
+    } else if (params.padding == Padding::VALID) {
       padding_id = 2;
     } else {
-      CHECK(false) << "Unsupported padding " << padding;
+      CHECK(false);
     }
     soc_interface_AppendNode(params.name.c_str(), node_id + NODE_ID_OFFSET,
                              op_id, padding_id, input_ptr, input_count,
@@ -213,12 +206,6 @@ bool HexagonControlWrapper::ReadOutputNode(
   // TODO: Accept all results
   std::get<2>(output) = DT_FLOAT;
   outputs->emplace_back(output);
-  if (DBG_SHOW_RESULT) {
-    const int byte_size = std::get<1>(output);
-    const int element_count = byte_size / sizeof(float);
-    const float* float_array = reinterpret_cast<float*>(std::get<0>(output));
-    DumpTopNFloatResults(float_array, element_count, 10 /* top_n */);
-  }
   return true;
 }
 
@@ -240,19 +227,4 @@ bool HexagonControlWrapper::ReadOutputNode(const string,
 }
 #endif
 
-void HexagonControlWrapper::DumpTopNFloatResults(const float* data,
-                                                 const float element_count,
-                                                 const int top_n) {
-  std::priority_queue<std::tuple<float, int>> queue;
-  for (int i = 0; i < element_count; ++i) {
-    queue.emplace(data[i], i);
-  }
-  LOG(INFO) << "=== Dump ranking ===";
-  for (int i = 0; i < top_n; ++i) {
-    const std::tuple<float, int>& entry = queue.top();
-    LOG(INFO) << i << ": " << std::get<1>(entry) << ", " << std::get<0>(entry);
-    queue.pop();
-  }
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
index dfae5aa5e2..0ba0b323cb 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
@@ -46,9 +46,6 @@ class HexagonControlWrapper final : public ISocControlWrapper {
   // CAVEAT: Need offset as HVX library reserves some ids
   static constexpr int NODE_ID_OFFSET = 0x10000;
 
-  void DumpTopNFloatResults(const float *data, const float element_count,
-                            const int top_n);
-
   // Dummy float array for input node.
   // TODO(satok): Use actual data passed by FillInputNode and remove
   std::vector<float> dummy_input_float_;
diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
index d06fb5fabc..81e49bd147 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
@@ -17,10 +17,15 @@ limitations under the License.
 // -o /tmp/tensorflow_inception_v3_stripped_optimized_quantized.pb
 // adb push /tmp/tensorflow_inception_v3_stripped_optimized_quantized.pb \
 // /data/local/tmp
+// $ curl
+// https://storage.googleapis.com/download.tensorflow.org/models/imagenet_comp_graph_label_strings.txt
+// -o /tmp/imagenet_comp_graph_label_strings.txt
+// adb push /tmp/imagenet_comp_graph_label_strings.txt /data/local/tmp
 
 #include <memory>
 
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/hexagon/graph_transfer_utils.h"
 #include "tensorflow/core/kernels/hexagon/graph_transferer.h"
 #include "tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h"
 #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
@@ -29,7 +34,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/profile_utils/clock_cycle_profiler.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -40,6 +47,43 @@ const bool DBG_DUMP_FLOAT_DATA = false;
 const int WIDTH = 299;
 const int HEIGHT = 299;
 const int DEPTH = 3;
+const int EXPECTED_FIRST_RESULT_ID = 59;
+const int EXECUTION_REPEAT_COUNT = 3;
+
+static void DumpTop10Results(
+    const std::vector<ISocControlWrapper::ByteArray>& outputs) {
+  CHECK(outputs.size() == 1);
+  const int byte_size = std::get<1>(outputs.at(0));
+  const int element_count = byte_size / sizeof(float);
+  const float* float_array =
+      reinterpret_cast<float*>(std::get<0>(outputs.at(0)));
+  const string label_filename =
+      "/data/local/tmp/imagenet_comp_graph_label_strings.txt";
+  string label_str;
+  TF_CHECK_OK(ReadFileToString(Env::Default(), label_filename, &label_str));
+  std::vector<string> labels = str_util::Split(label_str, '\n');
+  GraphTransferUtils::DumpTopNFloatResults(
+      float_array, labels.data(),
+      std::min(element_count, static_cast<int>(labels.size())),
+      10 /* show top_n results */);
+}
+
+static void CheckFirstResult(
+    const std::vector<ISocControlWrapper::ByteArray>& outputs,
+    const int expected_first_id) {
+  EXPECT_GE(outputs.size(), 1);
+  const int byte_size = std::get<1>(outputs.at(0));
+  const int element_count = byte_size / sizeof(float);
+  const float* float_array =
+      reinterpret_cast<float*>(std::get<0>(outputs.at(0)));
+  EXPECT_GE(element_count, 1);
+  std::vector<string> labels(element_count);
+  std::priority_queue<std::tuple<float, int, string>> queue =
+      GraphTransferUtils::GetTopNFloatResults(float_array, labels.data(),
+                                              element_count);
+  const std::tuple<float, int, string>& entry = queue.top();
+  EXPECT_EQ(expected_first_id, std::get<1>(entry));
+}
 
 // CAVEAT: This test only runs when you specify hexagon library using
 // makefile.
@@ -77,12 +121,17 @@ TEST(GraphTransferer, RunInceptionV3OnHexagonExample) {
   const int fsize = bmp.size();
   LOG(INFO) << "Read " << image_filename << ", size = " << fsize << "bytes";
   const int64 pixel_count = WIDTH * HEIGHT * DEPTH;
+  CHECK(fsize >= 22 /* pos of height */ + sizeof(int));
+  CHECK(bmp.data() != nullptr);
   uint8* const img_bytes = bit_cast<uint8*>(bmp.data());
   const int header_size = *(reinterpret_cast<int*>(img_bytes + 10));
+  LOG(INFO) << "header size = " << header_size;
   const int size = *(reinterpret_cast<int*>(img_bytes + 14));
+  LOG(INFO) << "image size = " << size;
   const int width = *(reinterpret_cast<int*>(img_bytes + 18));
+  LOG(INFO) << "width = " << width;
   const int height = *(reinterpret_cast<int*>(img_bytes + 22));
-  LOG(INFO) << header_size << ", " << size << ", " << width << ", " << height;
+  LOG(INFO) << "height = " << height;
   CHECK(fsize >= (WIDTH + 1) * WIDTH * 3 + header_size);
 
   uint8* const bmp_pixels = &img_bytes[header_size];
@@ -129,12 +178,23 @@ TEST(GraphTransferer, RunInceptionV3OnHexagonExample) {
   hexagon_control_wrapper.FillInputNode("Mul", ba);
 
   // 4. Execute graph
-  hexagon_control_wrapper.ExecuteGraph();
+  profile_utils::CpuUtils::EnableClockCycleProfiling(true);
+  ClockCycleProfiler prof;
+  for (int i = 0; i < EXECUTION_REPEAT_COUNT; ++i) {
+    prof.Start();
+    hexagon_control_wrapper.ExecuteGraph();
+    prof.Stop();
+  }
 
-  // 5. Read output node's outputs
+  // 5-1. Read output node's outputs
   std::vector<ISocControlWrapper::ByteArray> outputs;
   hexagon_control_wrapper.ReadOutputNode("softmax", &outputs);
 
+  // 5-2. Dump results
+  DumpTop10Results(outputs);
+  CheckFirstResult(outputs, EXPECTED_FIRST_RESULT_ID);
+  prof.DumpStatistics("Graph Execution");
+
   // 6. Teardown graph in hexagon
   hexagon_control_wrapper.TeardownGraph();
 
diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/kernels/image_resizer_state.h
index 8870937422..33383d16a8 100644
--- a/tensorflow/core/kernels/image_resizer_state.h
+++ b/tensorflow/core/kernels/image_resizer_state.h
@@ -90,6 +90,18 @@ struct ImageResizerState {
         errors::InvalidArgument("input image must be of non-zero size"));
     height_scale = CalculateResizeScale(in_height, out_height, align_corners_);
     width_scale = CalculateResizeScale(in_width, out_width, align_corners_);
+
+    // Guard against overflows
+    OP_REQUIRES(context,
+                ceilf((out_height - 1) * height_scale) <=
+                    static_cast<float>(std::numeric_limits<int64>::max()),
+                errors::InvalidArgument(
+                    "input image height scale would cause an overflow"));
+    OP_REQUIRES(
+        context,
+        ceilf((out_width - 1) * width_scale) <= static_cast<float>(INT_MAX),
+        errors::InvalidArgument(
+            "input image width scale would cause an overflow"));
   }
 
   // Calculates all the required variables, and allocates the output.
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index 5f1f5b652c..b44f2f5465 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -29,39 +29,24 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 namespace functor {
 
 template <typename T>
-Status DoInplaceUpdate(const CPUDevice& d, InplaceOpType op,
-                       const Tensor& value, const Tensor& loc, Tensor* output) {
-  auto Tloc = loc.flat<int64>();
+Status DoParallelConcatUpdate(const CPUDevice& d, const Tensor& value,
+                              int32 loc, Tensor* output) {
   auto Tvalue = value.flat_outer_dims<T>();
   auto Toutput = output->flat_outer_dims<T>();
   auto nrows = Toutput.dimension(0);
-  for (int64 j = 0; j < Tloc.size(); ++j) {
-    auto r = (Tloc(j) % nrows + nrows) % nrows;  // Guard index range.
-    switch (op) {
-      case I_UPDATE:
-        Toutput.template chip<0>(r).device(d) = Tvalue.template chip<0>(j);
-        break;
-      case I_ADD:
-        Toutput.template chip<0>(r).device(d) += Tvalue.template chip<0>(j);
-        break;
-      case I_SUB:
-        Toutput.template chip<0>(r).device(d) -= Tvalue.template chip<0>(j);
-        break;
-      default:
-        return errors::InvalidArgument("Unsupported inplace operation", op);
-    }
-  }
+  auto r = (loc % nrows + nrows) % nrows;  // Guard index range.
+  Toutput.template chip<0>(r).device(d) = Tvalue.template chip<0>(0);
   return Status::OK();
 }
 
 template <>
-Status DoInplace(const CPUDevice& d, InplaceOpType op, const Tensor& value,
-                 const Tensor& loc, Tensor* output) {
+Status DoParallelConcat(const CPUDevice& d, const Tensor& value, int32 loc,
+                        Tensor* output) {
   CHECK_EQ(value.dtype(), output->dtype());
   switch (value.dtype()) {
 #define CASE(type)                  \
   case DataTypeToEnum<type>::value: \
-    return DoInplaceUpdate<type>(d, op, value, loc, output);
+    return DoParallelConcatUpdate<type>(d, value, loc, output);
     TF_CALL_NUMBER_TYPES(CASE);
 #undef CASE
     default:
@@ -73,19 +58,17 @@ Status DoInplace(const CPUDevice& d, InplaceOpType op, const Tensor& value,
 
 namespace {
 
-// TODO(apassos): validate the shapes better.
-class InplaceOpBase : public OpKernel {
+template <typename Device>
+class ParallelConcatUpdate : public OpKernel {
  public:
-  explicit InplaceOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  explicit ParallelConcatUpdate(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("loc", &loc_));
+  }
 
   void Compute(OpKernelContext* ctx) override {
     auto value = ctx->input(0);
-    auto loc = ctx->input(1);
-    auto update = ctx->input(2);
+    auto update = ctx->input(1);
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(loc.shape()),
-                errors::InvalidArgument("loc must be a vector. ",
-                                        loc.shape().DebugString()));
     OP_REQUIRES(
         ctx, value.dims() == update.dims(),
         errors::InvalidArgument("value and update shape doesn't match: ",
@@ -98,67 +81,39 @@ class InplaceOpBase : public OpKernel {
                                   value.shape().DebugString(), " vs. ",
                                   update.shape().DebugString()));
     }
-    OP_REQUIRES(ctx, loc.dim_size(0) == update.dim_size(0),
-                errors::InvalidArgument("loc and update shape doesn't match: ",
-                                        loc.shape().DebugString(), " vs. ",
+    OP_REQUIRES(ctx, 1 == update.dim_size(0),
+                errors::InvalidArgument("update shape doesn't match: ",
                                         update.shape().DebugString()));
 
     Tensor output = value;  // This creates an alias intentionally.
-    OP_REQUIRES_OK(ctx, DoCompute(ctx, update, loc, &output));
+    const auto& d = ctx->eigen_device<Device>();
+    OP_REQUIRES_OK(
+        ctx, ::tensorflow::functor::DoParallelConcat(d, update, loc_, &output));
     ctx->set_output(0, output);
   }
 
- protected:
-  virtual Status DoCompute(OpKernelContext* ctx, const Tensor& value,
-                           const Tensor& loc, Tensor* output) = 0;
-};
-
-template <typename Device, functor::InplaceOpType op>
-class InplaceOp : public InplaceOpBase {
- public:
-  explicit InplaceOp(OpKernelConstruction* ctx) : InplaceOpBase(ctx) {}
-
- protected:
-  Status DoCompute(OpKernelContext* ctx, const Tensor& value, const Tensor& loc,
-                   Tensor* output) override {
-    const auto& d = ctx->eigen_device<Device>();
-    return ::tensorflow::functor::DoInplace(d, op, value, loc, output);
-  }
+ private:
+  int32 loc_;
 };
 
 template <typename Device, typename T>
-class EmptyOp : public OpKernel {
+class ParallelConcatStart : public OpKernel {
  public:
-  explicit EmptyOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("init", &init_));
+  explicit ParallelConcatStart(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_));
   }
 
   void Compute(OpKernelContext* ctx) override {
-    const Tensor& shape = ctx->input(0);
-    OP_REQUIRES(
-        ctx, TensorShapeUtils::IsVector(shape.shape()),
-        errors::InvalidArgument("shape must be a vector of int32, got shape ",
-                                shape.shape().DebugString()));
-    auto dims = shape.flat<int32>();
-    TensorShape out_shape;
-    OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
-                            reinterpret_cast<const int32*>(dims.data()),
-                            dims.size(), &out_shape));
     Tensor* out = nullptr;
     // We do not know whether the output will be used on GPU. Setting it to be
     // gpu-compatible for now.
     AllocatorAttributes attr;
     attr.set_gpu_compatible(true);
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out, attr));
-
-    if (init_) {
-      functor::SetZeroFunctor<Device, T>()(ctx->eigen_device<Device>(),
-                                           out->flat<T>());
-    }
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, shape_, &out, attr));
   }
 
  private:
-  bool init_;
+  TensorShape shape_;
 };
 
 class FailureKernel : public OpKernel {
@@ -176,16 +131,15 @@ class FailureKernel : public OpKernel {
   REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")   \
                               .Device(DEVICE_CPU)         \
                               .TypeConstraint<type>("T"), \
-                          InplaceOp<CPUDevice, functor::I_UPDATE>);
+                          ParallelConcatUpdate<CPUDevice>);
 TF_CALL_NUMBER_TYPES(REGISTER)
 #undef REGISTER
 
 #define REGISTER_EMPTY(type)                                  \
   REGISTER_KERNEL_BUILDER(Name("_ParallelConcatStart")        \
                               .Device(DEVICE_CPU)             \
-                              .HostMemory("shape")            \
                               .TypeConstraint<type>("dtype"), \
-                          EmptyOp<CPUDevice, type>)
+                          ParallelConcatStart<CPUDevice, type>)
 
 TF_CALL_POD_STRING_TYPES(REGISTER_EMPTY)
 #undef REGISTER_EMPTY
@@ -204,9 +158,8 @@ typedef Eigen::GpuDevice GPUDevice;
 #define REGISTER_EMPTY(type)                                  \
   REGISTER_KERNEL_BUILDER(Name("_ParallelConcatStart")        \
                               .Device(DEVICE_GPU)             \
-                              .HostMemory("shape")            \
                               .TypeConstraint<type>("dtype"), \
-                          EmptyOp<GPUDevice, type>);
+                          ParallelConcatStart<GPUDevice, type>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_EMPTY)
 #undef REGISTER_EMPTY
 
@@ -221,7 +174,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_PARALLEL_CONCAT);
   REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")   \
                               .Device(DEVICE_GPU)         \
                               .TypeConstraint<type>("T"), \
-                          InplaceOp<GPUDevice, functor::I_UPDATE>);
+                          ParallelConcatUpdate<GPUDevice>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER)
 #undef REGISTER
 
@@ -231,11 +184,10 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER)
 REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
                             .Device(DEVICE_GPU)
                             .HostMemory("value")
-                            .HostMemory("loc")
                             .HostMemory("update")
                             .HostMemory("output")
                             .TypeConstraint<int32>("T"),
-                        InplaceOp<CPUDevice, functor::I_UPDATE>);
+                        ParallelConcatUpdate<CPUDevice>);
 #endif
 
 }  // end namespace
diff --git a/tensorflow/core/kernels/inplace_ops_functor.h b/tensorflow/core/kernels/inplace_ops_functor.h
index 6cb15eda91..53529f5165 100644
--- a/tensorflow/core/kernels/inplace_ops_functor.h
+++ b/tensorflow/core/kernels/inplace_ops_functor.h
@@ -22,19 +22,9 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
-// Inplace update/add/sub values in 'y'. It computes
-//   y[i, :] = v if op is I_UPDATE
-//   y[i, :] += v if op is I_ADD
-//   y[i, :] -= v if op is I_SUB
-enum InplaceOpType {
-  I_UPDATE,  // x = y
-  I_ADD,     // x += y
-  I_SUB,     // x -= y
-};
-
 template <typename Device>
-Status DoInplace(const Device& device, InplaceOpType op, const Tensor& value,
-                 const Tensor& loc, Tensor* output);
+Status DoParallelConcat(const Device& device, const Tensor& value, int32 loc,
+                        Tensor* output);
 
 }  // end namespace functor
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
index 8e70f4575d..8467360435 100644
--- a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
@@ -26,72 +26,43 @@ namespace functor {
 
 typedef Eigen::GpuDevice Device;
 
-template <typename T, InplaceOpType op>
-__global__ void DoInplaceOpKernel(int nthreads, const int64 rows,
-                                  const int64 cols, const int64 n, const T* src,
-                                  const int64* rowids, T* dst) {
+template <typename T>
+__global__ void DoParallelConcatOpKernel(int nthreads, const int64 rows,
+                                         const int64 cols, int32 loc,
+                                         const T* src, T* dst) {
   CUDA_1D_KERNEL_LOOP(idx, nthreads) {
-    int64 r = idx / cols;
     int64 c = idx % cols;
-    r = (rowids[r] % rows + rows) % rows;  // Guard index range.
+    int64 r = (loc % rows + rows) % rows;  // Guard index range.
     T* p = dst + r * cols + c;
     const T* q = src + idx;
-    switch (op) {
-      case I_UPDATE:
-        *p = ldg(q);
-        break;
-      case I_ADD:
-        *p += ldg(q);
-        break;
-      case I_SUB:
-        *p -= ldg(q);
-        break;
-    }
+    *p = ldg(q);
   }
 }
 
 template <typename T>
-Status DoInplaceUpdate(const Device& d, InplaceOpType op, const Tensor& value,
-                       const Tensor& loc, Tensor* output) {
+Status DoParallelConcatUpdate(const Device& d, const Tensor& value, int32 loc,
+                              Tensor* output) {
   const int64 nelem = value.NumElements();
   CudaLaunchConfig cfg = GetCudaLaunchConfig(nelem, d);
   auto Toutput = output->flat_outer_dims<T>();
   const int64 nrows = Toutput.dimension(0);
   const int64 ncols = Toutput.dimension(1);
-  const int64 n = loc.NumElements();
   const T* src = value.flat<T>().data();
-  const int64* rowids = loc.flat<int64>().data();
   T* dst = output->flat<T>().data();
-  switch (op) {
-    case I_UPDATE:
-      DoInplaceOpKernel<T, I_UPDATE>
-          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
-              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
-      break;
-    case I_ADD:
-      DoInplaceOpKernel<T, I_ADD>
-          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
-              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
-      break;
-    case I_SUB:
-      DoInplaceOpKernel<T, I_SUB>
-          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
-              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
-      break;
-    default:
-      return errors::InvalidArgument("Unsupported operation type", op);
-  }
+  DoParallelConcatOpKernel<T>
+      <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+          cfg.virtual_thread_count, nrows, ncols, loc, src, dst);
   return Status::OK();
 }
 
 template <>
-Status DoInplace(const Device& d, InplaceOpType op, const Tensor& value,
-                 const Tensor& loc, Tensor* output) {
+Status DoParallelConcat(const Device& d, const Tensor& value, int32 loc,
+                        Tensor* output) {
   CHECK_EQ(value.dtype(), output->dtype());
   switch (value.dtype()) {
-#define CASE(type)                                           \
-  case DataTypeToEnum<type>::value:                          \
-    return DoInplaceUpdate<type>(d, op, value, loc, output); \
+#define CASE(type)                                              \
+  case DataTypeToEnum<type>::value:                             \
+    return DoParallelConcatUpdate<type>(d, value, loc, output); \
     break;
 
     CASE(float)
diff --git a/tensorflow/core/kernels/record_input_op.cc b/tensorflow/core/kernels/record_input_op.cc
new file mode 100644
index 0000000000..878996c9d6
--- /dev/null
+++ b/tensorflow/core/kernels/record_input_op.cc
@@ -0,0 +1,67 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/record_yielder.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+class RecordInputOp : public OpKernel {
+ public:
+  explicit RecordInputOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+#define GETATTR(TYPE, FIELD) \
+  TYPE FIELD;                \
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(#FIELD, &FIELD));
+
+    GETATTR(string, file_pattern);
+    GETATTR(int64, file_random_seed);
+    GETATTR(float, file_shuffle_shift_ratio);
+    GETATTR(int64, file_buffer_size);
+    GETATTR(int64, file_parallelism);
+    GETATTR(int64, batch_size);
+#undef GETATTR
+
+    RecordYielder::Options yopts;
+    yopts.file_pattern = file_pattern;
+    yopts.seed = file_random_seed;
+    yopts.bufsize = file_buffer_size;
+    yopts.file_shuffle_shift_ratio = file_shuffle_shift_ratio;
+    yopts.parallelism = file_parallelism;
+    yielder_ = std::unique_ptr<RecordYielder>(new RecordYielder(ctx, yopts));
+
+    batch_size_ = batch_size;
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor out(DT_STRING, {batch_size_});
+    auto t_out = out.flat<string>();
+    for (int i = 0; i < batch_size_; ++i) {
+      OP_REQUIRES_OK(ctx, yielder_->YieldOne(&t_out(i)));
+    }
+    ctx->set_output(0, out);
+  }
+
+ private:
+  int64 batch_size_;
+  std::unique_ptr<RecordYielder> yielder_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("RecordInput").Device(DEVICE_CPU), RecordInputOp);
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/record_yielder.cc b/tensorflow/core/kernels/record_yielder.cc
new file mode 100644
index 0000000000..e391752289
--- /dev/null
+++ b/tensorflow/core/kernels/record_yielder.cc
@@ -0,0 +1,216 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/record_yielder.h"
+
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+RecordYielder::RecordYielder(OpKernelConstruction* context,
+                             const RecordYielder::Options& opts)
+    : opts_(opts),
+      thread_(new thread::ThreadPool(context->env(), "record_yielder",
+                                     1 + opts.parallelism)),
+      epoch_(0),
+      rnd_(opts.seed) {
+  thread_->Schedule([this]() { MainLoop(); });
+}
+
+RecordYielder::~RecordYielder() {
+  {
+    mutex_lock l(mu_);
+    stop_ = true;
+    buf_empty_.notify_all();
+    buf_enough_.notify_all();
+    buf_not_full_.notify_all();
+  }
+  main_loop_done_.WaitForNotification();
+  delete thread_;
+}
+
+Status RecordYielder::YieldOne(string* value) {
+  mutex_lock l(mu_);
+  while (!BufEnough()) {
+    buf_enough_.wait(l);
+  }
+  if (status_.ok()) {
+    bool notify_no_longer_full = !BufNotFull();
+    CHECK(!stop_ && !buf_.empty());
+    *value = std::move(buf_.back());
+    buf_.pop_back();
+    ++num_records_yielded_in_epoch_;
+    // Assumption is that an epoch always has something in the buffer
+    // until it ends.  If the input pipeline was slower than the consumers
+    // by a lot this might not be true.  Not sure how to handle.
+    if (buf_.empty()) {
+      buf_empty_.notify_all();
+    }
+    if (notify_no_longer_full) {
+      buf_not_full_.notify_all();
+    }
+  }
+  return status_;
+}
+
+struct RecordYielder::Shard {
+  int index;                      // Shard index.
+  std::vector<string> filenames;  // File names given to this shard.
+  Notification done;              // Notified when this shard is done.
+  Status status;                  // Shard status.
+};
+
+bool RecordYielder::ShouldFinish(const Status& s) {
+  mutex_lock l(mu_);
+  status_.Update(s);
+  return stop_ || !status_.ok();
+}
+
+static Status MatchFiles(const string& patterns,
+                         std::vector<string>* filenames) {
+  for (const auto& file_pattern : str_util::Split(patterns, ',')) {
+    std::vector<string> tmp_filenames;
+    TF_RETURN_IF_ERROR(
+        Env::Default()->GetMatchingPaths(file_pattern, &tmp_filenames));
+    filenames->insert(filenames->end(),
+                      std::make_move_iterator(tmp_filenames.begin()),
+                      std::make_move_iterator(tmp_filenames.end()));
+  }
+  return Status::OK();
+}
+
+void RecordYielder::MainLoop() {
+  while (true) {
+    ++epoch_;
+    num_records_yielded_in_epoch_ = 0;
+
+    // Finds all files.
+    std::vector<string> filenames;
+    Status s = MatchFiles(opts_.file_pattern, &filenames);
+    if (ShouldFinish(s)) break;
+
+    if (filenames.empty()) {
+      s = errors::NotFound("Found no files at ", opts_.file_pattern);
+      if (ShouldFinish(s)) break;
+    }
+
+    // Shuffles these files according to the epoch # and random seed.
+    std::mt19937_64 shuffle_rnd(
+        Hash64(reinterpret_cast<char*>(&epoch_), sizeof(epoch_), opts_.seed));
+    std::shuffle(filenames.begin(), filenames.end(), shuffle_rnd);
+
+    // Left-shift the filename list.
+    const int64 num = filenames.size();
+    int64 shift;
+    if (0 <= opts_.file_shuffle_shift_ratio &&
+        opts_.file_shuffle_shift_ratio < 1) {
+      shift = opts_.file_shuffle_shift_ratio * num;
+      std::rotate(filenames.begin(), filenames.begin() + shift,
+                  filenames.end());
+    }
+
+    // Shards files and use one thread to go through each shard.
+    const int N = opts_.parallelism;
+    std::vector<Shard> shards(N);
+    for (int i = 0; i < N; ++i) {
+      Shard* shard = &shards[i];
+      shard->index = i;
+      for (int j = i; j < filenames.size(); j += N) {
+        shard->filenames.push_back(filenames[j]);
+      }
+      thread_->Schedule([this, shard]() { ShardLoop(shard); });
+    }
+    for (int i = 0; i < N; ++i) {
+      shards[i].done.WaitForNotification();
+      s.Update(shards[i].status);
+    }
+    if (ShouldFinish(s)) break;
+
+    // Starts the next epoch once all buffered records are consumed.
+    {
+      mutex_lock l(mu_);
+      epoch_end_ = true;
+      while (!BufEmpty()) {
+        buf_empty_.wait(l);
+      }
+      epoch_end_ = false;
+    }
+  }
+  main_loop_done_.Notify();
+}
+
+bool RecordYielder::Add(std::vector<string>* values) {
+  mutex_lock l(mu_);
+  while (!BufNotFull()) {
+    buf_not_full_.wait(l);
+  }
+  while (BufNotFull() && !values->empty()) {
+    // Adds values->back(). Swaps its position with another random
+    // element.
+    auto index = rnd_() % (buf_.size() + 1);
+    if (index == buf_.size()) {
+      buf_.push_back(std::move(values->back()));
+    } else {
+      buf_.push_back(std::move(buf_[index]));
+      buf_[index] = std::move(values->back());
+    }
+    values->pop_back();
+  }
+  if (BufEnough()) {
+    buf_enough_.notify_all();
+  }
+  return stop_;
+}
+
+void RecordYielder::ShardLoop(Shard* shard) {
+  std::vector<string> values;
+  const int64 kRecords = 16;
+  for (const string& filename : shard->filenames) {
+    std::unique_ptr<RandomAccessFile> file;
+    if (ShouldFinish(Status::OK())) break;
+    Status s = Env::Default()->NewRandomAccessFile(filename, &file);
+    if (!s.ok()) {
+      shard->status = errors::InvalidArgument("Can't open ", filename);
+      break;
+    }
+    io::RecordReader rdr(file.get());
+    uint64 offset = 0;
+    string record;
+    while (true) {
+      Status s = rdr.ReadRecord(&offset, &record);
+      if (s.ok()) {
+        values.emplace_back(std::move(record));
+        if (values.size() >= kRecords && Add(&values)) {
+          shard->status = errors::Aborted("stopped");
+          break;
+        }
+      } else if (errors::IsOutOfRange(s)) {
+        break;
+      } else {
+        shard->status = s;
+        break;
+      }
+    }
+  }
+  // Adds the remaining values of this shard to buf_.
+  while (!values.empty()) {
+    Add(&values);
+  }
+  shard->done.Notify();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/record_yielder.h b/tensorflow/core/kernels/record_yielder.h
new file mode 100644
index 0000000000..503644f3b8
--- /dev/null
+++ b/tensorflow/core/kernels/record_yielder.h
@@ -0,0 +1,157 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_RECORD_YIELDER_H_
+#define TENSORFLOW_KERNELS_RECORD_YIELDER_H_
+
+#include <atomic>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+// RecordYielder produces value records from a set of tfrecord files
+// in a random order.
+//
+// It guarantees that:
+//   1) all records in tfrecords are yielded within every epoch;
+//   2) each record is yielded only once within every epoch;
+//   3) the order in which records are yielded are highly randomized.
+//   4) the peak memory usage is roughly avg record size *
+//      (opts.bufsize + opts.parellelism * 16).
+//
+// Usage example:
+//   RecordYielder::Options opts;
+//   opts.file_pattern = "input-*";
+//   opts.seed = 301;
+//   opts.bufsize = 1000000;    // A randomized buffer with 1M records.
+//   opts.parallelism = 8;      // Uses 8 tfrecord iterators to iterate
+//                              // through all files.
+//   RecordYielder yielder(opts);
+//   string val;
+//   while (true) {
+//     yielder.YieldOne(&val);
+//     // process val
+//   }
+//
+// RecordYielder can be accessed by multiple threads concurrently.
+class RecordYielder {
+ public:
+  struct Options {
+    // Glob pattern for tfrecords.
+    string file_pattern;
+
+    // Random seed. It determines how data files are shuffled and how
+    // records are shuffled.
+    int64 seed = 0;
+
+    // Each epoch, all files are first shuffled according to the
+    // random seed and the epoch number, and then all files are
+    // left-shifted by file_shuffle_shift_ratio * num_files slots.  If
+    // file_shuffle_shift_ratio is not within [0, 1), the
+    // implementation clip it to [0, 1).
+    float file_shuffle_shift_ratio = 0;
+
+    // Randomization buffer keeps these many records.
+    uint64 bufsize = 1;
+
+    // Uses these many concurrent tfrecord iterators to iterate through
+    // tfrecords.
+    int32 parallelism = 1;
+  };
+
+  explicit RecordYielder(OpKernelConstruction* context,
+                         const RecordYielder::Options& opts);
+  ~RecordYielder();
+
+  RecordYielder(const RecordYielder&) = delete;
+  RecordYielder& operator=(const RecordYielder&) = delete;
+
+  // Yields one 'value'.
+  Status YieldOne(string* value);
+
+  // Returns the current epoch number.
+  int64 current_epoch() const { return epoch_; }
+
+ private:
+  typedef RecordYielder ME;
+
+  Options opts_;
+
+  // Backgrounds threads. Owned.
+  thread::ThreadPool* thread_;
+
+  // Epoch number.
+  std::atomic<int64> epoch_;
+
+  mutex mu_;
+
+  // Turned to true when this is deleted.
+  bool stop_ GUARDED_BY(mu_) = false;
+  Status status_ GUARDED_BY(mu_);
+
+  // PRG used for randomization.
+  std::mt19937_64 rnd_ GUARDED_BY(mu_);
+
+  // Randomization buffer.
+  std::vector<string> buf_ GUARDED_BY(mu_);
+
+  // True iff we are draining an epoch.
+  bool epoch_end_ = false;
+
+  int64 num_records_yielded_in_epoch_ = 0;
+
+  // Trigger when the main loop has exited.
+  Notification main_loop_done_;
+
+  // condition_variables.
+  condition_variable buf_empty_;
+  bool BufEmpty() const SHARED_LOCKS_REQUIRED(mu_) {
+    return stop_ || buf_.empty();
+  }
+
+  condition_variable buf_not_full_;
+  bool BufNotFull() const SHARED_LOCKS_REQUIRED(mu_) {
+    return stop_ || buf_.size() < opts_.bufsize;
+  }
+
+  condition_variable buf_enough_;
+  bool BufEnough() const SHARED_LOCKS_REQUIRED(mu_) {
+    // NOTE: Unless we are finishing an epoch, we want to make sure
+    // the buf_ contains enough randomized elements before yielding
+    // any.
+    return stop_ || !status_.ok() || (epoch_end_ && !buf_.empty()) ||
+           (!epoch_end_ &&
+            buf_.size() >= std::max<int64>(1, opts_.bufsize / 2));
+  }
+
+  void MainLoop();
+  struct Shard;
+  void ShardLoop(Shard* shard);
+  bool ShouldFinish(const Status& s);
+  bool Add(std::vector<string>* values);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_RECORD_YIELDER_H_
diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc
index 6dfe871c52..85d28d2c64 100644
--- a/tensorflow/core/kernels/resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op.cc
@@ -64,6 +64,201 @@ class ResizeBilinearOp : public OpKernel {
   bool align_corners_;
 };
 
+namespace {
+// Compute the interpolation indices only once.
+struct CachedInterpolation {
+  int64 lower;  // Lower source index used in the interpolation
+  int64 upper;  // Upper source index used in the interpolation
+  // 1-D linear iterpolation scale (see:
+  // https://en.wikipedia.org/wiki/Bilinear_interpolation)
+  float lerp;
+  // How many consecutive points use the same lower & upper indices
+  int consecutive;
+};
+
+enum ImageScalePattern { SCALE_UP, SIMILAR, SCALE_DOWN };
+
+inline ImageScalePattern compute_image_scale_pattern(const int64 out_height,
+                                                     const int64 out_width,
+                                                     const int64 in_height,
+                                                     const int64 in_width) {
+  if (in_height * 2 < out_height || in_width * 2 < out_width) {
+    return SCALE_UP;
+  } else if (out_height * 2 < in_height || out_width * 2 < in_width) {
+    return SCALE_DOWN;
+  } else {
+    return SIMILAR;
+  }
+}
+
+inline int compute_scratch_size(const int64 out_height, const int64 out_width,
+                                const int64 in_height, const int64 in_width,
+                                const int channels,
+                                const ImageScalePattern scale_pattern) {
+  // Allocate a CachedInterpolation for each y, and each x in the out-height,
+  // plus 2 extra to avoid extra branches in the
+  // CachedInterpolation.consecutive computation.
+  const int cached_computation_size =
+      sizeof(CachedInterpolation) * (out_height + out_width + 2);
+  if (scale_pattern == SCALE_DOWN) {
+    return cached_computation_size;
+  } else {
+    // In order to avoid paying the cost of data type conversion multiple times,
+    // we must allocate a temporary image as well.
+    const int tmp_image_size = sizeof(float) * in_height * in_width * channels;
+    // We batch up all memory allocations into a single malloc call for
+    // performance reasons.
+    return cached_computation_size + tmp_image_size;
+  }
+}
+
+inline void compute_interpolation_weights(const ImageScalePattern scale_pattern,
+                                          const int64 out_size,
+                                          const int64 in_size,
+                                          const float scale,
+                                          CachedInterpolation* interpolation) {
+  interpolation[out_size].lower = 0;
+  interpolation[out_size].upper = 0;
+  interpolation[out_size].consecutive = 0;
+  for (int64 i = out_size - 1; i >= 0; --i) {
+    const float in = i * scale;
+    interpolation[i].lower = static_cast<int64>(in);
+    interpolation[i].upper = std::min(interpolation[i].lower + 1, in_size - 1);
+    interpolation[i].lerp = in - interpolation[i].lower;
+    interpolation[i].consecutive =
+        interpolation[i + 1].lower == interpolation[i].lower &&
+                interpolation[i + 1].upper == interpolation[i].upper
+            ? interpolation[i + 1].consecutive + 1
+            : 1;
+  }
+}
+
+template <typename T>
+struct Converter {
+  static inline const float* convert_image_to_float(
+      typename TTypes<T, 4>::ConstTensor images, const int batch_index,
+      const int64 in_height, const int64 in_width, const int channels,
+      std::vector<float>* converted_image_v) {
+    converted_image_v->resize(in_height * in_width * channels);
+    float* converted_image = converted_image_v->data();
+    for (int64 y = 0; y < in_height; ++y) {
+      for (int64 x = 0; x < in_width; ++x) {
+        for (int c = 0; c < channels; ++c) {
+          converted_image[y * in_width * channels + x * channels + c] =
+              static_cast<float>(images(batch_index, y, x, c));
+        }
+      }
+    }
+    return converted_image;
+  }
+};
+
+template <>
+struct Converter<float> {
+  static inline const float* convert_image_to_float(
+      typename TTypes<float, 4>::ConstTensor images, const int b,
+      const int64 in_height, const int64 in_width, const int channels,
+      std::vector<float>* converted_image_v) {
+    return images.data() + (b * in_height * in_width * channels);
+  }
+};
+
+/**
+ * Computes the bilinear interpolation from the appropriate 4 float points
+ * and the linear interpolation weights.
+ */
+inline float compute_lerp(const float top_left, const float top_right,
+                          const float bottom_left, const float bottom_right,
+                          const float x_lerp, const float y_lerp) {
+  const float top = top_left + (top_right - top_left) * x_lerp;
+  const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
+  return top + (bottom - top) * y_lerp;
+}
+
+template <typename T>
+inline void scale_down_image(typename TTypes<T, 4>::ConstTensor images,
+                             const int batch_size, const int64 out_height,
+                             const int64 out_width, const int channels,
+                             const std::vector<CachedInterpolation>& xs,
+                             const std::vector<CachedInterpolation>& ys,
+                             typename TTypes<float, 4>::Tensor output) {
+  // Do not eagerly convert all input data points, as we ignore most.
+  for (int b = 0; b < batch_size; ++b) {
+    // Compute the interpolation
+    for (int64 y = 0; y < out_height; ++y) {
+      for (int64 x = 0; x < out_width; ++x) {
+        for (int c = 0; c < channels; ++c) {
+          const float top_left(images(b, ys[y].lower, xs[x].lower, c));
+          const float top_right(images(b, ys[y].lower, xs[x].upper, c));
+          const float bottom_left(images(b, ys[y].upper, xs[x].lower, c));
+          const float bottom_right(images(b, ys[y].upper, xs[x].upper, c));
+          output(b, y, x, c) =
+              compute_lerp(top_left, top_right, bottom_left, bottom_right,
+                           xs[x].lerp, ys[y].lerp);
+        }
+      }
+    }
+  }
+}
+
+inline void scale_up_image(const float* input_image, const int batch_index,
+                           const int64 out_height, const int64 out_width,
+                           const int channels, const int64 in_height,
+                           const int64 in_width,
+                           const std::vector<CachedInterpolation>& xs,
+                           const std::vector<CachedInterpolation>& ys,
+                           typename TTypes<float, 4>::Tensor output) {
+  for (int64 y = 0; y < out_height; y += ys[y].consecutive) {
+    const int64 in_y_lower = ys[y].lower * in_width * channels;
+    const int64 in_y_upper = ys[y].upper * in_width * channels;
+    for (int64 x = 0; x < out_width; x += xs[x].consecutive) {
+      const int64 in_x_lower = xs[x].lower * channels;
+      const int64 in_x_upper = xs[x].upper * channels;
+      for (int c = 0; c < channels; ++c) {
+        const float top_left = input_image[in_y_lower + in_x_lower + c];
+        const float top_right = input_image[in_y_lower + in_x_upper + c];
+        const float bottom_left = input_image[in_y_upper + in_x_lower + c];
+        const float bottom_right = input_image[in_y_upper + in_x_upper + c];
+        for (int64 y_inner = y; y_inner < y + ys[y].consecutive; ++y_inner) {
+          for (int64 x_inner = x; x_inner < x + xs[x].consecutive; ++x_inner) {
+            output(batch_index, y_inner, x_inner, c) =
+                compute_lerp(top_left, top_right, bottom_left, bottom_right,
+                             xs[x_inner].lerp, ys[y_inner].lerp);
+          }
+        }
+      }
+    }
+  }
+}
+
+inline void scale_similar_image(const float* input_image, const int b,
+                                const int64 out_height, const int64 out_width,
+                                const int channels, const int64 in_height,
+                                const int64 in_width,
+                                const std::vector<CachedInterpolation>& xs,
+                                const std::vector<CachedInterpolation>& ys,
+                                typename TTypes<float, 4>::Tensor output) {
+  // Compute the interpolation
+  for (int64 y = 0; y < out_height; ++y) {
+    const int64 in_y_lower = ys[y].lower * in_width * channels;
+    const int64 in_y_upper = ys[y].upper * in_width * channels;
+    // Similar-sized images do not have a set of inner loops.
+    for (int64 x = 0; x < out_width; ++x) {
+      const int64 in_x_lower = xs[x].lower * channels;
+      const int64 in_x_upper = xs[x].upper * channels;
+      for (int c = 0; c < channels; ++c) {
+        const float top_left = input_image[in_y_lower + in_x_lower + c];
+        const float top_right = input_image[in_y_lower + in_x_upper + c];
+        const float bottom_left = input_image[in_y_upper + in_x_lower + c];
+        const float bottom_right = input_image[in_y_upper + in_x_upper + c];
+        output(b, y, x, c) = compute_lerp(top_left, top_right, bottom_left,
+                                          bottom_right, xs[x].lerp, ys[y].lerp);
+      }
+    }
+  }
+}
+}  // namespace
+
 // Partial specialization of ResizeBilinear functor for a CPUDevice.
 namespace functor {
 template <typename T>
@@ -71,7 +266,7 @@ struct ResizeBilinear<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor images,
                   const float height_scale, const float width_scale,
                   typename TTypes<float, 4>::Tensor output) {
-    const int batch = images.dimension(0);
+    const int batch_size = images.dimension(0);
     const int64 in_height = images.dimension(1);
     const int64 in_width = images.dimension(2);
     const int channels = images.dimension(3);
@@ -79,31 +274,41 @@ struct ResizeBilinear<CPUDevice, T> {
     const int64 out_height = output.dimension(1);
     const int64 out_width = output.dimension(2);
 
-    for (int b = 0; b < batch; ++b) {
-      for (int y = 0; y < out_height; ++y) {
-        const float in_y = y * height_scale;
-        const int64 top_y_index = static_cast<int64>(floorf(in_y));
-        const int64 bottom_y_index =
-            std::min(static_cast<int64>(ceilf(in_y)), in_height - 1);
-        const float y_lerp = in_y - top_y_index;
-        for (int x = 0; x < out_width; ++x) {
-          const float in_x = x * width_scale;
-          const int64 left_x_index = static_cast<int64>(floorf(in_x));
-          const int64 right_x_index =
-              std::min(static_cast<int64>(ceilf(in_x)), in_width - 1);
-          const float x_lerp = in_x - left_x_index;
-          for (int c = 0; c < channels; ++c) {
-            const float top_left(images(b, top_y_index, left_x_index, c));
-            const float top_right(images(b, top_y_index, right_x_index, c));
-            const float bottom_left(images(b, bottom_y_index, left_x_index, c));
-            const float bottom_right(
-                images(b, bottom_y_index, right_x_index, c));
-            const float top = top_left + (top_right - top_left) * x_lerp;
-            const float bottom =
-                bottom_left + (bottom_right - bottom_left) * x_lerp;
-            output(b, y, x, c) = top + (bottom - top) * y_lerp;
-          }
-        }
+    // Handle no-op resizes efficiently.
+    if (out_height == in_height && out_width == in_width) {
+      output = images.template cast<float>();
+      return;
+    }
+
+    const ImageScalePattern scale_pattern =
+        compute_image_scale_pattern(out_height, out_width, in_height, in_width);
+    std::vector<CachedInterpolation> ys(out_height + 1);
+    std::vector<CachedInterpolation> xs(out_width + 1);
+    std::vector<float> converted_image_v;
+
+    // Compute the cached interpolation weights on the x and y dimensions.
+    compute_interpolation_weights(scale_pattern, out_height, in_height,
+                                  height_scale, ys.data());
+    compute_interpolation_weights(scale_pattern, out_width, in_width,
+                                  width_scale, xs.data());
+
+    if (scale_pattern == SCALE_UP) {
+      for (int b = 0; b < batch_size; ++b) {
+        const float* converted_image = Converter<T>::convert_image_to_float(
+            images, b, in_height, in_width, channels, &converted_image_v);
+        scale_up_image(converted_image, b, out_height, out_width, channels,
+                       in_height, in_width, xs, ys, output);
+      }
+    } else if (scale_pattern == SCALE_DOWN) {
+      // Do not eagerly convert all input data points, as we ignore most.
+      scale_down_image<T>(images, batch_size, out_height, out_width, channels,
+                          xs, ys, output);
+    } else {
+      for (int b = 0; b < batch_size; ++b) {
+        const float* converted_image = Converter<T>::convert_image_to_float(
+            images, b, in_height, in_width, channels, &converted_image_v);
+        scale_similar_image(converted_image, b, out_height, out_width, channels,
+                            in_height, in_width, xs, ys, output);
       }
     }
   }
diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/resize_bilinear_op_test.cc
index 32acdf2df8..a4f1120578 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -39,6 +40,74 @@ class ResizeBilinearOpTest : public OpsTestBase {
                      .Finalize(node_def()));
     TF_EXPECT_OK(InitOp());
   }
+
+  const Tensor* AddRandomImageInput(const TensorShape& shape) {
+    CHECK_GT(input_types_.size(), inputs_.size())
+        << "Adding more inputs than types; perhaps you need to call MakeOp";
+    CHECK_EQ(shape.dims(), 4) << "All images must have 4 dimensions.";
+    bool is_ref = IsRefType(input_types_[inputs_.size()]);
+    Tensor* input = new Tensor(device_->GetAllocator(AllocatorAttributes()),
+                               DataTypeToEnum<float>::v(), shape);
+    input->flat<float>().setRandom();
+    tensors_.push_back(input);
+    if (is_ref) {
+      CHECK_EQ(RemoveRefType(input_types_[inputs_.size()]),
+               DataTypeToEnum<float>::v());
+      inputs_.push_back({&lock_for_refs_, input});
+    } else {
+      CHECK_EQ(input_types_[inputs_.size()], DataTypeToEnum<float>::v());
+      inputs_.push_back({nullptr, input});
+    }
+    return input;
+  }
+
+  // This is the straight forward unoptimized implementation of resize bilinear
+  // We use this to confirm that the optimized version is exactly identical.
+  void ResizeBilinearBaseline(TTypes<float, 4>::ConstTensor images,
+                              TTypes<float, 4>::Tensor output) {
+    const int batch = images.dimension(0);
+    const int64 in_height = images.dimension(1);
+    const int64 in_width = images.dimension(2);
+    const int channels = images.dimension(3);
+
+    ASSERT_EQ(batch, output.dimension(0));
+    ASSERT_EQ(channels, output.dimension(3));
+
+    const int64 out_height = output.dimension(1);
+    const int64 out_width = output.dimension(2);
+
+    const float height_scale = in_height / static_cast<float>(out_height);
+    const float width_scale = in_width / static_cast<float>(out_width);
+
+    for (int b = 0; b < batch; ++b) {
+      for (int64 y = 0; y < out_height; ++y) {
+        const float in_y = y * height_scale;
+        const int64 top_y_index = static_cast<int64>(floorf(in_y));
+        const int64 bottom_y_index =
+            std::min(static_cast<int64>(ceilf(in_y)), in_height - 1);
+        const float y_lerp = in_y - top_y_index;
+        for (int64 x = 0; x < out_width; ++x) {
+          const float in_x = x * width_scale;
+          const int64 left_x_index = static_cast<int64>(floorf(in_x));
+          const int64 right_x_index =
+              std::min(static_cast<int64>(ceilf(in_x)), in_width - 1);
+          const float x_lerp = in_x - left_x_index;
+          for (int c = 0; c < channels; ++c) {
+            const float top_left = images(b, top_y_index, left_x_index, c);
+            const float top_right = images(b, top_y_index, right_x_index, c);
+            const float bottom_left =
+                images(b, bottom_y_index, left_x_index, c);
+            const float bottom_right =
+                images(b, bottom_y_index, right_x_index, c);
+            const float top = top_left + (top_right - top_left) * x_lerp;
+            const float bottom =
+                bottom_left + (bottom_right - bottom_left) * x_lerp;
+            output(b, y, x, c) = top + (bottom - top) * y_lerp;
+          }
+        }
+      }
+    }
+  }
 };
 
 class ResizeBilinearOpAlignCornersTest : public OpsTestBase {
@@ -68,6 +137,23 @@ TEST_F(ResizeBilinearOpTest, TestBilinear2x2To1x1) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(ResizeBilinearOpTest, TestBilinearRandom2x2To1x1) {
+  const Tensor* input = AddRandomImageInput(TensorShape({1, 2, 2, 1}));
+  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // When scaling down, we have to arbitrarily pick a pixel from the
+  // original input. In this case, we choose the top/left most pixel.
+  Tensor* output = GetOutput(0);
+  std::unique_ptr<Tensor> expected(
+      new Tensor(device_->GetAllocator(AllocatorAttributes()),
+                 DataTypeToEnum<float>::v(), TensorShape({1, 1, 1, 1})));
+  ResizeBilinearBaseline(input->tensor<float, 4>(),
+                         expected->tensor<float, 4>());
+  EXPECT_EQ(input->flat<float>()(0), output->flat<float>()(0));
+  test::ExpectTensorEqual<float>(*expected.get(), *output);
+}
+
 TEST_F(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners2x2To1x1) {
   // Input:
   //  1, 2
@@ -302,6 +388,62 @@ TEST_F(ResizeBilinearOpTest, TestBilinear2x2To4x4) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(ResizeBilinearOpTest, TestBilinearRandom183x299To299x299) {
+  const TensorShape shape({1, 183, 299, 1});
+  const Tensor* input = AddRandomImageInput(shape);
+  AddInputFromArray<int32>(TensorShape({2}), {299, 299});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::unique_ptr<Tensor> expected(
+      new Tensor(device_->GetAllocator(AllocatorAttributes()),
+                 DataTypeToEnum<float>::v(), TensorShape({1, 299, 299, 1})));
+  ResizeBilinearBaseline(input->tensor<float, 4>(),
+                         expected->tensor<float, 4>());
+  test::ExpectTensorEqual<float>(*expected, *GetOutput(0));
+}
+
+TEST_F(ResizeBilinearOpTest, TestBilinearRandom141x186To299x299) {
+  const TensorShape shape({1, 141, 186, 1});
+  const Tensor* input = AddRandomImageInput(shape);
+  AddInputFromArray<int32>(TensorShape({2}), {299, 299});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::unique_ptr<Tensor> expected(
+      new Tensor(device_->GetAllocator(AllocatorAttributes()),
+                 DataTypeToEnum<float>::v(), TensorShape({1, 299, 299, 1})));
+  ResizeBilinearBaseline(input->tensor<float, 4>(),
+                         expected->tensor<float, 4>());
+  test::ExpectTensorEqual<float>(*expected, *GetOutput(0));
+}
+
+TEST_F(ResizeBilinearOpTest, TestBilinearRandom749x603To299x299) {
+  const TensorShape shape({1, 749, 603, 1});
+  const Tensor* input = AddRandomImageInput(shape);
+  AddInputFromArray<int32>(TensorShape({2}), {299, 299});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::unique_ptr<Tensor> expected(
+      new Tensor(device_->GetAllocator(AllocatorAttributes()),
+                 DataTypeToEnum<float>::v(), TensorShape({1, 299, 299, 1})));
+  ResizeBilinearBaseline(input->tensor<float, 4>(),
+                         expected->tensor<float, 4>());
+  test::ExpectTensorEqual<float>(*expected, *GetOutput(0));
+}
+
+TEST_F(ResizeBilinearOpTest, TestBilinearRandom299x299To299x299) {
+  const TensorShape shape({1, 299, 299, 1});
+  const Tensor* input = AddRandomImageInput(shape);
+  AddInputFromArray<int32>(TensorShape({2}), {299, 299});
+  TF_ASSERT_OK(RunOpKernel());
+
+  std::unique_ptr<Tensor> expected(
+      new Tensor(device_->GetAllocator(AllocatorAttributes()),
+                 DataTypeToEnum<float>::v(), TensorShape({1, 299, 299, 1})));
+  ResizeBilinearBaseline(input->tensor<float, 4>(),
+                         expected->tensor<float, 4>());
+  test::ExpectTensorEqual<float>(*expected, *GetOutput(0));
+}
+
 TEST_F(ResizeBilinearOpTest, TestInvalidOutputSize) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 6c4f20a23a..6a3f3dfc77 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -1386,21 +1386,21 @@ void wrapper_libxsmm_spmdm_createSparseSlice_generic_thread(
 void wrapper_libxsmm_spmdm_compute_generic_thread(
     empty_type_wrapper<bfloat16>, const libxsmm_spmdm_handle* handle,
     char transA, char transB, const bfloat16* alpha,
-    libxsmm_CSR_sparseslice* A_sparse, const bfloat16* B, const bfloat16* beta,
-    float* C, int block_id, int tid, int nthreads) {
+    libxsmm_CSR_sparseslice* A_sparse, const bfloat16* B, char transC,
+    const bfloat16* beta, float* C, int block_id, int tid, int nthreads) {
   return libxsmm_spmdm_compute_bfloat16_thread(
       handle, transA, transB, reinterpret_cast<const uint16*>(alpha), A_sparse,
-      reinterpret_cast<const uint16*>(B), 'N', reinterpret_cast<const uint16*>(beta),
-      C, block_id, tid, nthreads);
+      reinterpret_cast<const uint16*>(B), transC,
+      reinterpret_cast<const uint16*>(beta), C, block_id, tid, nthreads);
 }
 void wrapper_libxsmm_spmdm_compute_generic_thread(
     empty_type_wrapper<float>, const libxsmm_spmdm_handle* handle, char transA,
     char transB, const float* alpha, libxsmm_CSR_sparseslice* A_sparse,
-    const float* B, const float* beta, float* C, int block_id, int tid,
-    int nthreads) {
+    const float* B, char transC, const float* beta, float* C, int block_id,
+    int tid, int nthreads) {
   return libxsmm_spmdm_compute_fp32_thread(handle, transA, transB, alpha,
-                                           A_sparse, B, 'N', beta, C, block_id, tid,
-                                           nthreads);
+                                           A_sparse, B, transC, beta, C,
+                                           block_id, tid, nthreads);
 }
 
 class PinnedToCurrentCPU {
@@ -1438,7 +1438,7 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
     const typename LibxsmmSparseMatMul<TL, TR>::ConstMatrixMapR& right,
     bool transpose_left, const DeviceBase::CpuWorkerThreads* thread_pool,
     bool transpose_output, MatrixMap* output) {
-  if (transpose_output || transpose_left) {
+  if (false) {
     // Not handled by libxsmm currently
     SparseMatMul<TL, TR>::Compute(
         nullptr /* Assumes no cached data for fallback */, left, right,
@@ -1455,7 +1455,6 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
            (transpose_output ? output->dimension(1) : output->dimension(0)));
   CHECK_EQ(right_dim1,
            (transpose_output ? output->dimension(0) : output->dimension(1)));
-  CHECK(!transpose_output);
   if (left_dim0 < 32 || left_dim1 < 32 || right_dim1 < 32) {
     // Causes problems in libxsmm
     SparseMatMul<TL, TR>::Compute(
@@ -1482,7 +1481,7 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
       if (work_item >= total_num_creation_blocks) break;
       wrapper_libxsmm_spmdm_createSparseSlice_generic_thread(
           empty_type_wrapper<TL>{}, &entry->handle,
-          (transpose_left ? 'T' : 'N'), left_data, entry->output_csr, work_item,
+          (transpose_left ? 'Y' : 'N'), left_data, entry->output_csr, work_item,
           i, num_threads);
     }
   });
@@ -1504,8 +1503,9 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
       const TL beta(0.0);   // Stored in a variable so we can get a pointer
       wrapper_libxsmm_spmdm_compute_generic_thread(
           empty_type_wrapper<TL>{}, &entry->handle,
-          (transpose_left ? 'T' : 'N'), 'N', &alpha, entry->output_csr,
-          right_data, &beta, output_data, work_item, i, num_threads);
+          (transpose_left ? 'Y' : 'N'), 'N', &alpha, entry->output_csr,
+          right_data, (transpose_output ? 'Y' : 'N'), &beta, output_data,
+          work_item, i, num_threads);
     }
   });
   // Put handle + CSR storage back into cache
diff --git a/tensorflow/core/kernels/variable_ops.h b/tensorflow/core/kernels/variable_ops.h
index d8d8831702..2839c3d8cf 100644
--- a/tensorflow/core/kernels/variable_ops.h
+++ b/tensorflow/core/kernels/variable_ops.h
@@ -53,29 +53,29 @@ class VariableOp : public OpKernel {
     dtype_ = RemoveRefType(context->output_type(0));
   }
 
-  ~VariableOp() override {
-    if (var_) var_->Unref();
-  }
-
   void Compute(OpKernelContext* ctx) override {
     mutex_lock l(init_mu_);
-    if (var_ == nullptr) {
-      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
-                                      true /* use name() */));
-      auto creator = [this](Var** var) {
-        *var = new Var(dtype_);
-        (*var)->tensor()->set_shape(shape_);
-        return Status::OK();
-      };
-      OP_REQUIRES_OK(ctx,
-                     cinfo_.resource_manager()->LookupOrCreate<Var>(
-                         cinfo_.container(), cinfo_.name(), &var_, creator));
+    if (!initialized_) {
+      OP_REQUIRES_OK(
+          ctx,
+          cinfo_.Init(ctx->resource_manager(), def(), true /* use name() */));
+      initialized_ = true;
     }
+    auto creator = [this](Var** var) {
+      *var = new Var(dtype_);
+      (*var)->tensor()->set_shape(shape_);
+      return Status::OK();
+    };
+    Var* var;
+    OP_REQUIRES_OK(ctx,
+                   cinfo_.resource_manager()->LookupOrCreate<Var>(
+                       cinfo_.container(), cinfo_.name(), &var, creator));
     // Output a reference to our tensor, so it may be updated.
     //
-    // As long as *this is alive, the ref we return here is valid
-    // because *this owns a ref on var_.
-    ctx->set_output_ref(0, var_->mu(), var_->tensor());
+    // As long as the resource manager hasn't been cleared the ref we return
+    // here is valid because it owns a ref on var.
+    ctx->set_output_ref(0, var->mu(), var->tensor());
+    var->Unref();
   }
 
  private:
@@ -84,7 +84,7 @@ class VariableOp : public OpKernel {
 
   mutex init_mu_;
   ContainerInfo cinfo_ GUARDED_BY(init_mu_);
-  Var* var_ GUARDED_BY(init_mu_) = nullptr;
+  bool initialized_ GUARDED_BY(init_mu_){false};
 
   TF_DISALLOW_COPY_AND_ASSIGN(VariableOp);
 };
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 7ce667675d..d61e7b32de 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -1226,11 +1226,9 @@ Equivalent to np.full
 
 // --------------------------------------------------------------------------
 REGISTER_OP("_ParallelConcatStart")
-    .Input("shape: Tshape")
     .Output("output: dtype")
+    .Attr("shape: shape")
     .Attr("dtype: type")
-    .Attr("Tshape: {int32, int64} = DT_INT32")
-    .Attr("init: bool = false")
     .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle out;
@@ -1246,44 +1244,27 @@ conjunction with inplace operations.
 
 shape: 1-D `Tensor` indicating the shape of the output.
 dtype: The element type of the returned tensor.
-init: `bool` indicating whether or not to zero the allocated memory.
 output: An empty Tensor of the specified type.
 )doc");
 
 // --------------------------------------------------------------------------
 REGISTER_OP("_ParallelConcatUpdate")
     .Input("value: T")
-    .Input("loc: Tshape")
     .Input("update: T")
     .Output("output: T")
     .Attr("T: type")
-    .Attr("Tshape: {int32, int64} = DT_INT32")
+    .Attr("loc: int")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Updates input `value` at `loc` with `update`.
 
-If `loc` is None, `value` and `update` must be the same size.
-```
-value = update
-```
-
-If `loc` is a scalar, `value` has rank 1 higher than `update`
-```
-value[i, :] = update
-```
-
-If `loc` is a vector, `value` has the same rank as `update`
-```
-value[loc, :] = update
-```
-
 If you use this function you will almost certainly want to add
 a control dependency as done in the implementation of parallel_stack to
 avoid race conditions.
 
 value: A `Tensor` object that will be updated in-place.
-loc: A scalar or 1-D `Tensor` indicating the indices of the first dimension
-     such that value[loc, :] is updated.
+loc: A scalar indicating the index of the first dimension such that
+         value[loc, :] is updated.
 update: A `Tensor` of rank one less than `value` if `loc` is a scalar,
         otherwise of rank equal to `value` that contains the new values
         for `value`.
@@ -1917,7 +1898,7 @@ This op first slices `input` along the dimension `batch_dim`, and for each
 slice `i`, reverses the first `seq_lengths[i]` elements along
 the dimension `seq_dim`.
 
-The elements of `seq_lengths` must obey `seq_lengths[i] < input.dims[seq_dim]`,
+The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
 and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
 
 The output slice `i` along dimension `batch_dim` is then given by input
@@ -1970,7 +1951,7 @@ output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
 
 input: The input to reverse.
 seq_lengths: 1-D with length `input.dims(batch_dim)` and
-  `max(seq_lengths) < input.dims(seq_dim)`
+  `max(seq_lengths) <= input.dims(seq_dim)`
 seq_dim: The dimension which is partially reversed.
 batch_dim: The dimension along which reversal is performed.
 output: The partially reversed input. It has the same shape as `input`.
diff --git a/tensorflow/core/ops/compat/ops_history.v0.pbtxt b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
index cfb7504664..49297ae409 100644
--- a/tensorflow/core/ops/compat/ops_history.v0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
@@ -23219,6 +23219,53 @@ op {
   }
 }
 op {
+  name: "RecordInput"
+  output_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  attr {
+    name: "file_pattern"
+    type: "string"
+  }
+  attr {
+    name: "file_random_seed"
+    type: "int"
+    default_value {
+      i: 301
+    }
+  }
+  attr {
+    name: "file_shuffle_shift_ratio"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "file_buffer_size"
+    type: "int"
+    default_value {
+      i: 10000
+    }
+  }
+  attr {
+    name: "file_parallelism"
+    type: "int"
+    default_value {
+      i: 16
+    }
+  }
+  attr {
+    name: "batch_size"
+    type: "int"
+    default_value {
+      i: 32
+    }
+  }
+  is_stateful: true
+}
+op {
   name: "ReduceJoin"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index 54e766e8e9..a19d9483a1 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -2211,4 +2211,27 @@ dequeue with many fewer capabilities and options.  This Op is optimized for
 performance.
     )doc");
 
+REGISTER_OP("RecordInput")
+    .Output("records: string")
+    .Attr("file_pattern: string")
+    .Attr("file_random_seed: int = 301")
+    .Attr("file_shuffle_shift_ratio: float = 0")
+    .Attr("file_buffer_size: int = 10000")
+    .Attr("file_parallelism: int = 16")
+    .Attr("batch_size: int = 32")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Emits randomized records.
+
+records: A tensor of shape [batch_size].
+file_pattern: Glob pattern for the data files.
+file_random_seed: Random seeds used to produce randomized records.
+file_shuffle_shift_ratio: Shifts the list of files after the list is randomly
+    shuffled.
+file_buffer_size: The randomization shuffling buffer.
+file_parallelism: How many sstables are opened and concurrently iterated over.
+batch_size: The batch size.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index e631c289c6..937e9f588c 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -15028,6 +15028,61 @@ op {
   description: "Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`\nis the corresponding input gradient."
 }
 op {
+  name: "RecordInput"
+  output_arg {
+    name: "records"
+    description: "A tensor of shape [batch_size]."
+    type: DT_STRING
+  }
+  attr {
+    name: "file_pattern"
+    type: "string"
+    description: "Glob pattern for the data files."
+  }
+  attr {
+    name: "file_random_seed"
+    type: "int"
+    default_value {
+      i: 301
+    }
+    description: "Random seeds used to produce randomized records."
+  }
+  attr {
+    name: "file_shuffle_shift_ratio"
+    type: "float"
+    default_value {
+      f: 0
+    }
+    description: "Shifts the list of files after the list is randomly\nshuffled."
+  }
+  attr {
+    name: "file_buffer_size"
+    type: "int"
+    default_value {
+      i: 10000
+    }
+    description: "The randomization shuffling buffer."
+  }
+  attr {
+    name: "file_parallelism"
+    type: "int"
+    default_value {
+      i: 16
+    }
+    description: "How many sstables are opened and concurrently iterated over."
+  }
+  attr {
+    name: "batch_size"
+    type: "int"
+    default_value {
+      i: 32
+    }
+    description: "The batch size."
+  }
+  summary: "Emits randomized records."
+  is_stateful: true
+}
+op {
   name: "ReduceJoin"
   input_arg {
     name: "inputs"
@@ -17453,7 +17508,7 @@ op {
   }
   input_arg {
     name: "seq_lengths"
-    description: "1-D with length `input.dims(batch_dim)` and\n`max(seq_lengths) < input.dims(seq_dim)`"
+    description: "1-D with length `input.dims(batch_dim)` and\n`max(seq_lengths) <= input.dims(seq_dim)`"
     type_attr: "Tlen"
   }
   output_arg {
@@ -17492,7 +17547,7 @@ op {
     }
   }
   summary: "Reverses variable length slices."
-  description: "This op first slices `input` along the dimension `batch_dim`, and for each\nslice `i`, reverses the first `seq_lengths[i]` elements along\nthe dimension `seq_dim`.\n\nThe elements of `seq_lengths` must obey `seq_lengths[i] < input.dims[seq_dim]`,\nand `seq_lengths` must be a vector of length `input.dims[batch_dim]`.\n\nThe output slice `i` along dimension `batch_dim` is then given by input\nslice `i`, with the first `seq_lengths[i]` slices along dimension\n`seq_dim` reversed.\n\nFor example:\n\n```prettyprint\n# Given this:\nbatch_dim = 0\nseq_dim = 1\ninput.dims = (4, 8, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]\noutput[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]\noutput[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]\noutput[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[0, 7:, :, ...] = input[0, 7:, :, ...]\noutput[1, 2:, :, ...] = input[1, 2:, :, ...]\noutput[2, 3:, :, ...] = input[2, 3:, :, ...]\noutput[3, 2:, :, ...] = input[3, 2:, :, ...]\n```\n\nIn contrast, if:\n\n```prettyprint\n# Given this:\nbatch_dim = 2\nseq_dim = 0\ninput.dims = (8, ?, 4, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]\noutput[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]\noutput[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]\noutput[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]\noutput[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]\noutput[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]\noutput[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]\n```"
+  description: "This op first slices `input` along the dimension `batch_dim`, and for each\nslice `i`, reverses the first `seq_lengths[i]` elements along\nthe dimension `seq_dim`.\n\nThe elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,\nand `seq_lengths` must be a vector of length `input.dims[batch_dim]`.\n\nThe output slice `i` along dimension `batch_dim` is then given by input\nslice `i`, with the first `seq_lengths[i]` slices along dimension\n`seq_dim` reversed.\n\nFor example:\n\n```prettyprint\n# Given this:\nbatch_dim = 0\nseq_dim = 1\ninput.dims = (4, 8, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]\noutput[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]\noutput[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]\noutput[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[0, 7:, :, ...] = input[0, 7:, :, ...]\noutput[1, 2:, :, ...] = input[1, 2:, :, ...]\noutput[2, 3:, :, ...] = input[2, 3:, :, ...]\noutput[3, 2:, :, ...] = input[3, 2:, :, ...]\n```\n\nIn contrast, if:\n\n```prettyprint\n# Given this:\nbatch_dim = 2\nseq_dim = 0\ninput.dims = (8, ?, 4, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]\noutput[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]\noutput[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]\noutput[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]\noutput[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]\noutput[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]\noutput[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]\n```"
 }
 op {
   name: "ReverseV2"
diff --git a/tensorflow/core/platform/profile_utils/clock_cycle_profiler.cc b/tensorflow/core/platform/profile_utils/clock_cycle_profiler.cc
new file mode 100644
index 0000000000..6f852a653f
--- /dev/null
+++ b/tensorflow/core/platform/profile_utils/clock_cycle_profiler.cc
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/profile_utils/clock_cycle_profiler.h"
+
+#include <chrono>
+
+namespace tensorflow {
+
+void ClockCycleProfiler::DumpStatistics(const string& tag) {
+  CHECK(!IsStarted());
+  const double average_clock_cycle = GetAverageClockCycle();
+  const double count = GetCount();
+  const std::chrono::duration<double> average_time =
+      profile_utils::CpuUtils::ConvertClockCycleToTime(
+          static_cast<int64>(average_clock_cycle + 0.5));
+  LOG(INFO) << tag << ": average = "
+            << std::chrono::duration_cast<std::chrono::microseconds>(
+                   average_time)
+                   .count()
+            << " us (" << average_clock_cycle << " cycles)"
+            << ", count = " << count;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h b/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h
new file mode 100644
index 0000000000..876bb9c020
--- /dev/null
+++ b/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h
@@ -0,0 +1,104 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
+#define TENSORFLOW_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/profile_utils/cpu_utils.h"
+
+namespace tensorflow {
+
+class ClockCycleProfiler {
+ public:
+  ClockCycleProfiler() = default;
+
+  // Start counting clock cycle.
+  inline void Start() {
+    CHECK(!IsStarted()) << "Profiler has been already started.";
+    start_clock_ = GetCurrentClockCycleInternal();
+  }
+
+  // Stop counting clock cycle.
+  inline void Stop() {
+    CHECK(IsStarted()) << "Profiler is not started yet.";
+    AccumulateClockCycle();
+  }
+
+  // Get how many times Start() is called.
+  inline double GetCount() {
+    CHECK(!IsStarted());
+    return count_;
+  }
+
+  // Get average clock cycle.
+  inline double GetAverageClockCycle() {
+    CHECK(!IsStarted());
+    return average_clock_cycle_;
+  }
+
+  // TODO(satok): Support more statistics (e.g. standard deviation)
+  // Get worst clock cycle.
+  inline double GetWorstClockCycle() {
+    CHECK(!IsStarted());
+    return worst_clock_cycle_;
+  }
+
+  // Dump statistics
+  void DumpStatistics(const string& tag);
+
+ private:
+  inline uint64 GetCurrentClockCycleInternal() {
+    const uint64 clockCycle = profile_utils::CpuUtils::GetCurrentClockCycle();
+    if (clockCycle <= 0) {
+      if (valid_) {
+        LOG(WARNING) << "GetCurrentClockCycle is not implemented."
+                     << " Return 1 instead.";
+        valid_ = false;
+      }
+      return 1;
+    } else {
+      return clockCycle;
+    }
+  }
+
+  inline bool IsStarted() const { return start_clock_ > 0; }
+
+  inline void AccumulateClockCycle() {
+    const uint64 now = GetCurrentClockCycleInternal();
+    const double clock_diff = static_cast<double>(now - start_clock_);
+    const double next_count = count_ + 1.0;
+    const double next_count_inv = 1.0 / next_count;
+    const double next_ave_cpu_clock =
+        next_count_inv * (average_clock_cycle_ * count_ + clock_diff);
+    count_ = next_count;
+    average_clock_cycle_ = next_ave_cpu_clock;
+    worst_clock_cycle_ = std::max(worst_clock_cycle_, clock_diff);
+    start_clock_ = 0;
+  }
+
+  uint64 start_clock_{0};
+  double count_{0.0};
+  double average_clock_cycle_{0.0};
+  double worst_clock_cycle_{0.0};
+  bool valid_{true};
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ClockCycleProfiler);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
index 7cbd994661..fccc4d38a7 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/profile_utils/cpu_utils.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/profile_utils/clock_cycle_profiler.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -68,5 +69,18 @@ TEST_F(CpuUtilsTest, CheckMicroSecPerClock) {
   }
 }
 
+TEST_F(CpuUtilsTest, SimpleUsageOfClockCycleProfiler) {
+  static constexpr int LOOP_COUNT = 10;
+  ClockCycleProfiler prof;
+  for (int i = 0; i < LOOP_COUNT; ++i) {
+    prof.Start();
+    prof.Stop();
+  }
+  EXPECT_EQ(LOOP_COUNT, static_cast<int>(prof.GetCount() + 0.5));
+  if (DBG) {
+    prof.DumpStatistics("CpuUtilsTest");
+  }
+}
+
 }  // namespace profile_utils
 }  // namespace tensorflow
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index 0c1cea5fc3..c795ba67a8 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -39,7 +39,6 @@ cc_binary(
         "notap",
     ],
     deps = [
-        ":demo_proto_lib_cc",
         "//tensorflow/contrib/android:android_tensorflow_inference_jni",
         "//tensorflow/core:android_tensorflow_lib",
         LINKER_SCRIPT,
@@ -118,20 +117,3 @@ filegroup(
 )
 
 exports_files(["AndroidManifest.xml"])
-
-load(
-    "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_proto_library",
-)
-
-tf_proto_library(
-    name = "demo_proto_lib",
-    srcs = glob(
-        ["**/*.proto"],
-    ),
-    cc_api_version = 2,
-    visibility = ["//visibility:public"],
-)
-
-# -----------------------------------------------------------------------------
-# Google-internal targets go here (must be at the end).
diff --git a/tensorflow/examples/android/jni/box_coder_jni.cc b/tensorflow/examples/android/jni/box_coder_jni.cc
deleted file mode 100644
index be85414fc1..0000000000
--- a/tensorflow/examples/android/jni/box_coder_jni.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file loads the box coder mappings.
-
-#include <android/asset_manager.h>
-#include <android/asset_manager_jni.h>
-#include <android/bitmap.h>
-
-#include <jni.h>
-#include <pthread.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <map>
-#include <queue>
-#include <sstream>
-#include <string>
-
-#include "tensorflow/contrib/android/jni/jni_utils.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/types.h"
-
-#include "tensorflow/examples/android/proto/box_coder.pb.h"
-
-#define TENSORFLOW_METHOD(METHOD_NAME) \
-  Java_org_tensorflow_demo_TensorFlowMultiBoxDetector_##METHOD_NAME  // NOLINT
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-JNIEXPORT void JNICALL TENSORFLOW_METHOD(loadCoderOptions)(
-    JNIEnv* env, jobject thiz, jobject java_asset_manager, jstring location,
-    jfloatArray priors);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-
-JNIEXPORT void JNICALL TENSORFLOW_METHOD(loadCoderOptions)(
-    JNIEnv* env, jobject thiz, jobject java_asset_manager, jstring location,
-    jfloatArray priors) {
-  AAssetManager* const asset_manager =
-      AAssetManager_fromJava(env, java_asset_manager);
-  LOG(INFO) << "Acquired AssetManager.";
-
-  const std::string location_str = GetString(env, location);
-
-  org_tensorflow_demo::MultiBoxCoderOptions multi_options;
-
-  LOG(INFO) << "Reading file to proto: " << location_str;
-  ReadFileToProtoOrDie(asset_manager, location_str.c_str(), &multi_options);
-
-  LOG(INFO) << "Read file. " << multi_options.box_coder_size() << " entries.";
-
-  jboolean iCopied = JNI_FALSE;
-  jfloat* values = env->GetFloatArrayElements(priors, &iCopied);
-
-  const int array_length = env->GetArrayLength(priors);
-  LOG(INFO) << "Array length: " << array_length
-            << " (/8 = " << (array_length / 8) << ")";
-  CHECK_EQ(array_length % 8, 0);
-
-  const int num_items =
-      std::min(array_length / 8, multi_options.box_coder_size());
-
-  for (int i = 0; i < num_items; ++i) {
-    const org_tensorflow_demo::BoxCoderOptions& options =
-        multi_options.box_coder(i);
-
-    for (int j = 0; j < 4; ++j) {
-      const org_tensorflow_demo::BoxCoderPrior& prior = options.priors(j);
-      values[i * 8 + j * 2] = prior.mean();
-      values[i * 8 + j * 2 + 1] = prior.stddev();
-    }
-  }
-  env->ReleaseFloatArrayElements(priors, values, 0);
-
-  LOG(INFO) << "Read " << num_items << " options";
-}
diff --git a/tensorflow/examples/android/proto/box_coder.proto b/tensorflow/examples/android/proto/box_coder.proto
deleted file mode 100644
index 8576294110..0000000000
--- a/tensorflow/examples/android/proto/box_coder.proto
+++ /dev/null
@@ -1,42 +0,0 @@
-syntax = "proto2";
-
-package org_tensorflow_demo;
-
-// Prior for a single feature (like minimum x coordinate, width, area, etc.)
-message BoxCoderPrior {
-  optional float mean = 1 [default = 0.0];
-  optional float stddev = 2 [default = 1.0];
-};
-
-// Box encoding/decoding configuration for a single box.
-message BoxCoderOptions {
-  // Number of priors must match the number of values used to encoded
-  // values which is derived from the use_... flags below.
-  repeated BoxCoderPrior priors = 1;
-
-  // Minimum/maximum X/Y of the four corners are used as features.
-  // Order: MinX, MinY, MaxX, MaxY.
-  // Number of values: 4.
-  optional bool use_corners = 2 [default = true];
-
-  // Width and height of the box in this order.
-  // Number of values: 2.
-  optional bool use_width_height = 3 [default = false];
-
-  // Coordinates of the center of the box.
-  // Order: X, Y.
-  // Number of values: 2.
-  optional bool use_center = 4 [default = false];
-
-  // Area of the box.
-  // Number of values: 1.
-  optional bool use_area = 5 [default = false];
-};
-
-// Options for MultiBoxCoder which is a encoder/decoder for a fixed number of
-// boxes.
-// A list of BoxCoderOptions that allows for storing multiple box coder options
-// in a single file.
-message MultiBoxCoderOptions {
-  repeated BoxCoderOptions box_coder = 1;
-};
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
index 9ab5a7108a..d06f2d3c0f 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
@@ -60,7 +60,7 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
   private static final String MB_OUTPUT_NAMES = "output_locations/Reshape,output_scores/Reshape";
   private static final String MB_MODEL_FILE = "file:///android_asset/multibox_model.pb";
   private static final String MB_LOCATION_FILE =
-      "file:///android_asset/multibox_location_priors.pb";
+      "file:///android_asset/multibox_location_priors.txt";
 
   // Configuration values for tiny-yolo-voc. Note that the graph is not included with TensorFlow and
   // must be manually placed in the assets/ directory by the user.
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
index e438956c7d..34a4361626 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
@@ -19,10 +19,16 @@ import android.content.res.AssetManager;
 import android.graphics.Bitmap;
 import android.graphics.RectF;
 import android.os.Trace;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.List;
 import java.util.PriorityQueue;
+import java.util.StringTokenizer;
 import org.tensorflow.contrib.android.TensorFlowInferenceInterface;
 import org.tensorflow.demo.env.Logger;
 
@@ -80,7 +86,7 @@ public class TensorFlowMultiBoxDetector implements Classifier {
       final float imageStd,
       final String inputName,
       final String outputName) {
-    TensorFlowMultiBoxDetector d = new TensorFlowMultiBoxDetector();
+    final TensorFlowMultiBoxDetector d = new TensorFlowMultiBoxDetector();
     d.inputName = inputName;
     d.inputSize = inputSize;
     d.imageMean = imageMean;
@@ -89,7 +95,11 @@ public class TensorFlowMultiBoxDetector implements Classifier {
 
     d.boxPriors = new float[numLocations * 8];
 
-    d.loadCoderOptions(assetManager, locationFilename, d.boxPriors);
+    try {
+      d.loadCoderOptions(assetManager, locationFilename, d.boxPriors);
+    } catch (final IOException e) {
+      throw new RuntimeException("Error initializing box priors from " + locationFilename);
+    }
 
     // Pre-allocate buffers.
     d.outputNames = outputName.split(",");
@@ -110,9 +120,42 @@ public class TensorFlowMultiBoxDetector implements Classifier {
 
   private TensorFlowMultiBoxDetector() {}
 
-  // Load BoxCoderOptions from native code.
-  private native void loadCoderOptions(
-      AssetManager assetManager, String locationFilename, float[] boxPriors);
+  private void loadCoderOptions(
+      final AssetManager assetManager, final String locationFilename, final float[] boxPriors)
+      throws IOException {
+    // Try to be intelligent about opening from assets or sdcard depending on prefix.
+    final String assetPrefix = "file:///android_asset/";
+    InputStream is;
+    if (locationFilename.startsWith(assetPrefix)) {
+      is = assetManager.open(locationFilename.split(assetPrefix)[1]);
+    } else {
+      is = new FileInputStream(locationFilename);
+    }
+
+    // Read values. Number of values per line doesn't matter, as long as they are separated
+    // by commas and/or whitespace, and there are exactly numLocations * 8 values total.
+    // Values are in the order mean, std for each consecutive corner of each box, for a total of 8
+    // per location.
+    final BufferedReader reader = new BufferedReader(new InputStreamReader(is));
+    int priorIndex = 0;
+    String line;
+    while ((line = reader.readLine()) != null) {
+      final StringTokenizer st = new StringTokenizer(line, ", ");
+      while (st.hasMoreTokens()) {
+        final String token = st.nextToken();
+        try {
+          final float number = Float.parseFloat(token);
+          boxPriors[priorIndex++] = number;
+        } catch (final NumberFormatException e) {
+          // Silently ignore.
+        }
+      }
+    }
+    if (priorIndex != boxPriors.length) {
+      throw new RuntimeException(
+          "BoxPrior length mismatch: " + priorIndex + " vs " + boxPriors.length);
+    }
+  }
 
   private float[] decodeLocationsEncoding(final float[] locationEncoding) {
     final float[] locations = new float[locationEncoding.length];
@@ -216,7 +259,7 @@ public class TensorFlowMultiBoxDetector implements Classifier {
   }
 
   @Override
-  public void enableStatLogging(boolean debug) {
+  public void enableStatLogging(final boolean debug) {
     inferenceInterface.enableStatLogging(debug);
   }
 
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index 544b1b2738..08e6e4544a 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -97,7 +97,7 @@ Status ReadTensorFromImageFile(string file_name, const int input_height,
                                                file_name);
   // Now try to figure out what kind of file it is and decode it.
   const int wanted_channels = 3;
-  Output image_reader;
+  tensorflow::Output image_reader;
   if (tensorflow::StringPiece(file_name).ends_with(".png")) {
     image_reader = DecodePng(root.WithOpName("png_reader"), file_reader,
                              DecodePng::Channels(wanted_channels));
diff --git a/tensorflow/g3doc/api_docs/python/array_ops.md b/tensorflow/g3doc/api_docs/python/array_ops.md
index 2dcf6bcca6..cb30382c6b 100644
--- a/tensorflow/g3doc/api_docs/python/array_ops.md
+++ b/tensorflow/g3doc/api_docs/python/array_ops.md
@@ -1109,7 +1109,7 @@ This op first slices `input` along the dimension `batch_axis`, and for each
 slice `i`, reverses the first `seq_lengths[i]` elements along
 the dimension `seq_axis`.
 
-The elements of `seq_lengths` must obey `seq_lengths[i] < input.dims[seq_dim]`,
+The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
 and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
 
 The output slice `i` along dimension `batch_axis` is then given by input
@@ -1166,7 +1166,7 @@ output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
 *  <b>`input`</b>: A `Tensor`. The input to reverse.
 *  <b>`seq_lengths`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
     1-D with length `input.dims(batch_dim)` and
-    `max(seq_lengths) < input.dims(seq_dim)`
+    `max(seq_lengths) <= input.dims(seq_dim)`
 *  <b>`seq_axis`</b>: An `int`. The dimension which is partially reversed.
 *  <b>`batch_axis`</b>: An optional `int`. Defaults to `0`.
     The dimension along which reversal is performed.
diff --git a/tensorflow/g3doc/api_docs/python/contrib.graph_editor.md b/tensorflow/g3doc/api_docs/python/contrib.graph_editor.md
index 303a99020c..b5aae70911 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.graph_editor.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.graph_editor.md
@@ -1794,6 +1794,9 @@ This handler is typically used to transform a hidden input tensors.
 
 Add the transformed elem to the (renamed) collections of elem.
 
+A collection is renamed only if is not a known key, as described in
+`tf.GraphKeys`.
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.learn.md b/tensorflow/g3doc/api_docs/python/contrib.learn.md
index 1e515d6490..fb790e2f1e 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.learn.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.learn.md
@@ -485,22 +485,19 @@ The signature of the input_fn accepted by export is changing to be consistent wi
 
 - - -
 
-#### `tf.contrib.learn.Estimator.export_savedmodel(*args, **kwargs)` {#Estimator.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir. (experimental)
-
-THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and without warning.
+#### `tf.contrib.learn.Estimator.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False)` {#Estimator.export_savedmodel}
 
+Exports inference graph as a SavedModel into given dir.
 
 ##### Args:
 
 
 *  <b>`export_dir_base`</b>: A string containing a directory to write the exported
     graph and checkpoints.
-*  <b>`input_fn`</b>: A function that takes no argument and
+*  <b>`serving_input_fn`</b>: A function that takes no argument and
     returns an `InputFnOps`.
 *  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.
+    specified.  Not needed for single-headed models.
 *  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
     within the exported SavedModel.  Each key should give the destination
     path (including the filename) relative to the assets.extra directory.
@@ -509,7 +506,6 @@ THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and with
     renaming it is specified as
     `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
 *  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`exports_to_keep`</b>: Number of exports to keep.
 
 ##### Returns:
 
@@ -1038,22 +1034,19 @@ See BaseEstimator.export.
 
 - - -
 
-#### `tf.contrib.learn.DNNClassifier.export_savedmodel(*args, **kwargs)` {#DNNClassifier.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir. (experimental)
-
-THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and without warning.
+#### `tf.contrib.learn.DNNClassifier.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False)` {#DNNClassifier.export_savedmodel}
 
+Exports inference graph as a SavedModel into given dir.
 
 ##### Args:
 
 
 *  <b>`export_dir_base`</b>: A string containing a directory to write the exported
     graph and checkpoints.
-*  <b>`input_fn`</b>: A function that takes no argument and
+*  <b>`serving_input_fn`</b>: A function that takes no argument and
     returns an `InputFnOps`.
 *  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.
+    specified.  Not needed for single-headed models.
 *  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
     within the exported SavedModel.  Each key should give the destination
     path (including the filename) relative to the assets.extra directory.
@@ -1062,7 +1055,6 @@ THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and with
     renaming it is specified as
     `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
 *  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`exports_to_keep`</b>: Number of exports to keep.
 
 ##### Returns:
 
@@ -1466,22 +1458,19 @@ See BaseEstimator.export.
 
 - - -
 
-#### `tf.contrib.learn.DNNRegressor.export_savedmodel(*args, **kwargs)` {#DNNRegressor.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir. (experimental)
-
-THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and without warning.
+#### `tf.contrib.learn.DNNRegressor.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False)` {#DNNRegressor.export_savedmodel}
 
+Exports inference graph as a SavedModel into given dir.
 
 ##### Args:
 
 
 *  <b>`export_dir_base`</b>: A string containing a directory to write the exported
     graph and checkpoints.
-*  <b>`input_fn`</b>: A function that takes no argument and
+*  <b>`serving_input_fn`</b>: A function that takes no argument and
     returns an `InputFnOps`.
 *  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.
+    specified.  Not needed for single-headed models.
 *  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
     within the exported SavedModel.  Each key should give the destination
     path (including the filename) relative to the assets.extra directory.
@@ -1490,7 +1479,6 @@ THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and with
     renaming it is specified as
     `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
 *  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`exports_to_keep`</b>: Number of exports to keep.
 
 ##### Returns:
 
@@ -1890,22 +1878,19 @@ See BaseEstimator.export.
 
 - - -
 
-#### `tf.contrib.learn.LinearClassifier.export_savedmodel(*args, **kwargs)` {#LinearClassifier.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir. (experimental)
-
-THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and without warning.
+#### `tf.contrib.learn.LinearClassifier.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False)` {#LinearClassifier.export_savedmodel}
 
+Exports inference graph as a SavedModel into given dir.
 
 ##### Args:
 
 
 *  <b>`export_dir_base`</b>: A string containing a directory to write the exported
     graph and checkpoints.
-*  <b>`input_fn`</b>: A function that takes no argument and
+*  <b>`serving_input_fn`</b>: A function that takes no argument and
     returns an `InputFnOps`.
 *  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.
+    specified.  Not needed for single-headed models.
 *  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
     within the exported SavedModel.  Each key should give the destination
     path (including the filename) relative to the assets.extra directory.
@@ -1914,7 +1899,6 @@ THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and with
     renaming it is specified as
     `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
 *  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`exports_to_keep`</b>: Number of exports to keep.
 
 ##### Returns:
 
@@ -2276,22 +2260,19 @@ See BaseEstimator.export.
 
 - - -
 
-#### `tf.contrib.learn.LinearRegressor.export_savedmodel(*args, **kwargs)` {#LinearRegressor.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir. (experimental)
-
-THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and without warning.
+#### `tf.contrib.learn.LinearRegressor.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False)` {#LinearRegressor.export_savedmodel}
 
+Exports inference graph as a SavedModel into given dir.
 
 ##### Args:
 
 
 *  <b>`export_dir_base`</b>: A string containing a directory to write the exported
     graph and checkpoints.
-*  <b>`input_fn`</b>: A function that takes no argument and
+*  <b>`serving_input_fn`</b>: A function that takes no argument and
     returns an `InputFnOps`.
 *  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.
+    specified.  Not needed for single-headed models.
 *  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
     within the exported SavedModel.  Each key should give the destination
     path (including the filename) relative to the assets.extra directory.
@@ -2300,7 +2281,6 @@ THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and with
     renaming it is specified as
     `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
 *  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`exports_to_keep`</b>: Number of exports to keep.
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md b/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md
index c7e32f0437..dae7162a0d 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md
@@ -2384,7 +2384,7 @@ Can do early stopping on validation metrics if `early_stopping_rounds` is
 provided.
 - - -
 
-#### `tf.contrib.learn.monitors.ValidationMonitor.__init__(x=None, y=None, input_fn=None, batch_size=None, eval_steps=None, every_n_steps=100, metrics=None, early_stopping_rounds=None, early_stopping_metric='loss', early_stopping_metric_minimize=True, name=None)` {#ValidationMonitor.__init__}
+#### `tf.contrib.learn.monitors.ValidationMonitor.__init__(x=None, y=None, input_fn=None, batch_size=None, eval_steps=None, every_n_steps=100, metrics=None, hooks=None, early_stopping_rounds=None, early_stopping_metric='loss', early_stopping_metric_minimize=True, name=None)` {#ValidationMonitor.__init__}
 
 Initializes a ValidationMonitor.
 
@@ -2399,6 +2399,8 @@ Initializes a ValidationMonitor.
 *  <b>`every_n_steps`</b>: Check for new checkpoints to evaluate every N steps. If a
       new checkpoint is found, it is evaluated. See `EveryN`.
 *  <b>`metrics`</b>: See `BaseEstimator.evaluate`.
+*  <b>`hooks`</b>: A list of `SessionRunHook` hooks to pass to the
+    `Estimator`'s `evaluate` function.
 *  <b>`early_stopping_rounds`</b>: `int`. If the metric indicated by
       `early_stopping_metric` does not change according to
       `early_stopping_metric_minimize` for this many steps, then training
diff --git a/tensorflow/g3doc/api_docs/python/contrib.linalg.md b/tensorflow/g3doc/api_docs/python/contrib.linalg.md
index cbbffb1e78..509dc10e93 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.linalg.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.linalg.md
@@ -237,7 +237,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperator.batch_shape_dynamic(name='batch_shape_dynamic')` {#LinearOperator.batch_shape_dynamic}
+#### `tf.contrib.linalg.LinearOperator.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperator.batch_shape_tensor}
 
 Shape of batch dimensions of this operator, determined at runtime.
 
@@ -287,7 +287,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperator.domain_dimension_dynamic(name='domain_dimension_dynamic')` {#LinearOperator.domain_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperator.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperator.domain_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the domain of this operator.
 
@@ -380,7 +380,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperator.range_dimension_dynamic(name='range_dimension_dynamic')` {#LinearOperator.range_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperator.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperator.range_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the range of this operator.
 
@@ -416,7 +416,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperator.shape_dynamic(name='shape_dynamic')` {#LinearOperator.shape_dynamic}
+#### `tf.contrib.linalg.LinearOperator.shape_tensor(name='shape_tensor')` {#LinearOperator.shape_tensor}
 
 Shape of this `LinearOperator`, determined at runtime.
 
@@ -497,7 +497,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperator.tensor_rank_dynamic(name='tensor_rank_dynamic')` {#LinearOperator.tensor_rank_dynamic}
+#### `tf.contrib.linalg.LinearOperator.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperator.tensor_rank_tensor}
 
 Rank (in the sense of tensors) of matrix corresponding to this operator.
 
@@ -720,7 +720,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorDiag.batch_shape_dynamic(name='batch_shape_dynamic')` {#LinearOperatorDiag.batch_shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorDiag.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorDiag.batch_shape_tensor}
 
 Shape of batch dimensions of this operator, determined at runtime.
 
@@ -770,7 +770,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorDiag.domain_dimension_dynamic(name='domain_dimension_dynamic')` {#LinearOperatorDiag.domain_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorDiag.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorDiag.domain_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the domain of this operator.
 
@@ -863,7 +863,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorDiag.range_dimension_dynamic(name='range_dimension_dynamic')` {#LinearOperatorDiag.range_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorDiag.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorDiag.range_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the range of this operator.
 
@@ -899,7 +899,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorDiag.shape_dynamic(name='shape_dynamic')` {#LinearOperatorDiag.shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorDiag.shape_tensor(name='shape_tensor')` {#LinearOperatorDiag.shape_tensor}
 
 Shape of this `LinearOperator`, determined at runtime.
 
@@ -980,7 +980,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorDiag.tensor_rank_dynamic(name='tensor_rank_dynamic')` {#LinearOperatorDiag.tensor_rank_dynamic}
+#### `tf.contrib.linalg.LinearOperatorDiag.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorDiag.tensor_rank_tensor}
 
 Rank (in the sense of tensors) of matrix corresponding to this operator.
 
@@ -1237,7 +1237,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorIdentity.batch_shape_dynamic(name='batch_shape_dynamic')` {#LinearOperatorIdentity.batch_shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorIdentity.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorIdentity.batch_shape_tensor}
 
 Shape of batch dimensions of this operator, determined at runtime.
 
@@ -1287,7 +1287,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorIdentity.domain_dimension_dynamic(name='domain_dimension_dynamic')` {#LinearOperatorIdentity.domain_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorIdentity.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorIdentity.domain_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the domain of this operator.
 
@@ -1380,7 +1380,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorIdentity.range_dimension_dynamic(name='range_dimension_dynamic')` {#LinearOperatorIdentity.range_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorIdentity.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorIdentity.range_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the range of this operator.
 
@@ -1416,7 +1416,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorIdentity.shape_dynamic(name='shape_dynamic')` {#LinearOperatorIdentity.shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorIdentity.shape_tensor(name='shape_tensor')` {#LinearOperatorIdentity.shape_tensor}
 
 Shape of this `LinearOperator`, determined at runtime.
 
@@ -1497,7 +1497,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorIdentity.tensor_rank_dynamic(name='tensor_rank_dynamic')` {#LinearOperatorIdentity.tensor_rank_dynamic}
+#### `tf.contrib.linalg.LinearOperatorIdentity.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorIdentity.tensor_rank_tensor}
 
 Rank (in the sense of tensors) of matrix corresponding to this operator.
 
@@ -1728,7 +1728,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.batch_shape_dynamic(name='batch_shape_dynamic')` {#LinearOperatorScaledIdentity.batch_shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorScaledIdentity.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorScaledIdentity.batch_shape_tensor}
 
 Shape of batch dimensions of this operator, determined at runtime.
 
@@ -1778,7 +1778,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.domain_dimension_dynamic(name='domain_dimension_dynamic')` {#LinearOperatorScaledIdentity.domain_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorScaledIdentity.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorScaledIdentity.domain_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the domain of this operator.
 
@@ -1878,7 +1878,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.range_dimension_dynamic(name='range_dimension_dynamic')` {#LinearOperatorScaledIdentity.range_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorScaledIdentity.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorScaledIdentity.range_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the range of this operator.
 
@@ -1914,7 +1914,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.shape_dynamic(name='shape_dynamic')` {#LinearOperatorScaledIdentity.shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorScaledIdentity.shape_tensor(name='shape_tensor')` {#LinearOperatorScaledIdentity.shape_tensor}
 
 Shape of this `LinearOperator`, determined at runtime.
 
@@ -1995,7 +1995,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.tensor_rank_dynamic(name='tensor_rank_dynamic')` {#LinearOperatorScaledIdentity.tensor_rank_dynamic}
+#### `tf.contrib.linalg.LinearOperatorScaledIdentity.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorScaledIdentity.tensor_rank_tensor}
 
 Rank (in the sense of tensors) of matrix corresponding to this operator.
 
@@ -2209,7 +2209,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorMatrix.batch_shape_dynamic(name='batch_shape_dynamic')` {#LinearOperatorMatrix.batch_shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorMatrix.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorMatrix.batch_shape_tensor}
 
 Shape of batch dimensions of this operator, determined at runtime.
 
@@ -2259,7 +2259,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorMatrix.domain_dimension_dynamic(name='domain_dimension_dynamic')` {#LinearOperatorMatrix.domain_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorMatrix.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorMatrix.domain_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the domain of this operator.
 
@@ -2352,7 +2352,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorMatrix.range_dimension_dynamic(name='range_dimension_dynamic')` {#LinearOperatorMatrix.range_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorMatrix.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorMatrix.range_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the range of this operator.
 
@@ -2388,7 +2388,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorMatrix.shape_dynamic(name='shape_dynamic')` {#LinearOperatorMatrix.shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorMatrix.shape_tensor(name='shape_tensor')` {#LinearOperatorMatrix.shape_tensor}
 
 Shape of this `LinearOperator`, determined at runtime.
 
@@ -2469,7 +2469,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorMatrix.tensor_rank_dynamic(name='tensor_rank_dynamic')` {#LinearOperatorMatrix.tensor_rank_dynamic}
+#### `tf.contrib.linalg.LinearOperatorMatrix.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorMatrix.tensor_rank_tensor}
 
 Rank (in the sense of tensors) of matrix corresponding to this operator.
 
@@ -2685,7 +2685,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorTriL.batch_shape_dynamic(name='batch_shape_dynamic')` {#LinearOperatorTriL.batch_shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorTriL.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorTriL.batch_shape_tensor}
 
 Shape of batch dimensions of this operator, determined at runtime.
 
@@ -2735,7 +2735,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorTriL.domain_dimension_dynamic(name='domain_dimension_dynamic')` {#LinearOperatorTriL.domain_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorTriL.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorTriL.domain_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the domain of this operator.
 
@@ -2828,7 +2828,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorTriL.range_dimension_dynamic(name='range_dimension_dynamic')` {#LinearOperatorTriL.range_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorTriL.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorTriL.range_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the range of this operator.
 
@@ -2864,7 +2864,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorTriL.shape_dynamic(name='shape_dynamic')` {#LinearOperatorTriL.shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorTriL.shape_tensor(name='shape_tensor')` {#LinearOperatorTriL.shape_tensor}
 
 Shape of this `LinearOperator`, determined at runtime.
 
@@ -2945,7 +2945,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorTriL.tensor_rank_dynamic(name='tensor_rank_dynamic')` {#LinearOperatorTriL.tensor_rank_dynamic}
+#### `tf.contrib.linalg.LinearOperatorTriL.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorTriL.tensor_rank_tensor}
 
 Rank (in the sense of tensors) of matrix corresponding to this operator.
 
@@ -3172,7 +3172,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorComposition.batch_shape_dynamic(name='batch_shape_dynamic')` {#LinearOperatorComposition.batch_shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorComposition.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorComposition.batch_shape_tensor}
 
 Shape of batch dimensions of this operator, determined at runtime.
 
@@ -3222,7 +3222,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorComposition.domain_dimension_dynamic(name='domain_dimension_dynamic')` {#LinearOperatorComposition.domain_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorComposition.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorComposition.domain_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the domain of this operator.
 
@@ -3322,7 +3322,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorComposition.range_dimension_dynamic(name='range_dimension_dynamic')` {#LinearOperatorComposition.range_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorComposition.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorComposition.range_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the range of this operator.
 
@@ -3358,7 +3358,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorComposition.shape_dynamic(name='shape_dynamic')` {#LinearOperatorComposition.shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorComposition.shape_tensor(name='shape_tensor')` {#LinearOperatorComposition.shape_tensor}
 
 Shape of this `LinearOperator`, determined at runtime.
 
@@ -3439,7 +3439,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorComposition.tensor_rank_dynamic(name='tensor_rank_dynamic')` {#LinearOperatorComposition.tensor_rank_dynamic}
+#### `tf.contrib.linalg.LinearOperatorComposition.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorComposition.tensor_rank_tensor}
 
 Rank (in the sense of tensors) of matrix corresponding to this operator.
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.metrics.md b/tensorflow/g3doc/api_docs/python/contrib.metrics.md
index 4d6cf8625c..1537865fc6 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.metrics.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.metrics.md
@@ -1469,10 +1469,10 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 ##### Args:
 
 
-*  <b>`predictions`</b>: The predicted values, a `bool` `Tensor` of arbitrary
-    dimensions.
-*  <b>`labels`</b>: The ground truth values, a `bool` `Tensor` whose dimensions must
-    match `predictions`.
+*  <b>`predictions`</b>: The predicted values, a `Tensor` of arbitrary dimensions. Will
+    be cast to `bool`.
+*  <b>`labels`</b>: The ground truth values, a `Tensor` whose dimensions must match
+    `predictions`. Will be cast to `bool`.
 *  <b>`weights`</b>: Optional `Tensor` whose rank is either 0, or the same rank as
     `labels`, and must be broadcastable to `labels` (i.e., all dimensions
     must be either `1`, or the same as the corresponding `labels`
@@ -1515,10 +1515,10 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 ##### Args:
 
 
-*  <b>`predictions`</b>: The predicted values, a `bool` `Tensor` of arbitrary
-    dimensions.
-*  <b>`labels`</b>: The ground truth values, a `bool` `Tensor` whose dimensions must
-    match `predictions`.
+*  <b>`predictions`</b>: The predicted values, a `Tensor` of arbitrary dimensions. Will
+    be cast to `bool`.
+*  <b>`labels`</b>: The ground truth values, a `Tensor` whose dimensions must match
+    `predictions`. Will be cast to `bool`.
 *  <b>`weights`</b>: Optional `Tensor` whose rank is either 0, or the same rank as
     `labels`, and must be broadcastable to `labels` (i.e., all dimensions
     must be either `1`, or the same as the corresponding `labels`
@@ -1562,10 +1562,10 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 ##### Args:
 
 
-*  <b>`predictions`</b>: The predicted values, a `bool` `Tensor` of arbitrary
-    dimensions.
-*  <b>`labels`</b>: The ground truth values, a `bool` `Tensor` whose dimensions must
-    match `predictions`.
+*  <b>`predictions`</b>: The predicted values, a `Tensor` of arbitrary dimensions. Will
+    be cast to `bool`.
+*  <b>`labels`</b>: The ground truth values, a `Tensor` whose dimensions must match
+    `predictions`. Will be cast to `bool`.
 *  <b>`weights`</b>: Optional `Tensor` whose rank is either 0, or the same rank as
     `labels`, and must be broadcastable to `labels` (i.e., all dimensions
     must be either `1`, or the same as the corresponding `labels`
@@ -1609,10 +1609,10 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 ##### Args:
 
 
-*  <b>`predictions`</b>: The predicted values, a `bool` `Tensor` of arbitrary
-    dimensions.
-*  <b>`labels`</b>: The ground truth values, a `bool` `Tensor` whose dimensions must
-    match `predictions`.
+*  <b>`predictions`</b>: The predicted values, a `Tensor` of arbitrary dimensions. Will
+    be cast to `bool`.
+*  <b>`labels`</b>: The ground truth values, a `Tensor` whose dimensions must match
+    `predictions`. Will be cast to `bool`.
 *  <b>`weights`</b>: Optional `Tensor` whose rank is either 0, or the same rank as
     `labels`, and must be broadcastable to `labels` (i.e., all dimensions
     must be either `1`, or the same as the corresponding `labels`
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
index 6009e8262e..f5b1ca422c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
@@ -137,22 +137,19 @@ See BaseEstimator.export.
 
 - - -
 
-#### `tf.contrib.learn.LinearRegressor.export_savedmodel(*args, **kwargs)` {#LinearRegressor.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir. (experimental)
-
-THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and without warning.
+#### `tf.contrib.learn.LinearRegressor.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False)` {#LinearRegressor.export_savedmodel}
 
+Exports inference graph as a SavedModel into given dir.
 
 ##### Args:
 
 
 *  <b>`export_dir_base`</b>: A string containing a directory to write the exported
     graph and checkpoints.
-*  <b>`input_fn`</b>: A function that takes no argument and
+*  <b>`serving_input_fn`</b>: A function that takes no argument and
     returns an `InputFnOps`.
 *  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.
+    specified.  Not needed for single-headed models.
 *  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
     within the exported SavedModel.  Each key should give the destination
     path (including the filename) relative to the assets.extra directory.
@@ -161,7 +158,6 @@ THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and with
     renaming it is specified as
     `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
 *  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`exports_to_keep`</b>: Number of exports to keep.
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.linalg.LinearOperatorDiag.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.linalg.LinearOperatorDiag.md
index a449b2f097..1900385928 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.linalg.LinearOperatorDiag.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.linalg.LinearOperatorDiag.md
@@ -189,7 +189,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorDiag.batch_shape_dynamic(name='batch_shape_dynamic')` {#LinearOperatorDiag.batch_shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorDiag.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorDiag.batch_shape_tensor}
 
 Shape of batch dimensions of this operator, determined at runtime.
 
@@ -239,7 +239,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorDiag.domain_dimension_dynamic(name='domain_dimension_dynamic')` {#LinearOperatorDiag.domain_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorDiag.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorDiag.domain_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the domain of this operator.
 
@@ -332,7 +332,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorDiag.range_dimension_dynamic(name='range_dimension_dynamic')` {#LinearOperatorDiag.range_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorDiag.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorDiag.range_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the range of this operator.
 
@@ -368,7 +368,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorDiag.shape_dynamic(name='shape_dynamic')` {#LinearOperatorDiag.shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorDiag.shape_tensor(name='shape_tensor')` {#LinearOperatorDiag.shape_tensor}
 
 Shape of this `LinearOperator`, determined at runtime.
 
@@ -449,7 +449,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorDiag.tensor_rank_dynamic(name='tensor_rank_dynamic')` {#LinearOperatorDiag.tensor_rank_dynamic}
+#### `tf.contrib.linalg.LinearOperatorDiag.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorDiag.tensor_rank_tensor}
 
 Rank (in the sense of tensors) of matrix corresponding to this operator.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reverse_sequence.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reverse_sequence.md
index b950cd5fe6..c6e8c748bf 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reverse_sequence.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reverse_sequence.md
@@ -6,7 +6,7 @@ This op first slices `input` along the dimension `batch_axis`, and for each
 slice `i`, reverses the first `seq_lengths[i]` elements along
 the dimension `seq_axis`.
 
-The elements of `seq_lengths` must obey `seq_lengths[i] < input.dims[seq_dim]`,
+The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
 and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
 
 The output slice `i` along dimension `batch_axis` is then given by input
@@ -63,7 +63,7 @@ output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
 *  <b>`input`</b>: A `Tensor`. The input to reverse.
 *  <b>`seq_lengths`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
     1-D with length `input.dims(batch_dim)` and
-    `max(seq_lengths) < input.dims(seq_dim)`
+    `max(seq_lengths) <= input.dims(seq_dim)`
 *  <b>`seq_axis`</b>: An `int`. The dimension which is partially reversed.
 *  <b>`batch_axis`</b>: An optional `int`. Defaults to `0`.
     The dimension along which reversal is performed.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
index d649e42181..08de000315 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
@@ -165,22 +165,19 @@ See BaseEstimator.export.
 
 - - -
 
-#### `tf.contrib.learn.LinearClassifier.export_savedmodel(*args, **kwargs)` {#LinearClassifier.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir. (experimental)
-
-THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and without warning.
+#### `tf.contrib.learn.LinearClassifier.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False)` {#LinearClassifier.export_savedmodel}
 
+Exports inference graph as a SavedModel into given dir.
 
 ##### Args:
 
 
 *  <b>`export_dir_base`</b>: A string containing a directory to write the exported
     graph and checkpoints.
-*  <b>`input_fn`</b>: A function that takes no argument and
+*  <b>`serving_input_fn`</b>: A function that takes no argument and
     returns an `InputFnOps`.
 *  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.
+    specified.  Not needed for single-headed models.
 *  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
     within the exported SavedModel.  Each key should give the destination
     path (including the filename) relative to the assets.extra directory.
@@ -189,7 +186,6 @@ THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and with
     renaming it is specified as
     `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
 *  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`exports_to_keep`</b>: Number of exports to keep.
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.monitors.ValidationMonitor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.monitors.ValidationMonitor.md
index 2bafff8cdf..b24a86f1e1 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.monitors.ValidationMonitor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.monitors.ValidationMonitor.md
@@ -7,7 +7,7 @@ Can do early stopping on validation metrics if `early_stopping_rounds` is
 provided.
 - - -
 
-#### `tf.contrib.learn.monitors.ValidationMonitor.__init__(x=None, y=None, input_fn=None, batch_size=None, eval_steps=None, every_n_steps=100, metrics=None, early_stopping_rounds=None, early_stopping_metric='loss', early_stopping_metric_minimize=True, name=None)` {#ValidationMonitor.__init__}
+#### `tf.contrib.learn.monitors.ValidationMonitor.__init__(x=None, y=None, input_fn=None, batch_size=None, eval_steps=None, every_n_steps=100, metrics=None, hooks=None, early_stopping_rounds=None, early_stopping_metric='loss', early_stopping_metric_minimize=True, name=None)` {#ValidationMonitor.__init__}
 
 Initializes a ValidationMonitor.
 
@@ -22,6 +22,8 @@ Initializes a ValidationMonitor.
 *  <b>`every_n_steps`</b>: Check for new checkpoints to evaluate every N steps. If a
       new checkpoint is found, it is evaluated. See `EveryN`.
 *  <b>`metrics`</b>: See `BaseEstimator.evaluate`.
+*  <b>`hooks`</b>: A list of `SessionRunHook` hooks to pass to the
+    `Estimator`'s `evaluate` function.
 *  <b>`early_stopping_rounds`</b>: `int`. If the metric indicated by
       `early_stopping_metric` does not change according to
       `early_stopping_metric_minimize` for this many steps, then training
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.linalg.LinearOperatorComposition.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.linalg.LinearOperatorComposition.md
index 5e051e5ba8..ee7140922c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.linalg.LinearOperatorComposition.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.linalg.LinearOperatorComposition.md
@@ -193,7 +193,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorComposition.batch_shape_dynamic(name='batch_shape_dynamic')` {#LinearOperatorComposition.batch_shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorComposition.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorComposition.batch_shape_tensor}
 
 Shape of batch dimensions of this operator, determined at runtime.
 
@@ -243,7 +243,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorComposition.domain_dimension_dynamic(name='domain_dimension_dynamic')` {#LinearOperatorComposition.domain_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorComposition.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorComposition.domain_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the domain of this operator.
 
@@ -343,7 +343,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorComposition.range_dimension_dynamic(name='range_dimension_dynamic')` {#LinearOperatorComposition.range_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorComposition.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorComposition.range_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the range of this operator.
 
@@ -379,7 +379,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorComposition.shape_dynamic(name='shape_dynamic')` {#LinearOperatorComposition.shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorComposition.shape_tensor(name='shape_tensor')` {#LinearOperatorComposition.shape_tensor}
 
 Shape of this `LinearOperator`, determined at runtime.
 
@@ -460,7 +460,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorComposition.tensor_rank_dynamic(name='tensor_rank_dynamic')` {#LinearOperatorComposition.tensor_rank_dynamic}
+#### `tf.contrib.linalg.LinearOperatorComposition.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorComposition.tensor_rank_tensor}
 
 Rank (in the sense of tensors) of matrix corresponding to this operator.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.linalg.LinearOperatorIdentity.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.linalg.LinearOperatorIdentity.md
index 37e711c819..f4d68516dc 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.linalg.LinearOperatorIdentity.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.linalg.LinearOperatorIdentity.md
@@ -226,7 +226,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorIdentity.batch_shape_dynamic(name='batch_shape_dynamic')` {#LinearOperatorIdentity.batch_shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorIdentity.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorIdentity.batch_shape_tensor}
 
 Shape of batch dimensions of this operator, determined at runtime.
 
@@ -276,7 +276,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorIdentity.domain_dimension_dynamic(name='domain_dimension_dynamic')` {#LinearOperatorIdentity.domain_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorIdentity.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorIdentity.domain_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the domain of this operator.
 
@@ -369,7 +369,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorIdentity.range_dimension_dynamic(name='range_dimension_dynamic')` {#LinearOperatorIdentity.range_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorIdentity.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorIdentity.range_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the range of this operator.
 
@@ -405,7 +405,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorIdentity.shape_dynamic(name='shape_dynamic')` {#LinearOperatorIdentity.shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorIdentity.shape_tensor(name='shape_tensor')` {#LinearOperatorIdentity.shape_tensor}
 
 Shape of this `LinearOperator`, determined at runtime.
 
@@ -486,7 +486,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorIdentity.tensor_rank_dynamic(name='tensor_rank_dynamic')` {#LinearOperatorIdentity.tensor_rank_dynamic}
+#### `tf.contrib.linalg.LinearOperatorIdentity.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorIdentity.tensor_rank_tensor}
 
 Rank (in the sense of tensors) of matrix corresponding to this operator.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.LoggingTensorHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.LoggingTensorHook.md
index 519d5f253e..e76b7838ed 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.LoggingTensorHook.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.LoggingTensorHook.md
@@ -3,7 +3,7 @@ Prints the given tensors once every N local steps or once every N seconds.
 The tensors will be printed to the log, with `INFO` severity.
 - - -
 
-#### `tf.train.LoggingTensorHook.__init__(tensors, every_n_iter=None, every_n_secs=None)` {#LoggingTensorHook.__init__}
+#### `tf.train.LoggingTensorHook.__init__(tensors, every_n_iter=None, every_n_secs=None, formatter=None)` {#LoggingTensorHook.__init__}
 
 Initializes a LoggingHook monitor.
 
@@ -17,6 +17,8 @@ Initializes a LoggingHook monitor.
 *  <b>`every_n_secs`</b>: `int` or `float`, print the values of `tensors` once every N
       seconds. Exactly one of `every_n_iter` and `every_n_secs` should be
       provided.
+*  <b>`formatter`</b>: function, takes dict of `tag`->`Tensor` and returns a string.
+      If `None` uses default printing all tensors.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.MonitoredTrainingSession.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.MonitoredTrainingSession.md
index 19cec59080..254e28a70a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.MonitoredTrainingSession.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.MonitoredTrainingSession.md
@@ -1,4 +1,4 @@
-### `tf.train.MonitoredTrainingSession(master='', is_chief=True, checkpoint_dir=None, scaffold=None, hooks=None, chief_only_hooks=None, save_checkpoint_secs=600, save_summaries_steps=100, config=None)` {#MonitoredTrainingSession}
+### `tf.train.MonitoredTrainingSession(master='', is_chief=True, checkpoint_dir=None, scaffold=None, hooks=None, chief_only_hooks=None, save_checkpoint_secs=600, save_summaries_steps=100, save_summaries_secs=None, config=None)` {#MonitoredTrainingSession}
 
 Creates a `MonitoredSession` for training.
 
@@ -26,8 +26,12 @@ inialize/restore.
     using a default checkpoint saver. If `save_checkpoint_secs` is set to
     `None`, then the default checkpoint saver isn't used.
 *  <b>`save_summaries_steps`</b>: The frequency, in number of global steps, that the
-    summaries are written to disk using a default summary saver. If
-    `save_summaries_steps` is set to `None`, then the default summary saver
+    summaries are written to disk using a default summary saver. If both
+    `save_summaries_steps` and `save_summaries_secs` are set to `None`, then
+    the default summary saver isn't used.
+*  <b>`save_summaries_secs`</b>: The frequency, in secs, that the summaries are written
+    to disk using a default summary saver.  If both `save_summaries_steps` and
+    `save_summaries_secs` are set to `None`, then the default summary saver
     isn't used.
 *  <b>`config`</b>: an instance of `tf.ConfigProto` proto used to configure the session.
     It's the `config` argument of constructor of `tf.Session`.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf_debug.LocalCLIDebugHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf_debug.LocalCLIDebugHook.md
index 851a1d2210..eeb4226633 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf_debug.LocalCLIDebugHook.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf_debug.LocalCLIDebugHook.md
@@ -34,12 +34,18 @@ Create a local debugger command-line interface (CLI) hook.
 
 Add a tensor filter.
 
+See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()` for details.
+Override default behavior to accomodate the possibility of this method being
+called prior to the initialization of the underlying
+`LocalCLIDebugWrapperSession` object.
+
 ##### Args:
 
 
-*  <b>`filter_name`</b>: (`str`) name of the filter.
-*  <b>`tensor_filter`</b>: (`callable`) the filter callable. See the doc string of
-    `DebugDumpDir.find()` for more details about its signature.
+*  <b>`filter_name`</b>: See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()`
+    for details.
+*  <b>`tensor_filter`</b>: See doc of
+    `LocalCLIDebugWrapperSession.add_tensor_filter()` for details.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.neg.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.neg.md
deleted file mode 100644
index 519fd9a875..0000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.neg.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.neg(x, name=None)` {#neg}
-
-Computes numerical negative value element-wise.
-
-I.e., \\(y = -x\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md
index e1caff4de8..9b900ac378 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md
@@ -157,22 +157,19 @@ The signature of the input_fn accepted by export is changing to be consistent wi
 
 - - -
 
-#### `tf.contrib.learn.Estimator.export_savedmodel(*args, **kwargs)` {#Estimator.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir. (experimental)
-
-THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and without warning.
+#### `tf.contrib.learn.Estimator.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False)` {#Estimator.export_savedmodel}
 
+Exports inference graph as a SavedModel into given dir.
 
 ##### Args:
 
 
 *  <b>`export_dir_base`</b>: A string containing a directory to write the exported
     graph and checkpoints.
-*  <b>`input_fn`</b>: A function that takes no argument and
+*  <b>`serving_input_fn`</b>: A function that takes no argument and
     returns an `InputFnOps`.
 *  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.
+    specified.  Not needed for single-headed models.
 *  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
     within the exported SavedModel.  Each key should give the destination
     path (including the filename) relative to the assets.extra directory.
@@ -181,7 +178,6 @@ THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and with
     renaming it is specified as
     `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
 *  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`exports_to_keep`</b>: Number of exports to keep.
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.linalg.LinearOperatorScaledIdentity.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.linalg.LinearOperatorScaledIdentity.md
index 9cef244fe4..f37278eb55 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.linalg.LinearOperatorScaledIdentity.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.linalg.LinearOperatorScaledIdentity.md
@@ -200,7 +200,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.batch_shape_dynamic(name='batch_shape_dynamic')` {#LinearOperatorScaledIdentity.batch_shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorScaledIdentity.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorScaledIdentity.batch_shape_tensor}
 
 Shape of batch dimensions of this operator, determined at runtime.
 
@@ -250,7 +250,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.domain_dimension_dynamic(name='domain_dimension_dynamic')` {#LinearOperatorScaledIdentity.domain_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorScaledIdentity.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorScaledIdentity.domain_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the domain of this operator.
 
@@ -350,7 +350,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.range_dimension_dynamic(name='range_dimension_dynamic')` {#LinearOperatorScaledIdentity.range_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorScaledIdentity.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorScaledIdentity.range_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the range of this operator.
 
@@ -386,7 +386,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.shape_dynamic(name='shape_dynamic')` {#LinearOperatorScaledIdentity.shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorScaledIdentity.shape_tensor(name='shape_tensor')` {#LinearOperatorScaledIdentity.shape_tensor}
 
 Shape of this `LinearOperator`, determined at runtime.
 
@@ -467,7 +467,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.tensor_rank_dynamic(name='tensor_rank_dynamic')` {#LinearOperatorScaledIdentity.tensor_rank_dynamic}
+#### `tf.contrib.linalg.LinearOperatorScaledIdentity.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorScaledIdentity.tensor_rank_tensor}
 
 Rank (in the sense of tensors) of matrix corresponding to this operator.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
index dd5d361619..b1f95ca2ae 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
@@ -165,22 +165,19 @@ See BaseEstimator.export.
 
 - - -
 
-#### `tf.contrib.learn.DNNClassifier.export_savedmodel(*args, **kwargs)` {#DNNClassifier.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir. (experimental)
-
-THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and without warning.
+#### `tf.contrib.learn.DNNClassifier.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False)` {#DNNClassifier.export_savedmodel}
 
+Exports inference graph as a SavedModel into given dir.
 
 ##### Args:
 
 
 *  <b>`export_dir_base`</b>: A string containing a directory to write the exported
     graph and checkpoints.
-*  <b>`input_fn`</b>: A function that takes no argument and
+*  <b>`serving_input_fn`</b>: A function that takes no argument and
     returns an `InputFnOps`.
 *  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.
+    specified.  Not needed for single-headed models.
 *  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
     within the exported SavedModel.  Each key should give the destination
     path (including the filename) relative to the assets.extra directory.
@@ -189,7 +186,6 @@ THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and with
     renaming it is specified as
     `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
 *  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`exports_to_keep`</b>: Number of exports to keep.
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_true_positives.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_true_positives.md
index aa3019dbf4..a022639c94 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_true_positives.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_true_positives.md
@@ -7,10 +7,10 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 ##### Args:
 
 
-*  <b>`predictions`</b>: The predicted values, a `bool` `Tensor` of arbitrary
-    dimensions.
-*  <b>`labels`</b>: The ground truth values, a `bool` `Tensor` whose dimensions must
-    match `predictions`.
+*  <b>`predictions`</b>: The predicted values, a `Tensor` of arbitrary dimensions. Will
+    be cast to `bool`.
+*  <b>`labels`</b>: The ground truth values, a `Tensor` whose dimensions must match
+    `predictions`. Will be cast to `bool`.
 *  <b>`weights`</b>: Optional `Tensor` whose rank is either 0, or the same rank as
     `labels`, and must be broadcastable to `labels` (i.e., all dimensions
     must be either `1`, or the same as the corresponding `labels`
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md
index bcf0156924..0aa696ba2f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md
@@ -28,13 +28,13 @@ this function.**
 
   _sentinel: Used to prevent positional parameters. Internal, do not use.
 
-*  <b>`labels`</b>: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
-    `int64`. Each entry in `labels` must be an index in `[0, num_classes)`.
-    Other values will raise an exception when this op is run on CPU, and
-    return `NaN` for corresponding corresponding loss and gradient rows
-    on GPU.
-*  <b>`logits`</b>: Unscaled log probabilities of rank `r` and shape
-    `[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`.
+*  <b>`labels`</b>: `Tensor` of shape `[d_0, d_1, ..., d_{r-1}]` (where `r` is rank of
+    `labels` and result) and dtype `int32` or `int64`. Each entry in `labels`
+    must be an index in `[0, num_classes)`. Other values will raise an
+    exception when this op is run on CPU, and return `NaN` for corresponding
+    loss and gradient rows on GPU.
+*  <b>`logits`</b>: Unscaled log probabilities of shape
+    `[d_0, d_1, ..., d_{r-1}, num_classes]` and dtype `float32` or `float64`.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.write_graph.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.write_graph.md
index 872705a482..33e1f1c591 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.write_graph.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.write_graph.md
@@ -27,3 +27,7 @@ tf.train.write_graph(sess.graph, '/tmp/my-model', 'train.pbtxt')
 *  <b>`name`</b>: Filename for the graph.
 *  <b>`as_text`</b>: If `True`, writes the graph as an ASCII proto.
 
+##### Returns:
+
+  The path of the output proto file.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_false_positives.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_false_positives.md
index c31a7c68dc..d3f748fec7 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_false_positives.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_false_positives.md
@@ -7,10 +7,10 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 ##### Args:
 
 
-*  <b>`predictions`</b>: The predicted values, a `bool` `Tensor` of arbitrary
-    dimensions.
-*  <b>`labels`</b>: The ground truth values, a `bool` `Tensor` whose dimensions must
-    match `predictions`.
+*  <b>`predictions`</b>: The predicted values, a `Tensor` of arbitrary dimensions. Will
+    be cast to `bool`.
+*  <b>`labels`</b>: The ground truth values, a `Tensor` whose dimensions must match
+    `predictions`. Will be cast to `bool`.
 *  <b>`weights`</b>: Optional `Tensor` whose rank is either 0, or the same rank as
     `labels`, and must be broadcastable to `labels` (i.e., all dimensions
     must be either `1`, or the same as the corresponding `labels`
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.assign_renamed_collections_handler.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.assign_renamed_collections_handler.md
index 05b2eba532..153da470ea 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.assign_renamed_collections_handler.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.assign_renamed_collections_handler.md
@@ -2,6 +2,9 @@
 
 Add the transformed elem to the (renamed) collections of elem.
 
+A collection is renamed only if is not a known key, as described in
+`tf.GraphKeys`.
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.linalg.LinearOperatorMatrix.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.linalg.LinearOperatorMatrix.md
index 40bb846034..af1ab47660 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.linalg.LinearOperatorMatrix.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.linalg.LinearOperatorMatrix.md
@@ -183,7 +183,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorMatrix.batch_shape_dynamic(name='batch_shape_dynamic')` {#LinearOperatorMatrix.batch_shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorMatrix.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorMatrix.batch_shape_tensor}
 
 Shape of batch dimensions of this operator, determined at runtime.
 
@@ -233,7 +233,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorMatrix.domain_dimension_dynamic(name='domain_dimension_dynamic')` {#LinearOperatorMatrix.domain_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorMatrix.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorMatrix.domain_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the domain of this operator.
 
@@ -326,7 +326,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorMatrix.range_dimension_dynamic(name='range_dimension_dynamic')` {#LinearOperatorMatrix.range_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorMatrix.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorMatrix.range_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the range of this operator.
 
@@ -362,7 +362,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorMatrix.shape_dynamic(name='shape_dynamic')` {#LinearOperatorMatrix.shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorMatrix.shape_tensor(name='shape_tensor')` {#LinearOperatorMatrix.shape_tensor}
 
 Shape of this `LinearOperator`, determined at runtime.
 
@@ -443,7 +443,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorMatrix.tensor_rank_dynamic(name='tensor_rank_dynamic')` {#LinearOperatorMatrix.tensor_rank_dynamic}
+#### `tf.contrib.linalg.LinearOperatorMatrix.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorMatrix.tensor_rank_tensor}
 
 Rank (in the sense of tensors) of matrix corresponding to this operator.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.linalg.LinearOperatorTriL.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.linalg.LinearOperatorTriL.md
index 5454b65f26..13e8d3395a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.linalg.LinearOperatorTriL.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.linalg.LinearOperatorTriL.md
@@ -185,7 +185,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorTriL.batch_shape_dynamic(name='batch_shape_dynamic')` {#LinearOperatorTriL.batch_shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorTriL.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorTriL.batch_shape_tensor}
 
 Shape of batch dimensions of this operator, determined at runtime.
 
@@ -235,7 +235,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorTriL.domain_dimension_dynamic(name='domain_dimension_dynamic')` {#LinearOperatorTriL.domain_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorTriL.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorTriL.domain_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the domain of this operator.
 
@@ -328,7 +328,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorTriL.range_dimension_dynamic(name='range_dimension_dynamic')` {#LinearOperatorTriL.range_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperatorTriL.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorTriL.range_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the range of this operator.
 
@@ -364,7 +364,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorTriL.shape_dynamic(name='shape_dynamic')` {#LinearOperatorTriL.shape_dynamic}
+#### `tf.contrib.linalg.LinearOperatorTriL.shape_tensor(name='shape_tensor')` {#LinearOperatorTriL.shape_tensor}
 
 Shape of this `LinearOperator`, determined at runtime.
 
@@ -445,7 +445,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperatorTriL.tensor_rank_dynamic(name='tensor_rank_dynamic')` {#LinearOperatorTriL.tensor_rank_dynamic}
+#### `tf.contrib.linalg.LinearOperatorTriL.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorTriL.tensor_rank_tensor}
 
 Rank (in the sense of tensors) of matrix corresponding to this operator.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_true_negatives.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_true_negatives.md
index d8f12ab9eb..5b9dfd33f4 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_true_negatives.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_true_negatives.md
@@ -7,10 +7,10 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 ##### Args:
 
 
-*  <b>`predictions`</b>: The predicted values, a `bool` `Tensor` of arbitrary
-    dimensions.
-*  <b>`labels`</b>: The ground truth values, a `bool` `Tensor` whose dimensions must
-    match `predictions`.
+*  <b>`predictions`</b>: The predicted values, a `Tensor` of arbitrary dimensions. Will
+    be cast to `bool`.
+*  <b>`labels`</b>: The ground truth values, a `Tensor` whose dimensions must match
+    `predictions`. Will be cast to `bool`.
 *  <b>`weights`</b>: Optional `Tensor` whose rank is either 0, or the same rank as
     `labels`, and must be broadcastable to `labels` (i.e., all dimensions
     must be either `1`, or the same as the corresponding `labels`
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
index 5934a587fe..22e7531e78 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
@@ -129,22 +129,19 @@ See BaseEstimator.export.
 
 - - -
 
-#### `tf.contrib.learn.DNNRegressor.export_savedmodel(*args, **kwargs)` {#DNNRegressor.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir. (experimental)
-
-THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and without warning.
+#### `tf.contrib.learn.DNNRegressor.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False)` {#DNNRegressor.export_savedmodel}
 
+Exports inference graph as a SavedModel into given dir.
 
 ##### Args:
 
 
 *  <b>`export_dir_base`</b>: A string containing a directory to write the exported
     graph and checkpoints.
-*  <b>`input_fn`</b>: A function that takes no argument and
+*  <b>`serving_input_fn`</b>: A function that takes no argument and
     returns an `InputFnOps`.
 *  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.
+    specified.  Not needed for single-headed models.
 *  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
     within the exported SavedModel.  Each key should give the destination
     path (including the filename) relative to the assets.extra directory.
@@ -153,7 +150,6 @@ THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and with
     renaming it is specified as
     `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
 *  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`exports_to_keep`</b>: Number of exports to keep.
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.linalg.LinearOperator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.linalg.LinearOperator.md
index a07c373774..41a5a1cb74 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.linalg.LinearOperator.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.linalg.LinearOperator.md
@@ -215,7 +215,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperator.batch_shape_dynamic(name='batch_shape_dynamic')` {#LinearOperator.batch_shape_dynamic}
+#### `tf.contrib.linalg.LinearOperator.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperator.batch_shape_tensor}
 
 Shape of batch dimensions of this operator, determined at runtime.
 
@@ -265,7 +265,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperator.domain_dimension_dynamic(name='domain_dimension_dynamic')` {#LinearOperator.domain_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperator.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperator.domain_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the domain of this operator.
 
@@ -358,7 +358,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperator.range_dimension_dynamic(name='range_dimension_dynamic')` {#LinearOperator.range_dimension_dynamic}
+#### `tf.contrib.linalg.LinearOperator.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperator.range_dimension_tensor}
 
 Dimension (in the sense of vector spaces) of the range of this operator.
 
@@ -394,7 +394,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperator.shape_dynamic(name='shape_dynamic')` {#LinearOperator.shape_dynamic}
+#### `tf.contrib.linalg.LinearOperator.shape_tensor(name='shape_tensor')` {#LinearOperator.shape_tensor}
 
 Shape of this `LinearOperator`, determined at runtime.
 
@@ -475,7 +475,7 @@ If this operator acts like the batch matrix `A` with
 
 - - -
 
-#### `tf.contrib.linalg.LinearOperator.tensor_rank_dynamic(name='tensor_rank_dynamic')` {#LinearOperator.tensor_rank_dynamic}
+#### `tf.contrib.linalg.LinearOperator.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperator.tensor_rank_tensor}
 
 Rank (in the sense of tensors) of matrix corresponding to this operator.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.metrics.streaming_false_negatives.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.metrics.streaming_false_negatives.md
index 878ba46941..1464305257 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.metrics.streaming_false_negatives.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.metrics.streaming_false_negatives.md
@@ -7,10 +7,10 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 ##### Args:
 
 
-*  <b>`predictions`</b>: The predicted values, a `bool` `Tensor` of arbitrary
-    dimensions.
-*  <b>`labels`</b>: The ground truth values, a `bool` `Tensor` whose dimensions must
-    match `predictions`.
+*  <b>`predictions`</b>: The predicted values, a `Tensor` of arbitrary dimensions. Will
+    be cast to `bool`.
+*  <b>`labels`</b>: The ground truth values, a `Tensor` whose dimensions must match
+    `predictions`. Will be cast to `bool`.
 *  <b>`weights`</b>: Optional `Tensor` whose rank is either 0, or the same rank as
     `labels`, and must be broadcastable to `labels` (i.e., all dimensions
     must be either `1`, or the same as the corresponding `labels`
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_local_variable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_local_variable.md
index 9026066f66..c425a3e64b 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_local_variable.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_local_variable.md
@@ -14,7 +14,7 @@ for an extensive description of how reusing works. Here is a basic example:
 with tf.variable_scope("foo"):
     v = tf.get_variable("v", [1])  # v.name == "foo/v:0"
     w = tf.get_variable("w", [1])  # w.name == "foo/w:0"
-with tf.variable_scope("foo", reuse=True)
+with tf.variable_scope("foo", reuse=True):
     v1 = tf.get_variable("v")  # The same as v above.
 ```
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_variable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_variable.md
index c7040d28da..f09098eb51 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_variable.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_variable.md
@@ -11,7 +11,7 @@ for an extensive description of how reusing works. Here is a basic example:
 with tf.variable_scope("foo"):
     v = tf.get_variable("v", [1])  # v.name == "foo/v:0"
     w = tf.get_variable("w", [1])  # w.name == "foo/w:0"
-with tf.variable_scope("foo", reuse=True)
+with tf.variable_scope("foo", reuse=True):
     v1 = tf.get_variable("v")  # The same as v above.
 ```
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.FeedFnHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.FeedFnHook.md
new file mode 100644
index 0000000000..1797a0d3b5
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.FeedFnHook.md
@@ -0,0 +1,88 @@
+Runs `feed_fn` and sets the `feed_dict` accordingly.
+- - -
+
+#### `tf.train.FeedFnHook.__init__(feed_fn)` {#FeedFnHook.__init__}
+
+Constructs the FeedFnHook with given `feed_fn`.
+
+##### Args:
+
+
+*  <b>`feed_fn`</b>: function, no arguments and returns `dict` to feed.
+
+
+- - -
+
+#### `tf.train.FeedFnHook.after_create_session(session, coord)` {#FeedFnHook.after_create_session}
+
+Called when new TensorFlow session is created.
+
+This is called to signal the hooks that a new session has been created. This
+has two essential differences with the situation in which `begin` is called:
+
+* When this is called, the graph is finalized and ops can no longer be added
+    to the graph.
+* This method will also be called as a result of recovering a wrapped
+    session, not only at the beginning of the overall session.
+
+##### Args:
+
+
+*  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
+
+
+- - -
+
+#### `tf.train.FeedFnHook.after_run(run_context, run_values)` {#FeedFnHook.after_run}
+
+Called after each call to run().
+
+The `run_values` argument contains results of requested ops/tensors by
+`before_run()`.
+
+The `run_context` argument is the same one send to `before_run` call.
+`run_context.request_stop()` can be called to stop the iteration.
+
+##### Args:
+
+
+*  <b>`run_context`</b>: A `SessionRunContext` object.
+*  <b>`run_values`</b>: A SessionRunValues object.
+
+
+- - -
+
+#### `tf.train.FeedFnHook.before_run(run_context)` {#FeedFnHook.before_run}
+
+
+
+
+- - -
+
+#### `tf.train.FeedFnHook.begin()` {#FeedFnHook.begin}
+
+Called once before using the session.
+
+When called, the default graph is the one that will be launched in the
+session.  The hook can modify the graph by adding new operations to it.
+After the `begin()` call the graph will be finalized and the other callbacks
+can not modify the graph anymore. Second call of `begin()` on the same
+graph, should not change the graph.
+
+
+- - -
+
+#### `tf.train.FeedFnHook.end(session)` {#FeedFnHook.end}
+
+Called at the end of session.
+
+The `session` argument can be used in case the hook wants to run final ops,
+such as saving a last checkpoint.
+
+##### Args:
+
+
+*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.FinalOpsHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.FinalOpsHook.md
new file mode 100644
index 0000000000..bf8e7184b6
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.FinalOpsHook.md
@@ -0,0 +1,111 @@
+A run hook which evaluates `Tensors` at the end of a session.
+- - -
+
+#### `tf.train.FinalOpsHook.__init__(final_ops, final_ops_feed_dict=None)` {#FinalOpsHook.__init__}
+
+Constructs the FinalOpHook with ops to run at the end of the session.
+
+##### Args:
+
+
+*  <b>`final_ops`</b>: A single `Tensor`, a list of `Tensors` or a dictionary of
+    names to `Tensors`.
+*  <b>`final_ops_feed_dict`</b>: A feed dictionary to use when running
+    `final_ops_dict`.
+
+
+- - -
+
+#### `tf.train.FinalOpsHook.after_create_session(session, coord)` {#FinalOpsHook.after_create_session}
+
+Called when new TensorFlow session is created.
+
+This is called to signal the hooks that a new session has been created. This
+has two essential differences with the situation in which `begin` is called:
+
+* When this is called, the graph is finalized and ops can no longer be added
+    to the graph.
+* This method will also be called as a result of recovering a wrapped
+    session, not only at the beginning of the overall session.
+
+##### Args:
+
+
+*  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
+
+
+- - -
+
+#### `tf.train.FinalOpsHook.after_run(run_context, run_values)` {#FinalOpsHook.after_run}
+
+Called after each call to run().
+
+The `run_values` argument contains results of requested ops/tensors by
+`before_run()`.
+
+The `run_context` argument is the same one send to `before_run` call.
+`run_context.request_stop()` can be called to stop the iteration.
+
+##### Args:
+
+
+*  <b>`run_context`</b>: A `SessionRunContext` object.
+*  <b>`run_values`</b>: A SessionRunValues object.
+
+
+- - -
+
+#### `tf.train.FinalOpsHook.before_run(run_context)` {#FinalOpsHook.before_run}
+
+Called before each call to run().
+
+You can return from this call a `SessionRunArgs` object indicating ops or
+tensors to add to the upcoming `run()` call.  These ops/tensors will be run
+together with the ops/tensors originally passed to the original run() call.
+The run args you return can also contain feeds to be added to the run()
+call.
+
+The `run_context` argument is a `SessionRunContext` that provides
+information about the upcoming `run()` call: the originally requested
+op/tensors, the TensorFlow Session.
+
+At this point graph is finalized and you can not add ops.
+
+##### Args:
+
+
+*  <b>`run_context`</b>: A `SessionRunContext` object.
+
+##### Returns:
+
+  None or a `SessionRunArgs` object.
+
+
+- - -
+
+#### `tf.train.FinalOpsHook.begin()` {#FinalOpsHook.begin}
+
+Called once before using the session.
+
+When called, the default graph is the one that will be launched in the
+session.  The hook can modify the graph by adding new operations to it.
+After the `begin()` call the graph will be finalized and the other callbacks
+can not modify the graph anymore. Second call of `begin()` on the same
+graph, should not change the graph.
+
+
+- - -
+
+#### `tf.train.FinalOpsHook.end(session)` {#FinalOpsHook.end}
+
+
+
+
+- - -
+
+#### `tf.train.FinalOpsHook.final_ops_values` {#FinalOpsHook.final_ops_values}
+
+
+
+
diff --git a/tensorflow/g3doc/api_docs/python/index.md b/tensorflow/g3doc/api_docs/python/index.md
index cc3dc0a0e5..424448acfd 100644
--- a/tensorflow/g3doc/api_docs/python/index.md
+++ b/tensorflow/g3doc/api_docs/python/index.md
@@ -260,7 +260,6 @@
   * [`minimum`](../../api_docs/python/math_ops.md#minimum)
   * [`mod`](../../api_docs/python/math_ops.md#mod)
   * [`multiply`](../../api_docs/python/math_ops.md#multiply)
-  * [`neg`](../../api_docs/python/math_ops.md#neg)
   * [`negative`](../../api_docs/python/math_ops.md#negative)
   * [`norm`](../../api_docs/python/math_ops.md#norm)
   * [`polygamma`](../../api_docs/python/math_ops.md#polygamma)
@@ -619,6 +618,8 @@
   * [`do_quantize_training_on_graphdef`](../../api_docs/python/train.md#do_quantize_training_on_graphdef)
   * [`exponential_decay`](../../api_docs/python/train.md#exponential_decay)
   * [`ExponentialMovingAverage`](../../api_docs/python/train.md#ExponentialMovingAverage)
+  * [`FeedFnHook`](../../api_docs/python/train.md#FeedFnHook)
+  * [`FinalOpsHook`](../../api_docs/python/train.md#FinalOpsHook)
   * [`FtrlOptimizer`](../../api_docs/python/train.md#FtrlOptimizer)
   * [`generate_checkpoint_state_proto`](../../api_docs/python/train.md#generate_checkpoint_state_proto)
   * [`get_checkpoint_mtimes`](../../api_docs/python/train.md#get_checkpoint_mtimes)
diff --git a/tensorflow/g3doc/api_docs/python/math_ops.md b/tensorflow/g3doc/api_docs/python/math_ops.md
index 92b001f898..76636dc6f0 100644
--- a/tensorflow/g3doc/api_docs/python/math_ops.md
+++ b/tensorflow/g3doc/api_docs/python/math_ops.md
@@ -3720,24 +3720,3 @@ invert_permutation(x) ==> [2, 4, 3, 0, 1]
   A `Tensor`. Has the same type as `x`. 1-D.
 
 
-
-## Other Functions and Classes
-- - -
-
-### `tf.neg(x, name=None)` {#neg}
-
-Computes numerical negative value element-wise.
-
-I.e., \\(y = -x\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/nn.md b/tensorflow/g3doc/api_docs/python/nn.md
index 84aaa5c5c9..5d64aaf072 100644
--- a/tensorflow/g3doc/api_docs/python/nn.md
+++ b/tensorflow/g3doc/api_docs/python/nn.md
@@ -2370,13 +2370,13 @@ this function.**
 
   _sentinel: Used to prevent positional parameters. Internal, do not use.
 
-*  <b>`labels`</b>: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
-    `int64`. Each entry in `labels` must be an index in `[0, num_classes)`.
-    Other values will raise an exception when this op is run on CPU, and
-    return `NaN` for corresponding corresponding loss and gradient rows
-    on GPU.
-*  <b>`logits`</b>: Unscaled log probabilities of rank `r` and shape
-    `[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`.
+*  <b>`labels`</b>: `Tensor` of shape `[d_0, d_1, ..., d_{r-1}]` (where `r` is rank of
+    `labels` and result) and dtype `int32` or `int64`. Each entry in `labels`
+    must be an index in `[0, num_classes)`. Other values will raise an
+    exception when this op is run on CPU, and return `NaN` for corresponding
+    loss and gradient rows on GPU.
+*  <b>`logits`</b>: Unscaled log probabilities of shape
+    `[d_0, d_1, ..., d_{r-1}, num_classes]` and dtype `float32` or `float64`.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/state_ops.md b/tensorflow/g3doc/api_docs/python/state_ops.md
index 2db192fddd..9890892b0f 100644
--- a/tensorflow/g3doc/api_docs/python/state_ops.md
+++ b/tensorflow/g3doc/api_docs/python/state_ops.md
@@ -1943,7 +1943,7 @@ for an extensive description of how reusing works. Here is a basic example:
 with tf.variable_scope("foo"):
     v = tf.get_variable("v", [1])  # v.name == "foo/v:0"
     w = tf.get_variable("w", [1])  # w.name == "foo/w:0"
-with tf.variable_scope("foo", reuse=True)
+with tf.variable_scope("foo", reuse=True):
     v1 = tf.get_variable("v")  # The same as v above.
 ```
 
@@ -2032,7 +2032,7 @@ for an extensive description of how reusing works. Here is a basic example:
 with tf.variable_scope("foo"):
     v = tf.get_variable("v", [1])  # v.name == "foo/v:0"
     w = tf.get_variable("w", [1])  # w.name == "foo/w:0"
-with tf.variable_scope("foo", reuse=True)
+with tf.variable_scope("foo", reuse=True):
     v1 = tf.get_variable("v")  # The same as v above.
 ```
 
diff --git a/tensorflow/g3doc/api_docs/python/tf_debug.md b/tensorflow/g3doc/api_docs/python/tf_debug.md
index 28fc9ec502..9dc35ac82e 100644
--- a/tensorflow/g3doc/api_docs/python/tf_debug.md
+++ b/tensorflow/g3doc/api_docs/python/tf_debug.md
@@ -1216,12 +1216,18 @@ Create a local debugger command-line interface (CLI) hook.
 
 Add a tensor filter.
 
+See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()` for details.
+Override default behavior to accomodate the possibility of this method being
+called prior to the initialization of the underlying
+`LocalCLIDebugWrapperSession` object.
+
 ##### Args:
 
 
-*  <b>`filter_name`</b>: (`str`) name of the filter.
-*  <b>`tensor_filter`</b>: (`callable`) the filter callable. See the doc string of
-    `DebugDumpDir.find()` for more details about its signature.
+*  <b>`filter_name`</b>: See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()`
+    for details.
+*  <b>`tensor_filter`</b>: See doc of
+    `LocalCLIDebugWrapperSession.add_tensor_filter()` for details.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/train.md b/tensorflow/g3doc/api_docs/python/train.md
index 098418f7a6..ac5ef5fc9f 100644
--- a/tensorflow/g3doc/api_docs/python/train.md
+++ b/tensorflow/g3doc/api_docs/python/train.md
@@ -1533,6 +1533,7 @@ See [Threading and Queues](../../how_tos/threading_and_queues/index.md)
 for how to use threads and queues.  For documentation on the Queue API,
 see [Queues](../../api_docs/python/io_ops.md#queues).
 
+
 - - -
 
 ### `class tf.train.Coordinator` {#Coordinator}
@@ -1984,6 +1985,233 @@ Converts this `QueueRunner` to a `QueueRunnerDef` protocol buffer.
 
 - - -
 
+### `class tf.train.LooperThread` {#LooperThread}
+
+A thread that runs code repeatedly, optionally on a timer.
+
+This thread class is intended to be used with a `Coordinator`.  It repeatedly
+runs code specified either as `target` and `args` or by the `run_loop()`
+method.
+
+Before each run the thread checks if the coordinator has requested stop.  In
+that case the looper thread terminates immediately.
+
+If the code being run raises an exception, that exception is reported to the
+coordinator and the thread terminates.  The coordinator will then request all
+the other threads it coordinates to stop.
+
+You typically pass looper threads to the supervisor `Join()` method.
+- - -
+
+#### `tf.train.LooperThread.__init__(coord, timer_interval_secs, target=None, args=None, kwargs=None)` {#LooperThread.__init__}
+
+Create a LooperThread.
+
+##### Args:
+
+
+*  <b>`coord`</b>: A Coordinator.
+*  <b>`timer_interval_secs`</b>: Time boundaries at which to call Run(), or None
+    if it should be called back to back.
+*  <b>`target`</b>: Optional callable object that will be executed in the thread.
+*  <b>`args`</b>: Optional arguments to pass to `target` when calling it.
+*  <b>`kwargs`</b>: Optional keyword arguments to pass to `target` when calling it.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If one of the arguments is invalid.
+
+
+- - -
+
+#### `tf.train.LooperThread.__repr__()` {#LooperThread.__repr__}
+
+
+
+
+- - -
+
+#### `tf.train.LooperThread.daemon` {#LooperThread.daemon}
+
+A boolean value indicating whether this thread is a daemon thread (True) or not (False).
+
+This must be set before start() is called, otherwise RuntimeError is
+raised. Its initial value is inherited from the creating thread; the
+main thread is not a daemon thread and therefore all threads created in
+the main thread default to daemon = False.
+
+The entire Python program exits when no alive non-daemon threads are
+left.
+
+
+- - -
+
+#### `tf.train.LooperThread.getName()` {#LooperThread.getName}
+
+
+
+
+- - -
+
+#### `tf.train.LooperThread.ident` {#LooperThread.ident}
+
+Thread identifier of this thread or None if it has not been started.
+
+This is a nonzero integer. See the thread.get_ident() function. Thread
+identifiers may be recycled when a thread exits and another thread is
+created. The identifier is available even after the thread has exited.
+
+
+- - -
+
+#### `tf.train.LooperThread.isAlive()` {#LooperThread.isAlive}
+
+Return whether the thread is alive.
+
+This method returns True just before the run() method starts until just
+after the run() method terminates. The module function enumerate()
+returns a list of all alive threads.
+
+
+- - -
+
+#### `tf.train.LooperThread.isDaemon()` {#LooperThread.isDaemon}
+
+
+
+
+- - -
+
+#### `tf.train.LooperThread.is_alive()` {#LooperThread.is_alive}
+
+Return whether the thread is alive.
+
+This method returns True just before the run() method starts until just
+after the run() method terminates. The module function enumerate()
+returns a list of all alive threads.
+
+
+- - -
+
+#### `tf.train.LooperThread.join(timeout=None)` {#LooperThread.join}
+
+Wait until the thread terminates.
+
+This blocks the calling thread until the thread whose join() method is
+called terminates -- either normally or through an unhandled exception
+or until the optional timeout occurs.
+
+When the timeout argument is present and not None, it should be a
+floating point number specifying a timeout for the operation in seconds
+(or fractions thereof). As join() always returns None, you must call
+isAlive() after join() to decide whether a timeout happened -- if the
+thread is still alive, the join() call timed out.
+
+When the timeout argument is not present or None, the operation will
+block until the thread terminates.
+
+A thread can be join()ed many times.
+
+join() raises a RuntimeError if an attempt is made to join the current
+thread as that would cause a deadlock. It is also an error to join() a
+thread before it has been started and attempts to do so raises the same
+exception.
+
+
+- - -
+
+#### `tf.train.LooperThread.loop(coord, timer_interval_secs, target, args=None, kwargs=None)` {#LooperThread.loop}
+
+Start a LooperThread that calls a function periodically.
+
+If `timer_interval_secs` is None the thread calls `target(args)`
+repeatedly.  Otherwise `target(args)` is called every `timer_interval_secs`
+seconds.  The thread terminates when a stop of the coordinator is
+requested.
+
+##### Args:
+
+
+*  <b>`coord`</b>: A Coordinator.
+*  <b>`timer_interval_secs`</b>: Number. Time boundaries at which to call `target`.
+*  <b>`target`</b>: A callable object.
+*  <b>`args`</b>: Optional arguments to pass to `target` when calling it.
+*  <b>`kwargs`</b>: Optional keyword arguments to pass to `target` when calling it.
+
+##### Returns:
+
+  The started thread.
+
+
+- - -
+
+#### `tf.train.LooperThread.name` {#LooperThread.name}
+
+A string used for identification purposes only.
+
+It has no semantics. Multiple threads may be given the same name. The
+initial name is set by the constructor.
+
+
+- - -
+
+#### `tf.train.LooperThread.run()` {#LooperThread.run}
+
+
+
+
+- - -
+
+#### `tf.train.LooperThread.run_loop()` {#LooperThread.run_loop}
+
+Called at 'timer_interval_secs' boundaries.
+
+
+- - -
+
+#### `tf.train.LooperThread.setDaemon(daemonic)` {#LooperThread.setDaemon}
+
+
+
+
+- - -
+
+#### `tf.train.LooperThread.setName(name)` {#LooperThread.setName}
+
+
+
+
+- - -
+
+#### `tf.train.LooperThread.start()` {#LooperThread.start}
+
+Start the thread's activity.
+
+It must be called at most once per thread object. It arranges for the
+object's run() method to be invoked in a separate thread of control.
+
+This method will raise a RuntimeError if called more than once on the
+same thread object.
+
+
+- - -
+
+#### `tf.train.LooperThread.start_loop()` {#LooperThread.start_loop}
+
+Called when the thread starts.
+
+
+- - -
+
+#### `tf.train.LooperThread.stop_loop()` {#LooperThread.stop_loop}
+
+Called when the thread stops.
+
+
+
+- - -
+
 ### `tf.train.add_queue_runner(qr, collection='queue_runners')` {#add_queue_runner}
 
 Adds a `QueueRunner` to a collection in the graph.
@@ -3531,7 +3759,7 @@ with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
 
 - - -
 
-### `tf.train.MonitoredTrainingSession(master='', is_chief=True, checkpoint_dir=None, scaffold=None, hooks=None, chief_only_hooks=None, save_checkpoint_secs=600, save_summaries_steps=100, config=None)` {#MonitoredTrainingSession}
+### `tf.train.MonitoredTrainingSession(master='', is_chief=True, checkpoint_dir=None, scaffold=None, hooks=None, chief_only_hooks=None, save_checkpoint_secs=600, save_summaries_steps=100, save_summaries_secs=None, config=None)` {#MonitoredTrainingSession}
 
 Creates a `MonitoredSession` for training.
 
@@ -3559,8 +3787,12 @@ inialize/restore.
     using a default checkpoint saver. If `save_checkpoint_secs` is set to
     `None`, then the default checkpoint saver isn't used.
 *  <b>`save_summaries_steps`</b>: The frequency, in number of global steps, that the
-    summaries are written to disk using a default summary saver. If
-    `save_summaries_steps` is set to `None`, then the default summary saver
+    summaries are written to disk using a default summary saver. If both
+    `save_summaries_steps` and `save_summaries_secs` are set to `None`, then
+    the default summary saver isn't used.
+*  <b>`save_summaries_secs`</b>: The frequency, in secs, that the summaries are written
+    to disk using a default summary saver.  If both `save_summaries_steps` and
+    `save_summaries_secs` are set to `None`, then the default summary saver
     isn't used.
 *  <b>`config`</b>: an instance of `tf.ConfigProto` proto used to configure the session.
     It's the `config` argument of constructor of `tf.Session`.
@@ -4111,232 +4343,312 @@ for more information about their attributes.
 
 
 
-## Training Utilities
+## Training Hooks
+
+Hooks are tools that run in the process of training/evaluation of the model.
 
 - - -
 
-### `tf.train.global_step(sess, global_step_tensor)` {#global_step}
+### `class tf.train.SessionRunHook` {#SessionRunHook}
 
-Small helper to get the global step.
+Hook to extend calls to MonitoredSession.run().
+- - -
 
-```python
-# Creates a variable to hold the global_step.
-global_step_tensor = tf.Variable(10, trainable=False, name='global_step')
-# Creates a session.
-sess = tf.Session()
-# Initializes the variable.
-print('global_step: %s' % tf.train.global_step(sess, global_step_tensor))
+#### `tf.train.SessionRunHook.after_create_session(session, coord)` {#SessionRunHook.after_create_session}
 
-global_step: 10
-```
+Called when new TensorFlow session is created.
 
-##### Args:
+This is called to signal the hooks that a new session has been created. This
+has two essential differences with the situation in which `begin` is called:
 
+* When this is called, the graph is finalized and ops can no longer be added
+    to the graph.
+* This method will also be called as a result of recovering a wrapped
+    session, not only at the beginning of the overall session.
 
-*  <b>`sess`</b>: A TensorFlow `Session` object.
-*  <b>`global_step_tensor`</b>: `Tensor` or the `name` of the operation that contains
-    the global step.
+##### Args:
 
-##### Returns:
 
-  The global step value.
+*  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
 
-### `tf.train.basic_train_loop(supervisor, train_step_fn, args=None, kwargs=None, master='')` {#basic_train_loop}
-
-Basic loop to train a model.
+#### `tf.train.SessionRunHook.after_run(run_context, run_values)` {#SessionRunHook.after_run}
 
-Calls `train_step_fn` in a loop to train a model.  The function is called as:
+Called after each call to run().
 
-```python
-train_step_fn(session, *args, **kwargs)
-```
+The `run_values` argument contains results of requested ops/tensors by
+`before_run()`.
 
-It is passed a `tf.Session` in addition to `args` and `kwargs`.  The function
-typically runs one training step in the session.
+The `run_context` argument is the same one send to `before_run` call.
+`run_context.request_stop()` can be called to stop the iteration.
 
 ##### Args:
 
 
-*  <b>`supervisor`</b>: `tf.Supervisor` to run the training services.
-*  <b>`train_step_fn`</b>: Callable to execute one training step.  Called
-    repeatedly as `train_step_fn(session, *args **kwargs)`.
-*  <b>`args`</b>: Optional positional arguments passed to `train_step_fn`.
-*  <b>`kwargs`</b>: Optional keyword arguments passed to `train_step_fn`.
-*  <b>`master`</b>: Master to use to create the training session.  Defaults to
-    `""` which causes the session to be created in the local process.
+*  <b>`run_context`</b>: A `SessionRunContext` object.
+*  <b>`run_values`</b>: A SessionRunValues object.
 
 
 - - -
 
-### `tf.train.get_global_step(graph=None)` {#get_global_step}
+#### `tf.train.SessionRunHook.before_run(run_context)` {#SessionRunHook.before_run}
 
-Get the global step tensor.
+Called before each call to run().
 
-The global step tensor must be an integer variable. We first try to find it
-in the collection `GLOBAL_STEP`, or by name `global_step:0`.
+You can return from this call a `SessionRunArgs` object indicating ops or
+tensors to add to the upcoming `run()` call.  These ops/tensors will be run
+together with the ops/tensors originally passed to the original run() call.
+The run args you return can also contain feeds to be added to the run()
+call.
+
+The `run_context` argument is a `SessionRunContext` that provides
+information about the upcoming `run()` call: the originally requested
+op/tensors, the TensorFlow Session.
+
+At this point graph is finalized and you can not add ops.
 
 ##### Args:
 
 
-*  <b>`graph`</b>: The graph to find the global step in. If missing, use default graph.
+*  <b>`run_context`</b>: A `SessionRunContext` object.
 
 ##### Returns:
 
-  The global step variable, or `None` if none was found.
+  None or a `SessionRunArgs` object.
 
-##### Raises:
 
+- - -
 
-*  <b>`TypeError`</b>: If the global step tensor has a non-integer type, or if it is not
-    a `Variable`.
+#### `tf.train.SessionRunHook.begin()` {#SessionRunHook.begin}
+
+Called once before using the session.
+
+When called, the default graph is the one that will be launched in the
+session.  The hook can modify the graph by adding new operations to it.
+After the `begin()` call the graph will be finalized and the other callbacks
+can not modify the graph anymore. Second call of `begin()` on the same
+graph, should not change the graph.
 
 
 - - -
 
-### `tf.train.assert_global_step(global_step_tensor)` {#assert_global_step}
+#### `tf.train.SessionRunHook.end(session)` {#SessionRunHook.end}
 
-Asserts `global_step_tensor` is a scalar int `Variable` or `Tensor`.
+Called at the end of session.
+
+The `session` argument can be used in case the hook wants to run final ops,
+such as saving a last checkpoint.
 
 ##### Args:
 
 
-*  <b>`global_step_tensor`</b>: `Tensor` to test.
+*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
+
 
 
 - - -
 
-### `tf.train.write_graph(graph_or_graph_def, logdir, name, as_text=True)` {#write_graph}
+### `class tf.train.SessionRunArgs` {#SessionRunArgs}
 
-Writes a graph proto to a file.
+Represents arguments to be added to a `Session.run()` call.
 
-The graph is written as a binary proto unless `as_text` is `True`.
+Args:
+  fetches: Exactly like the 'fetches' argument to Session.Run().
+    Can be a single tensor or op, a list of 'fetches' or a dictionary
+    of fetches.  For example:
+      fetches = global_step_tensor
+      fetches = [train_op, summary_op, global_step_tensor]
+      fetches = {'step': global_step_tensor, 'summ': summary_op}
+    Note that this can recurse as expected:
+      fetches = {'step': global_step_tensor,
+                 'ops': [train_op, check_nan_op]}
+  feed_dict: Exactly like the `feed_dict` argument to `Session.Run()`
+  options: Exactly like the `options` argument to `Session.run()`, i.e., a
+    config_pb2.RunOptions proto.
+- - -
 
-```python
-v = tf.Variable(0, name='my_variable')
-sess = tf.Session()
-tf.train.write_graph(sess.graph_def, '/tmp/my-model', 'train.pbtxt')
-```
+#### `tf.train.SessionRunArgs.__getnewargs__()` {#SessionRunArgs.__getnewargs__}
 
-or
+Return self as a plain tuple.  Used by copy and pickle.
 
-```python
-v = tf.Variable(0, name='my_variable')
-sess = tf.Session()
-tf.train.write_graph(sess.graph, '/tmp/my-model', 'train.pbtxt')
-```
 
-##### Args:
+- - -
 
+#### `tf.train.SessionRunArgs.__getstate__()` {#SessionRunArgs.__getstate__}
 
-*  <b>`graph_or_graph_def`</b>: A `Graph` or a `GraphDef` protocol buffer.
-*  <b>`logdir`</b>: Directory where to write the graph. This can refer to remote
-    filesystems, such as Google Cloud Storage (GCS).
-*  <b>`name`</b>: Filename for the graph.
-*  <b>`as_text`</b>: If `True`, writes the graph as an ASCII proto.
+Exclude the OrderedDict from pickling
 
 
 - - -
 
-### `class tf.train.SessionRunHook` {#SessionRunHook}
+#### `tf.train.SessionRunArgs.__new__(cls, fetches, feed_dict=None, options=None)` {#SessionRunArgs.__new__}
+
+
+
 
-Hook to extend calls to MonitoredSession.run().
 - - -
 
-#### `tf.train.SessionRunHook.after_create_session(session, coord)` {#SessionRunHook.after_create_session}
+#### `tf.train.SessionRunArgs.__repr__()` {#SessionRunArgs.__repr__}
 
-Called when new TensorFlow session is created.
+Return a nicely formatted representation string
 
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
 
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
+- - -
 
-##### Args:
+#### `tf.train.SessionRunArgs.feed_dict` {#SessionRunArgs.feed_dict}
 
+Alias for field number 1
 
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
+
+- - -
+
+#### `tf.train.SessionRunArgs.fetches` {#SessionRunArgs.fetches}
+
+Alias for field number 0
 
 
 - - -
 
-#### `tf.train.SessionRunHook.after_run(run_context, run_values)` {#SessionRunHook.after_run}
+#### `tf.train.SessionRunArgs.options` {#SessionRunArgs.options}
 
-Called after each call to run().
+Alias for field number 2
 
-The `run_values` argument contains results of requested ops/tensors by
-`before_run()`.
 
-The `run_context` argument is the same one send to `before_run` call.
-`run_context.request_stop()` can be called to stop the iteration.
 
-##### Args:
+- - -
 
+### `class tf.train.SessionRunContext` {#SessionRunContext}
 
-*  <b>`run_context`</b>: A `SessionRunContext` object.
-*  <b>`run_values`</b>: A SessionRunValues object.
+Provides information about the `session.run()` call being made.
+
+Provides information about original request to `Session.Run()` function.
+SessionRunHook objects can stop the loop by calling `request_stop()` of
+`run_context`. In the future we may use this object to add more information
+about run without changing the Hook API.
+- - -
+
+#### `tf.train.SessionRunContext.__init__(original_args, session)` {#SessionRunContext.__init__}
+
+Initializes SessionRunContext.
 
 
 - - -
 
-#### `tf.train.SessionRunHook.before_run(run_context)` {#SessionRunHook.before_run}
+#### `tf.train.SessionRunContext.original_args` {#SessionRunContext.original_args}
 
-Called before each call to run().
+A `SessionRunArgs` object holding the original arguments of `run()`.
 
-You can return from this call a `SessionRunArgs` object indicating ops or
-tensors to add to the upcoming `run()` call.  These ops/tensors will be run
-together with the ops/tensors originally passed to the original run() call.
-The run args you return can also contain feeds to be added to the run()
-call.
+If user called `MonitoredSession.run(fetches=a, feed_dict=b)`, then this
+field is equal to SessionRunArgs(a, b).
 
-The `run_context` argument is a `SessionRunContext` that provides
-information about the upcoming `run()` call: the originally requested
-op/tensors, the TensorFlow Session.
+##### Returns:
 
-At this point graph is finalized and you can not add ops.
+ A `SessionRunArgs` object
 
-##### Args:
 
+- - -
+
+#### `tf.train.SessionRunContext.request_stop()` {#SessionRunContext.request_stop}
 
-*  <b>`run_context`</b>: A `SessionRunContext` object.
+Sets stop requested field.
+
+Hooks can use this function to request stop of iterations.
+`MonitoredSession` checks whether this is called or not.
+
+
+- - -
+
+#### `tf.train.SessionRunContext.session` {#SessionRunContext.session}
+
+A TensorFlow session object which will execute the `run`.
+
+
+- - -
+
+#### `tf.train.SessionRunContext.stop_requested` {#SessionRunContext.stop_requested}
+
+Returns whether a stop is requested or not.
+
+If true, `MonitoredSession` stops iterations.
 
 ##### Returns:
 
-  None or a `SessionRunArgs` object.
+  A `bool`
+
 
 
 - - -
 
-#### `tf.train.SessionRunHook.begin()` {#SessionRunHook.begin}
+### `class tf.train.SessionRunValues` {#SessionRunValues}
 
-Called once before using the session.
+Contains the results of `Session.run()`.
 
-When called, the default graph is the one that will be launched in the
-session.  The hook can modify the graph by adding new operations to it.
-After the `begin()` call the graph will be finalized and the other callbacks
-can not modify the graph anymore. Second call of `begin()` on the same
-graph, should not change the graph.
+In the future we may use this object to add more information about result of
+run without changing the Hook API.
+
+Args:
+  results: The return values from `Session.run()` corresponding to the fetches
+    attribute returned in the RunArgs. Note that this has the same shape as
+    the RunArgs fetches.  For example:
+      fetches = global_step_tensor
+      => results = nparray(int)
+      fetches = [train_op, summary_op, global_step_tensor]
+      => results = [None, nparray(string), nparray(int)]
+      fetches = {'step': global_step_tensor, 'summ': summary_op}
+      => results = {'step': nparray(int), 'summ': nparray(string)}
+  options: `RunOptions` from the `Session.run()` call.
+  run_metadata: `RunMetadata` from the `Session.run()` call.
+- - -
+
+#### `tf.train.SessionRunValues.__getnewargs__()` {#SessionRunValues.__getnewargs__}
+
+Return self as a plain tuple.  Used by copy and pickle.
 
 
 - - -
 
-#### `tf.train.SessionRunHook.end(session)` {#SessionRunHook.end}
+#### `tf.train.SessionRunValues.__getstate__()` {#SessionRunValues.__getstate__}
 
-Called at the end of session.
+Exclude the OrderedDict from pickling
 
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
 
-##### Args:
+- - -
 
+#### `tf.train.SessionRunValues.__new__(_cls, results, options, run_metadata)` {#SessionRunValues.__new__}
+
+Create new instance of SessionRunValues(results, options, run_metadata)
+
+
+- - -
+
+#### `tf.train.SessionRunValues.__repr__()` {#SessionRunValues.__repr__}
+
+Return a nicely formatted representation string
+
+
+- - -
+
+#### `tf.train.SessionRunValues.options` {#SessionRunValues.options}
+
+Alias for field number 1
+
+
+- - -
+
+#### `tf.train.SessionRunValues.results` {#SessionRunValues.results}
+
+Alias for field number 0
+
+
+- - -
+
+#### `tf.train.SessionRunValues.run_metadata` {#SessionRunValues.run_metadata}
+
+Alias for field number 2
 
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
 
 
 
@@ -4349,7 +4661,7 @@ Prints the given tensors once every N local steps or once every N seconds.
 The tensors will be printed to the log, with `INFO` severity.
 - - -
 
-#### `tf.train.LoggingTensorHook.__init__(tensors, every_n_iter=None, every_n_secs=None)` {#LoggingTensorHook.__init__}
+#### `tf.train.LoggingTensorHook.__init__(tensors, every_n_iter=None, every_n_secs=None, formatter=None)` {#LoggingTensorHook.__init__}
 
 Initializes a LoggingHook monitor.
 
@@ -4363,6 +4675,8 @@ Initializes a LoggingHook monitor.
 *  <b>`every_n_secs`</b>: `int` or `float`, print the values of `tensors` once every N
       seconds. Exactly one of `every_n_iter` and `every_n_secs` should be
       provided.
+*  <b>`formatter`</b>: function, takes dict of `tag`->`Tensor` and returns a string.
+      If `None` uses default printing all tensors.
 
 ##### Raises:
 
@@ -4953,431 +5267,343 @@ such as saving a last checkpoint.
 
 - - -
 
-### `class tf.train.SessionRunArgs` {#SessionRunArgs}
-
-Represents arguments to be added to a `Session.run()` call.
+### `class tf.train.FinalOpsHook` {#FinalOpsHook}
 
-Args:
-  fetches: Exactly like the 'fetches' argument to Session.Run().
-    Can be a single tensor or op, a list of 'fetches' or a dictionary
-    of fetches.  For example:
-      fetches = global_step_tensor
-      fetches = [train_op, summary_op, global_step_tensor]
-      fetches = {'step': global_step_tensor, 'summ': summary_op}
-    Note that this can recurse as expected:
-      fetches = {'step': global_step_tensor,
-                 'ops': [train_op, check_nan_op]}
-  feed_dict: Exactly like the `feed_dict` argument to `Session.Run()`
-  options: Exactly like the `options` argument to `Session.run()`, i.e., a
-    config_pb2.RunOptions proto.
+A run hook which evaluates `Tensors` at the end of a session.
 - - -
 
-#### `tf.train.SessionRunArgs.__getnewargs__()` {#SessionRunArgs.__getnewargs__}
+#### `tf.train.FinalOpsHook.__init__(final_ops, final_ops_feed_dict=None)` {#FinalOpsHook.__init__}
 
-Return self as a plain tuple.  Used by copy and pickle.
+Constructs the FinalOpHook with ops to run at the end of the session.
 
+##### Args:
 
-- - -
 
-#### `tf.train.SessionRunArgs.__getstate__()` {#SessionRunArgs.__getstate__}
-
-Exclude the OrderedDict from pickling
+*  <b>`final_ops`</b>: A single `Tensor`, a list of `Tensors` or a dictionary of
+    names to `Tensors`.
+*  <b>`final_ops_feed_dict`</b>: A feed dictionary to use when running
+    `final_ops_dict`.
 
 
 - - -
 
-#### `tf.train.SessionRunArgs.__new__(cls, fetches, feed_dict=None, options=None)` {#SessionRunArgs.__new__}
-
+#### `tf.train.FinalOpsHook.after_create_session(session, coord)` {#FinalOpsHook.after_create_session}
 
+Called when new TensorFlow session is created.
 
+This is called to signal the hooks that a new session has been created. This
+has two essential differences with the situation in which `begin` is called:
 
-- - -
-
-#### `tf.train.SessionRunArgs.__repr__()` {#SessionRunArgs.__repr__}
-
-Return a nicely formatted representation string
-
+* When this is called, the graph is finalized and ops can no longer be added
+    to the graph.
+* This method will also be called as a result of recovering a wrapped
+    session, not only at the beginning of the overall session.
 
-- - -
+##### Args:
 
-#### `tf.train.SessionRunArgs.feed_dict` {#SessionRunArgs.feed_dict}
 
-Alias for field number 1
+*  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
 
-#### `tf.train.SessionRunArgs.fetches` {#SessionRunArgs.fetches}
+#### `tf.train.FinalOpsHook.after_run(run_context, run_values)` {#FinalOpsHook.after_run}
 
-Alias for field number 0
+Called after each call to run().
 
+The `run_values` argument contains results of requested ops/tensors by
+`before_run()`.
 
-- - -
+The `run_context` argument is the same one send to `before_run` call.
+`run_context.request_stop()` can be called to stop the iteration.
 
-#### `tf.train.SessionRunArgs.options` {#SessionRunArgs.options}
+##### Args:
 
-Alias for field number 2
 
+*  <b>`run_context`</b>: A `SessionRunContext` object.
+*  <b>`run_values`</b>: A SessionRunValues object.
 
 
 - - -
 
-### `class tf.train.SessionRunContext` {#SessionRunContext}
-
-Provides information about the `session.run()` call being made.
-
-Provides information about original request to `Session.Run()` function.
-SessionRunHook objects can stop the loop by calling `request_stop()` of
-`run_context`. In the future we may use this object to add more information
-about run without changing the Hook API.
-- - -
+#### `tf.train.FinalOpsHook.before_run(run_context)` {#FinalOpsHook.before_run}
 
-#### `tf.train.SessionRunContext.__init__(original_args, session)` {#SessionRunContext.__init__}
+Called before each call to run().
 
-Initializes SessionRunContext.
+You can return from this call a `SessionRunArgs` object indicating ops or
+tensors to add to the upcoming `run()` call.  These ops/tensors will be run
+together with the ops/tensors originally passed to the original run() call.
+The run args you return can also contain feeds to be added to the run()
+call.
 
+The `run_context` argument is a `SessionRunContext` that provides
+information about the upcoming `run()` call: the originally requested
+op/tensors, the TensorFlow Session.
 
-- - -
+At this point graph is finalized and you can not add ops.
 
-#### `tf.train.SessionRunContext.original_args` {#SessionRunContext.original_args}
+##### Args:
 
-A `SessionRunArgs` object holding the original arguments of `run()`.
 
-If user called `MonitoredSession.run(fetches=a, feed_dict=b)`, then this
-field is equal to SessionRunArgs(a, b).
+*  <b>`run_context`</b>: A `SessionRunContext` object.
 
 ##### Returns:
 
- A `SessionRunArgs` object
+  None or a `SessionRunArgs` object.
 
 
 - - -
 
-#### `tf.train.SessionRunContext.request_stop()` {#SessionRunContext.request_stop}
-
-Sets stop requested field.
-
-Hooks can use this function to request stop of iterations.
-`MonitoredSession` checks whether this is called or not.
-
-
-- - -
+#### `tf.train.FinalOpsHook.begin()` {#FinalOpsHook.begin}
 
-#### `tf.train.SessionRunContext.session` {#SessionRunContext.session}
+Called once before using the session.
 
-A TensorFlow session object which will execute the `run`.
+When called, the default graph is the one that will be launched in the
+session.  The hook can modify the graph by adding new operations to it.
+After the `begin()` call the graph will be finalized and the other callbacks
+can not modify the graph anymore. Second call of `begin()` on the same
+graph, should not change the graph.
 
 
 - - -
 
-#### `tf.train.SessionRunContext.stop_requested` {#SessionRunContext.stop_requested}
+#### `tf.train.FinalOpsHook.end(session)` {#FinalOpsHook.end}
 
-Returns whether a stop is requested or not.
-
-If true, `MonitoredSession` stops iterations.
-
-##### Returns:
-
-  A `bool`
 
 
 
 - - -
 
-### `class tf.train.SessionRunValues` {#SessionRunValues}
+#### `tf.train.FinalOpsHook.final_ops_values` {#FinalOpsHook.final_ops_values}
 
-Contains the results of `Session.run()`.
 
-In the future we may use this object to add more information about result of
-run without changing the Hook API.
 
-Args:
-  results: The return values from `Session.run()` corresponding to the fetches
-    attribute returned in the RunArgs. Note that this has the same shape as
-    the RunArgs fetches.  For example:
-      fetches = global_step_tensor
-      => results = nparray(int)
-      fetches = [train_op, summary_op, global_step_tensor]
-      => results = [None, nparray(string), nparray(int)]
-      fetches = {'step': global_step_tensor, 'summ': summary_op}
-      => results = {'step': nparray(int), 'summ': nparray(string)}
-  options: `RunOptions` from the `Session.run()` call.
-  run_metadata: `RunMetadata` from the `Session.run()` call.
-- - -
-
-#### `tf.train.SessionRunValues.__getnewargs__()` {#SessionRunValues.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.train.SessionRunValues.__getstate__()` {#SessionRunValues.__getstate__}
-
-Exclude the OrderedDict from pickling
 
 
 - - -
 
-#### `tf.train.SessionRunValues.__new__(_cls, results, options, run_metadata)` {#SessionRunValues.__new__}
-
-Create new instance of SessionRunValues(results, options, run_metadata)
-
+### `class tf.train.FeedFnHook` {#FeedFnHook}
 
+Runs `feed_fn` and sets the `feed_dict` accordingly.
 - - -
 
-#### `tf.train.SessionRunValues.__repr__()` {#SessionRunValues.__repr__}
-
-Return a nicely formatted representation string
+#### `tf.train.FeedFnHook.__init__(feed_fn)` {#FeedFnHook.__init__}
 
+Constructs the FeedFnHook with given `feed_fn`.
 
-- - -
+##### Args:
 
-#### `tf.train.SessionRunValues.options` {#SessionRunValues.options}
 
-Alias for field number 1
+*  <b>`feed_fn`</b>: function, no arguments and returns `dict` to feed.
 
 
 - - -
 
-#### `tf.train.SessionRunValues.results` {#SessionRunValues.results}
+#### `tf.train.FeedFnHook.after_create_session(session, coord)` {#FeedFnHook.after_create_session}
 
-Alias for field number 0
+Called when new TensorFlow session is created.
 
+This is called to signal the hooks that a new session has been created. This
+has two essential differences with the situation in which `begin` is called:
 
-- - -
+* When this is called, the graph is finalized and ops can no longer be added
+    to the graph.
+* This method will also be called as a result of recovering a wrapped
+    session, not only at the beginning of the overall session.
 
-#### `tf.train.SessionRunValues.run_metadata` {#SessionRunValues.run_metadata}
+##### Args:
 
-Alias for field number 2
 
+*  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
 
-### `class tf.train.LooperThread` {#LooperThread}
-
-A thread that runs code repeatedly, optionally on a timer.
+#### `tf.train.FeedFnHook.after_run(run_context, run_values)` {#FeedFnHook.after_run}
 
-This thread class is intended to be used with a `Coordinator`.  It repeatedly
-runs code specified either as `target` and `args` or by the `run_loop()`
-method.
-
-Before each run the thread checks if the coordinator has requested stop.  In
-that case the looper thread terminates immediately.
-
-If the code being run raises an exception, that exception is reported to the
-coordinator and the thread terminates.  The coordinator will then request all
-the other threads it coordinates to stop.
-
-You typically pass looper threads to the supervisor `Join()` method.
-- - -
+Called after each call to run().
 
-#### `tf.train.LooperThread.__init__(coord, timer_interval_secs, target=None, args=None, kwargs=None)` {#LooperThread.__init__}
+The `run_values` argument contains results of requested ops/tensors by
+`before_run()`.
 
-Create a LooperThread.
+The `run_context` argument is the same one send to `before_run` call.
+`run_context.request_stop()` can be called to stop the iteration.
 
 ##### Args:
 
 
-*  <b>`coord`</b>: A Coordinator.
-*  <b>`timer_interval_secs`</b>: Time boundaries at which to call Run(), or None
-    if it should be called back to back.
-*  <b>`target`</b>: Optional callable object that will be executed in the thread.
-*  <b>`args`</b>: Optional arguments to pass to `target` when calling it.
-*  <b>`kwargs`</b>: Optional keyword arguments to pass to `target` when calling it.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If one of the arguments is invalid.
+*  <b>`run_context`</b>: A `SessionRunContext` object.
+*  <b>`run_values`</b>: A SessionRunValues object.
 
 
 - - -
 
-#### `tf.train.LooperThread.__repr__()` {#LooperThread.__repr__}
+#### `tf.train.FeedFnHook.before_run(run_context)` {#FeedFnHook.before_run}
 
 
 
 
 - - -
 
-#### `tf.train.LooperThread.daemon` {#LooperThread.daemon}
-
-A boolean value indicating whether this thread is a daemon thread (True) or not (False).
-
-This must be set before start() is called, otherwise RuntimeError is
-raised. Its initial value is inherited from the creating thread; the
-main thread is not a daemon thread and therefore all threads created in
-the main thread default to daemon = False.
-
-The entire Python program exits when no alive non-daemon threads are
-left.
-
-
-- - -
-
-#### `tf.train.LooperThread.getName()` {#LooperThread.getName}
+#### `tf.train.FeedFnHook.begin()` {#FeedFnHook.begin}
 
+Called once before using the session.
 
+When called, the default graph is the one that will be launched in the
+session.  The hook can modify the graph by adding new operations to it.
+After the `begin()` call the graph will be finalized and the other callbacks
+can not modify the graph anymore. Second call of `begin()` on the same
+graph, should not change the graph.
 
 
 - - -
 
-#### `tf.train.LooperThread.ident` {#LooperThread.ident}
+#### `tf.train.FeedFnHook.end(session)` {#FeedFnHook.end}
 
-Thread identifier of this thread or None if it has not been started.
-
-This is a nonzero integer. See the thread.get_ident() function. Thread
-identifiers may be recycled when a thread exits and another thread is
-created. The identifier is available even after the thread has exited.
-
-
-- - -
-
-#### `tf.train.LooperThread.isAlive()` {#LooperThread.isAlive}
+Called at the end of session.
 
-Return whether the thread is alive.
+The `session` argument can be used in case the hook wants to run final ops,
+such as saving a last checkpoint.
 
-This method returns True just before the run() method starts until just
-after the run() method terminates. The module function enumerate()
-returns a list of all alive threads.
+##### Args:
 
 
-- - -
+*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
 
-#### `tf.train.LooperThread.isDaemon()` {#LooperThread.isDaemon}
 
 
 
+## Training Utilities
 
 - - -
 
-#### `tf.train.LooperThread.is_alive()` {#LooperThread.is_alive}
-
-Return whether the thread is alive.
-
-This method returns True just before the run() method starts until just
-after the run() method terminates. The module function enumerate()
-returns a list of all alive threads.
-
+### `tf.train.global_step(sess, global_step_tensor)` {#global_step}
 
-- - -
+Small helper to get the global step.
 
-#### `tf.train.LooperThread.join(timeout=None)` {#LooperThread.join}
+```python
+# Creates a variable to hold the global_step.
+global_step_tensor = tf.Variable(10, trainable=False, name='global_step')
+# Creates a session.
+sess = tf.Session()
+# Initializes the variable.
+print('global_step: %s' % tf.train.global_step(sess, global_step_tensor))
 
-Wait until the thread terminates.
+global_step: 10
+```
 
-This blocks the calling thread until the thread whose join() method is
-called terminates -- either normally or through an unhandled exception
-or until the optional timeout occurs.
+##### Args:
 
-When the timeout argument is present and not None, it should be a
-floating point number specifying a timeout for the operation in seconds
-(or fractions thereof). As join() always returns None, you must call
-isAlive() after join() to decide whether a timeout happened -- if the
-thread is still alive, the join() call timed out.
 
-When the timeout argument is not present or None, the operation will
-block until the thread terminates.
+*  <b>`sess`</b>: A TensorFlow `Session` object.
+*  <b>`global_step_tensor`</b>: `Tensor` or the `name` of the operation that contains
+    the global step.
 
-A thread can be join()ed many times.
+##### Returns:
 
-join() raises a RuntimeError if an attempt is made to join the current
-thread as that would cause a deadlock. It is also an error to join() a
-thread before it has been started and attempts to do so raises the same
-exception.
+  The global step value.
 
 
 - - -
 
-#### `tf.train.LooperThread.loop(coord, timer_interval_secs, target, args=None, kwargs=None)` {#LooperThread.loop}
+### `tf.train.basic_train_loop(supervisor, train_step_fn, args=None, kwargs=None, master='')` {#basic_train_loop}
 
-Start a LooperThread that calls a function periodically.
+Basic loop to train a model.
 
-If `timer_interval_secs` is None the thread calls `target(args)`
-repeatedly.  Otherwise `target(args)` is called every `timer_interval_secs`
-seconds.  The thread terminates when a stop of the coordinator is
-requested.
+Calls `train_step_fn` in a loop to train a model.  The function is called as:
 
-##### Args:
+```python
+train_step_fn(session, *args, **kwargs)
+```
 
+It is passed a `tf.Session` in addition to `args` and `kwargs`.  The function
+typically runs one training step in the session.
 
-*  <b>`coord`</b>: A Coordinator.
-*  <b>`timer_interval_secs`</b>: Number. Time boundaries at which to call `target`.
-*  <b>`target`</b>: A callable object.
-*  <b>`args`</b>: Optional arguments to pass to `target` when calling it.
-*  <b>`kwargs`</b>: Optional keyword arguments to pass to `target` when calling it.
+##### Args:
 
-##### Returns:
 
-  The started thread.
+*  <b>`supervisor`</b>: `tf.Supervisor` to run the training services.
+*  <b>`train_step_fn`</b>: Callable to execute one training step.  Called
+    repeatedly as `train_step_fn(session, *args **kwargs)`.
+*  <b>`args`</b>: Optional positional arguments passed to `train_step_fn`.
+*  <b>`kwargs`</b>: Optional keyword arguments passed to `train_step_fn`.
+*  <b>`master`</b>: Master to use to create the training session.  Defaults to
+    `""` which causes the session to be created in the local process.
 
 
 - - -
 
-#### `tf.train.LooperThread.name` {#LooperThread.name}
-
-A string used for identification purposes only.
+### `tf.train.get_global_step(graph=None)` {#get_global_step}
 
-It has no semantics. Multiple threads may be given the same name. The
-initial name is set by the constructor.
+Get the global step tensor.
 
+The global step tensor must be an integer variable. We first try to find it
+in the collection `GLOBAL_STEP`, or by name `global_step:0`.
 
-- - -
+##### Args:
 
-#### `tf.train.LooperThread.run()` {#LooperThread.run}
 
+*  <b>`graph`</b>: The graph to find the global step in. If missing, use default graph.
 
+##### Returns:
 
+  The global step variable, or `None` if none was found.
 
-- - -
+##### Raises:
 
-#### `tf.train.LooperThread.run_loop()` {#LooperThread.run_loop}
 
-Called at 'timer_interval_secs' boundaries.
+*  <b>`TypeError`</b>: If the global step tensor has a non-integer type, or if it is not
+    a `Variable`.
 
 
 - - -
 
-#### `tf.train.LooperThread.setDaemon(daemonic)` {#LooperThread.setDaemon}
-
-
-
+### `tf.train.assert_global_step(global_step_tensor)` {#assert_global_step}
 
-- - -
+Asserts `global_step_tensor` is a scalar int `Variable` or `Tensor`.
 
-#### `tf.train.LooperThread.setName(name)` {#LooperThread.setName}
+##### Args:
 
 
+*  <b>`global_step_tensor`</b>: `Tensor` to test.
 
 
 - - -
 
-#### `tf.train.LooperThread.start()` {#LooperThread.start}
-
-Start the thread's activity.
-
-It must be called at most once per thread object. It arranges for the
-object's run() method to be invoked in a separate thread of control.
+### `tf.train.write_graph(graph_or_graph_def, logdir, name, as_text=True)` {#write_graph}
 
-This method will raise a RuntimeError if called more than once on the
-same thread object.
+Writes a graph proto to a file.
 
+The graph is written as a binary proto unless `as_text` is `True`.
 
-- - -
+```python
+v = tf.Variable(0, name='my_variable')
+sess = tf.Session()
+tf.train.write_graph(sess.graph_def, '/tmp/my-model', 'train.pbtxt')
+```
 
-#### `tf.train.LooperThread.start_loop()` {#LooperThread.start_loop}
+or
 
-Called when the thread starts.
+```python
+v = tf.Variable(0, name='my_variable')
+sess = tf.Session()
+tf.train.write_graph(sess.graph, '/tmp/my-model', 'train.pbtxt')
+```
 
+##### Args:
 
-- - -
 
-#### `tf.train.LooperThread.stop_loop()` {#LooperThread.stop_loop}
+*  <b>`graph_or_graph_def`</b>: A `Graph` or a `GraphDef` protocol buffer.
+*  <b>`logdir`</b>: Directory where to write the graph. This can refer to remote
+    filesystems, such as Google Cloud Storage (GCS).
+*  <b>`name`</b>: Filename for the graph.
+*  <b>`as_text`</b>: If `True`, writes the graph as an ASCII proto.
 
-Called when the thread stops.
+##### Returns:
 
+  The path of the output proto file.
 
 
 
diff --git a/tensorflow/g3doc/tutorials/tflearn/index.md b/tensorflow/g3doc/tutorials/tflearn/index.md
index b6e26ee351..9f6485e30b 100644
--- a/tensorflow/g3doc/tutorials/tflearn/index.md
+++ b/tensorflow/g3doc/tutorials/tflearn/index.md
@@ -202,8 +202,8 @@ The code above first defines the model's feature columns, which specify the data
 type for the features in the data set. All the feature data is continuous, so
 `tf.contrib.layers.real_valued_column` is the appropriate function to use to
 construct the feature columns. There are four features in the data set (sepal
-width, sepal height, petal width, and petal height), so `dimensions` must be set
-accordingly to `4` to hold all the data.
+width, sepal height, petal width, and petal height), so accordingly `dimension`
+must be set to `4` to hold all the data.
 
 Then, the code creates a `DNNClassifier` model using the following arguments:
 
diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go
index 75c111e957..d9ebec0f8c 100644
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@@ -395,7 +395,7 @@ func goType(tfType string) (string, error) {
 	case "type":
 		gotype = "tf.DataType"
 	case "shape":
-		gotype = "[]int64"
+		gotype = "tf.Shape"
 	case "tensor":
 		gotype = "tf.Tensor"
 	case "string":
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index 2eb1194610..c0f91ffb30 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -259,13 +259,38 @@ func setAttr(cdesc *C.TF_OperationDescription, status *status, name string, valu
 		if err := status.Err(); err != nil {
 			return fmt.Errorf("bad value for attribute %q: %v", name, err)
 		}
+	case Shape:
+		ndims, dims := cshape(value)
+		var dimsp *C.int64_t
+		if ndims > 0 {
+			dimsp = &dims[0]
+		}
+		C.TF_SetAttrShape(cdesc, cAttrName, dimsp, ndims)
+	case []Shape:
+		ndims := make([]C.int, len(value))
+		dims := make([][]C.int64_t, len(value))
+		dimsp := make([]*C.int64_t, len(value))
+		for i, s := range value {
+			ndims[i], dims[i] = cshape(s)
+			if ndims[i] > 0 {
+				dimsp[i] = &dims[i][0]
+			}
+		}
+		C.TF_SetAttrShapeList(cdesc, cAttrName, &dimsp[0], &ndims[0], C.int(len(value)))
 	default:
-		// Shapes can be done, but will require that it be
-		// distinguishable from []int64. Which is fine, it
-		// probably makes sense to define a Shape type anyway,
-		// since that should handle partially known shapes as
-		// well and hide the special meaning of -1?
 		return fmt.Errorf("attribute %q has a type (%T) which is not valid for operation attributes", name, value)
 	}
 	return nil
 }
+
+func cshape(s Shape) (C.int, []C.int64_t) {
+	ndims := C.int(s.NumDimensions())
+	if ndims < 0 {
+		return -1, nil
+	}
+	dims := make([]C.int64_t, ndims)
+	for i, s := range s.dims {
+		dims[i] = C.int64_t(s)
+	}
+	return ndims, dims
+}
diff --git a/tensorflow/go/op/op_test.go b/tensorflow/go/op/op_test.go
new file mode 100644
index 0000000000..eaa27bfcd0
--- /dev/null
+++ b/tensorflow/go/op/op_test.go
@@ -0,0 +1,33 @@
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Tests for the generated code of some operations.
+
+package op
+
+import (
+	"testing"
+
+	tf "github.com/tensorflow/tensorflow/tensorflow/go"
+)
+
+func TestPlaceholder(t *testing.T) {
+	s := NewScope()
+	Placeholder(s.SubScope("x"), tf.Float, PlaceholderShape(tf.MakeShape(-1, 10)))
+	Placeholder(s.SubScope("y"), tf.Float, PlaceholderShape(tf.ScalarShape()))
+	Placeholder(s.SubScope("z"), tf.Float, PlaceholderShape(tf.Shape{}))
+	if _, err := s.Finalize(); err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/tensorflow/go/operation_test.go b/tensorflow/go/operation_test.go
index 8080515ee9..4c4c960448 100644
--- a/tensorflow/go/operation_test.go
+++ b/tensorflow/go/operation_test.go
@@ -81,6 +81,21 @@ func TestOperationOutputListSize(t *testing.T) {
 	}
 }
 
+func TestOperationShapeAttribute(t *testing.T) {
+	g := NewGraph()
+	_, err := g.AddOperation(OpSpec{
+		Type: "Placeholder",
+		Attrs: map[string]interface{}{
+			"dtype": Float,
+			"shape": MakeShape(-1, 3),
+		},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	// If and when the API to get attributes is added, check that here.
+}
+
 func TestOutputShape(t *testing.T) {
 	graph := NewGraph()
 	testdata := []struct {
diff --git a/tensorflow/go/shape.go b/tensorflow/go/shape.go
new file mode 100644
index 0000000000..c48bbf29a3
--- /dev/null
+++ b/tensorflow/go/shape.go
@@ -0,0 +1,102 @@
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tensorflow
+
+import (
+	"fmt"
+	"strings"
+)
+
+// Shape represents the (possibly partially known) shape of a tensor that will
+// be produced by an operation.
+//
+// The zero-value of a Shape represents a shape with an unknown number of
+// dimensions.
+type Shape struct {
+	dims []int64
+}
+
+// ScalarShape returns a Shape representing a scalar.
+func ScalarShape() Shape {
+	return Shape{dims: make([]int64, 0)}
+}
+
+// MakeShape returns a Shape with the provided size of each dimension.
+//
+// A value of -1 implies that the size of the corresponding dimension is not
+// known.
+func MakeShape(shape ...int64) Shape {
+	cpy := make([]int64, len(shape))
+	copy(cpy, shape)
+	return Shape{dims: cpy}
+}
+
+// NumDimensions returns the number of dimensions represented by s, or -1 if
+// unknown.
+func (s Shape) NumDimensions() int {
+	if s.dims == nil {
+		return -1
+	}
+	return len(s.dims)
+}
+
+// Size returns the size of the dim-th dimension of the shape, or -1 if it
+// is unknown.
+//
+// REQUIRES: 0 <= dim < s.NumDimensions()
+func (s Shape) Size(dim int) int64 {
+	if dim < 0 || dim > s.NumDimensions() {
+		return -1
+	}
+	return s.dims[dim]
+}
+
+// IsFullySpecified returns true iff the size of all the dimensions of s are
+// known.
+func (s Shape) IsFullySpecified() bool {
+	if s.dims == nil {
+		return false
+	}
+	for _, size := range s.dims {
+		if size <= 1 {
+			return false
+		}
+	}
+	return true
+}
+
+// ToSlice returns the (possibly partially known) shape represented by s as a
+// slice, or an error if the number of dimensions is not known.
+func (s Shape) ToSlice() ([]int64, error) {
+	if s.dims == nil {
+		return nil, fmt.Errorf("cannot create a slice for a Shape with an unknown number of dimensions")
+	}
+	cpy := make([]int64, len(s.dims))
+	copy(cpy, s.dims)
+	return cpy, nil
+}
+
+func (s Shape) String() string {
+	if s.dims == nil {
+		return "?"
+	}
+	ret := fmt.Sprint(s.dims)
+	for _, size := range s.dims {
+		if size < 0 {
+			ret = strings.Replace(ret, fmt.Sprint(size), "?", 1)
+		}
+	}
+	return strings.Replace(ret, " ", ", ", -1)
+}
diff --git a/tensorflow/go/shape_test.go b/tensorflow/go/shape_test.go
new file mode 100644
index 0000000000..f8f3d4e94b
--- /dev/null
+++ b/tensorflow/go/shape_test.go
@@ -0,0 +1,83 @@
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tensorflow
+
+import (
+	"fmt"
+	"reflect"
+	"testing"
+)
+
+func TestShape(t *testing.T) {
+	tests := []struct {
+		shape Shape
+		slice []int64
+		full  bool
+		str   string
+	}{
+		{
+			shape: ScalarShape(),
+			slice: make([]int64, 0),
+			full:  true,
+			str:   "[]",
+		},
+		{
+			shape: MakeShape(-1, 2, -1, 4),
+			slice: []int64{-1, 2, -1, 4},
+			full:  false,
+			str:   "[?, 2, ?, 4]",
+		},
+		{
+			shape: MakeShape(2, 3),
+			slice: []int64{2, 3},
+			full:  true,
+			str:   "[2, 3]",
+		},
+	}
+	for _, test := range tests {
+		t.Run(fmt.Sprintf("%#v", test.shape), func(t *testing.T) {
+			if got, want := test.shape.NumDimensions(), len(test.slice); got != want {
+				t.Errorf("Got %v, want %v", got, want)
+			}
+			if gotSlice, err := test.shape.ToSlice(); err != nil || !reflect.DeepEqual(gotSlice, test.slice) {
+				t.Errorf("Got (%#v, %v), want (%#v, nil)", gotSlice, err, test.slice)
+			}
+			if got, want := test.shape.IsFullySpecified(), test.full; got != want {
+				t.Errorf("Got %v, want %v", got, want)
+			}
+			if got, want := test.shape.String(), test.str; got != want {
+				t.Errorf("Got %v, want %v", got, want)
+			}
+		})
+	}
+
+}
+
+func TestZeroShape(t *testing.T) {
+	var s Shape
+	if s.NumDimensions() != -1 {
+		t.Error(s.NumDimensions())
+	}
+	if _, err := s.ToSlice(); err == nil {
+		t.Error("ToSlice() on a Shape of unknown number of dimensions should fail")
+	}
+	if s.IsFullySpecified() {
+		t.Error("Shape of unknown number of dimensions should not be fully specified")
+	}
+	if got, want := s.String(), "?"; got != want {
+		t.Errorf("Got %q, want %q", got, want)
+	}
+
+}
diff --git a/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
index b13f830631..cb3de5f744 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
@@ -17,6 +17,7 @@ package org.tensorflow;
 
 import static org.junit.Assert.fail;
 
+import org.junit.Ignore;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
@@ -26,7 +27,8 @@ import org.junit.runners.JUnit4;
 public class OperationBuilderTest {
   // TODO(ashankar): Restore this test once the C API gracefully handles mixing graphs and
   // operations instead of segfaulting.
-  // @Test
+  @Test
+  @Ignore
   public void failWhenMixingOperationsOnDifferentGraphs() {
     try (Graph g1 = new Graph();
         Graph g2 = new Graph()) {
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index c1a8191def..248d4c9b81 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -1308,7 +1308,12 @@ class InteractiveSession(BaseSession):
       config: (Optional) `ConfigProto` proto used to configure the session.
     """
     if not config:
-      config = config_pb2.ConfigProto()
+      # If config is not provided, choose some reasonable defaults for
+      # interactive use:
+      #
+      #   - Grow GPU memory as needed at the cost of fragmentation.
+      gpu_options = config_pb2.GPUOptions(allow_growth=True)
+      config = config_pb2.ConfigProto(gpu_options=gpu_options)
     # Interactive sessions always place pruned graphs.
     config.graph_options.place_pruned_graph = True
 
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 0aa5ce0a60..9ad8a1121f 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -592,6 +592,18 @@ py_test(
     ],
 )
 
+sh_test(
+    name = "examples_test",
+    size = "small",
+    srcs = ["examples/examples_test.sh"],
+    data = [
+        ":debug_errors",
+        ":debug_fibonacci",
+        ":debug_mnist",
+        ":debug_tflearn_iris",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/python/debug/examples/debug_fibonacci.py b/tensorflow/python/debug/examples/debug_fibonacci.py
index 14722ecd08..6fdc78b605 100644
--- a/tensorflow/python/debug/examples/debug_fibonacci.py
+++ b/tensorflow/python/debug/examples/debug_fibonacci.py
@@ -45,7 +45,7 @@ def main(_):
   sess.run(tf.global_variables_initializer())
 
   # Wrap the TensorFlow Session object for debugging.
-  sess = tf_debug.LocalCLIDebugWrapperSession(sess)
+  sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type=FLAGS.ui_type)
 
   sess.run(n1)
 
@@ -66,5 +66,10 @@ if __name__ == "__main__":
       type=int,
       default=20,
       help="Length of the fibonacci sequence to compute.")
+  parser.add_argument(
+      "--ui_type",
+      type=str,
+      default="curses",
+      help="Command-line user interface type (curses | readline)")
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_mnist.py b/tensorflow/python/debug/examples/debug_mnist.py
index d8195a6847..73d398c086 100644
--- a/tensorflow/python/debug/examples/debug_mnist.py
+++ b/tensorflow/python/debug/examples/debug_mnist.py
@@ -41,11 +41,14 @@ RAND_SEED = 42
 
 def main(_):
   # Import data
-  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
+  mnist = input_data.read_data_sets(FLAGS.data_dir,
+                                    one_hot=True,
+                                    fake_data=FLAGS.fake_data)
 
   def feed_dict(train):
-    if train:
-      xs, ys = mnist.train.next_batch(FLAGS.train_batch_size, fake_data=False)
+    if train or FLAGS.fake_data:
+      xs, ys = mnist.train.next_batch(FLAGS.train_batch_size,
+                                      fake_data=FLAGS.fake_data)
     else:
       xs, ys = mnist.test.images, mnist.test.labels
 
@@ -157,6 +160,13 @@ if __name__ == "__main__":
       default="curses",
       help="Command-line user interface type (curses | readline)")
   parser.add_argument(
+      "--fake_data",
+      type="bool",
+      nargs="?",
+      const=True,
+      default=False,
+      help="Use fake MNIST data for unit testing")
+  parser.add_argument(
       "--debug",
       type="bool",
       nargs="?",
diff --git a/tensorflow/python/debug/examples/debug_tflearn_iris.py b/tensorflow/python/debug/examples/debug_tflearn_iris.py
index 009885b9ea..57ebba689d 100644
--- a/tensorflow/python/debug/examples/debug_tflearn_iris.py
+++ b/tensorflow/python/debug/examples/debug_tflearn_iris.py
@@ -80,15 +80,22 @@ def iris_input_fn():
 
 
 def main(_):
-  training_data_path, test_data_path = maybe_download_data(FLAGS.data_dir)
-
   # Load datasets.
-  training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-      filename=training_data_path,
-      target_dtype=np.int,
-      features_dtype=np.float32)
-  test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-      filename=test_data_path, target_dtype=np.int, features_dtype=np.float32)
+  if FLAGS.fake_data:
+    training_set = tf.contrib.learn.datasets.base.Dataset(
+        np.random.random([120, 4]),
+        np.random.random_integers(3, size=[120]) - 1)
+    test_set = tf.contrib.learn.datasets.base.Dataset(
+        np.random.random([30, 4]),
+        np.random.random_integers(3, size=[30]) - 1)
+  else:
+    training_data_path, test_data_path = maybe_download_data(FLAGS.data_dir)
+    training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
+        filename=training_data_path,
+        target_dtype=np.int,
+        features_dtype=np.float32)
+    test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
+        filename=test_data_path, target_dtype=np.int, features_dtype=np.float32)
 
   # Specify that all features have real-value data
   feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
@@ -102,8 +109,11 @@ def main(_):
       n_classes=3,
       model_dir=model_dir)
 
-  hooks = ([tf_debug.LocalCLIDebugHook(ui_type=FLAGS.ui_type)] if FLAGS.debug
-           else None)
+  hooks = None
+  if FLAGS.debug:
+    debug_hook = tf_debug.LocalCLIDebugHook(ui_type=FLAGS.ui_type)
+    debug_hook.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
+    hooks = [debug_hook]
 
   if not FLAGS.use_experiment:
     # Fit model.
@@ -163,6 +173,13 @@ if __name__ == "__main__":
       default="curses",
       help="Command-line user interface type (curses | readline)")
   parser.add_argument(
+      "--fake_data",
+      type="bool",
+      nargs="?",
+      const=True,
+      default=False,
+      help="Use fake MNIST data for unit testing")
+  parser.add_argument(
       "--debug",
       type="bool",
       nargs="?",
diff --git a/tensorflow/python/debug/examples/examples_test.sh b/tensorflow/python/debug/examples/examples_test.sh
new file mode 100755
index 0000000000..397078b91d
--- /dev/null
+++ b/tensorflow/python/debug/examples/examples_test.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Bash unit tests for TensorFlow Debugger (tfdbg) Python examples that do not
+# involve downloading data.
+
+set -e
+
+
+DEBUG_FIBONACCI_BIN="$TEST_SRCDIR/org_tensorflow/tensorflow/python/debug/debug_fibonacci"
+
+# Override the default ui_type=curses to allow the test to pass in a tty-less
+# test environment.
+cat << EOF | "${DEBUG_FIBONACCI_BIN}" --ui_type=readline
+run
+exit
+EOF
+
+
+DEBUG_ERRORS_BIN="$TEST_SRCDIR/org_tensorflow/tensorflow/python/debug/debug_errors"
+
+cat << EOF | "${DEBUG_ERRORS_BIN}" --error=no_error --ui_type=readline
+run
+exit
+EOF
+
+
+DEBUG_MNIST_BIN="$TEST_SRCDIR/org_tensorflow/tensorflow/python/debug/debug_mnist"
+
+# Use a large enough "run -t" number to let the process end properly.
+cat << EOF | "${DEBUG_MNIST_BIN}" --debug --fake_data --ui_type=readline
+run -f has_inf_or_nan
+run -t 1000
+EOF
+
+
+DEBUG_TFLEARN_IRIS_BIN="$TEST_SRCDIR/org_tensorflow/tensorflow/python/debug/debug_tflearn_iris"
+
+cat << EOF | "${DEBUG_TFLEARN_IRIS_BIN}" --debug --fake_data --train_steps=2 --ui_type=readline
+run -f has_inf_or_nan
+EOF
diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index cda2becc6e..30f0e117e6 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -44,6 +44,28 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook,
 
     self._ui_type = ui_type
     self._wrapper_initialized = False
+    self._pending_tensor_filters = {}
+
+  def add_tensor_filter(self, filter_name, tensor_filter):
+    """Add a tensor filter.
+
+    See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()` for details.
+    Override default behavior to accomodate the possibility of this method being
+    called prior to the initialization of the underlying
+    `LocalCLIDebugWrapperSession` object.
+
+    Args:
+      filter_name: See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()`
+        for details.
+      tensor_filter: See doc of
+        `LocalCLIDebugWrapperSession.add_tensor_filter()` for details.
+    """
+
+    if self._wrapper_initialized:
+      local_cli_wrapper.LocalCLIDebugWrapperSession.add_tensor_filter(
+          self, filter_name, tensor_filter)
+    else:
+      self._pending_tensor_filters[filter_name] = tensor_filter
 
   def begin(self):
     pass
@@ -52,6 +74,13 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook,
     if not self._wrapper_initialized:
       local_cli_wrapper.LocalCLIDebugWrapperSession.__init__(
           self, run_context.session, ui_type=self._ui_type)
+
+      # Actually register tensor filters registered prior to the construction
+      # of the underlying LocalCLIDebugWrapperSession object.
+      for filter_name in self._pending_tensor_filters:
+        local_cli_wrapper.LocalCLIDebugWrapperSession.add_tensor_filter(
+            self, filter_name, self._pending_tensor_filters[filter_name])
+
       self._wrapper_initialized = True
 
     # Increment run call counter.
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index 49406eedf3..cc33c20f87 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -476,7 +476,8 @@ def import_scoped_meta_graph(meta_graph_or_file,
             sorted(input_map)):
           raise ValueError("Graph contains unbound inputs: %s. Must "
                            "provide these inputs through input_map." %
-                           ",".join([compat.as_str(v) for v in field.value]))
+                           ",".join([compat.as_str(v) for v in field.value
+                                     if not input_map or v not in input_map]))
         break
 
   # Sets graph to default graph if it's not passed in.
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index f4c3dcf99f..13b6923c3c 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -342,6 +342,18 @@ tf_py_test(
 )
 
 tf_py_test(
+    name = "record_input_test",
+    size = "small",
+    srcs = ["record_input_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+tf_py_test(
     name = "io_ops_test",
     size = "small",
     srcs = ["io_ops_test.py"],
diff --git a/tensorflow/python/kernel_tests/argmax_op_test.py b/tensorflow/python/kernel_tests/argmax_op_test.py
index ac9a78d0fa..a5352561aa 100644
--- a/tensorflow/python/kernel_tests/argmax_op_test.py
+++ b/tensorflow/python/kernel_tests/argmax_op_test.py
@@ -90,6 +90,12 @@ class ArgMaxTest(test.TestCase):
             r"Reduction axis 0 is empty in shape \[0\]"):
           op([], 0).eval()
 
+  def testDefaultAxis(self):
+    with self.test_session():
+      for op in math_ops.argmin, math_ops.argmax:
+        ans = op([1]).eval()
+        self.assertAllEqual(ans, 0)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index cf88209148..2d116df2ff 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import confusion_matrix
 from tensorflow.python.ops import math_ops
@@ -215,5 +216,239 @@ class ConfusionMatrixTest(test.TestCase):
     self.assertEqual(tf_cm.dtype, np.int64)
 
 
+class RemoveSqueezableDimensionsTest(test.TestCase):
+
+  def testBothScalarShape(self):
+    label_values = 1.0
+    prediction_values = 0.0
+    static_labels, static_predictions = (
+        confusion_matrix.remove_squeezable_dimensions(
+            label_values, prediction_values))
+
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    predictions_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    dynamic_labels, dynamic_predictions = (
+        confusion_matrix.remove_squeezable_dimensions(
+            labels_placeholder, predictions_placeholder))
+
+    with self.test_session():
+      self.assertAllEqual(label_values, static_labels.eval())
+      self.assertAllEqual(prediction_values, static_predictions.eval())
+      feed_dict = {
+          labels_placeholder: label_values,
+          predictions_placeholder: prediction_values
+      }
+      self.assertAllEqual(
+          label_values, dynamic_labels.eval(feed_dict=feed_dict))
+      self.assertAllEqual(
+          prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
+
+  def testSameShape(self):
+    label_values = np.ones(shape=(2, 3, 1))
+    prediction_values = np.zeros_like(label_values)
+    static_labels, static_predictions = (
+        confusion_matrix.remove_squeezable_dimensions(
+            label_values, prediction_values))
+
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
+    predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
+    dynamic_labels, dynamic_predictions = (
+        confusion_matrix.remove_squeezable_dimensions(
+            labels_placeholder, predictions_placeholder))
+
+    with self.test_session():
+      self.assertAllEqual(label_values, static_labels.eval())
+      self.assertAllEqual(prediction_values, static_predictions.eval())
+      feed_dict = {
+          labels_placeholder: label_values,
+          predictions_placeholder: prediction_values
+      }
+      self.assertAllEqual(
+          label_values, dynamic_labels.eval(feed_dict=feed_dict))
+      self.assertAllEqual(
+          prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
+
+  def testSameShapeExpectedRankDiff0(self):
+    label_values = np.ones(shape=(2, 3, 1))
+    prediction_values = np.zeros_like(label_values)
+    static_labels, static_predictions = (
+        confusion_matrix.remove_squeezable_dimensions(
+            label_values, prediction_values, expected_rank_diff=0))
+
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
+    predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
+    dynamic_labels, dynamic_predictions = (
+        confusion_matrix.remove_squeezable_dimensions(
+            labels_placeholder, predictions_placeholder, expected_rank_diff=0))
+
+    with self.test_session():
+      self.assertAllEqual(label_values, static_labels.eval())
+      self.assertAllEqual(prediction_values, static_predictions.eval())
+      feed_dict = {
+          labels_placeholder: label_values,
+          predictions_placeholder: prediction_values
+      }
+      self.assertAllEqual(
+          label_values, dynamic_labels.eval(feed_dict=feed_dict))
+      self.assertAllEqual(
+          prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
+
+  def testSqueezableLabels(self):
+    label_values = np.ones(shape=(2, 3, 1))
+    prediction_values = np.zeros(shape=(2, 3))
+    static_labels, static_predictions = (
+        confusion_matrix.remove_squeezable_dimensions(
+            label_values, prediction_values))
+
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
+    predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
+    dynamic_labels, dynamic_predictions = (
+        confusion_matrix.remove_squeezable_dimensions(
+            labels_placeholder, predictions_placeholder))
+
+    expected_label_values = np.reshape(label_values, newshape=(2, 3))
+    with self.test_session():
+      self.assertAllEqual(expected_label_values, static_labels.eval())
+      self.assertAllEqual(prediction_values, static_predictions.eval())
+      feed_dict = {
+          labels_placeholder: label_values,
+          predictions_placeholder: prediction_values
+      }
+      self.assertAllEqual(
+          expected_label_values, dynamic_labels.eval(feed_dict=feed_dict))
+      self.assertAllEqual(
+          prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
+
+  def testSqueezableLabelsExpectedRankDiffPlus1(self):
+    label_values = np.ones(shape=(2, 3, 1))
+    prediction_values = np.zeros(shape=(2, 3, 5))
+    static_labels, static_predictions = (
+        confusion_matrix.remove_squeezable_dimensions(
+            label_values, prediction_values, expected_rank_diff=1))
+
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
+    predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
+    dynamic_labels, dynamic_predictions = (
+        confusion_matrix.remove_squeezable_dimensions(
+            labels_placeholder, predictions_placeholder, expected_rank_diff=1))
+
+    expected_label_values = np.reshape(label_values, newshape=(2, 3))
+    with self.test_session():
+      self.assertAllEqual(expected_label_values, static_labels.eval())
+      self.assertAllEqual(prediction_values, static_predictions.eval())
+      feed_dict = {
+          labels_placeholder: label_values,
+          predictions_placeholder: prediction_values
+      }
+      self.assertAllEqual(
+          expected_label_values, dynamic_labels.eval(feed_dict=feed_dict))
+      self.assertAllEqual(
+          prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
+
+  def testSqueezablePredictions(self):
+    label_values = np.ones(shape=(2, 3))
+    prediction_values = np.zeros(shape=(2, 3, 1))
+    static_labels, static_predictions = (
+        confusion_matrix.remove_squeezable_dimensions(
+            label_values, prediction_values))
+
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
+    predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
+    dynamic_labels, dynamic_predictions = (
+        confusion_matrix.remove_squeezable_dimensions(
+            labels_placeholder, predictions_placeholder))
+
+    expected_prediction_values = np.reshape(prediction_values, newshape=(2, 3))
+    with self.test_session():
+      self.assertAllEqual(label_values, static_labels.eval())
+      self.assertAllEqual(expected_prediction_values, static_predictions.eval())
+      feed_dict = {
+          labels_placeholder: label_values,
+          predictions_placeholder: prediction_values
+      }
+      self.assertAllEqual(
+          label_values, dynamic_labels.eval(feed_dict=feed_dict))
+      self.assertAllEqual(
+          expected_prediction_values,
+          dynamic_predictions.eval(feed_dict=feed_dict))
+
+  def testSqueezablePredictionsExpectedRankDiffMinus1(self):
+    label_values = np.ones(shape=(2, 3, 5))
+    prediction_values = np.zeros(shape=(2, 3, 1))
+    static_labels, static_predictions = (
+        confusion_matrix.remove_squeezable_dimensions(
+            label_values, prediction_values, expected_rank_diff=-1))
+
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
+    predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
+    dynamic_labels, dynamic_predictions = (
+        confusion_matrix.remove_squeezable_dimensions(
+            labels_placeholder, predictions_placeholder, expected_rank_diff=-1))
+
+    expected_prediction_values = np.reshape(prediction_values, newshape=(2, 3))
+    with self.test_session():
+      self.assertAllEqual(label_values, static_labels.eval())
+      self.assertAllEqual(expected_prediction_values, static_predictions.eval())
+      feed_dict = {
+          labels_placeholder: label_values,
+          predictions_placeholder: prediction_values
+      }
+      self.assertAllEqual(
+          label_values, dynamic_labels.eval(feed_dict=feed_dict))
+      self.assertAllEqual(
+          expected_prediction_values,
+          dynamic_predictions.eval(feed_dict=feed_dict))
+
+  def testUnsqueezableLabels(self):
+    label_values = np.ones(shape=(2, 3, 2))
+    prediction_values = np.zeros(shape=(2, 3))
+    with self.assertRaisesRegexp(ValueError, r"Can not squeeze dim\[2\]"):
+      confusion_matrix.remove_squeezable_dimensions(
+          label_values, prediction_values)
+
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
+    predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
+    dynamic_labels, dynamic_predictions = (
+        confusion_matrix.remove_squeezable_dimensions(
+            labels_placeholder, predictions_placeholder))
+
+    with self.test_session():
+      feed_dict = {
+          labels_placeholder: label_values,
+          predictions_placeholder: prediction_values
+      }
+      with self.assertRaisesRegexp(
+          errors_impl.InvalidArgumentError,
+          "Tried to explicitly squeeze dimension 2"):
+        dynamic_labels.eval(feed_dict=feed_dict)
+      self.assertAllEqual(
+          prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
+
+  def testUnsqueezablePredictions(self):
+    label_values = np.ones(shape=(2, 3))
+    prediction_values = np.zeros(shape=(2, 3, 2))
+    with self.assertRaisesRegexp(ValueError, r"Can not squeeze dim\[2\]"):
+      confusion_matrix.remove_squeezable_dimensions(
+          label_values, prediction_values)
+
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
+    predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
+    dynamic_labels, dynamic_predictions = (
+        confusion_matrix.remove_squeezable_dimensions(
+            labels_placeholder, predictions_placeholder))
+
+    with self.test_session():
+      feed_dict = {
+          labels_placeholder: label_values,
+          predictions_placeholder: prediction_values
+      }
+      self.assertAllEqual(
+          label_values, dynamic_labels.eval(feed_dict=feed_dict))
+      with self.assertRaisesRegexp(
+          errors_impl.InvalidArgumentError,
+          "Tried to explicitly squeeze dimension 2"):
+        dynamic_predictions.eval(feed_dict=feed_dict)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index 125d353df3..f3ae092b6f 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -72,7 +72,7 @@ class AbsoluteDifferenceLossTest(test.TestCase):
       self.assertAlmostEqual(5.5 * weights, loss.eval(), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeights(self):
-    weights = constant_op.constant([1.2, 0.0], shape=[2,])
+    weights = constant_op.constant((1.2, 0.0), shape=(2, 1))
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.test_session():
       self.assertAlmostEqual(5.6, loss.eval(), 3)
@@ -154,7 +154,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
     labels = constant_op.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0]])
-    weights = constant_op.constant([1.2, 3.4, 5.6], shape=[3])
+    weights = constant_op.constant((1.2, 3.4, 5.6))
     with self.test_session():
       loss = losses.softmax_cross_entropy(labels, logits, weights)
       self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss.eval(), 3)
@@ -296,8 +296,6 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
                                                  constant_op.constant(weights))
       self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
 
-  # TODO(b/33556118): Bug: this should be averaged across all dimensions, not
-  # summed across dim 0.
   def testNonZeroLossWith1DTensorWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -305,25 +303,25 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = 2.3
     with self.test_session():
       loss = losses.sparse_softmax_cross_entropy(
-          labels, logits, constant_op.constant(weights, shape=(1,)))
-      self.assertAlmostEqual(weights * 3.0 * 10.0, loss.eval(), 2)
+          labels, logits, constant_op.constant((weights,)))
+      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
 
   def testNonZeroLossWithPlaceholderForWeights(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0],
                                    [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
     labels = constant_op.constant([[2], [0], [1]])
-    weights = array_ops.placeholder(dtypes.float32, shape=(None,))
+    weights = array_ops.placeholder(dtypes.float32)
     with self.test_session() as sess:
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
       loss_val = sess.run(loss,
-                          feed_dict={weights: [1.2, 3.4, 5.6]})
+                          feed_dict={weights: ((1.2,), (3.4,), (5.6,))})
       self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss_val, 3)
 
   def testNonZeroLossWithPlaceholderForLogitsLabelsAndWeights(self):
     logits = array_ops.placeholder(dtypes.float32, shape=(None, 3))
     labels = array_ops.placeholder(dtypes.int32, shape=(None, 1))
-    weights = array_ops.placeholder(dtypes.float32, shape=(None,))
+    weights = array_ops.placeholder(dtypes.float32)
     with self.test_session() as sess:
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
       loss_val = sess.run(loss,
@@ -332,7 +330,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
                                        [0.0, 10.0, 0.0],
                                        [0.0, 0.0, 10.0]],
                               labels: [[2], [0], [1]],
-                              weights: [1.2, 3.4, 5.6],
+                              weights: ((1.2,), (3.4,), (5.6,)),
                           })
       self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss_val, 3)
 
@@ -340,7 +338,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
     labels = constant_op.constant([[2], [0], [1]])
-    weights = constant_op.constant([1.2, 3.4, 5.6], shape=[3])
+    weights = constant_op.constant([1.2, 3.4, 5.6], shape=(3, 1))
     with self.test_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
       self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss.eval(), 3)
@@ -358,7 +356,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
     labels = constant_op.constant([[2], [0], [1]])
-    weights = constant_op.constant([0, 0, 0], shape=[3])
+    weights = constant_op.constant([0, 0, 0], shape=(3, 1))
     with self.test_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
       self.assertAlmostEqual(0.0, loss.eval(), 3)
@@ -367,7 +365,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
     labels = constant_op.constant([[2], [0], [1]])
-    weights = constant_op.constant([1.2, 0, 0], shape=[3])
+    weights = constant_op.constant([1.2, 0, 0], shape=(3, 1))
     with self.test_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
       self.assertAlmostEqual(12.0, loss.eval(), 3)
@@ -432,9 +430,9 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
                                      [-100.0, -100.0, 100.0, -100.0],
                                      [-100.0, -100.0, -100.0, 100.0]])
       labels = constant_op.constant([[0, 1], [2, 3]])
-      weights = constant_op.constant([1.2, 3.4, 5.6, 7.8])
+      weights = constant_op.constant(1.2)
 
-      with self.assertRaises(errors_impl.InvalidArgumentError):
+      with self.assertRaisesRegexp(ValueError, 'dimension'):
         losses.sparse_softmax_cross_entropy(
             labels, logits, weights=weights).eval()
 
@@ -629,7 +627,7 @@ class LogLossTest(test.TestCase):
                              loss, 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeights(self):
-    weights = constant_op.constant([1.2, 3.4], shape=[2])
+    weights = constant_op.constant((1.2, 3.4), shape=(2, 1))
     expected_losses = np.multiply(
         self._expected_losses,
         np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
@@ -638,7 +636,7 @@ class LogLossTest(test.TestCase):
       self.assertAlmostEqual(-np.sum(expected_losses) / 6.0, loss.eval(), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeightsSomeZero(self):
-    weights = constant_op.constant([1.2, 0], shape=[2])
+    weights = constant_op.constant((1.2, 0), shape=(2, 1))
     expected_losses = np.multiply(self._expected_losses,
                                   np.asarray([1.2, 1.2, 1.2, 0, 0, 0]).reshape(
                                       (2, 3)))
@@ -797,7 +795,7 @@ class MeanSquaredErrorTest(test.TestCase):
       self.assertAlmostEqual(49.5 * weights, loss.eval(), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeights(self):
-    weights = constant_op.constant([1.2, 3.4], shape=[2,])
+    weights = constant_op.constant([1.2, 3.4], shape=(2, 1))
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.test_session():
       self.assertAlmostEqual(767.8 / 6.0, loss.eval(), 3)
@@ -855,7 +853,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
             labels=constant_op.constant(self._labels),
             weights=None)
 
-  def _test_mean_pairwise_squared_error(
+  def _test_valid_weights(
       self, labels, predictions, expected_loss, weights=1.0):
     with self.test_session():
       static_inputs_op = losses.mean_pairwise_squared_error(
@@ -881,11 +879,11 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
           expected_loss, dynamic_inputs_op.eval(feed_dict=feed_dict), places=3)
 
   def testAllCorrectNoLossWeight(self):
-    self._test_mean_pairwise_squared_error(
+    self._test_valid_weights(
         self._labels, self._labels, expected_loss=0.0)
 
   def testNonZeroLoss(self):
-    self._test_mean_pairwise_squared_error(
+    self._test_valid_weights(
         self._labels, self._predictions,
         expected_loss=np.sum(self._expected_losses))
 
@@ -916,7 +914,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
 
   def testNonZeroLossWithPythonScalarWeight(self):
     weight = 2.3
-    self._test_mean_pairwise_squared_error(
+    self._test_valid_weights(
         self._labels, self._predictions,
         expected_loss=weight * np.sum(self._expected_losses),
         weights=weight)
@@ -932,16 +930,9 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
                              loss.eval(), 3)
 
   def testNonZeroLossWithScalarZeroWeight(self):
-    self._test_mean_pairwise_squared_error(
+    self._test_valid_weights(
         self._labels, self._predictions, expected_loss=0.0, weights=0.0)
 
-  def testNonZeroLossWithOneDimBatchSpecificWeights(self):
-    weights = np.asarray((1.2, 3.4))
-    self._test_mean_pairwise_squared_error(
-        self._labels, self._predictions,
-        expected_loss=np.sum(np.multiply(weights, self._expected_losses)),
-        weights=weights)
-
   def test3d(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -951,7 +942,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         [[4, 8, 12], [1, 2, 3], [4, 5, 6]],
         [[8, 1, 3], [7, 8, 9], [10, 11, 12]],
     ])
-    self._test_mean_pairwise_squared_error(
+    self._test_valid_weights(
         labels, predictions, expected_loss=122.22222)
 
   def test3dWeightedScalar(self):
@@ -964,11 +955,36 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         [[8, 1, 3], [7, 8, 9], [10, 11, 12]],
     ])
     weight = 3.0
-    self._test_mean_pairwise_squared_error(
+    self._test_valid_weights(
         labels, predictions, expected_loss=weight * 122.22222,
         weights=weight)
 
-  def test3dWeighted2x0(self):
+  def _test_invalid_weights(
+      self, labels, predictions, weights=1.0):
+    expected_error_msg = 'weights can not be broadcast to values'
+
+    # Static check.
+    with self.assertRaisesRegexp(ValueError, expected_error_msg):
+      losses.mean_pairwise_squared_error(
+          predictions=predictions, labels=labels, weights=weights)
+
+    # Dynamic check.
+    predictions_placeholder = array_ops.placeholder(dtypes.float32)
+    labels_placeholder = array_ops.placeholder(dtypes.int32)
+    weights_placeholder = array_ops.placeholder(dtypes.float32)
+    dynamic_inputs_op = losses.mean_pairwise_squared_error(
+        predictions=predictions_placeholder,
+        labels=labels_placeholder,
+        weights=weights_placeholder)
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.OpError, expected_error_msg):
+        dynamic_inputs_op.eval(feed_dict={
+            predictions_placeholder: predictions,
+            labels_placeholder: labels,
+            weights_placeholder: weights,
+        })
+
+  def testInvalid3dWeighted2x0(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
         [[-5, -5, 7], [6, 5, 4], [3, 2, 1]],
@@ -977,11 +993,9 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         [[4, 8, 12], [1, 2, 3], [4, 5, 6]],
         [[8, 1, 3], [7, 8, 9], [10, 11, 12]],
     ])
-    self._test_mean_pairwise_squared_error(
-        labels, predictions, expected_loss=253.24445,
-        weights=np.asarray((1.2, 3.4)))
+    self._test_invalid_weights(
+        labels, predictions, weights=np.asarray((1.2, 3.4)))
 
-  # TODO(ptucker): According to the pydoc, this should work.
   def test3dWeighted2x3x3(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -991,19 +1005,13 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         [[4, 8, 12], [1, 2, 3], [4, 5, 6]],
         [[8, 1, 3], [7, 8, 9], [10, 11, 12]],
     ])
-    with self.assertRaisesRegexp(
-        ValueError, 'Dimensions must be equal, but are 2 and 3'):
-      losses.mean_pairwise_squared_error(
-          predictions=predictions, labels=labels,
-          weights=np.ones((2, 3, 3)))
-
-  def testZeroLossWithOneDimBatchZeroWeights(self):
-    self._test_mean_pairwise_squared_error(
-        self._labels, self._predictions, expected_loss=0.0,
-        weights=np.zeros((2,)))
+    self._test_valid_weights(
+        # TODO(ptucker): This doesn't look right.
+        labels, predictions, expected_loss=9 * 122.22222,
+        weights=np.ones((2, 3, 3)))
 
   def testLossWithAllZeroBatchSpecificWeights(self):
-    self._test_mean_pairwise_squared_error(
+    self._test_valid_weights(
         self._labels, self._predictions, expected_loss=0.0,
         weights=np.zeros((2, 1)))
 
@@ -1071,7 +1079,7 @@ class CosineDistanceLossTest(test.TestCase):
         predictions=constant_op.constant(self._predictions),
         labels=constant_op.constant(self._labels),
         dim=2,
-        weights=constant_op.constant([1, 0, 0]))
+        weights=np.asarray((1, 0, 0)).reshape((3, 1, 1)))
     with self.test_session():
       self.assertEqual(1.0, loss.eval())
 
@@ -1081,21 +1089,10 @@ class CosineDistanceLossTest(test.TestCase):
         labels=constant_op.constant(self._labels),
         dim=2,
         weights=constant_op.constant(
-            [1, 0, 0, 1, 1, 1], shape=(3, 2)))
+            [1, 0, 0, 1, 1, 1], shape=(3, 2, 1)))
     with self.test_session():
       self.assertEqual(3.0 / 4.0, loss.eval())
 
-  def testValueErrorThrownWithShapelessPlaceholder(self):
-    tf_predictions = array_ops.placeholder(dtypes.float32)
-    with self.test_session():
-      with self.assertRaises(ValueError):
-        losses.cosine_distance(
-            predictions=tf_predictions,
-            labels=constant_op.constant(self._labels),
-            dim=2,
-            weights=constant_op.constant(
-                [1, 0, 0, 1, 1, 1], shape=(3, 2)))
-
   def testMeasurementSpecificWeightsWithPlaceholderWithShape(self):
     tf_predictions = array_ops.placeholder(
         dtypes.float32, shape=self._labels.shape)
@@ -1104,7 +1101,7 @@ class CosineDistanceLossTest(test.TestCase):
         labels=constant_op.constant(self._labels),
         dim=2,
         weights=constant_op.constant(
-            [1, 0, 0, 1, 1, 1], shape=(3, 2)))
+            [1, 0, 0, 1, 1, 1], shape=(3, 2, 1)))
     with self.test_session() as sess:
       loss = sess.run(loss, feed_dict={tf_predictions: self._predictions})
       self.assertEqual(3.0 / 4.0, loss)
@@ -1114,7 +1111,7 @@ class CosineDistanceLossTest(test.TestCase):
         predictions=constant_op.constant(self._predictions),
         labels=constant_op.constant(self._labels),
         dim=2,
-        weights=array_ops.zeros((3,)))
+        weights=array_ops.zeros((3, 1, 1)))
     with self.test_session():
       self.assertEqual(0, loss.eval())
 
@@ -1123,7 +1120,7 @@ class CosineDistanceLossTest(test.TestCase):
         predictions=constant_op.constant(self._predictions),
         labels=constant_op.constant(self._labels),
         dim=2,
-        weights=array_ops.zeros((3, 2)))
+        weights=array_ops.zeros((3, 2, 1)))
     with self.test_session():
       self.assertEqual(0, loss.eval())
 
@@ -1161,17 +1158,18 @@ class ComputeWeightedLossTest(test.TestCase):
     with ops.Graph().as_default():
       self.assertEqual(0, len(util.get_losses()))
       raw_losses = self._raw_losses
-      shape = self._shape
-      unweighted_losses = (losses.compute_weighted_loss(raw_losses),
-                           losses.compute_weighted_loss(
-                               raw_losses, weights=1.0),
-                           losses.compute_weighted_loss(
-                               raw_losses, weights=np.ones(shape=shape[0:1])),
-                           losses.compute_weighted_loss(
-                               raw_losses, weights=np.ones(shape=shape[0:2])),
-                           losses.compute_weighted_loss(
-                               raw_losses, weights=np.ones(shape=shape)))
-      self.assertEqual(5, len(util.get_losses()))
+      unweighted_losses = (
+          losses.compute_weighted_loss(raw_losses),
+          losses.compute_weighted_loss(raw_losses, weights=np.ones((1, 1, 1))),
+          losses.compute_weighted_loss(raw_losses, weights=np.ones((1, 1, 4))),
+          losses.compute_weighted_loss(raw_losses, weights=np.ones((1, 2, 1))),
+          losses.compute_weighted_loss(raw_losses, weights=np.ones((1, 2, 4))),
+          losses.compute_weighted_loss(raw_losses, weights=np.ones((3, 1, 1))),
+          losses.compute_weighted_loss(raw_losses, weights=np.ones((3, 1, 4))),
+          losses.compute_weighted_loss(raw_losses, weights=np.ones((3, 2, 1))),
+          losses.compute_weighted_loss(raw_losses, weights=np.ones(self._shape))
+      )
+      self.assertEqual(9, len(util.get_losses()))
       with self.test_session():
         for unweighted_loss in unweighted_losses:
           self.assertAllClose(self._unweighted_loss, unweighted_loss.eval())
@@ -1187,215 +1185,114 @@ class ComputeWeightedLossTest(test.TestCase):
         self.assertAllClose(
             np.mean(weight * self._raw_losses), weighted_loss.eval())
 
-  # TODO(b/33556118): Bug: `loss1` should be the same as `testUnweighted`, and
-  # `loss17` should be the same as `testScalarWeight`.
-  def testScalar1DWeight(self):
+  def _test_invalid_weights(self, weights):
     with ops.Graph().as_default():
       self.assertEqual(0, len(util.get_losses()))
-      loss1 = losses.compute_weighted_loss(self._raw_losses, weights=(1.0,))
+      expected_error_msg = 'weights can not be broadcast to values'
+
+      # Static check.
+      with self.assertRaisesRegexp(ValueError, expected_error_msg):
+        losses.compute_weighted_loss(self._raw_losses, weights=weights)
+
+      # Dynamic check.
+      weights_placeholder = array_ops.placeholder(dtypes.float32)
+      weighted_loss = losses.compute_weighted_loss(
+          self._raw_losses, weights=weights_placeholder)
       self.assertEqual(1, len(util.get_losses()))
-      weight = 17.0
-      loss17 = losses.compute_weighted_loss(self._raw_losses, weights=(weight,))
-      self.assertEqual(2, len(util.get_losses()))
       with self.test_session():
-        self.assertAllClose(self._unweighted_loss * self._shape[0],
-                            loss1.eval())
-        self.assertAllClose(
-            np.mean(weight * self._raw_losses) * self._shape[0], loss17.eval())
+        with self.assertRaisesRegexp(errors_impl.OpError, expected_error_msg):
+          weighted_loss.eval(feed_dict={weights_placeholder: weights})
 
-  def testInvalid1DWeight(self):
-    with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, 'Dimensions must be equal'):
-        losses.compute_weighted_loss(self._raw_losses, weights=(17.0, 31.0))
-
-  def testInvalid4DWeight(self):
-    with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, 'Invalid weights shape'):
-        losses.compute_weighted_loss(
-            self._raw_losses, weights=np.zeros(shape=(2, 2, 2, 2)))
+  def testInvalidWeightTooManyDims(self):
+    self._test_invalid_weights(np.zeros(shape=(2, 2, 2, 2)))
 
-  def testInvalid4DWeight2(self):
+  def testInvalidWeightMismatchedDim(self):
     with ops.Graph().as_default():
       raw_losses = array_ops.reshape(self._raw_losses, shape=(3, 2, 4, 1))
       weights = np.ones(shape=(3, 2, 4, 2))
-      with self.assertRaisesRegexp(ValueError, 'Invalid weights shape'):
+      expected_error_msg = 'weights can not be broadcast to values'
+      self.assertEqual(0, len(util.get_losses()))
+
+      # Static check.
+      with self.assertRaisesRegexp(ValueError, expected_error_msg):
         losses.compute_weighted_loss(raw_losses, weights=weights)
 
-  def test3Weight(self):
-    with ops.Graph().as_default():
-      self.assertEqual(0, len(util.get_losses()))
-      weights3 = (17.0, 5.0, 2.0)
+      # Dynamic check.
+      weights_placeholder = array_ops.placeholder(dtypes.float32)
       weighted_loss = losses.compute_weighted_loss(
-          self._raw_losses, weights=weights3)
+          raw_losses, weights=weights_placeholder)
       self.assertEqual(1, len(util.get_losses()))
       with self.test_session():
-        weights3x1x1 = np.reshape(weights3, (3, 1, 1))
-        self.assertAllClose(
-            np.mean(weights3x1x1 * self._raw_losses), weighted_loss.eval())
+        with self.assertRaisesRegexp(errors_impl.OpError, expected_error_msg):
+          weighted_loss.eval(feed_dict={weights_placeholder: weights})
 
-  def test3x1Weight(self):
-    with ops.Graph().as_default():
-      self.assertEqual(0, len(util.get_losses()))
-      weights3x1 = (
-          (17.0,),
-          (5.0,),
-          (2.0,),)
-      weighted_loss = losses.compute_weighted_loss(
-          self._raw_losses, weights=weights3x1)
-      self.assertEqual(1, len(util.get_losses()))
-      with self.test_session():
-        weights3x1x1 = np.reshape(weights3x1, (3, 1, 1))
-        self.assertAllClose(
-            np.mean(weights3x1x1 * self._raw_losses), weighted_loss.eval())
+  def testInvalid3Weight(self):
+    self._test_invalid_weights((17.0, 5.0, 2.0))
 
-  # TODO(ptucker): Bug: this should be the same as `test3x1Weight`.
-  def test3x1x1Weight(self):
-    with ops.Graph().as_default():
-      self.assertEqual(0, len(util.get_losses()))
-      weights3x1x1 = (
-          ((17.0,),),
-          ((5.0,),),
-          ((2.0,),),)
-      weighted_loss = losses.compute_weighted_loss(
-          self._raw_losses, weights=weights3x1x1)
-      self.assertEqual(1, len(util.get_losses()))
-      with self.test_session():
-        self.assertAllClose(
-            np.mean(weights3x1x1 * self._raw_losses) * self._shape[1],
-            weighted_loss.eval())
+  def testInvalid3x1Weight(self):
+    self._test_invalid_weights(((17.0,), (5.0,), (2.0,),))
 
-  def test3x2Weight(self):
-    with ops.Graph().as_default():
-      self.assertEqual(0, len(util.get_losses()))
-      weights3x2 = (
-          (17.0, 3.0),
-          (5.0, 31.0),
-          (2.0, 7.0),)
-      weighted_loss = losses.compute_weighted_loss(
-          self._raw_losses, weights=weights3x2)
-      self.assertEqual(1, len(util.get_losses()))
-      with self.test_session():
-        weights3x2x1 = np.reshape(weights3x2, (3, 2, 1))
-        self.assertAllClose(
-            np.mean(weights3x2x1 * self._raw_losses), weighted_loss.eval())
+  def testInvalid3x2Weight(self):
+    self._test_invalid_weights((
+        (17.0, 3.0),
+        (5.0, 31.0),
+        (2.0, 7.0),))
+
+  def testInvalid1x2Weight(self):
+    self._test_invalid_weights((17.0, 3.0,),)
 
-  # TODO(b/33556118): Bug: this should be averaged across all dimensions, not
-  # summed across dim 0.
-  def test1x2Weight(self):
+  def testInvalidScalar1DWeight(self):
+    self._test_invalid_weights((17.0,),)
+
+  def _test_valid_weights(self, weights):
     with ops.Graph().as_default():
       self.assertEqual(0, len(util.get_losses()))
-      weights1x2 = ((
-          17.0,
-          3.0,),)
       weighted_loss = losses.compute_weighted_loss(
-          self._raw_losses, weights=weights1x2)
+          self._raw_losses, weights=weights)
       self.assertEqual(1, len(util.get_losses()))
       with self.test_session():
-        weights1x2x1 = np.reshape(weights1x2, (1, 2, 1))
         self.assertAllClose(
-            np.mean(weights1x2x1 * self._raw_losses) * self._shape[0],
+            np.mean(weights * self._raw_losses),
             weighted_loss.eval())
 
-  # TODO(b/33556118): Bug: this should be averaged across all dimensions, not
-  # summed across dim 0.
+  def test1x1x1Weight(self):
+    self._test_valid_weights((((17.0,),),))
+
   def test1x2x1Weight(self):
-    with ops.Graph().as_default():
-      self.assertEqual(0, len(util.get_losses()))
-      weights1x2x1 = ((
-          (17.0,),
-          (3.0,),),)
-      weighted_loss = losses.compute_weighted_loss(
-          self._raw_losses, weights=weights1x2x1)
-      self.assertEqual(1, len(util.get_losses()))
-      with self.test_session():
-        self.assertAllClose(
-            np.mean(weights1x2x1 * self._raw_losses) * self._shape[0],
-            weighted_loss.eval())
+    self._test_valid_weights((((17.0,), (3.0,),),))
 
-  # TODO(b/33556118): Bug: this should be averaged across all dimensions, not
-  # summed across dims 0 & 1.
   def test1x1x4Weight(self):
-    with ops.Graph().as_default():
-      self.assertEqual(0, len(util.get_losses()))
-      weights1x1x4 = (((17.0, 13.0, 2.0, 5.0),),)
-      weighted_loss = losses.compute_weighted_loss(
-          self._raw_losses, weights=weights1x1x4)
-      self.assertEqual(1, len(util.get_losses()))
-      shape = self._shape
-      with self.test_session():
-        self.assertAllClose(
-            np.mean(weights1x1x4 * self._raw_losses) * shape[0] * shape[1],
-            weighted_loss.eval())
+    self._test_valid_weights((((17.0, 13.0, 2.0, 5.0),),))
+
+  def test3x1x1Weight(self):
+    self._test_valid_weights((((17.0,),), ((5.0,),), ((2.0,),),))
 
   def test3x2x1Weight(self):
-    with ops.Graph().as_default():
-      self.assertEqual(0, len(util.get_losses()))
-      weights3x2x1 = (
-          ((17.0,), (3.0,)),
-          ((5.0,), (31.0,)),
-          ((2.0,), (7.0,)),
-      )
-      weighted_loss = losses.compute_weighted_loss(
-          self._raw_losses, weights=weights3x2x1)
-      self.assertEqual(1, len(util.get_losses()))
-      with self.test_session():
-        self.assertAllClose(
-            np.mean(weights3x2x1 * self._raw_losses),
-            weighted_loss.eval())
+    self._test_valid_weights((
+        ((17.0,), (3.0,)),
+        ((5.0,), (31.0,)),
+        ((2.0,), (7.0,)),
+    ))
 
-  # TODO(b/33556118): Bug: this should be averaged across all dimensions, not
-  # summed across dim 1.
   def test3x1x4Weight(self):
-    with ops.Graph().as_default():
-      self.assertEqual(0, len(util.get_losses()))
-      weights3x1x4 = (
-          ((17.0, 13.0, 2.0, 5.0),),
-          ((5.0, 31.0, 17.0, 5.0),),
-          ((7.0, 3.0, 11.0, 5.0),),
-      )
-      weighted_loss = losses.compute_weighted_loss(
-          self._raw_losses, weights=weights3x1x4)
-      self.assertEqual(1, len(util.get_losses()))
-      with self.test_session():
-        self.assertAllClose(
-            np.mean(weights3x1x4 * self._raw_losses) * self._shape[1],
-            weighted_loss.eval())
+    self._test_valid_weights((
+        ((17.0, 13.0, 2.0, 5.0),),
+        ((5.0, 31.0, 17.0, 5.0),),
+        ((7.0, 3.0, 11.0, 5.0),),
+    ))
 
-  # TODO(b/33556118): Bug: this should be averaged across all dimensions, not
-  # summed across dim 0.
   def test1x2x4Weight(self):
-    with ops.Graph().as_default():
-      self.assertEqual(0, len(util.get_losses()))
-      weights1x2x4 = ((
-          (17.0, 13.0, 2.0, 5.0),
-          (3.0, 13.0, 11.0, 2.0),),)
-      weighted_loss = losses.compute_weighted_loss(
-          self._raw_losses, weights=weights1x2x4)
-      self.assertEqual(1, len(util.get_losses()))
-      with self.test_session():
-        self.assertAllClose(
-            np.mean(weights1x2x4 * self._raw_losses) * self._shape[0],
-            weighted_loss.eval())
+    self._test_valid_weights(((
+        (17.0, 13.0, 2.0, 5.0),
+        (3.0, 13.0, 11.0, 2.0),
+    ),))
 
   def test3x2x4Weight(self):
-    with ops.Graph().as_default():
-      self.assertEqual(0, len(util.get_losses()))
-      weights3x2x4 = (
-          (
-              (17.0, 13.0, 2.0, 5.0),
-              (3.0, 13.0, 11.0, 2.0),),
-          (
-              (5.0, 31.0, 17.0, 5.0),
-              (13.0, 3.0, 1.0, 11.0),),
-          (
-              (7.0, 3.0, 11.0, 5.0),
-              (13.0, 11.0, 1.0, 7.0),),)
-      weighted_loss = losses.compute_weighted_loss(
-          self._raw_losses, weights=weights3x2x4)
-      self.assertEqual(1, len(util.get_losses()))
-      with self.test_session():
-        self.assertAllClose(
-            np.mean(weights3x2x4 * self._raw_losses), weighted_loss.eval())
+    self._test_valid_weights((
+        ((17.0, 13.0, 2.0, 5.0), (3.0, 13.0, 11.0, 2.0),),
+        ((5.0, 31.0, 17.0, 5.0), (13.0, 3.0, 1.0, 11.0),),
+        ((7.0, 3.0, 11.0, 5.0), (13.0, 11.0, 1.0, 7.0),),
+    ))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index fc021c897a..4fbde86aec 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -721,15 +722,18 @@ class PrecisionTest(test.TestCase):
       self.assertAlmostEqual(1, sess.run(update_op))
       self.assertAlmostEqual(1, precision.eval())
 
-  def testSomeCorrect(self):
-    predictions = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
-    labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-    precision, update_op = metrics.precision(labels, predictions)
+  def testSomeCorrect_multipleInputDtypes(self):
+    for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+      predictions = math_ops.cast(
+          constant_op.constant([1, 0, 1, 0], shape=(1, 4)), dtype=dtype)
+      labels = math_ops.cast(
+          constant_op.constant([0, 1, 1, 0], shape=(1, 4)), dtype=dtype)
+      precision, update_op = metrics.precision(labels, predictions)
 
-    with self.test_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.5, update_op.eval())
-      self.assertAlmostEqual(0.5, precision.eval())
+      with self.test_session() as sess:
+        sess.run(variables.local_variables_initializer())
+        self.assertAlmostEqual(0.5, update_op.eval())
+        self.assertAlmostEqual(0.5, precision.eval())
 
   def testWeighted1d(self):
     predictions = constant_op.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
@@ -885,15 +889,18 @@ class RecallTest(test.TestCase):
       sess.run(update_op)
       self.assertEqual(1, recall.eval())
 
-  def testSomeCorrect(self):
-    predictions = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
-    labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-    recall, update_op = metrics.recall(labels, predictions)
+  def testSomeCorrect_multipleInputDtypes(self):
+    for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+      predictions = math_ops.cast(
+          constant_op.constant([1, 0, 1, 0], shape=(1, 4)), dtype=dtype)
+      labels = math_ops.cast(
+          constant_op.constant([0, 1, 1, 0], shape=(1, 4)), dtype=dtype)
+      recall, update_op = metrics.recall(labels, predictions)
 
-    with self.test_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.5, update_op.eval())
-      self.assertAlmostEqual(0.5, recall.eval())
+      with self.test_session() as sess:
+        sess.run(variables.local_variables_initializer())
+        self.assertAlmostEqual(0.5, update_op.eval())
+        self.assertAlmostEqual(0.5, recall.eval())
 
   def testWeighted1d(self):
     predictions = constant_op.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
@@ -1008,17 +1015,20 @@ class AUCTest(test.TestCase):
 
       self.assertEqual(1, auc.eval())
 
-  def testSomeCorrect(self):
+  def testSomeCorrect_multipleLabelDtypes(self):
     with self.test_session() as sess:
-      predictions = constant_op.constant(
-          [1, 0, 1, 0], shape=(1, 4), dtype=dtypes_lib.float32)
-      labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-      auc, update_op = metrics.auc(labels, predictions)
+      for label_dtype in (
+          dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+        predictions = constant_op.constant(
+            [1, 0, 1, 0], shape=(1, 4), dtype=dtypes_lib.float32)
+        labels = math_ops.cast(
+            constant_op.constant([0, 1, 1, 0], shape=(1, 4)), dtype=label_dtype)
+        auc, update_op = metrics.auc(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.5, sess.run(update_op))
+        sess.run(variables.local_variables_initializer())
+        self.assertAlmostEqual(0.5, sess.run(update_op))
 
-      self.assertAlmostEqual(0.5, auc.eval())
+        self.assertAlmostEqual(0.5, auc.eval())
 
   def testWeighted1d(self):
     with self.test_session() as sess:
@@ -1297,23 +1307,24 @@ class SpecificityAtSensitivityTest(test.TestCase):
       self.assertAlmostEqual(0.6, sess.run(update_op))
       self.assertAlmostEqual(0.6, specificity.eval())
 
-  def testWeighted1d(self):
-    predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0, 0.1, 0.2, 0.2, 0.26, 0.26]
-    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-    weights_values = [3]
+  def testWeighted1d_multipleLabelDtypes(self):
+    for label_dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+      predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0, 0.1, 0.2, 0.2, 0.26, 0.26]
+      labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+      weights_values = [3]
 
-    predictions = constant_op.constant(
-        predictions_values, dtype=dtypes_lib.float32)
-    labels = constant_op.constant(labels_values)
-    weights = constant_op.constant(weights_values)
-    specificity, update_op = metrics.specificity_at_sensitivity(
-        labels, predictions, weights=weights, sensitivity=0.4)
+      predictions = constant_op.constant(
+          predictions_values, dtype=dtypes_lib.float32)
+      labels = math_ops.cast(labels_values, dtype=label_dtype)
+      weights = constant_op.constant(weights_values)
+      specificity, update_op = metrics.specificity_at_sensitivity(
+          labels, predictions, weights=weights, sensitivity=0.4)
 
-    with self.test_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      with self.test_session() as sess:
+        sess.run(variables.local_variables_initializer())
 
-      self.assertAlmostEqual(0.6, sess.run(update_op))
-      self.assertAlmostEqual(0.6, specificity.eval())
+        self.assertAlmostEqual(0.6, sess.run(update_op))
+        self.assertAlmostEqual(0.6, specificity.eval())
 
   def testWeighted2d(self):
     predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0, 0.1, 0.2, 0.2, 0.26, 0.26]
@@ -1432,22 +1443,24 @@ class SensitivityAtSpecificityTest(test.TestCase):
       self.assertAlmostEqual(0.6, sess.run(update_op))
       self.assertAlmostEqual(0.6, specificity.eval())
 
-  def testWeighted(self):
-    predictions_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
-    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-    weights_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+  def testWeighted_multipleLabelDtypes(self):
+    for label_dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+      predictions_values = [
+          0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+      labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+      weights_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 
-    predictions = constant_op.constant(
-        predictions_values, dtype=dtypes_lib.float32)
-    labels = constant_op.constant(labels_values)
-    weights = constant_op.constant(weights_values)
-    specificity, update_op = metrics.sensitivity_at_specificity(
-        labels, predictions, weights=weights, specificity=0.4)
+      predictions = constant_op.constant(
+          predictions_values, dtype=dtypes_lib.float32)
+      labels = math_ops.cast(labels_values, dtype=label_dtype)
+      weights = constant_op.constant(weights_values)
+      specificity, update_op = metrics.sensitivity_at_specificity(
+          labels, predictions, weights=weights, specificity=0.4)
 
-    with self.test_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.675, sess.run(update_op))
-      self.assertAlmostEqual(0.675, specificity.eval())
+      with self.test_session() as sess:
+        sess.run(variables.local_variables_initializer())
+        self.assertAlmostEqual(0.675, sess.run(update_op))
+        self.assertAlmostEqual(0.675, specificity.eval())
 
 
 # TODO(nsilberman): Break this up into two sets of tests.
@@ -1536,22 +1549,25 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       self.assertEqual(1, prec.eval())
       self.assertEqual(1, rec.eval())
 
-  def testSomeCorrect(self):
+  def testSomeCorrect_multipleLabelDtypes(self):
     with self.test_session() as sess:
-      predictions = constant_op.constant(
-          [1, 0, 1, 0], shape=(1, 4), dtype=dtypes_lib.float32)
-      labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-      thresholds = [0.5]
-      prec, prec_op = metrics.precision_at_thresholds(labels, predictions,
-                                                      thresholds)
-      rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
-                                                 thresholds)
+      for label_dtype in (
+          dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+        predictions = constant_op.constant(
+            [1, 0, 1, 0], shape=(1, 4), dtype=dtypes_lib.float32)
+        labels = math_ops.cast(
+            constant_op.constant([0, 1, 1, 0], shape=(1, 4)), dtype=label_dtype)
+        thresholds = [0.5]
+        prec, prec_op = metrics.precision_at_thresholds(labels, predictions,
+                                                        thresholds)
+        rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
+                                                   thresholds)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+        sess.run(variables.local_variables_initializer())
+        sess.run([prec_op, rec_op])
 
-      self.assertAlmostEqual(0.5, prec.eval())
-      self.assertAlmostEqual(0.5, rec.eval())
+        self.assertAlmostEqual(0.5, prec.eval())
+        self.assertAlmostEqual(0.5, rec.eval())
 
   def testAllIncorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
diff --git a/tensorflow/python/kernel_tests/record_input_test.py b/tensorflow/python/kernel_tests/record_input_test.py
new file mode 100644
index 0000000000..9b5de4fcdb
--- /dev/null
+++ b/tensorflow/python/kernel_tests/record_input_test.py
@@ -0,0 +1,80 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for record_input_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.lib.io import tf_record
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.platform import test
+
+
+class RecordInputOpTest(test.TestCase):
+
+  def generateTestData(self, prefix, n, m):
+    for i in range(n):
+      f = os.path.join(self.get_temp_dir(), prefix + "." + str(i))
+      w = tf_record.TFRecordWriter(f)
+
+      for j in range(m):
+        w.write("{0:0{width}}".format(i * m + j, width=10).encode("utf-8"))
+
+    w.close()
+
+  def testRecordInputSimple(self):
+    with self.test_session() as sess:
+      self.generateTestData("basic", 1, 1)
+
+      yield_op = data_flow_ops.RecordInput(
+          file_pattern=os.path.join(self.get_temp_dir(), "basic.*"),
+          parallelism=1,
+          buffer_size=1,
+          batch_size=1,
+          name="record_input").get_yield_op()
+
+      self.assertEqual(sess.run(yield_op), b"0000000000")
+
+  def testRecordInputEpochs(self):
+    files = 100
+    records_per_file = 100
+    with self.test_session() as sess:
+      self.generateTestData("basic", files, records_per_file)
+
+      records = data_flow_ops.RecordInput(
+          file_pattern=os.path.join(self.get_temp_dir(), "basic.*"),
+          parallelism=2,
+          buffer_size=2000,
+          batch_size=1,
+          shift_ratio=0.33,
+          seed=10,
+          name="record_input")
+
+      yield_op = records.get_yield_op()
+
+      # cycle over 3 epochs and make sure we never duplicate
+      for _ in range(3):
+        epoch_set = set()
+        for _ in range(files * records_per_file):
+          r = sess.run(yield_op)
+          self.assertTrue(r[0] not in epoch_set)
+          epoch_set.add(r[0])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 74a6052ff6..853b08b2a5 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -121,6 +121,14 @@ class _Layer(object):
     return self._non_trainable_variables if self.trainable else self.variables
 
   @property
+  def trainable_weights(self):
+    return self.trainable_variables
+
+  @property
+  def non_trainable_weights(self):
+    return self.non_trainable_variables
+
+  @property
   def variables(self):
     """Returns the list of all layer variables/weights.
 
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index a476b0f72a..3b96d4362f 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -268,7 +268,7 @@ def conv1d(inputs,
            activity_regularizer=None,
            trainable=True,
            name=None,
-           reuse=False):
+           reuse=None):
   """Functional interface for 1D convolution layer (e.g. temporal convolution).
 
   This layer creates a convolution kernel that is convolved
@@ -435,7 +435,7 @@ def conv2d(inputs,
            activity_regularizer=None,
            trainable=True,
            name=None,
-           reuse=False):
+           reuse=None):
   """Functional interface for the 2D convolution layer.
 
   This layer creates a convolution kernel that is convolved
@@ -608,7 +608,7 @@ def conv3d(inputs,
            activity_regularizer=None,
            trainable=True,
            name=None,
-           reuse=False):
+           reuse=None):
   """Functional interface for the 3D convolution layer.
 
   This layer creates a convolution kernel that is convolved
@@ -867,7 +867,7 @@ def separable_conv2d(inputs,
                      activity_regularizer=None,
                      trainable=True,
                      name=None,
-                     reuse=False):
+                     reuse=None):
   """Functional interface for the depthwise separable 2D convolution layer.
 
   This layer performs a depthwise convolution that acts separately on
@@ -1128,7 +1128,7 @@ def conv2d_transpose(inputs,
                      activity_regularizer=None,
                      trainable=True,
                      name=None,
-                     reuse=False):
+                     reuse=None):
   """Transposed convolution layer (sometimes called Deconvolution).
 
   The need for transposed convolutions generally arises
diff --git a/tensorflow/python/layers/convolutional_test.py b/tensorflow/python/layers/convolutional_test.py
index c47e92c582..1a5fe5c9b7 100644
--- a/tensorflow/python/layers/convolutional_test.py
+++ b/tensorflow/python/layers/convolutional_test.py
@@ -18,11 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import convolutional as conv_layers
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -191,21 +196,45 @@ class ConvTest(test.TestCase):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
     conv_layers.conv2d(images, 32, [3, 3], name='conv1')
-    self.assertEqual(
-        len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 2)
+    self.assertEqual(len(variables.trainable_variables()), 2)
     conv_layers.conv2d(images, 32, [3, 3], name='conv1', reuse=True)
-    self.assertEqual(
-        len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 2)
+    self.assertEqual(len(variables.trainable_variables()), 2)
+
+  def testFunctionalConv2DReuseFromScope(self):
+    with variable_scope.variable_scope('scope'):
+      height, width = 7, 9
+      images = random_ops.random_uniform((5, height, width, 3), seed=1)
+      conv_layers.conv2d(images, 32, [3, 3], name='conv1')
+      self.assertEqual(len(variables.trainable_variables()), 2)
+    with variable_scope.variable_scope('scope', reuse=True):
+      conv_layers.conv2d(images, 32, [3, 3], name='conv1')
+      self.assertEqual(len(variables.trainable_variables()), 2)
+
+  def testFunctionalConv2DInitializerFromScope(self):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          'scope', initializer=init_ops.ones_initializer()):
+        height, width = 7, 9
+        images = random_ops.random_uniform((5, height, width, 3), seed=1)
+        conv_layers.conv2d(images, 32, [3, 3], name='conv1')
+        weights = variables.trainable_variables()
+        # Check the names of weights in order.
+        self.assertTrue('kernel' in weights[0].name)
+        self.assertTrue('bias' in weights[1].name)
+        sess.run(variables.global_variables_initializer())
+        weights = sess.run(weights)
+        # Check that the kernel weights got initialized to ones (from scope)
+        self.assertAllClose(weights[0], np.ones((3, 3, 3, 32)))
+        # Check that the bias still got initialized to zeros.
+        self.assertAllClose(weights[1], np.zeros((32)))
 
   def testFunctionalConv2DNoReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
     conv_layers.conv2d(images, 32, [3, 3])
-    self.assertEqual(
-        len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 2)
+    self.assertEqual(len(variables.trainable_variables()), 2)
     conv_layers.conv2d(images, 32, [3, 3])
-    self.assertEqual(
-        len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 4)
+    self.assertEqual(len(variables.trainable_variables()), 4)
 
 
 class SeparableConv2DTest(test.TestCase):
@@ -323,22 +352,48 @@ class SeparableConv2DTest(test.TestCase):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
     conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1')
-    self.assertEqual(
-        len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 3)
+    self.assertEqual(len(variables.trainable_variables()), 3)
     conv_layers.separable_conv2d(
         images, 32, [3, 3], name='sepconv1', reuse=True)
-    self.assertEqual(
-        len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 3)
+    self.assertEqual(len(variables.trainable_variables()), 3)
+
+  def testFunctionalConv2DReuseFromScope(self):
+    with variable_scope.variable_scope('scope'):
+      height, width = 7, 9
+      images = random_ops.random_uniform((5, height, width, 3), seed=1)
+      conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1')
+      self.assertEqual(len(variables.trainable_variables()), 3)
+    with variable_scope.variable_scope('scope', reuse=True):
+      conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1')
+      self.assertEqual(len(variables.trainable_variables()), 3)
+
+  def testFunctionalConv2DInitializerFromScope(self):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          'scope', initializer=init_ops.ones_initializer()):
+        height, width = 7, 9
+        images = random_ops.random_uniform((5, height, width, 3), seed=1)
+        conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1')
+        weights = variables.trainable_variables()
+        # Check the names of weights in order.
+        self.assertTrue('depthwise_kernel' in weights[0].name)
+        self.assertTrue('pointwise_kernel' in weights[1].name)
+        self.assertTrue('bias' in weights[2].name)
+        sess.run(variables.global_variables_initializer())
+        weights = sess.run(weights)
+        # Check that the kernel weights got initialized to ones (from scope)
+        self.assertAllClose(weights[0], np.ones((3, 3, 3, 1)))
+        self.assertAllClose(weights[1], np.ones((1, 1, 3, 32)))
+        # Check that the bias still got initialized to zeros.
+        self.assertAllClose(weights[2], np.zeros((32)))
 
   def testFunctionalConv2DNoReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
     conv_layers.separable_conv2d(images, 32, [3, 3])
-    self.assertEqual(
-        len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 3)
+    self.assertEqual(len(variables.trainable_variables()), 3)
     conv_layers.separable_conv2d(images, 32, [3, 3])
-    self.assertEqual(
-        len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 6)
+    self.assertEqual(len(variables.trainable_variables()), 6)
 
   def testSeparableConv2DDepthwiseRegularizer(self):
     height, width = 7, 9
@@ -511,21 +566,45 @@ class Conv2DTransposeTest(test.TestCase):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
     conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1')
-    self.assertEqual(
-        len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 2)
+    self.assertEqual(len(variables.trainable_variables()), 2)
     conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1', reuse=True)
-    self.assertEqual(
-        len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 2)
+    self.assertEqual(len(variables.trainable_variables()), 2)
+
+  def testFunctionalConv2DTransposeReuseFromScope(self):
+    with variable_scope.variable_scope('scope'):
+      height, width = 7, 9
+      images = random_ops.random_uniform((5, height, width, 3), seed=1)
+      conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1')
+      self.assertEqual(len(variables.trainable_variables()), 2)
+    with variable_scope.variable_scope('scope', reuse=True):
+      conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1')
+      self.assertEqual(len(variables.trainable_variables()), 2)
+
+  def testFunctionalConv2DTransposeInitializerFromScope(self):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          'scope', initializer=init_ops.ones_initializer()):
+        height, width = 7, 9
+        images = random_ops.random_uniform((5, height, width, 3), seed=1)
+        conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1')
+        weights = variables.trainable_variables()
+        # Check the names of weights in order.
+        self.assertTrue('kernel' in weights[0].name)
+        self.assertTrue('bias' in weights[1].name)
+        sess.run(variables.global_variables_initializer())
+        weights = sess.run(weights)
+        # Check that the kernel weights got initialized to ones (from scope)
+        self.assertAllClose(weights[0], np.ones((3, 3, 32, 3)))
+        # Check that the bias still got initialized to zeros.
+        self.assertAllClose(weights[1], np.zeros((32)))
 
   def testFunctionalConv2DTransposeNoReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
     conv_layers.conv2d_transpose(images, 32, [3, 3])
-    self.assertEqual(
-        len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 2)
+    self.assertEqual(len(variables.trainable_variables()), 2)
     conv_layers.conv2d_transpose(images, 32, [3, 3])
-    self.assertEqual(
-        len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 4)
+    self.assertEqual(len(variables.trainable_variables()), 4)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index c662478ccc..92894e1447 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -41,10 +41,12 @@ from tensorflow.python.layers import utils
 class Dense(base._Layer):  # pylint: disable=protected-access
   """Densely-connected layer class.
 
-  This layer implements the operation `outputs = activation(inputs.w + b)`
+  This layer implements the operation:
+  `outputs = activation(inputs.kernel + bias)`
   Where `activation` is the activation function passed as the `activation`
-  argument (if not `None`), `w` is a weights matrix created by the layer,
-  and `b` is a bias vector created by the layer (only if `use_bias` is `True`).
+  argument (if not `None`), `kernel` is a weights matrix created by the layer,
+  and `bias` is a bias vector created by the layer
+  (only if `use_bias` is `True`).
 
   Note: if the input to the layer has a rank greater than 2, then it is
   flattened prior to the initial matrix multiply by `w`.
@@ -54,9 +56,9 @@ class Dense(base._Layer):  # pylint: disable=protected-access
     activation: Activation function (callable). Set it to None to maintain a
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
-    weights_initializer: Initializer function for the weight matrix.
+    kernel_initializer: Initializer function for the weight matrix.
     bias_initializer: Initializer function for the bias.
-    weights_regularizer: Regularizer function for the weight matrix.
+    kernel_regularizer: Regularizer function for the weight matrix.
     bias_regularizer: Regularizer function for the bias.
     activity_regularizer: Regularizer function for the output.
     trainable: Boolean, if `True` also add variables to the graph collection
@@ -70,21 +72,21 @@ class Dense(base._Layer):  # pylint: disable=protected-access
     units: Python integer, dimensionality of the output space.
     activation: Activation function (callable).
     use_bias: Boolean, whether the layer uses a bias.
-    weights_initializer: Initializer instance (or name) for the weight matrix.
+    kernel_initializer: Initializer instance (or name) for the weight matrix.
     bias_initializer: Initializer instance (or name) for the bias.
-    weights_regularizer: Regularizer instance for the weight matrix (callable)
+    kernel_regularizer: Regularizer instance for the weight matrix (callable)
     bias_regularizer: Regularizer instance for the bias (callable).
     activity_regularizer: Regularizer instance for the output (callable)
-    weights: Weight matrix (TensorFlow variable or tensor).
+    kernel: Weight matrix (TensorFlow variable or tensor).
     bias: Bias vector, if applicable (TensorFlow variable or tensor).
   """
 
   def __init__(self, units,
                activation=None,
                use_bias=True,
-               weights_initializer=None,
+               kernel_initializer=None,
                bias_initializer=init_ops.zeros_initializer(),
-               weights_regularizer=None,
+               kernel_regularizer=None,
                bias_regularizer=None,
                activity_regularizer=None,
                trainable=True,
@@ -94,9 +96,9 @@ class Dense(base._Layer):  # pylint: disable=protected-access
     self.units = units
     self.activation = activation
     self.use_bias = use_bias
-    self.weights_initializer = weights_initializer
+    self.kernel_initializer = kernel_initializer
     self.bias_initializer = bias_initializer
-    self.weights_regularizer = weights_regularizer
+    self.kernel_regularizer = kernel_regularizer
     self.bias_regularizer = bias_regularizer
     self.activity_regularizer = activity_regularizer
 
@@ -113,12 +115,12 @@ class Dense(base._Layer):  # pylint: disable=protected-access
     # weight of the layer. If the layer is not trainable
     # (self.trainable = False), the variable will not be added to
     # tf.trainable_variables(), and self.trainable_weights will be empty.
-    self.w = vs.get_variable('weights',
-                             shape=[input_shape[-1].value, self.units],
-                             initializer=self.weights_initializer,
-                             regularizer=self.weights_regularizer,
-                             dtype=self.dtype,
-                             trainable=True)
+    self.kernel = vs.get_variable('kernel',
+                                  shape=[input_shape[-1].value, self.units],
+                                  initializer=self.kernel_initializer,
+                                  regularizer=self.kernel_regularizer,
+                                  dtype=self.dtype,
+                                  trainable=True)
     if self.use_bias:
       self.bias = vs.get_variable('bias',
                                   shape=[self.units,],
@@ -140,7 +142,7 @@ class Dense(base._Layer):  # pylint: disable=protected-access
       output_shape_tensor = array_ops.stack(output_shape_tensors)
       inputs = array_ops.reshape(inputs, [-1, input_dim])
 
-    outputs = standard_ops.matmul(inputs, self.w)
+    outputs = standard_ops.matmul(inputs, self.kernel)
     if self.use_bias:
       outputs = nn.bias_add(outputs, self.bias)
 
@@ -158,20 +160,22 @@ def dense(
     inputs, units,
     activation=None,
     use_bias=True,
-    weights_initializer=None,
+    kernel_initializer=None,
     bias_initializer=init_ops.zeros_initializer(),
-    weights_regularizer=None,
+    kernel_regularizer=None,
     bias_regularizer=None,
     activity_regularizer=None,
     trainable=True,
     name=None,
-    reuse=False):
+    reuse=None):
   """Functional interface for the densely-connected layer.
 
-  This layer implements the operation `outputs = activation(inputs.w + b)`
+  This layer implements the operation:
+  `outputs = activation(inputs.kernel + bias)`
   Where `activation` is the activation function passed as the `activation`
-  argument (if not `None`), `w` is a weights matrix created by the layer,
-  and `b` is a bias vector created by the layer (only if `use_bias` is `True`).
+  argument (if not `None`), `kernel` is a weights matrix created by the layer,
+  and `bias` is a bias vector created by the layer
+  (only if `use_bias` is `True`).
 
   Note: if the `inputs` tensor has a rank greater than 2, then it is
   flattened prior to the initial matrix multiply by `w`.
@@ -182,9 +186,9 @@ def dense(
     activation: Activation function (callable). Set it to None to maintain a
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
-    weights_initializer: Initializer function for the weight matrix.
+    kernel_initializer: Initializer function for the weight matrix.
     bias_initializer: Initializer function for the bias.
-    weights_regularizer: Regularizer function for the weight matrix.
+    kernel_regularizer: Regularizer function for the weight matrix.
     bias_regularizer: Regularizer function for the bias.
     activity_regularizer: Regularizer function for the output.
     trainable: Boolean, if `True` also add variables to the graph collection
@@ -199,9 +203,9 @@ def dense(
   layer = Dense(units,
                 activation=activation,
                 use_bias=use_bias,
-                weights_initializer=weights_initializer,
+                kernel_initializer=kernel_initializer,
                 bias_initializer=bias_initializer,
-                weights_regularizer=weights_regularizer,
+                kernel_regularizer=kernel_regularizer,
                 bias_regularizer=bias_regularizer,
                 activity_regularizer=activity_regularizer,
                 trainable=trainable,
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index c1fbe957df..cfcee7b788 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -39,7 +39,7 @@ class DenseTest(test.TestCase):
     dense = core_layers.Dense(2, activation=nn_ops.relu, name='my_dense')
     self.assertEqual(dense.units, 2)
     self.assertEqual(dense.activation, nn_ops.relu)
-    self.assertEqual(dense.weights_regularizer, None)
+    self.assertEqual(dense.kernel_regularizer, None)
     self.assertEqual(dense.bias_regularizer, None)
     self.assertEqual(dense.activity_regularizer, None)
     self.assertEqual(dense.use_bias, True)
@@ -55,36 +55,37 @@ class DenseTest(test.TestCase):
     dense = core_layers.Dense(2, activation=nn_ops.relu, name='my_dense')
     inputs = random_ops.random_uniform((5, 2), seed=1)
     _ = dense(inputs)
-    self.assertListEqual(dense.variables, [dense.w, dense.bias])
-    self.assertListEqual(dense.trainable_variables, [dense.w, dense.bias])
+    self.assertListEqual(dense.variables, [dense.kernel, dense.bias])
+    self.assertListEqual(dense.trainable_variables, [dense.kernel, dense.bias])
     self.assertListEqual(dense.non_trainable_variables, [])
-    self.assertListEqual(dense._trainable_variables, [dense.w, dense.bias])
+    self.assertListEqual(dense._trainable_variables, [dense.kernel, dense.bias])
     self.assertListEqual(dense._non_trainable_variables, [])
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 2)
-    self.assertEqual(dense.w.name, 'my_dense/weights:0')
+    self.assertEqual(dense.kernel.name, 'my_dense/kernel:0')
     self.assertEqual(dense.bias.name, 'my_dense/bias:0')
 
   def testNoBias(self):
     dense = core_layers.Dense(2, use_bias=False, name='my_dense')
     inputs = random_ops.random_uniform((5, 2), seed=1)
     _ = dense(inputs)
-    self.assertListEqual(dense.variables, [dense.w])
-    self.assertListEqual(dense.trainable_variables, [dense.w])
+    self.assertListEqual(dense.variables, [dense.kernel])
+    self.assertListEqual(dense.trainable_variables, [dense.kernel])
     self.assertListEqual(dense.non_trainable_variables, [])
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 1)
-    self.assertEqual(dense.w.name, 'my_dense/weights:0')
+    self.assertEqual(dense.kernel.name, 'my_dense/kernel:0')
     self.assertEqual(dense.bias, None)
 
   def testNonTrainable(self):
     dense = core_layers.Dense(2, trainable=False, name='my_dense')
     inputs = random_ops.random_uniform((5, 2), seed=1)
     _ = dense(inputs)
-    self.assertListEqual(dense.variables, [dense.w, dense.bias])
-    self.assertListEqual(dense.non_trainable_variables, [dense.w, dense.bias])
+    self.assertListEqual(dense.variables, [dense.kernel, dense.bias])
+    self.assertListEqual(dense.non_trainable_variables,
+                         [dense.kernel, dense.bias])
     self.assertListEqual(dense.trainable_variables, [])
-    self.assertListEqual(dense._trainable_variables, [dense.w, dense.bias])
+    self.assertListEqual(dense._trainable_variables, [dense.kernel, dense.bias])
     self.assertListEqual(dense._non_trainable_variables, [])
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 0)
@@ -149,25 +150,25 @@ class DenseTest(test.TestCase):
     self.assertEqual(len(loss_keys), 1)
     self.assertListEqual(dense.losses, loss_keys)
 
-  def testWeightsRegularizer(self):
+  def testKernelRegularizer(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     dense = core_layers.Dense(
-        2, name='my_dense', weights_regularizer=regularizer)
+        2, name='my_dense', kernel_regularizer=regularizer)
     inputs = random_ops.random_uniform((5, 3), seed=1)
     _ = dense(inputs)
     loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
     self.assertEqual(len(loss_keys), 1)
     self.assertListEqual(dense.losses, loss_keys)
 
-  def testWeightsRegularizerWithReuse(self):
+  def testKernelRegularizerWithReuse(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     inputs = random_ops.random_uniform((5, 3), seed=1)
     _ = core_layers.dense(
-        inputs, 2, name='my_dense', weights_regularizer=regularizer)
+        inputs, 2, name='my_dense', kernel_regularizer=regularizer)
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 1)
     _ = core_layers.dense(
-        inputs, 2, name='my_dense', weights_regularizer=regularizer, reuse=True)
+        inputs, 2, name='my_dense', kernel_regularizer=regularizer, reuse=True)
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 1)
 
@@ -206,6 +207,16 @@ class DenseTest(test.TestCase):
     vars2 = variables.trainable_variables()
     self.assertEqual(vars1, vars2)
 
+  def testFunctionalDenseTwiceReuseFromScope(self):
+    with variable_scope.variable_scope('scope'):
+      inputs = random_ops.random_uniform((5, 3), seed=1)
+      core_layers.dense(inputs, 2, name='my_dense')
+      vars1 = variables.trainable_variables()
+    with variable_scope.variable_scope('scope', reuse=True):
+      core_layers.dense(inputs, 2, name='my_dense')
+      vars2 = variables.trainable_variables()
+    self.assertEqual(vars1, vars2)
+
   def testFunctionalDenseInitializerFromScope(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
@@ -237,17 +248,17 @@ class DenseTest(test.TestCase):
       inputs = random_ops.random_uniform((5, 3), seed=1)
       core_layers.dense(inputs, 2, name='my_dense')
       var = variables.trainable_variables()[0]
-      self.assertEqual(var.name, 'test/my_dense/weights:0')
+      self.assertEqual(var.name, 'test/my_dense/kernel:0')
     with variable_scope.variable_scope('test1') as scope:
       inputs = random_ops.random_uniform((5, 3), seed=1)
       core_layers.dense(inputs, 2, name=scope)
       var = variables.trainable_variables()[2]
-      self.assertEqual(var.name, 'test1/weights:0')
+      self.assertEqual(var.name, 'test1/kernel:0')
     with variable_scope.variable_scope('test2'):
       inputs = random_ops.random_uniform((5, 3), seed=1)
       core_layers.dense(inputs, 2)
       var = variables.trainable_variables()[4]
-      self.assertEqual(var.name, 'test2/dense/weights:0')
+      self.assertEqual(var.name, 'test2/dense/kernel:0')
 
 
 class DropoutTest(test.TestCase):
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index fcbc69f2c5..4a59d77948 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -257,7 +257,7 @@ def batch_normalization(inputs,
                         training=False,
                         trainable=True,
                         name=None,
-                        reuse=False):
+                        reuse=None):
   """Functional interface for the batch normalization layer.
 
   Reference: http://arxiv.org/abs/1502.03167
diff --git a/tensorflow/python/layers/normalization_test.py b/tensorflow/python/layers/normalization_test.py
index 93efc09ca0..91b7cb6f48 100644
--- a/tensorflow/python/layers/normalization_test.py
+++ b/tensorflow/python/layers/normalization_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.layers import normalization as normalization_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -454,6 +455,20 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=2)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
+  def testFunctionalReuseFromScope(self):
+    inputs = variables.Variable(
+        np.random.random((5, 4, 3, 6)), dtype=dtypes.float32)
+    epsilon = 1e-3
+    training = array_ops.placeholder(dtype='bool')
+    with variable_scope.variable_scope('scope'):
+      _ = normalization_layers.batch_norm(
+          inputs, axis=-1, momentum=0.9, epsilon=epsilon, training=training)
+      self.assertEqual(len(variables.global_variables()), 5)
+    with variable_scope.variable_scope('scope', reuse=True):
+      _ = normalization_layers.batch_norm(
+          inputs, axis=-1, momentum=0.9, epsilon=epsilon, training=training)
+      self.assertEqual(len(variables.global_variables()), 5)
+
   def testNoCenter(self):
     bn = normalization_layers.BatchNormalization(axis=1, center=False)
     inputs = random_ops.random_uniform((5, 4, 3), seed=1)
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index 628853545e..95247ea125 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -32,8 +32,19 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 
 
-def remove_squeezable_dimensions(labels, predictions, name=None):
-  """Squeeze last dim if ranks of `predictions` and `labels` differ by 1.
+def remove_squeezable_dimensions(
+    labels, predictions, expected_rank_diff=0, name=None):
+  """Squeeze last dim if ranks differ from expected by exactly 1.
+
+  In the common case where we expect shapes to match, `expected_rank_diff`
+  defaults to 0, and we squeeze the last dimension of the larger rank if they
+  differ by 1.
+
+  But, for example, if `labels` contains class IDs and `predictions` contains 1
+  probability per class, we expect `predictions` to have 1 more dimension than
+  `labels`, so `expected_rank_diff` would be 1. In this case, we'd squeeze
+  `labels` if `rank(predictions) - rank(labels) == 0`, and
+  `predictions` if `rank(predictions) - rank(labels) == 2`.
 
   This will use static shape if available. Otherwise, it will add graph
   operations, which could result in a performance hit.
@@ -41,6 +52,7 @@ def remove_squeezable_dimensions(labels, predictions, name=None):
   Args:
     labels: Label values, a `Tensor` whose dimensions match `predictions`.
     predictions: Predicted values, a `Tensor` of arbitrary dimensions.
+    expected_rank_diff: Expected result of `rank(predictions) - rank(labels)`.
     name: Name of the op.
 
   Returns:
@@ -57,10 +69,10 @@ def remove_squeezable_dimensions(labels, predictions, name=None):
     if (labels_rank is not None) and (predictions_rank is not None):
       # Use static rank.
       rank_diff = predictions_rank - labels_rank
-      if rank_diff == -1:
-        labels = array_ops.squeeze(labels, [-1])
-      elif rank_diff == 1:
+      if rank_diff == expected_rank_diff + 1:
         predictions = array_ops.squeeze(predictions, [-1])
+      elif rank_diff == expected_rank_diff - 1:
+        labels = array_ops.squeeze(labels, [-1])
       return labels, predictions
 
     # Use dynamic rank.
@@ -68,13 +80,13 @@ def remove_squeezable_dimensions(labels, predictions, name=None):
     if (predictions_rank is None) or (
         predictions_shape.dims[-1].is_compatible_with(1)):
       predictions = control_flow_ops.cond(
-          math_ops.equal(1, rank_diff),
+          math_ops.equal(expected_rank_diff + 1, rank_diff),
           lambda: array_ops.squeeze(predictions, [-1]),
           lambda: predictions)
     if (labels_rank is None) or (
         labels_shape.dims[-1].is_compatible_with(1)):
       labels = control_flow_ops.cond(
-          math_ops.equal(-1, rank_diff),
+          math_ops.equal(expected_rank_diff - 1, rank_diff),
           lambda: array_ops.squeeze(labels, [-1]),
           lambda: labels)
     return labels, predictions
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 72f0454e30..037c3a8187 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -1613,3 +1613,65 @@ class StagingArea(object):
       output.set_shape(shape)
 
     return self._get_return_value(ret)
+
+
+class RecordInput(object):
+  """RecordInput asynchronously reads and randomly yields TFRecords.
+
+  A RecordInput Op will continuously read a batch of records asynchronously
+  into a buffer of some fixed capacity. It can also asynchronously yield
+  random records from this buffer.
+
+  It will not start yielding until at least `buffer_size / 2` elements have been
+  placed into the buffer so that sufficient randomization can take place.
+
+  The order the files are read will be shifted each epoch by `shift_amount` so
+  that the data is presented in a different order every epoch.
+  """
+
+  def __init__(self,
+               file_pattern,
+               batch_size=1,
+               buffer_size=1,
+               parallelism=1,
+               shift_ratio=0,
+               seed=0,
+               name=None):
+    """Constructs a RecordInput Op.
+
+    Args:
+      file_pattern: File path to the dataset, possibly containing wildcards.
+        All matching files will be iterated over each epoch.
+      batch_size: How many records to return at a time.
+      buffer_size: The maximum number of records the buffer will contain.  This
+        _must_ be smaller than the total number of records in an epoch or
+        deadlock can occur.
+      parallelism: How many reader threads to use for reading from files.
+      shift_ratio: What percentage of the total number files to move the start
+        file forward by each epoch.
+      seed: Specify the random number seed used by generator that randomizes
+        records.
+      name: Optional name for the operation.
+
+    Raises:
+      ValueError: If one of the arguments is invalid.
+    """
+
+    self._batch_size = batch_size
+    self._file_pattern = file_pattern
+    self._buffer_size = buffer_size
+    self._parallelism = parallelism
+    self._shift_ratio = shift_ratio
+    self._seed = seed
+    self._name = name
+
+  def get_yield_op(self):
+    """Add a node that yields a minibatch every time it is executed."""
+    return gen_data_flow_ops.record_input(
+        file_pattern=self._file_pattern,
+        file_buffer_size=self._buffer_size,
+        file_parallelism=self._parallelism,
+        file_shuffle_shift_ratio=self._shift_ratio,
+        batch_size=self._batch_size,
+        file_random_seed=self._seed,
+        name=self._name)
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
index 4b1b9815ca..16068e57d8 100644
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -237,6 +237,7 @@ Max
 Mean
 Min
 Mul
+Neg
 Pow
 Prod
 Range
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index b6da60770d..c231ca56bb 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -449,6 +449,42 @@ class AdjustSaturationBenchmark(test.Benchmark):
     self._benchmarkAdjustSaturation(test.gpu_device_name(), None)
 
 
+class ResizeBilinearBenchmark(test.Benchmark):
+
+  def _benchmarkResize(self, image_size):
+    # 4D float tensor (10 images per batch, 3 channels per image)
+    img = variables.Variable(
+        random_ops.random_normal([10, image_size[0], image_size[1], 3]),
+        name='img')
+
+    deps = []
+    for _ in xrange(100):
+      with ops.control_dependencies(deps):
+        resize_op = image_ops.resize_bilinear(
+            img, [299, 299], align_corners=False)
+        deps = [resize_op]
+      benchmark_op = control_flow_ops.group(*deps)
+
+    with session.Session() as sess:
+      sess.run(variables.global_variables_initializer())
+      print('Variables initalized for resize_bilinear image size: %s.' %
+            (image_size,))
+      benchmark_values = self.run_op_benchmark(
+          sess,
+          benchmark_op,
+          name=('bilinear_%s_%s' % image_size),)
+      print('Benchmark values:\n%s' % benchmark_values)
+
+  def benchmarkSimilar(self):
+    self._benchmarkResize((183, 229))
+
+  def benchmarkScaleUp(self):
+    self._benchmarkResize((141, 186))
+
+  def benchmarkScaleDown(self):
+    self._benchmarkResize((749, 603))
+
+
 class ResizeBicubicBenchmark(test.Benchmark):
 
   def _benchmarkResize(self, image_size):
diff --git a/tensorflow/python/ops/losses/BUILD b/tensorflow/python/ops/losses/BUILD
index 47d4d594d6..c4ce11ce0f 100644
--- a/tensorflow/python/ops/losses/BUILD
+++ b/tensorflow/python/ops/losses/BUILD
@@ -22,12 +22,15 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:confusion_matrix",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:util",
+        "//tensorflow/python:weights_broadcast_ops",
     ],
 )
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 486e25afc7..89daa9594a 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -20,11 +20,13 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import confusion_matrix
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util
-from tensorflow.python.platform import tf_logging as logging
 
 
 def _scale_losses(losses, weights):
@@ -46,13 +48,8 @@ def _scale_losses(losses, weights):
     A scalar tf.float32 `Tensor` whose value represents the sum of the scaled
       `losses`.
   """
-  # First, compute the sum of the losses over all elements:
-  start_index = max(0, weights.get_shape().ndims)
-  reduction_indices = list(range(start_index, losses.get_shape().ndims))
-  reduced_losses = math_ops.reduce_sum(losses,
-                                       reduction_indices=reduction_indices)
-  reduced_losses = math_ops.multiply(reduced_losses, weights)
-  return math_ops.reduce_sum(reduced_losses)
+  weighted_losses = math_ops.multiply(losses, weights)
+  return math_ops.reduce_sum(weighted_losses)
 
 
 def _safe_div(numerator, denominator, name="value"):
@@ -117,51 +114,29 @@ def _num_present(losses, weights, per_batch=False):
       `per_batch` is `True`, the value is returned as a tensor of size
       `[batch_size]`. Otherwise, a single scalar tensor is returned.
   """
-  # If weights is a scalar, its easy to compute:
-  if weights.get_shape().ndims == 0:
-    if losses.get_shape().ndims == 0:
-      batch_size = 1
-    else:
-      batch_size = array_ops.reshape(array_ops.slice(array_ops.shape(losses),
-                                                     [0], [1]), [])
-    num_per_batch = math_ops.div(math_ops.to_float(array_ops.size(losses)),
-                                 math_ops.to_float(batch_size))
-    num_per_batch = array_ops.where(math_ops.equal(weights, 0),
-                                    0.0, num_per_batch)
-    num_per_batch = math_ops.multiply(array_ops.ones(
-        array_ops.reshape(batch_size, [1])), num_per_batch)
-    return num_per_batch if per_batch else math_ops.reduce_sum(num_per_batch)
-
-  # First, count the number of nonzero weights.
-  if weights.get_shape().ndims >= 1:
-    reduction_indices = list(range(1, weights.get_shape().ndims))
-    num_nonzero_per_batch = math_ops.reduce_sum(
-        math_ops.to_float(math_ops.not_equal(weights, 0)),
-        reduction_indices=reduction_indices)
-
-  # Next, determine the number of elements that weight would broadcast to:
-  broadcast_dims = array_ops.slice(array_ops.shape(losses),
-                                   [weights.get_shape().ndims], [-1])
-  num_to_broadcast = math_ops.to_float(math_ops.reduce_prod(broadcast_dims))
-
-  num_per_batch = math_ops.multiply(num_nonzero_per_batch, num_to_broadcast)
-  return num_per_batch if per_batch else math_ops.reduce_sum(num_per_batch)
+  with ops.name_scope(None, "num_present", (losses, weights)) as scope:
+    weights = math_ops.to_float(weights)
+    present = array_ops.where(
+        math_ops.equal(weights, 0.0),
+        array_ops.zeros_like(weights),
+        array_ops.ones_like(weights))
+    present = weights_broadcast_ops.broadcast_weights(present, losses)
+    if per_batch:
+      return math_ops.reduce_sum(
+          present, axis=math_ops.range(1, array_ops.rank(present)),
+          keep_dims=True, name=scope)
+    return math_ops.reduce_sum(present, name=scope)
 
 
 def compute_weighted_loss(
     losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES):
   """Computes the weighted loss.
 
-  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
-  not work as advertised, you'll wind up with weighted sum instead of weighted
-  mean for any but the last dimension. This will be cleaned up soon, so please
-  do not rely on the current behavior for anything but the shapes documented for
-  `weights` below.
-
   Args:
     losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
-    weights: `Tensor` of shape `[]`, `[batch_size]` or
-      `[batch_size, d1, ... dK]`, where K < N.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `losses`, and must be broadcastable to `losses` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `losses` dimension).
     scope: the scope for the operations performed in computing the loss.
     loss_collection: the loss will be added to these collections.
 
@@ -173,52 +148,20 @@ def compute_weighted_loss(
       `losses`, or if the number of dimensions (rank) of either `losses` or
       `weights` is missing.
   """
-  with ops.name_scope(scope, "weighted_loss", [losses, weights]):
-    losses = ops.convert_to_tensor(losses)
-    input_dtype = losses.dtype
-    losses = math_ops.to_float(losses)
-    weights = math_ops.to_float(ops.convert_to_tensor(weights))
-
-    losses_shape = losses.get_shape()
-    if losses_shape.ndims is None:
-      raise ValueError("losses.get_shape().ndims cannot be None")
-    weights_shape = weights.get_shape()
-    if weights_shape.ndims is None:
-      raise ValueError("weight.get_shape().ndims cannot be None")
-
-    # TODO(b/33556118): Remove `ndims > 1` check so shapes [] and [1] behave the
-    # same.
-    if weights_shape.ndims > 1 and weights_shape.dims[-1].is_compatible_with(1):
-      weights = array_ops.squeeze(weights, [-1])
-
-    # TODO(b/33556118): Remove this when we require weights shape be either
-    # scalar or the same as losses.
-    weights_dims = weights_shape.as_list()
-    losses_dims = losses_shape.as_list()
-    if len(weights_dims) > len(losses_dims):
-      raise ValueError(
-          "Invalid weights shape %s can not be broadcast to losses %s." % (
-              weights_shape, losses_shape))
-    for i in range(len(weights_dims)):
-      if ((losses_dims[i] is not None) and (losses_dims[i] == 1) and
-          (weights_dims[i] is not None) and (weights_dims[i] != 1)):
-        raise ValueError(
-            "Invalid weights shape %s can not be broadcast to losses %s." % (
-                weights_shape, losses_shape))
-    for i in range(len(weights_dims)):
-      if ((losses_dims[i] is not None) and (losses_dims[i] != 1) and
-          (weights_dims[i] is not None) and (weights_dims[i] == 1)):
-        logging.warn(
-            "WARNING: Weights %s with dimension 1 will result in a sum"
-            ", not average, across dimension %d.", weights_shape, i)
-
-    total_loss = _scale_losses(losses, weights)
-    num_present = _num_present(losses, weights)
-    mean_loss = _safe_mean(total_loss, num_present)
-    # Convert the result back to the input type.
-    mean_loss = math_ops.cast(mean_loss, input_dtype)
-    util.add_loss(mean_loss, loss_collection)
-    return mean_loss
+  with ops.name_scope(scope, "weighted_loss", (losses, weights)):
+    with ops.control_dependencies((
+        weights_broadcast_ops.assert_broadcastable(weights, losses),)):
+      losses = ops.convert_to_tensor(losses)
+      input_dtype = losses.dtype
+      losses = math_ops.to_float(losses)
+      weights = math_ops.to_float(weights)
+      total_loss = _scale_losses(losses, weights)
+      num_present = _num_present(losses, weights)
+      mean_loss = _safe_mean(total_loss, num_present)
+      # Convert the result back to the input type.
+      mean_loss = math_ops.cast(mean_loss, input_dtype)
+      util.add_loss(mean_loss, loss_collection)
+      return mean_loss
 
 
 def absolute_difference(
@@ -234,17 +177,12 @@ def absolute_difference(
   measurable element of `predictions` is scaled by the corresponding value of
   `weights`.
 
-  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
-  not work as advertised, you'll wind up with weighted sum instead of weighted
-  mean for any but the last dimension. This will be cleaned up soon, so please
-  do not rely on the current behavior for anything but the shapes documented for
-  `weights` below.
-
   Args:
     labels: The ground truth output tensor, same dimensions as 'predictions'.
     predictions: The predicted outputs.
-    weights: Coefficients for the loss a scalar, a tensor of shape
-      `[batch_size]` or a tensor whose shape matches `predictions`.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `losses` dimension).
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which this loss will be added.
 
@@ -272,18 +210,13 @@ def cosine_distance(
   Note that the function assumes that `predictions` and `labels` are already
   unit-normalized.
 
-  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
-  not work as advertised, you'll wind up with weighted sum instead of weighted
-  mean for any but the last dimension. This will be cleaned up soon, so please
-  do not rely on the current behavior for anything but the shapes documented for
-  `weights` below.
-
   Args:
     labels: `Tensor` whose shape matches 'predictions'
     predictions: An arbitrary matrix.
     dim: The dimension along which the cosine distance is computed.
-    weights: Coefficients for the loss a scalar, a tensor of shape
-      `[batch_size]` or a tensor whose shape matches `predictions`.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `losses` dimension).
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which this loss will be added.
 
@@ -303,7 +236,7 @@ def cosine_distance(
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
 
     radial_diffs = math_ops.multiply(predictions, labels)
-    losses = 1 - math_ops.reduce_sum(radial_diffs, reduction_indices=[dim,])
+    losses = 1 - math_ops.reduce_sum(radial_diffs, axis=(dim,), keep_dims=True)
     return compute_weighted_loss(losses, weights, scope, loss_collection)
 
 
@@ -311,18 +244,13 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES):
   """Adds a hinge loss to the training procedure.
 
-  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
-  not work as advertised, you'll wind up with weighted sum instead of weighted
-  mean for any but the last dimension. This will be cleaned up soon, so please
-  do not rely on the current behavior for anything but the shapes documented for
-  `weights` below.
-
   Args:
     labels: The ground truth output tensor. Its shape should match the shape of
       logits. The values of the tensor are expected to be 0.0 or 1.0.
     logits: The logits, a float tensor.
-    weights: Coefficients for the loss a scalar, a tensor of shape
-      `[batch_size]` or a tensor whose shape matches `predictions`.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `losses` dimension).
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
 
@@ -356,17 +284,12 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
   measurable element of `predictions` is scaled by the corresponding value of
   `weights`.
 
-  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
-  not work as advertised, you'll wind up with weighted sum instead of weighted
-  mean for any but the last dimension. This will be cleaned up soon, so please
-  do not rely on the current behavior for anything but the shapes documented for
-  `weights` below.
-
   Args:
     labels: The ground truth output tensor, same dimensions as 'predictions'.
     predictions: The predicted outputs.
-    weights: Coefficients for the loss a scalar, a tensor of shape
-      `[batch_size]` or a tensor whose shape matches `predictions`.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `losses` dimension).
     epsilon: A small increment to add to avoid taking a log of zero.
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
@@ -434,41 +357,39 @@ def mean_pairwise_squared_error(labels, predictions, weights=1.0, scope=None,
   """
   with ops.name_scope(scope, "mean_pairwise_squared_error",
                       (predictions, labels, weights)) as scope:
-    predictions = math_ops.to_float(predictions)
+    weights = math_ops.to_float(weights)
     labels = math_ops.to_float(labels)
-    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    weights = math_ops.to_float(ops.convert_to_tensor(weights))
-
-    diffs = math_ops.subtract(predictions, labels)
+    with ops.control_dependencies((
+        weights_broadcast_ops.assert_broadcastable(weights, labels),)):
+      predictions = math_ops.to_float(predictions)
+      predictions.get_shape().assert_is_compatible_with(labels.get_shape())
 
-    # Need to verify here since the function doesn't use compute_weighted_loss
-    if diffs.get_shape().ndims is None:
-      raise ValueError("diffs.get_shape().ndims cannot be None")
-    if weights.get_shape().ndims is None:
-      raise ValueError("weights.get_shape().ndims cannot be None")
+      diffs = math_ops.subtract(predictions, labels)
 
-    reduction_indices = list(range(1, diffs.get_shape().ndims))
+      reduction_indices = math_ops.range(1, array_ops.rank(diffs))
 
-    sum_squares_diff_per_batch = math_ops.reduce_sum(
-        math_ops.square(diffs),
-        reduction_indices=reduction_indices)
-    num_present_per_batch = _num_present(diffs, weights, per_batch=True)
+      sum_squares_diff_per_batch = math_ops.reduce_sum(
+          math_ops.square(diffs), reduction_indices=reduction_indices,
+          keep_dims=True)
+      num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
-    term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
-                            num_present_per_batch)
+      term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
+                              num_present_per_batch)
 
-    sum_diff = math_ops.reduce_sum(diffs, reduction_indices=reduction_indices)
-    term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
-                            math_ops.square(num_present_per_batch))
+      sum_diff = math_ops.reduce_sum(
+          diffs, reduction_indices=reduction_indices, keep_dims=True)
+      term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
+                              math_ops.square(num_present_per_batch))
 
-    loss = _scale_losses(term1 - term2, weights)
+      loss = _scale_losses(term1 - term2, weights)
 
-    mean_loss = array_ops.where(math_ops.reduce_sum(num_present_per_batch) > 0,
-                                loss,
-                                array_ops.zeros_like(loss),
-                                name="value")
-    util.add_loss(mean_loss, loss_collection)
-    return mean_loss
+      mean_loss = array_ops.where(
+          math_ops.reduce_sum(num_present_per_batch) > 0,
+          loss,
+          array_ops.zeros_like(loss),
+          name="value")
+      util.add_loss(mean_loss, loss_collection)
+      return mean_loss
 
 
 def mean_squared_error(labels, predictions, weights=1.0, scope=None,
@@ -483,17 +404,12 @@ def mean_squared_error(labels, predictions, weights=1.0, scope=None,
   measurable element of `predictions` is scaled by the corresponding value of
   `weights`.
 
-  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
-  not work as advertised, you'll wind up with weighted sum instead of weighted
-  mean for any but the last dimension. This will be cleaned up soon, so please
-  do not rely on the current behavior for anything but the shapes documented for
-  `weights` below.
-
   Args:
     labels: The ground truth output tensor, same dimensions as 'predictions'.
     predictions: The predicted outputs.
-    weights: Coefficients for the loss a scalar, a tensor of shape
-      `[batch_size]` or a tensor whose shape matches `predictions`.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `losses` dimension).
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
 
@@ -523,12 +439,6 @@ def sigmoid_cross_entropy(
   tensor of shape `[batch_size]`, then the loss weights apply to each
   corresponding sample.
 
-  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
-  not work as advertised, you'll wind up with weighted sum instead of weighted
-  mean for any but the last dimension. This will be cleaned up soon, so please
-  do not rely on the current behavior for anything but the shapes documented for
-  `weights` below.
-
   If `label_smoothing` is nonzero, smooth the labels towards 1/2:
 
       new_multiclass_labels = multiclass_labels * (1 - label_smoothing)
@@ -538,8 +448,9 @@ def sigmoid_cross_entropy(
     multi_class_labels: `[batch_size, num_classes]` target integer labels in
       `(0, 1)`.
     logits: `[batch_size, num_classes]` logits outputs of the network.
-    weights: Coefficients for the loss. This must be of shape `[]`,
-      `[batch_size]` or `[batch_size, num_classes]`.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `losses` dimension).
     label_smoothing: If greater than `0` then smooth the labels.
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
@@ -578,12 +489,6 @@ def softmax_cross_entropy(
   tensor of shape `[batch_size]`, then the loss weights apply to each
   corresponding sample.
 
-  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
-  not work as advertised, you'll wind up with weighted sum instead of weighted
-  mean for any but the last dimension. This will be cleaned up soon, so please
-  do not rely on the current behavior for anything but the shapes documented for
-  `weights` below.
-
   If `label_smoothing` is nonzero, smooth the labels towards 1/num_classes:
       new_onehot_labels = onehot_labels * (1 - label_smoothing)
                           + label_smoothing / num_classes
@@ -591,8 +496,10 @@ def softmax_cross_entropy(
   Args:
     onehot_labels: `[batch_size, num_classes]` target one-hot-encoded labels.
     logits: [batch_size, num_classes] logits outputs of the network .
-    weights: Coefficients for the loss. This must be of shape `[]`,
-      `[batch_size]` or `[batch_size, num_classes]`.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `onehot_labels`, and must be broadcastable to `onehot_labels` (i.e., all
+      dimensions must be either `1`, or the same as the corresponding `losses`
+      dimension).
     label_smoothing: If greater than 0 then smooth the labels.
     scope: the scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
@@ -623,6 +530,57 @@ def softmax_cross_entropy(
     return compute_weighted_loss(losses, weights, scope, loss_collection)
 
 
+# TODO(ptucker): Merge this with similar method in metrics_impl.
+def _remove_squeezable_dimensions(
+    labels, predictions, weights=None, expected_rank_diff=0):
+  """Internal version of _remove_squeezable_dimensions which handles weights.
+
+  Squeezes `predictions` and `labels` if their ranks differ from expected by
+  exactly 1.
+  Squeezes `weights` if its rank is 1 more than the new rank of `predictions`
+
+  This will use static shape if available. Otherwise, it will add graph
+  operations, which could result in a performance hit.
+
+  Args:
+    labels: Label values, a `Tensor` whose dimensions match `predictions`.
+    predictions: Predicted values, a `Tensor` of arbitrary dimensions.
+    weights: Optional weight `Tensor`. It will be squeezed if it's not scalar,
+      and its rank is 1 more than the new rank of `labels`.
+    expected_rank_diff: Expected result of `rank(predictions) - rank(labels)`.
+
+  Returns:
+    Tuple of `predictions`, `labels` and `weights`, possibly with the last
+    dimension squeezed.
+  """
+  labels, predictions = confusion_matrix.remove_squeezable_dimensions(
+      labels, predictions, expected_rank_diff=expected_rank_diff)
+
+  if weights is not None:
+    weights = ops.convert_to_tensor(weights)
+    labels_rank = labels.get_shape().ndims
+    weights_shape = weights.get_shape()
+    weights_rank = weights_shape.ndims
+
+    if (labels_rank is not None) and (weights_rank is not None):
+      # Use static rank.
+      rank_diff = weights_rank - labels_rank
+      if rank_diff == 1:
+        weights = array_ops.squeeze(weights, [-1])
+      return labels, predictions, weights
+
+    # Use dynamic rank.
+    rank_diff = array_ops.rank(weights) - array_ops.rank(labels)
+    if (weights_rank is None) or (
+        weights_shape.dims[-1].is_compatible_with(1)):
+      weights = control_flow_ops.cond(
+          math_ops.equal(1, rank_diff),
+          lambda: array_ops.squeeze(weights, [-1]),
+          lambda: weights)
+
+  return labels, predictions, weights
+
+
 def sparse_softmax_cross_entropy(labels, logits, weights=1.0, scope=None,
                                  loss_collection=ops.GraphKeys.LOSSES):
   """Cross-entropy loss using `tf.nn.sparse_softmax_cross_entropy_with_logits`.
@@ -632,18 +590,16 @@ def sparse_softmax_cross_entropy(labels, logits, weights=1.0, scope=None,
   tensor of shape [`batch_size`], then the loss weights apply to each
   corresponding sample.
 
-  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
-  not work as advertised, you'll wind up with weighted sum instead of weighted
-  mean for any but the last dimension. This will be cleaned up soon, so please
-  do not rely on the current behavior for anything but the shapes documented for
-  `weights` below.
-
   Args:
-    labels: [batch_size, 1] or [batch_size] target labels of dtype `int32` or
-      `int64` in the range `[0, num_classes)`.
-    logits: [batch_size, num_classes] logits outputs of the network .
-    weights: Coefficients for the loss. This must be of shape `[batch_size]` or
-      `[batch_size, 1]`.
+    labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-1}]` (where `r` is rank of
+      `labels` and result) and dtype `int32` or `int64`. Each entry in `labels`
+      must be an index in `[0, num_classes)`. Other values will raise an
+      exception when this op is run on CPU, and return `NaN` for corresponding
+      loss and gradient rows on GPU.
+    logits: Unscaled log probabilities of shape
+      `[d_0, d_1, ..., d_{r-1}, num_classes]` and dtype `float32` or `float64`.
+    weights: Coefficients for the loss. This must be scalar or of same rank as
+      `labels`
     scope: the scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
 
@@ -655,12 +611,13 @@ def sparse_softmax_cross_entropy(labels, logits, weights=1.0, scope=None,
       if `weights` is None.
   """
   with ops.name_scope(scope, "sparse_softmax_cross_entropy_loss",
-                      [logits, labels, weights]) as scope:
-    labels = array_ops.reshape(labels, shape=[array_ops.shape(labels)[0]])
-
+                      (logits, labels, weights)) as scope:
+    # As documented above in Args, labels contain class IDs and logits contains
+    # 1 probability per class ID, so we expect rank(logits) - rank(labels) == 1;
+    # therefore, expected_rank_diff=1.
+    labels, logits, weights = _remove_squeezable_dimensions(
+        labels, logits, weights, expected_rank_diff=1)
     losses = nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                          logits=logits,
                                                          name="xentropy")
-    # Reshape losses to [batch_size, 1] to be consistent with weights.
-    losses = array_ops.reshape(losses, shape=[array_ops.shape(losses)[0], 1])
     return compute_weighted_loss(losses, weights, scope, loss_collection)
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index c9ad0936a5..11e7d8382f 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -260,6 +260,8 @@ def argmax(input, axis=None, name=None, dimension=None):
     if axis is not None:
       raise ValueError("Cannot specify both 'axis' and 'dimension'")
     axis = dimension
+  elif axis is None:
+    axis = 0
   return gen_math_ops.arg_max(input, axis, name)
 
 
@@ -273,6 +275,8 @@ def argmin(input, axis=None, name=None, dimension=None):
     if axis is not None:
       raise ValueError("Cannot specify both 'axis' and 'dimension'")
     axis = dimension
+  elif axis is None:
+    axis = 0
   return gen_math_ops.arg_min(input, axis, name)
 
 
@@ -399,11 +403,11 @@ def negative(x, name=None):
   """
   with ops.name_scope(name, "Neg", [x]) as name:
     if isinstance(x, sparse_tensor.SparseTensor):
-      x_neg = gen_math_ops.neg(x.values, name=name)
+      x_neg = gen_math_ops._neg(x.values, name=name)
       return sparse_tensor.SparseTensor(
           indices=x.indices, values=x_neg, dense_shape=x.dense_shape)
     else:
-      return gen_math_ops.neg(x, name=name)
+      return gen_math_ops._neg(x, name=name)
 # pylint: enable=g-docstring-has-escape
 
 
@@ -857,7 +861,7 @@ def to_bfloat16(x, name="ToBFloat16"):
   return cast(x, dtypes.bfloat16, name=name)
 
 
-ops.Tensor._override_operator("__neg__", gen_math_ops.neg)
+ops.Tensor._override_operator("__neg__", gen_math_ops._neg)
 ops.Tensor._override_operator("__abs__", abs)
 # __invert__ corresponds to the ~ operator.  Here we follow the numpy convention
 # ~ marks an elementwise bit-wise inverse.  This is only implemented for boolean
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index a00625d083..0a109eb99b 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -87,7 +87,7 @@ def _remove_squeezable_dimensions(labels, predictions, weights):
         weights = array_ops.squeeze(weights, [-1])
     elif (weights_rank is None) or (
         weights_shape.dims[-1].is_compatible_with(1)):
-      # Use dynamic rank
+      # Use dynamic rank.
       weights = control_flow_ops.cond(
           math_ops.equal(array_ops.rank(weights),
                          math_ops.add(array_ops.rank(predictions), 1)),
@@ -354,8 +354,8 @@ def _confusion_matrix_at_thresholds(
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
   Args:
-    labels: A `Tensor` whose shape matches `predictions`. `labels` will be cast
-      to `bool`.
+    labels: A `Tensor` whose shape matches `predictions`. Will be cast to
+      `bool`.
     predictions: A floating point `Tensor` of arbitrary shape and whose values
       are in the range `[0, 1]`.
     thresholds: A python list or tuple of float thresholds in `[0, 1]`.
@@ -384,6 +384,8 @@ def _confusion_matrix_at_thresholds(
       if include not in all_includes:
         raise ValueError('Invaild key: %s.' % include)
 
+  labels = math_ops.cast(labels, dtype=dtypes.bool)
+  predictions = math_ops.to_float(predictions)
   labels, predictions, weights = _remove_squeezable_dimensions(
       labels, predictions, weights)
   predictions.get_shape().assert_is_compatible_with(labels.get_shape())
@@ -503,7 +505,8 @@ def auc(labels, predictions, weights=None, num_thresholds=200,
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
   Args:
-    labels: A `bool` `Tensor` whose shape matches `predictions`.
+    labels: A `Tensor` whose shape matches `predictions`. Will be cast to
+      `bool`.
     predictions: A floating point `Tensor` of arbitrary shape and whose values
       are in the range `[0, 1]`.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
@@ -1101,10 +1104,10 @@ def true_positives(labels, predictions, weights=None,
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
   Args:
-    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
-      match `predictions`.
-    predictions: The predicted values, a `bool` `Tensor` of arbitrary
-      dimensions.
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
+    predictions: The predicted values, a `Tensor` of arbitrary dimensions. Will
+      be cast to `bool`.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
       be either `1`, or the same as the corresponding `labels` dimension).
@@ -1127,11 +1130,11 @@ def true_positives(labels, predictions, weights=None,
   with variable_scope.variable_scope(
       name, 'true_positives', (predictions, labels, weights)):
 
-    predictions = ops.convert_to_tensor(predictions)
-    labels = ops.convert_to_tensor(labels)
+    labels = math_ops.cast(labels, dtype=dtypes.bool)
+    predictions = math_ops.cast(predictions, dtype=dtypes.bool)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    is_true_positive = math_ops.logical_and(math_ops.equal(labels, 1),
-                                            math_ops.equal(predictions, 1))
+    is_true_positive = math_ops.logical_and(math_ops.equal(labels, True),
+                                            math_ops.equal(predictions, True))
     return _count_condition(is_true_positive, weights, metrics_collections,
                             updates_collections)
 
@@ -1145,10 +1148,10 @@ def false_positives(labels, predictions, weights=None,
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
   Args:
-    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
-      match `predictions`.
-    predictions: The predicted values, a `bool` `Tensor` of arbitrary
-      dimensions.
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
+    predictions: The predicted values, a `Tensor` of arbitrary dimensions. Will
+      be cast to `bool`.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
       be either `1`, or the same as the corresponding `labels` dimension).
@@ -1171,11 +1174,11 @@ def false_positives(labels, predictions, weights=None,
   with variable_scope.variable_scope(
       name, 'false_positives', (predictions, labels, weights)):
 
-    predictions = ops.convert_to_tensor(predictions)
-    labels = ops.convert_to_tensor(labels)
+    labels = math_ops.cast(labels, dtype=dtypes.bool)
+    predictions = math_ops.cast(predictions, dtype=dtypes.bool)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    is_false_positive = math_ops.logical_and(math_ops.equal(labels, 0),
-                                             math_ops.equal(predictions, 1))
+    is_false_positive = math_ops.logical_and(math_ops.equal(labels, False),
+                                             math_ops.equal(predictions, True))
     return _count_condition(is_false_positive, weights, metrics_collections,
                             updates_collections)
 
@@ -1199,9 +1202,10 @@ def precision(labels, predictions, weights=None,
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
   Args:
-    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
-      match `predictions`.
-    predictions: The predicted values, a `bool` `Tensor` of arbitrary shape.
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
+    predictions: The predicted values, a `Tensor` of arbitrary dimensions. Will
+      be cast to `bool`.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
       be either `1`, or the same as the corresponding `labels` dimension).
@@ -1227,6 +1231,8 @@ def precision(labels, predictions, weights=None,
   with variable_scope.variable_scope(
       name, 'precision', (predictions, labels, weights)):
 
+    labels = math_ops.cast(labels, dtype=dtypes.bool)
+    predictions = math_ops.cast(predictions, dtype=dtypes.bool)
     labels, predictions, weights = _remove_squeezable_dimensions(
         labels, predictions, weights)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
@@ -1279,7 +1285,8 @@ def precision_at_thresholds(labels, predictions, thresholds,
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
   Args:
-    labels: A `bool` `Tensor` whose shape matches `predictions`.
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
     predictions: A floating point `Tensor` of arbitrary shape and whose values
       are in the range `[0, 1]`.
     thresholds: A python list or tuple of float thresholds in `[0, 1]`.
@@ -1336,10 +1343,10 @@ def false_negatives(labels, predictions, weights=None,
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
   Args:
-    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
-      match `predictions`.
-    predictions: The predicted values, a `bool` `Tensor` of arbitrary
-      dimensions.
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
+    predictions: The predicted values, a `Tensor` of arbitrary dimensions. Will
+      be cast to `bool`.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
       be either `1`, or the same as the corresponding `labels` dimension).
@@ -1361,11 +1368,11 @@ def false_negatives(labels, predictions, weights=None,
   with variable_scope.variable_scope(
       name, 'false_negatives', (predictions, labels, weights)):
 
-    predictions = ops.convert_to_tensor(predictions)
-    labels = ops.convert_to_tensor(labels)
+    labels = math_ops.cast(labels, dtype=dtypes.bool)
+    predictions = math_ops.cast(predictions, dtype=dtypes.bool)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    is_false_negative = math_ops.logical_and(math_ops.equal(labels, 1),
-                                             math_ops.equal(predictions, 0))
+    is_false_negative = math_ops.logical_and(math_ops.equal(labels, True),
+                                             math_ops.equal(predictions, False))
     return _count_condition(is_false_negative, weights, metrics_collections,
                             updates_collections)
 
@@ -1387,9 +1394,10 @@ def recall(labels, predictions, weights=None,
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
   Args:
-    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
-      match `predictions`.
-    predictions: The predicted values, a `bool` `Tensor` of arbitrary shape.
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
+    predictions: The predicted values, a `Tensor` of arbitrary dimensions. Will
+      be cast to `bool`.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
       be either `1`, or the same as the corresponding `labels` dimension).
@@ -1414,6 +1422,8 @@ def recall(labels, predictions, weights=None,
   """
   with variable_scope.variable_scope(
       name, 'recall', (predictions, labels, weights)):
+    labels = math_ops.cast(labels, dtype=dtypes.bool)
+    predictions = math_ops.cast(predictions, dtype=dtypes.bool)
     labels, predictions, weights = _remove_squeezable_dimensions(
         labels, predictions, weights)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
@@ -1817,7 +1827,8 @@ def recall_at_thresholds(labels, predictions, thresholds,
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
   Args:
-    labels: A `bool` `Tensor` whose shape matches `predictions`.
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
     predictions: A floating point `Tensor` of arbitrary shape and whose values
       are in the range `[0, 1]`.
     thresholds: A python list or tuple of float thresholds in `[0, 1]`.
@@ -1952,7 +1963,8 @@ def sensitivity_at_specificity(
   following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
 
   Args:
-    labels: A `bool` `Tensor` whose shape matches `predictions`.
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
     predictions: A floating point `Tensor` of arbitrary shape and whose values
       are in the range `[0, 1]`.
     specificity: A scalar value in range `[0, 1]`.
@@ -2515,7 +2527,8 @@ def specificity_at_sensitivity(
   following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
 
   Args:
-    labels: A `bool` `Tensor` whose shape matches `predictions`.
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
     predictions: A floating point `Tensor` of arbitrary shape and whose values
       are in the range `[0, 1]`.
     sensitivity: A scalar value in range `[0, 1]`.
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 9ad2bf998b..344a592106 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1663,13 +1663,13 @@ def sparse_softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=
 
   Args:
     _sentinel: Used to prevent positional parameters. Internal, do not use.
-    labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
-      `int64`. Each entry in `labels` must be an index in `[0, num_classes)`.
-      Other values will raise an exception when this op is run on CPU, and
-      return `NaN` for corresponding corresponding loss and gradient rows
-      on GPU.
-    logits: Unscaled log probabilities of rank `r` and shape
-      `[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`.
+    labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-1}]` (where `r` is rank of
+      `labels` and result) and dtype `int32` or `int64`. Each entry in `labels`
+      must be an index in `[0, num_classes)`. Other values will raise an
+      exception when this op is run on CPU, and return `NaN` for corresponding
+      loss and gradient rows on GPU.
+    logits: Unscaled log probabilities of shape
+      `[d_0, d_1, ..., d_{r-1}, num_classes]` and dtype `float32` or `float64`.
     name: A name for the operation (optional).
 
   Returns:
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index bdd59eeb6b..0c266770ab 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -44,6 +44,7 @@ from google.protobuf import text_format
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import importer
@@ -67,6 +68,8 @@ def freeze_graph(input_graph,
                  variable_names_blacklist=""):
   """Converts all variables in a graph and checkpoint into constants."""
 
+  del restore_op_name, filename_tensor_name  # Unused by updated loading code.
+
   if not gfile.Exists(input_graph):
     print("Input graph file '" + input_graph + "' does not exist!")
     return -1
@@ -96,6 +99,7 @@ def freeze_graph(input_graph,
   if clear_devices:
     for node in input_graph_def.node:
       node.device = ""
+
   _ = importer.import_graph_def(input_graph_def, name="")
 
   with session.Session() as sess:
@@ -109,7 +113,19 @@ def freeze_graph(input_graph,
         saver = saver_lib.Saver(saver_def=saver_def)
         saver.restore(sess, input_checkpoint)
     else:
-      sess.run([restore_op_name], {filename_tensor_name: input_checkpoint})
+      var_list = {}
+      reader = pywrap_tensorflow.NewCheckpointReader(input_checkpoint)
+      var_to_shape_map = reader.get_variable_to_shape_map()
+      for key in var_to_shape_map:
+        try:
+          tensor = sess.graph.get_tensor_by_name(key + ":0")
+        except KeyError:
+          # This tensor doesn't exist in the graph (for example it's
+          # 'global_step' or a similar housekeeping element) so skip it.
+          continue
+        var_list[key] = tensor
+      saver = saver_lib.Saver(var_list=var_list)
+      saver.restore(sess, input_checkpoint)
       if initializer_nodes:
         sess.run(initializer_nodes)
 
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index b7dde8aa69..cda4fedec7 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -122,7 +122,8 @@ class LoggingTensorHook(session_run_hook.SessionRunHook):
   The tensors will be printed to the log, with `INFO` severity.
   """
 
-  def __init__(self, tensors, every_n_iter=None, every_n_secs=None):
+  def __init__(self, tensors, every_n_iter=None, every_n_secs=None,
+               formatter=None):
     """Initializes a LoggingHook monitor.
 
     Args:
@@ -133,6 +134,8 @@ class LoggingTensorHook(session_run_hook.SessionRunHook):
       every_n_secs: `int` or `float`, print the values of `tensors` once every N
           seconds. Exactly one of `every_n_iter` and `every_n_secs` should be
           provided.
+      formatter: function, takes dict of `tag`->`Tensor` and returns a string.
+          If `None` uses default printing all tensors.
 
     Raises:
       ValueError: if `every_n_iter` is non-positive.
@@ -143,8 +146,12 @@ class LoggingTensorHook(session_run_hook.SessionRunHook):
     if every_n_iter is not None and every_n_iter <= 0:
       raise ValueError("invalid every_n_iter=%s." % every_n_iter)
     if not isinstance(tensors, dict):
+      self._tag_order = tensors
       tensors = {item: item for item in tensors}
+    else:
+      self._tag_order = tensors.keys()
     self._tensors = tensors
+    self._formatter = formatter
     self._timer = SecondOrStepTimer(every_secs=every_n_secs,
                                     every_steps=every_n_iter)
 
@@ -164,11 +171,17 @@ class LoggingTensorHook(session_run_hook.SessionRunHook):
   def after_run(self, run_context, run_values):
     _ = run_context
     if self._should_trigger:
-      stats = []
-      for tag in self._current_tensors.keys():
-        stats.append("%s = %s" % (tag, run_values.results[tag]))
-      logging.info("%s", ", ".join(stats))
-      self._timer.update_last_triggered_step(self._iter_count)
+      original = np.get_printoptions()
+      np.set_printoptions(suppress=True)
+      elapsed_secs, _ = self._timer.update_last_triggered_step(self._iter_count)
+      if self._formatter:
+        logging.info(self._formatter(run_values.results))
+      else:
+        stats = []
+        for tag in self._tag_order:
+          stats.append("%s = %s" % (tag, run_values.results[tag]))
+        logging.info("%s (%.3f sec)", ", ".join(stats), elapsed_secs)
+      np.set_printoptions(**original)
     self._iter_count += 1
 
 
@@ -647,6 +660,22 @@ class FinalOpsHook(session_run_hook.SessionRunHook):
                                            feed_dict=self._final_ops_feed_dict)
 
 
+class FeedFnHook(session_run_hook.SessionRunHook):
+  """Runs `feed_fn` and sets the `feed_dict` accordingly."""
+
+  def __init__(self, feed_fn):
+    """Constructs the FeedFnHook with given `feed_fn`.
+
+    Args:
+      feed_fn: function, no arguments and returns `dict` to feed.
+    """
+    self.feed_fn = feed_fn
+
+  def before_run(self, run_context):  # pylint: disable=unused-argument
+    return session_run_hook.SessionRunArgs(
+        fetches=None, feed_dict=self.feed_fn())
+
+
 def _as_graph_element(obj):
   """Retrieves Graph element."""
   graph = ops.get_default_graph()
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index babc651e6c..6c2945396f 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -251,6 +251,19 @@ class LoggingTensorHookTest(test.TestCase):
       mon_sess.run(train_op)
       self.assertRegexpMatches(str(self.logged_message), t.name)
 
+  def test_print_formatter(self):
+    with ops.Graph().as_default(), session_lib.Session() as sess:
+      t = constant_op.constant(42.0, name='foo')
+      train_op = constant_op.constant(3)
+      hook = basic_session_run_hooks.LoggingTensorHook(
+          tensors=[t.name], every_n_iter=10,
+          formatter=lambda items: 'qqq=%s' % items[t.name])
+      hook.begin()
+      mon_sess = monitored_session._HookedSession(sess, [hook])
+      sess.run(variables_lib.global_variables_initializer())
+      mon_sess.run(train_op)
+      self.assertEqual(self.logged_message[0], 'qqq=42.0')
+
 
 class CheckpointSaverHookTest(test.TestCase):
 
@@ -820,5 +833,18 @@ class FinalOpsHookTest(test.TestCase):
                              hook.final_ops_values.tolist())
 
 
+class FeedFnHookTest(test.TestCase):
+
+  def test_feeding_placeholder(self):
+    with ops.Graph().as_default(), session_lib.Session() as sess:
+      x = array_ops.placeholder(dtype=dtypes.float32)
+      y = x + 1
+      hook = basic_session_run_hooks.FeedFnHook(
+          feed_fn=lambda: {x: 1.0})
+      hook.begin()
+      mon_sess = monitored_session._HookedSession(sess, [hook])
+      self.assertEqual(mon_sess.run(y), 2)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 30b9ccf922..26e52464cb 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -248,6 +248,7 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                              chief_only_hooks=None,
                              save_checkpoint_secs=600,
                              save_summaries_steps=100,
+                             save_summaries_secs=None,
                              config=None):
   """Creates a `MonitoredSession` for training.
 
@@ -273,8 +274,12 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
       using a default checkpoint saver. If `save_checkpoint_secs` is set to
       `None`, then the default checkpoint saver isn't used.
     save_summaries_steps: The frequency, in number of global steps, that the
-      summaries are written to disk using a default summary saver. If
-      `save_summaries_steps` is set to `None`, then the default summary saver
+      summaries are written to disk using a default summary saver. If both
+      `save_summaries_steps` and `save_summaries_secs` are set to `None`, then
+      the default summary saver isn't used.
+    save_summaries_secs: The frequency, in secs, that the summaries are written
+      to disk using a default summary saver.  If both `save_summaries_steps` and
+      `save_summaries_secs` are set to `None`, then the default summary saver
       isn't used.
     config: an instance of `tf.ConfigProto` proto used to configure the session.
       It's the `config` argument of constructor of `tf.Session`.
@@ -301,10 +306,12 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
     all_hooks.append(
         basic_session_run_hooks.StepCounterHook(output_dir=checkpoint_dir))
 
-    if save_summaries_steps and save_summaries_steps > 0:
+    if (save_summaries_steps and save_summaries_steps > 0) or (
+        save_summaries_secs and save_summaries_secs > 0):
       all_hooks.append(basic_session_run_hooks.SummarySaverHook(
           scaffold=scaffold,
           save_steps=save_summaries_steps,
+          save_secs=save_summaries_secs,
           output_dir=checkpoint_dir))
     if save_checkpoint_secs and save_checkpoint_secs > 0:
       all_hooks.append(basic_session_run_hooks.CheckpointSaverHook(
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index 3b16073166..444ee68cb8 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -215,15 +215,37 @@ class MonitoredTrainingSessionTest(test.TestCase):
           is_chief=True, checkpoint_dir=logdir) as session:
         self.assertEqual(2, session.run(gstep))
 
-  def test_summaries(self):
-    logdir = _test_dir(self.get_temp_dir(), 'test_summaries')
+  def test_summaries_steps(self):
+    logdir = _test_dir(self.get_temp_dir(), 'test_summaries_steps')
     with ops.Graph().as_default():
       gstep = variables_lib.get_or_create_global_step()
       new_gstep = state_ops.assign_add(gstep, 1)
       summary.scalar('my_summary_tag', new_gstep * 2)
       with monitored_session.MonitoredTrainingSession(
-          is_chief=True, checkpoint_dir=logdir) as session:
-        for _ in range(101):  # 100 is default summary writing steps
+          is_chief=True,
+          checkpoint_dir=logdir,
+          save_summaries_steps=100) as session:
+        for _ in range(101):
+          session.run(new_gstep)
+    summaries = util_test.latest_summaries(logdir)
+    tags = [s.summary.value[0].tag for s in summaries]
+    self.assertIn('my_summary_tag', tags)
+    self.assertIn('global_step/sec', tags)
+
+  def test_summaries_secs(self):
+    logdir = _test_dir(self.get_temp_dir(), 'test_summaries_secs')
+    with ops.Graph().as_default():
+      gstep = variables_lib.get_or_create_global_step()
+      new_gstep = state_ops.assign_add(gstep, 1)
+      summary.scalar('my_summary_tag', new_gstep * 2)
+      with monitored_session.MonitoredTrainingSession(
+          is_chief=True,
+          checkpoint_dir=logdir,
+          save_summaries_steps=None,
+          save_summaries_secs=0.1) as session:
+        session.run(new_gstep)
+        time.sleep(0.2)
+        for _ in range(101):
           session.run(new_gstep)
     summaries = util_test.latest_summaries(logdir)
     tags = [s.summary.value[0].tag for s in summaries]
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index 3a2415629a..9f59d270e4 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -88,8 +88,10 @@ See [Threading and Queues](../../how_tos/threading_and_queues/index.md)
 for how to use threads and queues.  For documentation on the Queue API,
 see [Queues](../../api_docs/python/io_ops.md#queues).
 
+
 @@Coordinator
 @@QueueRunner
+@@LooperThread
 @@add_queue_runner
 @@start_queue_runners
 
@@ -119,14 +121,15 @@ overview of summaries, event files, and visualization in TensorBoard.
 
 @@summary_iterator
 
-## Training Utilities
+## Training Hooks
+
+Hooks are tools that run in the process of training/evaluation of the model.
 
-@@global_step
-@@basic_train_loop
-@@get_global_step
-@@assert_global_step
-@@write_graph
 @@SessionRunHook
+@@SessionRunArgs
+@@SessionRunContext
+@@SessionRunValues
+
 @@LoggingTensorHook
 @@StopAtStepHook
 @@CheckpointSaverHook
@@ -136,10 +139,16 @@ overview of summaries, event files, and visualization in TensorBoard.
 @@NanTensorHook
 @@SummarySaverHook
 @@GlobalStepWaiterHook
-@@SessionRunArgs
-@@SessionRunContext
-@@SessionRunValues
-@@LooperThread
+@@FinalOpsHook
+@@FeedFnHook
+
+## Training Utilities
+
+@@global_step
+@@basic_train_loop
+@@get_global_step
+@@assert_global_step
+@@write_graph
 """
 # pylint: enable=line-too-long
 
@@ -190,6 +199,8 @@ from tensorflow.python.training.basic_session_run_hooks import NanLossDuringTrai
 from tensorflow.python.training.basic_session_run_hooks import NanTensorHook
 from tensorflow.python.training.basic_session_run_hooks import SummarySaverHook
 from tensorflow.python.training.basic_session_run_hooks import GlobalStepWaiterHook
+from tensorflow.python.training.basic_session_run_hooks import FinalOpsHook
+from tensorflow.python.training.basic_session_run_hooks import FeedFnHook
 from tensorflow.python.training.basic_loops import basic_train_loop
 from tensorflow.python.training.device_setter import replica_device_setter
 from tensorflow.python.training.monitored_session import Scaffold
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index ac0f15b687..93c312ecfc 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -1212,57 +1212,56 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return false;
 }
 
-/* static */ bool CUDADriver::SynchronousMemcpyD2H(CudaContext* context,
-                                                   void *host_dst,
-                                                   CUdeviceptr gpu_src,
-                                                   uint64 size) {
+/* static */ port::Status CUDADriver::SynchronousMemcpyD2H(CudaContext *context,
+                                                           void *host_dst,
+                                                           CUdeviceptr gpu_src,
+                                                           uint64 size) {
   ScopedActivateContext activation{context};
   CUresult res = dynload::cuMemcpyDtoH_v2(host_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
-    LOG(ERROR) << port::Printf(
-        "failed to synchronous memcpy from device to host: %s; "
-        "host dst: %p; GPU src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), host_dst, port::bit_cast<void *>(gpu_src), size, size);
-    return false;
+    return port::InternalError(
+        port::Printf("failed to synchronous memcpy from device to host: %s; "
+                     "host dst: %p; GPU src: %p; size: %llu=0x%llx",
+                     ToString(res).c_str(), host_dst,
+                     port::bit_cast<void *>(gpu_src), size, size));
   }
   VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
           << host_dst;
-  return true;
+  return port::Status::OK();
 }
 
-/* static */ bool CUDADriver::SynchronousMemcpyH2D(CudaContext* context,
-                                                   CUdeviceptr gpu_dst,
-                                                   const void *host_src,
-                                                   uint64 size) {
+/* static */ port::Status CUDADriver::SynchronousMemcpyH2D(CudaContext *context,
+                                                           CUdeviceptr gpu_dst,
+                                                           const void *host_src,
+                                                           uint64 size) {
   ScopedActivateContext activation{context};
   CUresult res = dynload::cuMemcpyHtoD_v2(gpu_dst, host_src, size);
   if (res != CUDA_SUCCESS) {
-    LOG(ERROR) << port::Printf(
+    return port::InternalError(port::Printf(
         "failed to synchronous memcpy from host to device: %s; GPU dst: %p;"
         " host src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), host_src, size, size);
-    return false;
+        ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), host_src, size,
+        size));
   }
   VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
-  return true;
+  return port::Status::OK();
 }
 
-/* static */ bool CUDADriver::SynchronousMemcpyD2D(CudaContext* context,
-                                                   CUdeviceptr gpu_dst,
-                                                   CUdeviceptr gpu_src,
-                                                   uint64 size) {
+/* static */ port::Status CUDADriver::SynchronousMemcpyD2D(CudaContext *context,
+                                                           CUdeviceptr gpu_dst,
+                                                           CUdeviceptr gpu_src,
+                                                           uint64 size) {
   ScopedActivateContext activation{context};
   CUresult res = dynload::cuMemcpyDtoD_v2(gpu_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
-    LOG(ERROR) << port::Printf(
+    return port::InternalError(port::Printf(
         "failed to synchronous memcpy from host to device: %s; GPU dst: %p; "
         "GPU src: %p; size: %llu=0x%llx",
         ToString(res).c_str(), port::bit_cast<void *>(gpu_dst),
-        port::bit_cast<void *>(gpu_src), size, size);
-    return false;
+        port::bit_cast<void *>(gpu_src), size, size));
   }
   VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
-  return true;
+  return port::Status::OK();
 }
 
 /* static */ bool CUDADriver::AsynchronousMemcpyD2H(CudaContext* context,
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index ab118e5d40..c5d7d8b32f 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -251,12 +251,14 @@ class CUDADriver {
   // -- Synchronous memcopies.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169
 
-  static bool SynchronousMemcpyD2H(CudaContext* context, void *host_dst,
-                                   CUdeviceptr gpu_src, uint64 size);
-  static bool SynchronousMemcpyH2D(CudaContext* context, CUdeviceptr gpu_dst,
-                                   const void *host_src, uint64 size);
-  static bool SynchronousMemcpyD2D(CudaContext* context, CUdeviceptr gpu_dst,
-                                   CUdeviceptr gpu_src, uint64 size);
+  static port::Status SynchronousMemcpyD2H(CudaContext* context, void* host_dst,
+                                           CUdeviceptr gpu_src, uint64 size);
+  static port::Status SynchronousMemcpyH2D(CudaContext* context,
+                                           CUdeviceptr gpu_dst,
+                                           const void* host_src, uint64 size);
+  static port::Status SynchronousMemcpyD2D(CudaContext* context,
+                                           CUdeviceptr gpu_dst,
+                                           CUdeviceptr gpu_src, uint64 size);
 
   // -- Asynchronous memcopies.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index b2da109bf0..ae1bf991a1 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -508,20 +508,21 @@ bool CUDAExecutor::SynchronousMemSet(DeviceMemoryBase *location, int value,
                                             value, size);
 }
 
-bool CUDAExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
-                                     const void *host_src, uint64 size) {
+port::Status CUDAExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
+                                             const void *host_src,
+                                             uint64 size) {
   return CUDADriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
                                           host_src, size);
 }
 
-bool CUDAExecutor::SynchronousMemcpy(void *host_dst,
-                                     const DeviceMemoryBase &gpu_src,
-                                     uint64 size) {
+port::Status CUDAExecutor::SynchronousMemcpy(void *host_dst,
+                                             const DeviceMemoryBase &gpu_src,
+                                             uint64 size) {
   return CUDADriver::SynchronousMemcpyD2H(context_, host_dst,
                                           AsCudaDevicePtr(gpu_src), size);
 }
 
-bool CUDAExecutor::SynchronousMemcpyDeviceToDevice(
+port::Status CUDAExecutor::SynchronousMemcpyDeviceToDevice(
     DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src, uint64 size) {
   return CUDADriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
                                           AsCudaDevicePtr(gpu_src), size);
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 3959d04439..a9917cc89f 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -108,15 +108,16 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
   bool SynchronousMemSet(DeviceMemoryBase *location, int value,
                          uint64 size) override;
 
-  bool SynchronousMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
-                         uint64 size) override;
+  port::Status SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
+                                 const void *host_src, uint64 size) override;
 
-  bool SynchronousMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
-                         uint64 size) override;
+  port::Status SynchronousMemcpy(void *host_dst,
+                                 const DeviceMemoryBase &gpu_src,
+                                 uint64 size) override;
 
-  bool SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst,
-                                       const DeviceMemoryBase &gpu_src,
-                                       uint64 size) override;
+  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst,
+                                               const DeviceMemoryBase &gpu_src,
+                                               uint64 size) override;
 
   bool MemZero(Stream *stream, DeviceMemoryBase *location,
                uint64 size) override;
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index d83d3042d5..5db86cefc3 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -38,6 +38,7 @@ limitations under the License.
 namespace perftools {
 namespace gputools {
 
+class HostBuffer;
 class Stream;
 class ScratchAllocator;
 
@@ -125,6 +126,15 @@ enum class RnnDirectionMode {
   kRnnBidirectional = 1,
 };
 
+// Relevant to DepthToSpace and SpaceToDepth. This is the write layout when
+// performing depth to space and the read layout when performing space to depth.
+// It's specified with most-major dimension first and most-minor dimension last.
+// In DepthToSpace, the D*M² values are read in and then, for DepthHeightWidth,
+// written out to the output patch, by varying first width, then height, then
+// depth. In C array format, it looks like [depth][height][width]. See
+// DepthToSpace comment for more information.
+enum class DepthToSpaceLayout { DepthHeightWidth };
+
 // Specifies the descriptor for a RNN model.
 //
 // An example use case:
@@ -530,6 +540,13 @@ enum class PoolingMode : int64 {
   kAverage,
 };
 
+// Specify the dimension in which to concatenate inputs in space.
+// Specify int64 so there's no padding in SpaceConcatenateMode.
+enum class SpaceConcatenateMode : int64 {
+  XDirection,
+  YDirection,
+};
+
 // Returns a short name for the pooling mode, e.g. "Avg".
 string ShortPoolingModeString(PoolingMode mode);
 
@@ -1319,6 +1336,129 @@ class DnnSupport {
       port::ArraySlice<const DeviceMemory<float>*> input_data,
       DeviceMemory<float>* output_data) = 0;
 
+  // Concatenates several layers into one, by concatenating each in the
+  // x-dimension or y-dimension, based on a user-specified flag.
+  // For x-concatenation, layers are aligned at matching y and depth
+  // coordinates, and for y-concatenation, they are aligned at matching x and
+  // depth coordinates. The inputs must all have the same depth and batch size.
+  // For x-concatenation, the inputs must have the same height (y-size), and the
+  // output will have the same depth and height as the inputs and its width (x-
+  // size) will be the sum of the input widths.  For y-concatenation, the inputs
+  // must have the same width, and the output will have the same depth and width
+  // as the inputs, and its height will be the sum of the input heights.
+  //
+  // Arguments:
+  //  stream: borrowed pointer to the stream that the 'space concatenate'
+  //    operation should be enqueued onto.
+  //  input_dimensions: the dimensions of each input.
+  //  input_data: un-owned device memory region which contains the input data
+  //    for each input layer.
+  //  output_data: un-owned device memory region in which to place the space
+  //    concatenate result.
+  //  concat_direction:  either dnn:SpaceConcatenateMode::XDirection or
+  //    dnn::SpaceConcatenateMode::YDirection.
+  virtual bool DoSpaceConcatenate(
+      Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+      port::ArraySlice<const DeviceMemory<float>*> input_data,
+      DeviceMemory<float>* output_data,
+      dnn::SpaceConcatenateMode concat_direction) {
+    return false;
+  }
+
+  // Change the layout of the data by shrinking one dimension (or set of
+  // dimensions) and growing another dimension (or set of dimensions), while
+  // keeping the total number of data elements constant, and maintaining the
+  // current data ordering.
+  //
+  // Currently, the only supported operation is depth into space by a power of
+  // 2. E.g. (y, x, z) -> (y*2, x*2, z/4)
+  //
+  // Note that Reshape may not be a no-op, depending on the platform and which
+  // dimensions are being changed.
+  //
+  // Example: forgetting about batch for the moment, let's take a tensor that's
+  // 2x1x8 (y by x by z) and reshape to a tensor that's 4x2x2. The memory layout
+  // is row-major order: y,x,z. I.e. z changes the fastest, then x, then y. The
+  // elements of the tensor range from 0 to 15. The x,y,z indices are below each
+  // element.
+  //
+  //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+  // y0 y0 y0 y0 y0 y0 y0 y0 y1 y1 y1 y1 y1 y1 y1 y1
+  // x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0
+  // z0 z1 z2 z3 z4 z5 z6 z7 z0 z1 z2 z3 z4 z5 z6 z7
+  //
+  // reshape to 4x2x2
+  //
+  //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+  // y0 y0 y0 y0 y1 y1 y1 y1 y2 y2 y2 y2 y3 y3 y3 y3
+  // x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1
+  // z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1
+  virtual bool DoReshape(Stream* stream,
+                         const dnn::BatchDescriptor& input_dimensions,
+                         const DeviceMemory<float>& input_data,
+                         const dnn::BatchDescriptor& output_dimensions,
+                         DeviceMemory<float>* output_data) {
+    return false;
+  }
+
+  // Depth to space takes an X by Y image with depth D*M² and changes it to an
+  // MX x MY image with depth D. Each input location (x,y) with depth D*M² in
+  // the input image is changed to an MxM contiguous area in the output image,
+  // with the values being laid out in the raster order by DepthToSpaceLayout,
+  // and will have a new depth of D.
+  //
+  // Example.
+  // M=2, Din =8, Xin=2, Yin=2. Xout=4, Yout=4,  Dout=2
+  // DepthHeightWidth layout
+  // Values within a 'cell' are at different depths and same x & y.
+  // Input:
+  // abcdefgh  ijklmnop
+  // qrstuvwx  yz012345
+  // Output:
+  // ae bf im jn
+  // cg dh ko lp
+  // qu rv y2 z3
+  // sw tx 04 15
+  //
+  // sqrt_depth_reduction: 'M' in the comment above
+  virtual bool DoDepthToSpace(Stream* stream,
+                              const dnn::BatchDescriptor& input_dimensions,
+                              const DeviceMemory<float>& input_data,
+                              const DepthToSpaceLayout& depth_to_space_layout,
+                              const int& sqrt_depth_reduction,
+                              DeviceMemory<float>* output_data) {
+    return false;
+  }
+
+  // Space to depth is the inverse of depth to space. Space to depth takes each
+  // non-overlapping M by M patch (in the X and Y dimensions) with depth D of
+  // the input, and transforms it to a 1 by 1 patch with depth D*M². If the
+  // input has size (MX, MY, D), the output has size (X, Y, D*M²). The number of
+  // data elements is not changed.
+  //
+  // Example.
+  // M=2, Din =2, Xin=4, Yin=4,  Dout=8
+  // DepthHeightWidth layout
+  // Values within a 'cell' are at different depths and same x & y.
+  // Input:
+  // ae bf im jn
+  // cg dh ko lp
+  // qu rv y2 z3
+  // sw tx 04 15
+  // Output:
+  // abcdefgh  ijklmnop
+  // qrstuvwx  yz012345
+  //
+  // sqrt_depth_increase: 'M' in the comment above
+  virtual bool DoSpaceToDepth(Stream* stream,
+                              const dnn::BatchDescriptor& input_dimensions,
+                              const DeviceMemory<float>& input_data,
+                              const DepthToSpaceLayout& space_to_depth_layout,
+                              const int& sqrt_depth_increase,
+                              DeviceMemory<float>* output_data) {
+    return false;
+  }
+
   // Computes the specified operation (e.g. addition or multiplication)
   // between corresponding elements in the inputs and stores the result in the
   // output element.
@@ -1342,6 +1482,37 @@ class DnnSupport {
       const dnn::BatchDescriptor& output_dimensions,
       DeviceMemory<float>* output_data) = 0;
 
+  // Computes the specified operation (e.g. addition or multiplication)
+  // between corresponding elements in the inputs and stores the result in the
+  // output element. Each input is multiplied by a scalar constant and the
+  // result is divided by a scalar constant.
+  // e.g. To perform Z = 0.9*X + 1.1*Y, set the input multiplicands to 9 and 11
+  // and the output divisor to 10.
+  // The inputs and output must all have the same dimensions, but may have
+  // different quantization parameters (min_value and max_value).
+  //
+  // Arguments (all borrowed):
+  //  stream: borrowed pointer to the stream that the 'elementwise operation'
+  // should be enqueued onto.
+  //  operation: The operation to perform.
+  //  input_multiplicands: Amount to scale each input.
+  //  output_divisor: Amount to divide the output.
+  //  input_dimensions: The dimensions of each input.
+  //  input_data: un-owned device memory region which contains the
+  //    input data for each input layer.
+  //  output_dimensions: The dimensions of the output.
+  //  output_data: un-owned device memory region in which to place the
+  //    operation result.
+  virtual bool DoElementwiseOperateScaledQuantized(
+      Stream* stream, ElementwiseOperation operation,
+      port::ArraySlice<int> input_multiplicands, int output_divisor,
+      port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+      port::ArraySlice<const DeviceMemory<float>*> input_data,
+      const dnn::BatchDescriptor& output_dimensions,
+      DeviceMemory<float>* output_data) {
+    return false;
+  }
+
   // Pads the input with zeros in the X and Y dimensions. The feature_map
   // dimension is unchanged.
   //
@@ -1382,6 +1553,43 @@ class DnnSupport {
                     int64 left_trim, int64 right_trim, int64 top_trim,
                     int64 bottom_trim, DeviceMemory<float> *output_data) = 0;
 
+  // Grows the input tensor by replicating the X and Y dimensions. The batch and
+  // depth/feature_map dimensions are unchanged. Currently, the input tensor is
+  // limited to X=1 and Y=1.
+  //
+  // For example, the input has dimensions x=2, y=3, and replicate_x=3,
+  // replicate_y=2. The diagonal elements of the output would be: [x0y0, x1y1,
+  // x0y2, x1y0, x0y1, x1y2].
+  // Here is the example as a picture. input:
+  // AB
+  // CD
+  // EF
+  // broadcast result:
+  // ABABAB
+  // CDCDCD
+  // EFEFEF
+  // ABABAB
+  // CDCDCD
+  // EFEFEF
+  //
+  // Arguments (all borrowed):
+  //  stream: borrowed pointer to the stream that the 'elementwise operation'
+  // should be enqueued onto.
+  //  dimensions: The dimensions of the input.
+  //  input_data: un-owned device memory region which contains the
+  //    input data for the input layer.
+  //  replicate_x: Amount to replicate the input's X dimension.
+  //  replicate_y: Amount to replicate the input's Y dimension.
+  //  output_data: un-owned device memory region in which to place the
+  //    padded result.
+  virtual bool DoXYBroadcast(Stream* stream,
+                             const dnn::BatchDescriptor& dimensions,
+                             const DeviceMemory<float>& input_data,
+                             int64 replicate_x, int64 replicate_y,
+                             DeviceMemory<float>* output_data) {
+    return false;
+  }
+
   // Enqueues an asynchronous memcpy of the *quantized* output of a layer (that
   // is, bytes instead of scaled floats) into 'host_dst' if they are available
   // for the underlying DNN implementation. If this quantized output is not
@@ -1425,6 +1633,21 @@ class DnnSupport {
       QuantizedActivationMode mode,
       DeviceMemory<float>* gpu_unquantized_dst) = 0;
 
+  // Enqueues an asynchronous copy of the contents of buffer_src to
+  // gpu_unquantized_dst.
+  virtual bool DoCopyHostBuffer2Device(
+      Stream* stream, HostBuffer* buffer_src,
+      DeviceMemory<float>* gpu_unquantized_dst) {
+    return false;
+  }
+
+  // Enqueues an asynchronous copy of the contents of gpu_unquantized_src to
+  // buffer_dst.
+  virtual bool DoCopyDevice2HostBuffer(
+      Stream* stream, const DeviceMemory<float>& gpu_unquantized_src,
+      HostBuffer* buffer_dst) {
+    return false;
+  }
 
   // Create an RNN descriptor based on model shapes and configurations.
   // The caller retains the ownership of the descriptor.
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index ff07432bb7..830bc9a681 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -129,23 +129,24 @@ bool HostExecutor::Memset32(Stream *stream, DeviceMemoryBase *location,
   return true;
 }
 
-bool HostExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
-                                     const void *host_src, uint64 size) {
+port::Status HostExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
+                                             const void *host_src,
+                                             uint64 size) {
   memcpy(gpu_dst->opaque(), host_src, size);
-  return true;
+  return port::Status::OK();
 }
 
-bool HostExecutor::SynchronousMemcpy(void *host_dst,
-                                     const DeviceMemoryBase &gpu_src,
-                                     uint64 size) {
+port::Status HostExecutor::SynchronousMemcpy(void *host_dst,
+                                             const DeviceMemoryBase &gpu_src,
+                                             uint64 size) {
   memcpy(host_dst, gpu_src.opaque(), size);
-  return true;
+  return port::Status::OK();
 }
 
-bool HostExecutor::SynchronousMemcpyDeviceToDevice(
+port::Status HostExecutor::SynchronousMemcpyDeviceToDevice(
     DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src, uint64 size) {
   memcpy(gpu_dst->opaque(), gpu_src.opaque(), size);
-  return true;
+  return port::Status::OK();
 }
 
 bool HostExecutor::HostCallback(Stream *stream,
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index f217f7947f..77b07e4a57 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -95,13 +95,14 @@ class HostExecutor : public internal::StreamExecutorInterface {
   bool SynchronousMemSet(DeviceMemoryBase *location, int value,
                          uint64 size) override;
 
-  bool SynchronousMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
-                         uint64 size) override;
-  bool SynchronousMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
-                         uint64 size) override;
-  bool SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst,
-                                       const DeviceMemoryBase &gpu_src,
-                                       uint64 size) override;
+  port::Status SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
+                                 const void *host_src, uint64 size) override;
+  port::Status SynchronousMemcpy(void *host_dst,
+                                 const DeviceMemoryBase &gpu_src,
+                                 uint64 size) override;
+  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst,
+                                               const DeviceMemoryBase &gpu_src,
+                                               uint64 size) override;
 
   bool HostCallback(Stream *stream, std::function<void()> callback) override;
 
diff --git a/tensorflow/stream_executor/host_buffer.h b/tensorflow/stream_executor/host_buffer.h
new file mode 100644
index 0000000000..8fa542e9ff
--- /dev/null
+++ b/tensorflow/stream_executor/host_buffer.h
@@ -0,0 +1,48 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_
+
+#include "tensorflow/stream_executor/dnn.h"
+
+namespace perftools {
+namespace gputools {
+
+// A HostBuffer is a block of memory in host memory containing the data for a
+// dnn::BatchDescriptor using a device-dependent memory layout.
+// Derived classes provide methods to construct a HostBuffer for a specific
+// device, and to copy data in and out of the buffer.
+class HostBuffer {
+ public:
+  const dnn::BatchDescriptor& descriptor() const { return descriptor_; }
+
+  // Returns a string describing the HostBuffer.
+  virtual string AsString() const = 0;
+
+ protected:
+  // Construct a HostBuffer from the supplied dnn::BatchDescriptor.
+  explicit HostBuffer(const dnn::BatchDescriptor& descriptor)
+      : descriptor_(descriptor) {}
+  virtual ~HostBuffer() {}
+
+ private:
+  const dnn::BatchDescriptor descriptor_;
+};
+
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_
diff --git a/tensorflow/stream_executor/lib/status.h b/tensorflow/stream_executor/lib/status.h
index 493fc656e1..0aec2917dc 100644
--- a/tensorflow/stream_executor/lib/status.h
+++ b/tensorflow/stream_executor/lib/status.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/stream_executor/lib/error.h"  // IWYU pragma: export
+#include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 
 namespace perftools {
@@ -33,6 +34,17 @@ using Status = tensorflow::Status;
 #define SE_ASSERT_OK(val) \
   ASSERT_EQ(::perftools::gputools::port::Status::OK(), (val))
 
+// Define some canonical error helpers.
+inline Status UnimplementedError(StringPiece message) {
+  return Status(error::UNIMPLEMENTED, message);
+}
+inline Status InternalError(StringPiece message) {
+  return Status(error::INTERNAL, message);
+}
+inline Status FailedPreconditionError(StringPiece message) {
+  return Status(error::FAILED_PRECONDITION, message);
+}
+
 }  // namespace port
 }  // namespace gputools
 }  // namespace perftools
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 512e882cad..980d544b01 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 
 #include "tensorflow/stream_executor/blas.h"
+#include "tensorflow/stream_executor/host_buffer.h"
 #include "tensorflow/stream_executor/lib/stacktrace.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/platform.h"
@@ -85,6 +86,8 @@ string ToVlogString(const void *ptr) {
   return out.str();
 }
 
+string ToVlogString(const HostBuffer &buffer) { return buffer.AsString(); }
+
 template <class T>
 string ToVlogString(const std::complex<T> &c) {
   // StrCat does not convert std::complex to text.
@@ -149,6 +152,13 @@ string ToVlogString(port::MutableArraySlice<T> elements) {
   return ToVlogString(port::ArraySlice<T>(elements));
 }
 
+string ToVlogString(dnn::DepthToSpaceLayout depth_to_space_layout) {
+  switch (depth_to_space_layout) {
+    case dnn::DepthToSpaceLayout::DepthHeightWidth:
+      return "DepthToSpaceLayout::DepthHeightWidth";
+  }
+}
+
 // Used together with PARAM to VLOG calls made to the stream. Intended
 // to be used like this:
 //
@@ -299,10 +309,7 @@ Stream &Stream::ThenBatchNormalizationForward(
           saved_inv_var, is_training, std::move(var_to_inv_var),
           std::move(inv_var_to_var)));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -324,10 +331,7 @@ Stream &Stream::ThenBatchNormalizationBackward(
           this, y_backprop, x, scale, mean, variance, x_desc, scale_offset_desc,
           epsilon, x_backprop, scale_backprop, offset_backprop));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -355,10 +359,7 @@ Stream &Stream::ThenConvolveWithScratch(
           /*scratch_allocator=*/scratch_allocator, dnn::AlgorithmConfig(),
           nullptr));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -385,10 +386,7 @@ Stream &Stream::ThenConvolveWithScratch(
           /*scratch_allocator=*/scratch_allocator, dnn::AlgorithmConfig(),
           nullptr));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -419,10 +417,7 @@ Stream &Stream::ThenConvolveWithAlgorithm(
         SetError();
       }
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -453,10 +448,7 @@ Stream &Stream::ThenConvolveWithAlgorithm(
         SetError();
       }
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -497,10 +489,7 @@ Stream &Stream::ThenSeparableConvolve(
           depth_multiplier, first_weights, second_weights,
           convolution_descriptor, output_descriptor, output));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -528,10 +517,7 @@ Stream &Stream::ThenConvolveBackwardDataWithScratch(
           backward_input_data, scratch_allocator, dnn::AlgorithmConfig(),
           nullptr));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -564,10 +550,7 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
         SetError();
       }
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -600,10 +583,7 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
         SetError();
       }
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -631,10 +611,7 @@ Stream &Stream::ThenConvolveBackwardDataWithScratch(
           backward_input_data, scratch_allocator, dnn::AlgorithmConfig(),
           nullptr));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -676,10 +653,7 @@ Stream &Stream::ThenConvolveBackwardFilterWithScratch(
           backward_filter_data, scratch_allocator, dnn::AlgorithmConfig(),
           nullptr));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -712,10 +686,7 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
         SetError();
       }
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -743,10 +714,7 @@ Stream &Stream::ThenConvolveBackwardFilterWithScratch(
           backward_filter_data, scratch_allocator, dnn::AlgorithmConfig(),
           nullptr));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -779,10 +747,7 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
         SetError();
       }
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -817,10 +782,7 @@ Stream &Stream::ThenConvolveBackwardBiasImpl(
                                              bias_descriptor,
                                              backward_bias_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -866,10 +828,7 @@ Stream &Stream::ThenMatMul(const DeviceMemory<float> &input_data,
       CheckError(dnn->DoMatMul(this, input_data, weights, input_dimensions,
                                output_dimensions, output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -891,10 +850,7 @@ Stream &Stream::ThenMatMulQuantized(
                                         weight_scales, input_dimensions,
                                         output_dimensions, output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -916,10 +872,7 @@ Stream &Stream::ThenMatMulQuantized(
                                         weight_scales, input_dimensions,
                                         output_dimensions, output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -937,10 +890,7 @@ Stream &Stream::ThenBiasAdd(const DeviceMemory<float> &input_data,
       CheckError(
           dnn->DoBiasAdd(this, input_data, biases, dimensions, output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -961,10 +911,7 @@ Stream &Stream::ThenPoolForward(
                                     input_data, output_dimensions,
                                     output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -985,10 +932,7 @@ Stream &Stream::ThenPoolForward(
                                     input_data, output_dimensions,
                                     output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1012,10 +956,7 @@ Stream &Stream::ThenPoolBackward(
                                      input_data, output_dimensions, output_data,
                                      input_diff_data, output_diff_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1039,10 +980,7 @@ Stream &Stream::ThenPoolBackward(
                                      input_data, output_dimensions, output_data,
                                      input_diff_data, output_diff_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1058,10 +996,7 @@ Stream &Stream::ThenNormalize(
       CheckError(dnn->DoNormalize(this, normalize_descriptor, input_data,
                                   output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1079,10 +1014,7 @@ Stream &Stream::ThenNormalizeWithDimensions(
       CheckError(dnn->DoNormalizeWithDimensions(
           this, normalize_descriptor, dimensions, input_data, output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1104,10 +1036,7 @@ Stream &Stream::ThenNormalizeBackwardWithDimensions(
           this, normalize_descriptor, dimensions, raw_data, normalized_data,
           normalized_variable_gradient, raw_variable_gradient));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1125,10 +1054,7 @@ Stream &Stream::ThenActivate(dnn::ActivationMode activation_mode,
       CheckError(dnn->DoActivate(this, activation_mode, dimensions, input_data,
                                  output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1158,10 +1084,114 @@ Stream &Stream::ThenDepthConcatenate(
       CheckError(dnn->DoDepthConcatenate(this, input_dimensions, input_data,
                                          output_data));
     } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenSpaceConcatenate(
+    port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+    port::ArraySlice<const DeviceMemory<float> *> input_data,
+    DeviceMemory<float> *output_data,
+    dnn::SpaceConcatenateMode concat_direction) {
+  VLOG_CALL(PARAM(input_dimensions), PARAM(input_data), PARAM(output_data));
+
+  // Check that the input dimensions of all the other batches match those of the
+  // first batch.
+  for (size_t i = 1; i < input_dimensions.size(); ++i) {
+    if ((concat_direction == dnn::SpaceConcatenateMode::XDirection) &&
+        (input_dimensions[i].count() != input_dimensions[0].count() ||
+         input_dimensions[i].height() != input_dimensions[0].height() ||
+         input_dimensions[i].feature_map_count() !=
+             input_dimensions[0].feature_map_count())) {
       SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      LOG(ERROR) << "Incompatible dimensions for X concatenation.\n"
+                 << "input_dimensions[0]: " << input_dimensions[0].ToString()
+                 << "input_dimensions[" << i
+                 << "]: " << input_dimensions[i].ToString();
+      return *this;
+    }
+
+    if ((concat_direction == dnn::SpaceConcatenateMode::YDirection) &&
+        (input_dimensions[i].count() != input_dimensions[0].count() ||
+         input_dimensions[i].width() != input_dimensions[0].width() ||
+         input_dimensions[i].feature_map_count() !=
+             input_dimensions[0].feature_map_count())) {
+      SetError();
+      LOG(ERROR) << "Incompatible dimensions for Y concatenation.\n"
+                 << "input_dimensions[0]: " << input_dimensions[0].ToString()
+                 << "input_dimensions[" << i
+                 << "]: " << input_dimensions[i].ToString();
+      return *this;
+    }
+  }
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoSpaceConcatenate(this, input_dimensions, input_data,
+                                         output_data, concat_direction));
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenReshape(const dnn::BatchDescriptor &input_dimensions,
+                            const DeviceMemory<float> &input_data,
+                            const dnn::BatchDescriptor &output_dimensions,
+                            DeviceMemory<float> *output_data) {
+  VLOG_CALL(PARAM(input_dimensions), PARAM(input_data),
+            PARAM(output_dimensions), PARAM(output_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoReshape(this, input_dimensions, input_data,
+                                output_dimensions, output_data));
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenDepthToSpace(
+    const dnn::BatchDescriptor &input_dimensions,
+    const DeviceMemory<float> &input_data,
+    const dnn::DepthToSpaceLayout &depth_to_space_layout,
+    const int sqrt_depth_reduction, DeviceMemory<float> *output_data) {
+  VLOG_CALL(PARAM(input_dimensions), PARAM(input_data),
+            PARAM(depth_to_space_layout), PARAM(sqrt_depth_reduction),
+            PARAM(output_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoDepthToSpace(this, input_dimensions, input_data,
+                                     depth_to_space_layout,
+                                     sqrt_depth_reduction, output_data));
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenSpaceToDepth(
+    const dnn::BatchDescriptor &input_dimensions,
+    const DeviceMemory<float> &input_data,
+    const dnn::DepthToSpaceLayout &space_to_depth_layout,
+    const int sqrt_depth_increase, DeviceMemory<float> *output_data) {
+  VLOG_CALL(PARAM(input_dimensions), PARAM(input_data),
+            PARAM(space_to_depth_layout), PARAM(sqrt_depth_increase),
+            PARAM(output_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoSpaceToDepth(this, input_dimensions, input_data,
+                                     space_to_depth_layout, sqrt_depth_increase,
+                                     output_data));
+    } else {
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1182,10 +1212,30 @@ Stream &Stream::ThenElementwiseOperate(
                                            input_data, output_dimensions,
                                            output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenElementwiseOperateScaledQuantized(
+    dnn::ElementwiseOperation operation,
+    port::ArraySlice<int> input_multiplicands, int output_divisor,
+    port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+    port::ArraySlice<const DeviceMemory<float> *> input_data,
+    const dnn::BatchDescriptor &output_dimensions,
+    DeviceMemory<float> *output_data) {
+  VLOG_CALL(PARAM(operation), PARAM(input_multiplicands), PARAM(output_divisor),
+            PARAM(input_dimensions), PARAM(input_data),
+            PARAM(output_dimensions), PARAM(output_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoElementwiseOperateScaledQuantized(
+          this, operation, input_multiplicands, output_divisor,
+          input_dimensions, input_data, output_dimensions, output_data));
+    } else {
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1204,10 +1254,7 @@ Stream &Stream::ThenXYPad(const dnn::BatchDescriptor &dimensions,
       CheckError(dnn->DoXYPad(this, dimensions, input_data, left_pad, right_pad,
                               top_pad, bottom_pad, output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1228,10 +1275,25 @@ Stream &Stream::ThenXYSlice(const dnn::BatchDescriptor &dimensions,
                                 right_trim, top_trim, bottom_trim,
                                 output_data));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenXYBroadcast(const dnn::BatchDescriptor &dimensions,
+                                const DeviceMemory<float> &input_data,
+                                int64 replicate_x, int64 replicate_y,
+                                DeviceMemory<float> *output_data) {
+  VLOG_CALL(PARAM(dimensions), PARAM(input_data), PARAM(replicate_x),
+            PARAM(replicate_y), PARAM(output_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoXYBroadcast(this, dimensions, input_data, replicate_x,
+                                    replicate_y, output_data));
+    } else {
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1248,10 +1310,7 @@ Stream &Stream::ThenMemcpyD2HQuantized(
       CheckError(dnn->DoMemcpyD2HQuantized(this, gpu_unquantized_src, mode,
                                            host_dst, size));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -1268,10 +1327,37 @@ Stream &Stream::ThenMemcpyH2DQuantized(
       CheckError(dnn->DoMemcpyH2DQuantized(this, host_src, size, mode,
                                            gpu_unquantized_dst));
     } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenCopyHostBuffer2Device(
+    HostBuffer *buffer_src, DeviceMemory<float> *gpu_unquantized_dst) {
+  VLOG_CALL(PARAM(*buffer_src), PARAM(gpu_unquantized_dst));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(
+          dnn->DoCopyHostBuffer2Device(this, buffer_src, gpu_unquantized_dst));
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenCopyDevice2HostBuffer(
+    const DeviceMemory<float> &gpu_unquantized_src, HostBuffer *buffer_dst) {
+  VLOG_CALL(PARAM(gpu_unquantized_src), PARAM(*buffer_dst));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(
+          dnn->DoCopyDevice2HostBuffer(this, gpu_unquantized_src, buffer_dst));
+    } else {
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 0d16495a1d..711eb3079a 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -499,6 +499,44 @@ class Stream {
       port::ArraySlice<const DeviceMemory<float> *> input_data,
       DeviceMemory<float> *output_data);
 
+  Stream &ThenSpaceConcatenate(
+      port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+      port::ArraySlice<const DeviceMemory<float> *> input_data,
+      DeviceMemory<float> *output_data,
+      dnn::SpaceConcatenateMode concat_direction);
+
+  // Change the layout of the data by shrinking one dimension (or set of
+  // dimensions) and growing another dimension (or set of dimensions), while
+  // keeping the total number of data elements constant, and maintaining the
+  // current data ordering.
+  Stream &ThenReshape(const dnn::BatchDescriptor &input_dimensions,
+                      const DeviceMemory<float> &input_data,
+                      const dnn::BatchDescriptor &output_dimensions,
+                      DeviceMemory<float> *output_data);
+
+  // Depth to space takes an X by Y image with depth D*M² and changes it to an
+  // MX x MY image with depth D. Each input location (x,y) with depth D*M² in
+  // the input image is changed to an MxM contiguous area in the output image,
+  // with the values being laid out in raster order specified by
+  // DepthToSpaceLayout, and will have a new depth of D.
+  // See the DoDepthToSpace comment for more information.
+  Stream &ThenDepthToSpace(const dnn::BatchDescriptor &input_dimensions,
+                           const DeviceMemory<float> &input_data,
+                           const dnn::DepthToSpaceLayout &depth_to_space_layout,
+                           const int sqrt_depth_reduction,
+                           DeviceMemory<float> *output_data);
+
+  // Space to depth is the inverse of depth to space. Space to depth takes each
+  // non-overlapping M by M patch (in the X and Y dimensions) with depth D of
+  // the input, and transforms it to a 1 by 1 patch with depth D*M². If the
+  // input has size (MX, MY, D), the output has size (X, Y, D*M²). The number of
+  // data elements is not changed.
+  Stream &ThenSpaceToDepth(const dnn::BatchDescriptor &input_dimensions,
+                           const DeviceMemory<float> &input_data,
+                           const dnn::DepthToSpaceLayout &space_to_depth_layout,
+                           const int sqrt_depth_increase,
+                           DeviceMemory<float> *output_data);
+
   Stream &ThenElementwiseOperate(
       dnn::ElementwiseOperation operation,
       port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
@@ -506,6 +544,14 @@ class Stream {
       const dnn::BatchDescriptor &output_dimensions,
       DeviceMemory<float> *output_data);
 
+  Stream &ThenElementwiseOperateScaledQuantized(
+      dnn::ElementwiseOperation operation,
+      port::ArraySlice<int> input_multiplicands, int output_divisor,
+      port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+      port::ArraySlice<const DeviceMemory<float> *> input_data,
+      const dnn::BatchDescriptor &output_dimensions,
+      DeviceMemory<float> *output_data);
+
   Stream &ThenXYPad(const dnn::BatchDescriptor &dimensions,
                     const DeviceMemory<float> &input_data, int64 left_pad,
                     int64 right_pad, int64 top_pad, int64 bottom_pad,
@@ -516,6 +562,14 @@ class Stream {
                       int64 right_trim, int64 top_trim, int64 bottom_trim,
                       DeviceMemory<float> *output_data);
 
+  // Grows the input tensor by replicating the X and Y dimensions. The batch and
+  // depth/feature_map dimensions are unchanged. Currently, the input tensor is
+  // limited to X=1 and Y=1.
+  Stream &ThenXYBroadcast(const dnn::BatchDescriptor &dimensions,
+                          const DeviceMemory<float> &input_data,
+                          int64 replicate_x, int64 replicate_y,
+                          DeviceMemory<float> *output_data);
+
   // See DnnSupport::DoMemcpyD2HQuantized.
   Stream &ThenMemcpyD2HQuantized(const DeviceMemory<float> &gpu_unquantized_src,
                                  dnn::QuantizedActivationMode mode,
@@ -549,6 +603,14 @@ class Stream {
         Quantization<ElementType>::kModeId, gpu_unquantized_dst);
   }
 
+  // See DnnSupport::DoCopyHostBuffer2Device.
+  Stream &ThenCopyHostBuffer2Device(HostBuffer *buffer_src,
+                                    DeviceMemory<float> *gpu_unquantized_dst);
+
+  // See DnnSupport::DoCopyDevice2HostBuffer.
+  Stream &ThenCopyDevice2HostBuffer(
+      const DeviceMemory<float> &gpu_unquantized_src, HostBuffer *buffer_dst);
+
   /////////////////
   // BLAS support
 
@@ -1527,6 +1589,12 @@ class Stream {
 
   void SetError() { CheckError(false /* = operation_retcode */); }
 
+  void SetErrorAndLogNoDnnSupport() {
+    SetError();
+    LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor "
+                    "without DNN support";
+  }
+
   // The StreamExecutor that supports the operation of this stream.
   StreamExecutor *parent_;
 
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 57db7775a6..d6d55fd623 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -199,14 +199,14 @@ class StreamExecutorInterface {
   virtual bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) = 0;
   virtual bool SynchronousMemSet(DeviceMemoryBase *location, int value,
                                  uint64 size) = 0;
-  virtual bool SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
-                                 const void *host_src, uint64 size) = 0;
-  virtual bool SynchronousMemcpy(void *host_dst,
-                                 const DeviceMemoryBase &gpu_src,
-                                 uint64 size) = 0;
-  virtual bool SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst,
-                                               const DeviceMemoryBase &gpu_src,
-                                               uint64 size) = 0;
+  virtual port::Status SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
+                                         const void *host_src, uint64 size) = 0;
+  virtual port::Status SynchronousMemcpy(void *host_dst,
+                                         const DeviceMemoryBase &gpu_src,
+                                         uint64 size) = 0;
+  virtual port::Status SynchronousMemcpyDeviceToDevice(
+      DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src,
+      uint64 size) = 0;
   virtual bool MemZero(Stream *stream, DeviceMemoryBase *location,
                        uint64 size) = 0;
   virtual bool Memset(Stream *stream, DeviceMemoryBase *location,
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 7739d31662..71a5a45b67 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -491,7 +491,12 @@ bool StreamExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
   // Tracing overloaded methods is very difficult due to issues with type
   // inference on template args. Since use of these overloaded methods is
   // discouraged anyway, this isn't a huge deal.
-  return implementation_->SynchronousMemcpy(gpu_dst, host_src, size);
+  port::Status status =
+      implementation_->SynchronousMemcpy(gpu_dst, host_src, size);
+  if (!status.ok()) {
+    LOG(ERROR) << "synchronous memcpy: " << status;
+  }
+  return status.ok();
 }
 
 bool StreamExecutor::SynchronousMemcpy(void *host_dst,
@@ -501,7 +506,12 @@ bool StreamExecutor::SynchronousMemcpy(void *host_dst,
           << ", gpu_src=" << gpu_src.opaque() << ", size=" << size << ") D2H"
           << StackTraceIfVLOG10();
 
-  return implementation_->SynchronousMemcpy(host_dst, gpu_src, size);
+  port::Status status =
+      implementation_->SynchronousMemcpy(host_dst, gpu_src, size);
+  if (!status.ok()) {
+    LOG(ERROR) << "synchronous memcpy: " << status;
+  }
+  return status.ok();
 }
 
 bool StreamExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
@@ -511,8 +521,12 @@ bool StreamExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
           << gpu_dst->opaque() << ", gpu_src=" << gpu_src.opaque()
           << ", size=" << size << ") D2D" << StackTraceIfVLOG10();
 
-  return implementation_->SynchronousMemcpyDeviceToDevice(gpu_dst, gpu_src,
-                                                          size);
+  port::Status status =
+      implementation_->SynchronousMemcpyDeviceToDevice(gpu_dst, gpu_src, size);
+  if (!status.ok()) {
+    LOG(ERROR) << "synchronous memcpy: " << status;
+  }
+  return status.ok();
 }
 
 port::Status StreamExecutor::SynchronousMemcpyD2H(
@@ -525,13 +539,15 @@ port::Status StreamExecutor::SynchronousMemcpyD2H(
   SCOPED_TRACE(TraceListener::SynchronousMemcpyD2H,
                &result, gpu_src, size, host_dst);
 
-  if (!implementation_->SynchronousMemcpy(host_dst, gpu_src, size)) {
+  port::Status status =
+      implementation_->SynchronousMemcpy(host_dst, gpu_src, size);
+  if (!status.ok()) {
     return port::Status{
         port::error::INTERNAL,
         port::Printf(
             "failed to synchronously memcpy device-to-host: GPU %p to host %p "
-            "size %lld",
-            gpu_src.opaque(), host_dst, size)};
+            "size %lld: %s",
+            gpu_src.opaque(), host_dst, size, status.ToString().c_str())};
   }
 
   return result;
@@ -548,12 +564,15 @@ port::Status StreamExecutor::SynchronousMemcpyH2D(const void *host_src,
   SCOPED_TRACE(TraceListener::SynchronousMemcpyH2D,
                &result, host_src, size, gpu_dst);
 
-  if (!implementation_->SynchronousMemcpy(gpu_dst, host_src, size)) {
+  port::Status status =
+      implementation_->SynchronousMemcpy(gpu_dst, host_src, size);
+  if (!status.ok()) {
     result = port::Status{
         port::error::INTERNAL,
         port::Printf("failed to synchronously memcpy host-to-device: host "
-                     "%p to GPU %p size %lld",
-                     host_src, gpu_dst->opaque(), size)};
+                     "%p to GPU %p size %lld: %s",
+                     host_src, gpu_dst->opaque(), size,
+                     status.ToString().c_str())};
   }
 
   return result;
diff --git a/tensorflow/tensorboard/BUILD b/tensorflow/tensorboard/BUILD
index 21f6519cab..2887fb4362 100644
--- a/tensorflow/tensorboard/BUILD
+++ b/tensorflow/tensorboard/BUILD
@@ -30,6 +30,7 @@ py_binary(
     deps = [
         "//tensorflow/python:platform",
         "//tensorflow/tensorboard/backend:server",
+        "@werkzeug",
     ],
 )
 
diff --git a/tensorflow/tensorboard/tensorboard.py b/tensorflow/tensorboard/tensorboard.py
index 9adcee7e36..42d5aedced 100644
--- a/tensorflow/tensorboard/tensorboard.py
+++ b/tensorflow/tensorboard/tensorboard.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 
 import os
 import socket
+from werkzeug.serving import run_simple
 
 from tensorflow.python.platform import app
 from tensorflow.python.platform import flags
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 2a9fcae5e5..7fa7e4a91d 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -140,28 +140,27 @@ def tf_gen_op_libs(op_lib_names, deps=None):
                       linkstatic=1,)
 
 def tf_gen_op_wrapper_cc(name, out_ops_file, pkg="",
-                         op_gen="//tensorflow/cc:cc_op_gen_main"):
+                         op_gen="//tensorflow/cc:cc_op_gen_main",
+                         deps=None,
+                         include_internal_ops=0):
   # Construct an op generator binary for these ops.
   tool = out_ops_file + "_gen_cc"
+  if deps == None:
+    deps = [pkg + ":" + name + "_op_lib"]
   native.cc_binary(
       name = tool,
       copts = tf_copts(),
       linkopts = ["-lm"],
       linkstatic = 1,   # Faster to link this one-time-use binary dynamically
-      deps = ([op_gen, pkg + ":" + name + "_op_lib"])
+      deps = [op_gen] + deps
   )
 
-  # Run the op generator.
-  if name == "sendrecv_ops" or name == "function_ops":
-    include_internal = "1"
-  else:
-    include_internal = "0"
   native.genrule(
       name=name + "_genrule",
       outs=[out_ops_file + ".h", out_ops_file + ".cc"],
       tools=[":" + tool],
       cmd=("$(location :" + tool + ") $(location :" + out_ops_file + ".h) " +
-           "$(location :" + out_ops_file + ".cc) " + include_internal))
+           "$(location :" + out_ops_file + ".cc) " + str(include_internal_ops)))
 
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate individual C++ .cc and .h
@@ -192,11 +191,14 @@ def tf_gen_op_wrappers_cc(name,
                               "//tensorflow/cc:const_op",
                           ],
                           op_gen="//tensorflow/cc:cc_op_gen_main",
+                          include_internal_ops=0,
                           visibility=None):
   subsrcs = other_srcs
   subhdrs = other_hdrs
   for n in op_lib_names:
-    tf_gen_op_wrapper_cc(n, "ops/" + n, pkg=pkg, op_gen=op_gen)
+    tf_gen_op_wrapper_cc(
+        n, "ops/" + n, pkg=pkg, op_gen=op_gen,
+        include_internal_ops=include_internal_ops)
     subsrcs += ["ops/" + n + ".cc"]
     subhdrs += ["ops/" + n + ".h"]
 
diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index a9989fe504..683ab9f77b 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -46,20 +46,17 @@ function build_libtensorflow_tarball() {
   fi
   bazel clean --expunge
   yes "" | ./configure
-  
-  # TODO(ashankar): Once 
-  # https://github.com/tensorflow/tensorflow/commit/1b32b698eddc10c0d85b0b8cf838f42023394de7  
-  # can be undone, i.e., when bazel supports pkg_tar with python3+ then all of this below
-  # can be replaced with something like:
-  # bazel build ${BAZEL_OPTS} //tensorflow/tools/lib_package:libtensorflow.tar.gz
-  
-  bazel build ${BAZEL_OPTS} //tensorflow:libtensorflow.so
+
+  # Remove this test call when
+  # https://github.com/bazelbuild/bazel/issues/2352
+  # and https://github.com/bazelbuild/bazel/issues/1580
+  # have been resolved and the "manual" tags on the BUILD targets
+  # in tensorflow/tools/lib_package/BUILD are removed.
+  # Till then, must manually run the test.
+  bazel test ${BAZEL_OPTS} //tensorflow/tools/lib_package/...
+
+  bazel build ${BAZEL_OPTS} //tensorflow/tools/lib_package:libtensorflow.tar.gz
   DIR=lib_package
-  rm -rf ${DIR}
-  mkdir -p ${DIR}/build/lib
-  mkdir -p ${DIR}/build/include/tensorflow/c
-  cp bazel-bin/tensorflow/libtensorflow.so ${DIR}/build/lib
-  cp tensorflow/c/c_api.h ${DIR}/build/include/tensorflow/c
-  tar -C ${DIR}/build -cvf ${DIR}/libtensorflow${TARBALL_SUFFIX}.tar.gz include/tensorflow/c/c_api.h lib/libtensorflow.so
-  rm -rf ${DIR}/build
+  mkdir -p ${DIR}
+  cp bazel-bin/tensorflow/tools/lib_package/libtensorflow.tar.gz ${DIR}/libtensorflow${TARBALL_SUFFIX}.tar.gz
 }
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 6f6684dcdf..46f97891d3 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -332,6 +332,11 @@ else
   EXTRA_ARGS="${TF_BUILD_APPEND_ARGUMENTS} --test_tag_filters=-benchmark-test"
 fi
 
+# For any "tool" dependencies in genrules, Bazel will build them for host
+# instead of the target configuration. We can save some build time by setting
+# this flag, and it only affects a few tests.
+EXTRA_ARGS="${EXTRA_ARGS} --distinct_host_configuration=false"
+
 # Process PIP install-test option
 if [[ ${TF_BUILD_IS_PIP} == "no_pip" ]] ||
    [[ ${TF_BUILD_IS_PIP} == "both" ]]; then
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 0d890f5684..975a14e7d5 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -292,8 +292,8 @@ do_buildifier(){
 }
 
 do_external_licenses_check(){
-  echo "Running do_external_licenses_check"
-  echo ""
+  BUILD_TARGET="$1"
+  LICENSES_TARGET="$2"
 
   EXTERNAL_LICENSES_CHECK_START_TIME=$(date +'%s')
 
@@ -302,8 +302,8 @@ do_external_licenses_check(){
   MISSING_LICENSES_FILE="$(mktemp)_missing_licenses.log"
   EXTRA_LICENSES_FILE="$(mktemp)_extra_licenses.log"
 
-  echo "Getting external dependencies for //tensorflow/tools/pip_package:build_pip_package."
- bazel query 'attr("licenses", "notice", deps(//tensorflow/tools/pip_package:build_pip_package))' --no_implicit_deps --no_host_deps --keep_going \
+  echo "Getting external dependencies for ${BUILD_TARGET}"
+ bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --no_implicit_deps --no_host_deps --keep_going \
   | egrep -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
@@ -311,8 +311,8 @@ do_external_licenses_check(){
   | tee ${EXTERNAL_DEPENDENCIES_FILE}
 
   echo
-  echo "Getting list of external licenses."
-  bazel query 'deps(//tensorflow/tools/pip_package:licenses)' --no_implicit_deps --no_host_deps --keep_going \
+  echo "Getting list of external licenses mentioned in ${LICENSES_TARGET}."
+  bazel query "deps(${LICENSES_TARGET})" --no_implicit_deps --no_host_deps --keep_going \
   | egrep -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
@@ -331,7 +331,7 @@ do_external_licenses_check(){
   echo
 
   if [[ -s ${MISSING_LICENSES_FILE} ]] || [[ -s ${EXTRA_LICENSES_FILE} ]] ; then
-    echo "FAIL: pip package external dependencies vs licenses mismatch."
+    echo "FAIL: mismatch in packaged licenses and external dependencies"
     if [[ -s ${MISSING_LICENSES_FILE} ]] ; then
       echo "Missing the licenses for the following external dependencies:"
       cat ${MISSING_LICENSES_FILE}
@@ -355,6 +355,21 @@ do_external_licenses_check(){
   fi
 }
 
+do_pip_package_licenses_check() {
+  echo "Running do_pip_package_licenses_check"
+  echo ""
+  do_external_licenses_check \
+    "//tensorflow/tools/pip_package:build_pip_package" \
+    "//tensorflow/tools/pip_package:licenses"
+}
+
+do_lib_package_licenses_check() {
+  echo "Running do_lib_package_licenses_check"
+  echo ""
+  do_external_licenses_check \
+    "//tensorflow:libtensorflow.so" \
+    "//tensorflow/tools/lib_package:clicenses_generate"
+}
 
 # Run bazel build --nobuild to test the validity of the BUILD files
 do_bazel_nobuild() {
@@ -376,8 +391,8 @@ do_bazel_nobuild() {
 }
 
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_buildifier" "do_bazel_nobuild" "do_external_licenses_check")
-SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "buildifier check" "bazel nobuild" "external dependencies licenses check")
+SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check")
+SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies")
 
 INCREMENTAL_FLAG=""
 
diff --git a/tensorflow/tools/ci_build/install/install_deb_packages.sh b/tensorflow/tools/ci_build/install/install_deb_packages.sh
index 71e2a6c852..227b83ab9f 100755
--- a/tensorflow/tools/ci_build/install/install_deb_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_deb_packages.sh
@@ -21,18 +21,11 @@ ubuntu_version=$(cat /etc/issue | grep -i ubuntu | awk '{print $2}' | \
 # Install dependencies from ubuntu deb repository.
 apt-get update
 
-set +e
-ffmpeg_location=$(which ffmpeg)
-if [[ -z "$ffmpeg_location"  && "$ubuntu_version" == "14" ]]; then
-  set -e
+if [[ "$ubuntu_version" == "14" ]]; then
   # specifically for trusty linked from ffmpeg.org
   add-apt-repository -y ppa:mc3man/trusty-media
   apt-get update
   apt-get dist-upgrade -y
-  apt-get install -y ffmpeg libav-tools
-else
-  set -e
-  apt-get install -y ffmpeg libav-tools
 fi
 
 apt-get install -y --no-install-recommends \
@@ -41,6 +34,7 @@ apt-get install -y --no-install-recommends \
     build-essential \
     cmake \
     curl \
+    ffmpeg \
     git \
     libcurl4-openssl-dev \
     libtool \
diff --git a/tensorflow/tools/graph_transforms/summarize_graph_main.cc b/tensorflow/tools/graph_transforms/summarize_graph_main.cc
index 638296b923..55b55e0a15 100644
--- a/tensorflow/tools/graph_transforms/summarize_graph_main.cc
+++ b/tensorflow/tools/graph_transforms/summarize_graph_main.cc
@@ -65,7 +65,8 @@ Status SummarizeGraph(const GraphDef& graph) {
   MapNodesToOutputs(graph, &output_map);
   std::vector<const NodeDef*> outputs;
   for (const NodeDef& node : graph.node()) {
-    if (output_map.count(node.name()) == 0) {
+    if ((output_map.count(node.name()) == 0) && (node.op() != "Const") &&
+        (node.op() != "Assign")) {
       outputs.push_back(&node);
     }
   }
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
new file mode 100644
index 0000000000..41e7221efe
--- /dev/null
+++ b/tensorflow/tools/lib_package/BUILD
@@ -0,0 +1,107 @@
+# Packaging the TensorFlow C API into a small, standalone archive for use with
+# language bindings and installations without Python.
+#
+# TODO(ashankar): Something similar for the JNI library for Java?
+# TODO(ashankar): Something similar for the C++ API (caveat: ABI compatibility)
+
+package(default_visibility = ["//visibility:private"])
+
+load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_tar")
+
+pkg_tar(
+    name = "libtensorflow",
+    extension = "tar.gz",
+    # Mark as "manual" till
+    # https://github.com/bazelbuild/bazel/issues/2352
+    # and https://github.com/bazelbuild/bazel/issues/1580
+    # are resolved, otherwise these rules break when built
+    # with Python 3.
+    tags = ["manual"],
+    deps = [
+        ":cheaders",
+        ":clib",
+        ":clicenses",
+    ],
+)
+
+pkg_tar(
+    name = "cheaders",
+    files = ["//tensorflow/c:headers"],
+    package_dir = "include/tensorflow/c",
+    # Mark as "manual" till
+    # https://github.com/bazelbuild/bazel/issues/2352
+    # and https://github.com/bazelbuild/bazel/issues/1580
+    # are resolved, otherwise these rules break when built
+    # with Python 3.
+    tags = ["manual"],
+)
+
+pkg_tar(
+    name = "clib",
+    files = ["//tensorflow:libtensorflow.so"],
+    package_dir = "lib",
+    # Mark as "manual" till
+    # https://github.com/bazelbuild/bazel/issues/2352
+    # and https://github.com/bazelbuild/bazel/issues/1580
+    # are resolved, otherwise these rules break when built
+    # with Python 3.
+    tags = ["manual"],
+)
+
+pkg_tar(
+    name = "clicenses",
+    files = [":include/tensorflow/c/LICENSE"],
+    package_dir = "include/tensorflow/c",
+    # Mark as "manual" till
+    # https://github.com/bazelbuild/bazel/issues/2352
+    # and https://github.com/bazelbuild/bazel/issues/1580
+    # are resolved, otherwise these rules break when built
+    # with Python 3.
+    tags = ["manual"],
+)
+
+genrule(
+    name = "clicenses_generate",
+    srcs = [
+        "//third_party/hadoop:LICENSE.txt",
+        "//third_party/eigen3:LICENSE",
+        "@boringssl//:LICENSE",
+        "@com_googlesource_code_re2//:LICENSE",
+        "@curl//:COPYING",
+        "@eigen_archive//:COPYING.MPL2",
+        "@farmhash_archive//:COPYING",
+        "@gemmlowp//:LICENSE",
+        "@gif_archive//:COPYING",
+        "@grpc//:LICENSE",
+        "@highwayhash//:LICENSE",
+        "@jemalloc//:COPYING",
+        "@jpeg//:LICENSE.md",
+        "@libxsmm_archive//:LICENSE",
+        "@local_config_sycl//sycl:LICENSE.text",
+        "@nanopb_git//:LICENSE.txt",
+        "@png_archive//:LICENSE",
+        "@protobuf//:LICENSE",
+        "@zlib_archive//:zlib.h",
+    ],
+    outs = ["include/tensorflow/c/LICENSE"],
+    cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
+    tools = [":concat_licenses.sh"],
+)
+
+sh_test(
+    name = "libtensorflow_test",
+    size = "small",
+    srcs = ["libtensorflow_test.sh"],
+    data = [
+        "libtensorflow_test.c",
+        ":libtensorflow.tar.gz",
+    ],
+    # Mark as "manual" till
+    # https://github.com/bazelbuild/bazel/issues/2352
+    # and https://github.com/bazelbuild/bazel/issues/1580
+    # are resolved, otherwise these rules break when built
+    # with Python 3.
+    # Till then, this test is explicitly executed when building
+    # the release by tensorflow/tools/ci_build/builds/libtensorflow.sh
+    tags = ["manual"],
+)
diff --git a/tensorflow/tools/lib_package/README.md b/tensorflow/tools/lib_package/README.md
new file mode 100644
index 0000000000..fbec0a067a
--- /dev/null
+++ b/tensorflow/tools/lib_package/README.md
@@ -0,0 +1,31 @@
+Bazel rules to package the TensorFlow C-library and [header
+files](https://www.tensorflow.org/code/tensorflow/c/c_api.h)
+into an archive.
+
+## TensorFlow C library
+
+The TensorFlow [C
+API](https://www.tensorflow.org/code/tensorflow/c/c_api.h)
+is typically a requirement of TensorFlow APIs in other languages such as
+[Go](https://www.tensorflow.org/code/tensorflow/go)
+and [Rust](https://github.com/tensorflow/rust).
+
+The command:
+
+```sh
+bazel build -c opt //tensorflow/tools/lib_package:libtensorflow
+```
+
+produces `bazel-bin/tensorflow/tools/lib_package/libtensorflow.tar.gz`, which
+can be distributed and installed using something like:
+
+```sh
+tar -C /usr/local -xzf libtensorflow.tar.gz
+```
+
+## Release
+
+Scripts to generate archives using these rules for release are in
+[tensorflow/tools/ci_build/linux](https://www.tensorflow.org/code/tensorflow/tools/ci_build/linux)
+and
+[tensorflow/tools/ci_build/osx](https://www.tensorflow.org/code/tensorflow/tools/ci_build/osx)
diff --git a/tensorflow/tools/lib_package/concat_licenses.sh b/tensorflow/tools/lib_package/concat_licenses.sh
new file mode 100755
index 0000000000..2070f64e9f
--- /dev/null
+++ b/tensorflow/tools/lib_package/concat_licenses.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Script aimed to combining multiple license files into a single one.
+
+for f in $@
+do
+  echo "--------------------------------------------------------------------------------"
+  echo "BEGIN LICENSE FOR $f"
+  echo "--------------------------------------------------------------------------------"
+  cat $f
+  echo "--------------------------------------------------------------------------------"
+  echo "END LICENSE FOR $f"
+  echo "--------------------------------------------------------------------------------"
+done
diff --git a/tensorflow/tools/lib_package/libtensorflow_test.c b/tensorflow/tools/lib_package/libtensorflow_test.c
new file mode 100644
index 0000000000..dff6fb77ec
--- /dev/null
+++ b/tensorflow/tools/lib_package/libtensorflow_test.c
@@ -0,0 +1,28 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Companion source file for libtensorflow_test.sh
+
+#include <tensorflow/c/c_api.h>
+
+int main() {
+  TF_Status* s = TF_NewStatus();
+  TF_SetStatus(s, TF_UNKNOWN, "Some error");
+  if (TF_GetCode(s) != TF_UNKNOWN) {
+    return 1;
+  }
+  TF_DeleteStatus(s);
+  return 0;
+}
diff --git a/tensorflow/tools/lib_package/libtensorflow_test.sh b/tensorflow/tools/lib_package/libtensorflow_test.sh
new file mode 100755
index 0000000000..6463ecea70
--- /dev/null
+++ b/tensorflow/tools/lib_package/libtensorflow_test.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -ex
+
+# Sanity test for the package C-library archive.
+# - Unarchive
+# - Compile a trivial C file that uses the archive
+# - Run it
+
+# Tools needed: A C-compiler and tar
+CC="${CC}"
+TAR="${TAR}"
+
+[ -z "${CC}" ] && CC="/usr/bin/gcc"
+[ -z "${TAR}"] && TAR="tar"
+
+# bazel tests run with ${PWD} set to the root of the bazel workspace
+TARFILE="${PWD}/tensorflow/tools/lib_package/libtensorflow.tar.gz"
+CFILE="${PWD}/tensorflow/tools/lib_package/libtensorflow_test.c"
+
+cd ${TEST_TMPDIR}
+
+# Extract the archive into tensorflow/
+mkdir tensorflow
+${TAR} -xzf ${TARFILE} -Ctensorflow
+
+# Compile the test .c file
+${CC} ${CFILE} -Itensorflow/include -Ltensorflow/lib -ltensorflow -oa.out
+
+# Execute it, with the shared library available.
+# DYLD_LIBRARY_PATH is used on OS X, LD_LIBRARY_PATH on Linux
+export DYLD_LIBRARY_PATH=tensorflow/lib
+export LD_LIBRARY_PATH=tensorflow/lib
+./a.out
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 62fb9b9176..0ef09835e9 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -95,6 +95,7 @@ filegroup(
         "@png_archive//:LICENSE",
         "@protobuf//:LICENSE",
         "@six_archive//:LICENSE",
+        "@werkzeug//:LICENSE",
         "@zlib_archive//:zlib.h",
     ] + tf_additional_license_deps(),
 )
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index a03e844ea2..1ad739d6cf 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -76,10 +76,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "libxsmm_archive",
       urls = [
-          "https://github.com/hfp/libxsmm/archive/1.6.4.tar.gz",
+          # "http://bazel-mirror.storage.googleapis.com/github.com/hfp/libxsmm/archive/1.6.1.tar.gz",
+          "https://github.com/hfp/libxsmm/archive/1.6.5.tar.gz",
       ],
-      sha256 = "3788bf1cdb60f119f8a04ed7ed96861322e539ce2d2ea977f00431d6b2b80beb",
-      strip_prefix = "libxsmm-1.6.4",
+      sha256 = "5231419a8e13e7a6d286cf25d32a3aa75c443a625e5ea57024d36468bc3d5936",
+      strip_prefix = "libxsmm-1.6.5",
       build_file = str(Label("//third_party:libxsmm.BUILD")),
   )
 
@@ -191,6 +192,17 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       build_file = str(Label("//third_party:six.BUILD")),
   )
 
+  native.new_http_archive(
+      name = "werkzeug",
+      urls = [
+          "http://bazel-mirror.storage.googleapis.com/pypi.python.org/packages/b7/7f/44d3cfe5a12ba002b253f6985a4477edfa66da53787a2a838a40f6415263/Werkzeug-0.11.10.tar.gz",
+          "https://pypi.python.org/packages/b7/7f/44d3cfe5a12ba002b253f6985a4477edfa66da53787a2a838a40f6415263/Werkzeug-0.11.10.tar.gz",
+      ],
+      strip_prefix = "Werkzeug-0.11.10",
+      sha256 = "cc64dafbacc716cdd42503cf6c44cb5a35576443d82f29f6829e5c49264aeeee",
+      build_file = str(Label("//third_party:werkzeug.BUILD")),
+  )
+
   native.bind(
       name = "six",
       actual = "@six_archive//:six",
@@ -314,7 +326,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   # TODO(phawkins): currently, this rule uses an unofficial LLVM mirror.
   # Switch to an official source of snapshots if/when possible.
-  native.new_http_archive(
+  temp_workaround_http_archive(
       name = "llvm",
       urls = [
           "http://bazel-mirror.storage.googleapis.com/github.com/llvm-mirror/llvm/archive/4e9e4f277ad254e02a0cff33c61cd827e600da62.tar.gz",
@@ -323,6 +335,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       sha256 = "ec67c57dfd85c2bb857fd13011c5c2aa3f1dc9f40c0a5bac13e78e76d6b61aa6",
       strip_prefix = "llvm-4e9e4f277ad254e02a0cff33c61cd827e600da62",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
+      repository = tf_repo_name,
   )
 
   native.new_http_archive(
@@ -395,7 +408,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       actual = "@junit_jar//jar",
   )
 
-  native.new_http_archive(
+  temp_workaround_http_archive(
       name = "jemalloc",
       urls = [
           "http://bazel-mirror.storage.googleapis.com/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
@@ -404,4 +417,5 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
       strip_prefix = "jemalloc-4.4.0",
       build_file = str(Label("//third_party:jemalloc.BUILD")),
+      repository = tf_repo_name,
   )
diff --git a/third_party/jemalloc.BUILD b/third_party/jemalloc.BUILD
index 2496d12627..aabff39d7b 100644
--- a/third_party/jemalloc.BUILD
+++ b/third_party/jemalloc.BUILD
@@ -5,7 +5,7 @@ licenses(["notice"])  # BSD
 
 exports_files(["COPYING"])
 
-load("@//third_party:common.bzl", "template_rule")
+load("@%ws%//third_party:common.bzl", "template_rule")
 
 cc_library(
     name = "jemalloc",
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index 0f7ef74545..330d8b79ce 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -7,18 +7,18 @@ licenses(["notice"])
 exports_files(["LICENSE.TXT"])
 
 load(
-    "@//third_party/llvm:llvm.bzl",
+    "@%ws%//third_party/llvm:llvm.bzl",
     "gentbl",
     "expand_cmake_vars",
     "llvm_target_cmake_vars",
     "cmake_var_string",
 )
 load(
-    "@//third_party:common.bzl",
+    "@%ws%//third_party:common.bzl",
     "template_rule",
 )
 
-package(default_visibility = ["@//tensorflow/compiler/xla:internal"])
+package(default_visibility = ["@%ws%//tensorflow/compiler/xla:internal"])
 
 llvm_host_triple = "x86_64-unknown-linux_gnu"
 
@@ -147,7 +147,7 @@ darwin_cmake_vars = {
 # TODO(phawkins): use a better method to select the right host triple, rather
 # than hardcoding x86_64.
 all_cmake_vars = select({
-    "@//tensorflow:darwin": cmake_var_string(
+    "@%ws%//tensorflow:darwin": cmake_var_string(
         cmake_vars + llvm_target_cmake_vars("X86", "x86_64-apple-darwin") +
         darwin_cmake_vars,
     ),
diff --git a/third_party/werkzeug.BUILD b/third_party/werkzeug.BUILD
new file mode 100644
index 0000000000..aaf1614bb9
--- /dev/null
+++ b/third_party/werkzeug.BUILD
@@ -0,0 +1,14 @@
+# Description:
+#   Werkzeug provides utilities for making WSGI applications
+
+licenses(["notice"])  # BSD 3-Clause
+
+exports_files(["LICENSE"])
+
+# Note: this library includes test code. Consider creating a testonly target.
+py_library(
+    name = "werkzeug",
+    srcs = glob(["werkzeug/werkzeug/*.py"]),
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)