20 files changed, 467 insertions, 71 deletions
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 2ca4af67cd..06647a7bbc 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -285,6 +285,7 @@ cc_library(
         ":cpu_runtime_avx",
         ":cpu_runtime_sse4_1",
         ":disassembler",
+        ":llvm_ir_runtime",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -346,6 +347,21 @@ cc_library(
 )
 
 cc_library(
+    name = "llvm_ir_runtime",
+    srcs = [
+        "llvm_ir_runtime.cc",
+    ],
+    hdrs = [
+        "llvm_ir_runtime.h",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "@llvm//:core",
+        "@llvm//:transform_utils",
+    ],
+)
+
+cc_library(
     name = "runtime_conv2d",
     srcs = [
         "runtime_conv2d.cc",
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 7304953249..2442a0f5dc 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h"
+#include "tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -84,6 +85,8 @@ operator()(llvm::Module& module) const {
 
   CHECK(!llvm::verifyModule(module, &llvm::dbgs()));
 
+  runtime::RewriteIRRuntimeFunctions(&module);
+
   // Buffer for holding machine code prior to constructing the ObjectFile.
   llvm::SmallVector<char, 0> stream_buffer;
   llvm::raw_svector_ostream ostream(stream_buffer);
@@ -136,9 +139,6 @@ std::vector<llvm::VecDesc> VectorFunctionsForTargetLibraryInfoImpl(
 
       {"logf", runtime::kLogV4F32SymbolName, 4},
       {"llvm.log.f32", runtime::kLogV4F32SymbolName, 4},
-
-      {"tanhf", runtime::kTanhV4F32SymbolName, 4},
-      {"llvm.tanh.f32", runtime::kTanhV4F32SymbolName, 4},
   };
 
   const llvm::VecDesc eight_wide_vector_functions[] = {
@@ -147,6 +147,13 @@ std::vector<llvm::VecDesc> VectorFunctionsForTargetLibraryInfoImpl(
 
       {"logf", runtime::kLogV8F32SymbolName, 8},
       {"llvm.log.f32", runtime::kLogV8F32SymbolName, 8},
+  };
+
+  // These functions are generated by XLA as LLVM IR, so they're always
+  // available.
+  const llvm::VecDesc ir_vector_functions[] = {
+      {"tanhf", runtime::kTanhV4F32SymbolName, 4},
+      {"llvm.tanh.f32", runtime::kTanhV4F32SymbolName, 4},
 
       {"tanhf", runtime::kTanhV8F32SymbolName, 8},
       {"llvm.tanh.f32", runtime::kTanhV8F32SymbolName, 8},
@@ -169,6 +176,10 @@ std::vector<llvm::VecDesc> VectorFunctionsForTargetLibraryInfoImpl(
                               std::end(eight_wide_vector_functions));
     }
   }
+
+  vector_functions.insert(vector_functions.end(),
+                          std::begin(ir_vector_functions),
+                          std::end(ir_vector_functions));
   return vector_functions;
 }
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
index f6664bb854..9d8e67897c 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
@@ -29,11 +29,6 @@ xla::cpu::runtime::V8F32 __xla_cpu_runtime_LogV8F32(
     xla::cpu::runtime::V8F32 x) {
   return Eigen::internal::plog(x);
 }
-
-xla::cpu::runtime::V8F32 __xla_cpu_runtime_TanhV8F32(
-    xla::cpu::runtime::V8F32 x) {
-  return Eigen::internal::ptanh(x);
-}
 #endif  // __AVX__
 
 namespace xla {
@@ -42,7 +37,6 @@ namespace runtime {
 
 const char *const kExpV8F32SymbolName = "__xla_cpu_runtime_ExpV8F32";
 const char *const kLogV8F32SymbolName = "__xla_cpu_runtime_LogV8F32";
-const char *const kTanhV8F32SymbolName = "__xla_cpu_runtime_TanhV8F32";
 
 }  // namespace runtime
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
index c15710fb00..62e352f1e4 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
@@ -48,8 +48,6 @@ xla::cpu::runtime::V8F32 __xla_cpu_runtime_ExpV8F32(xla::cpu::runtime::V8F32 x)
 xla::cpu::runtime::V8F32 __xla_cpu_runtime_LogV8F32(xla::cpu::runtime::V8F32 x)
     TF_ATTRIBUTE_WEAK;
 
-xla::cpu::runtime::V8F32 __xla_cpu_runtime_TanhV8F32(xla::cpu::runtime::V8F32 x)
-    TF_ATTRIBUTE_WEAK;
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_AVX_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
index 58ec9fc6e8..93a1a3a3c3 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
@@ -33,12 +33,6 @@ xla::cpu::runtime::V4F32 __xla_cpu_runtime_LogV4F32(
   return Eigen::internal::plog(p);
 }
 
-xla::cpu::runtime::V4F32 __xla_cpu_runtime_TanhV4F32(
-    xla::cpu::runtime::V4F32 x) {
-  Eigen::internal::Packet4f p = x;
-  return Eigen::internal::ptanh(p);
-}
-
 #endif  // __SSE4_1__
 
 namespace xla {
@@ -47,7 +41,6 @@ namespace runtime {
 
 const char *const kExpV4F32SymbolName = "__xla_cpu_runtime_ExpV4F32";
 const char *const kLogV4F32SymbolName = "__xla_cpu_runtime_LogV4F32";
-const char *const kTanhV4F32SymbolName = "__xla_cpu_runtime_TanhV4F32";
 
 }  // namespace runtime
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
index 7ab9a52d00..591682a943 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
@@ -48,9 +48,6 @@ xla::cpu::runtime::V4F32 __xla_cpu_runtime_ExpV4F32(xla::cpu::runtime::V4F32 x)
 
 xla::cpu::runtime::V4F32 __xla_cpu_runtime_LogV4F32(xla::cpu::runtime::V4F32 x)
     TF_ATTRIBUTE_WEAK;
-
-xla::cpu::runtime::V4F32 __xla_cpu_runtime_TanhV4F32(xla::cpu::runtime::V4F32 x)
-    TF_ATTRIBUTE_WEAK;
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_SSE4_1_H_
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
new file mode 100644
index 0000000000..77e4425aa2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -0,0 +1,140 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h"
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+namespace cpu {
+namespace runtime {
+
+const char* const kTanhV4F32SymbolName = "__xla_cpu_runtime_TanhV4F32";
+const char* const kTanhV8F32SymbolName = "__xla_cpu_runtime_TanhV8F32";
+
+namespace {
+llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module,
+                                          llvm::StringRef function_name,
+                                          int vector_width) {
+  llvm::Function* vector_tanh_function = module->getFunction(function_name);
+  if (vector_tanh_function == nullptr) {
+    // If the function declaration is not present in the module, there can't be
+    // any calls to resolve.  Don't emit the function in this case.
+    return nullptr;
+  }
+
+  llvm::LLVMContext* context = &module->getContext();
+  llvm::Type* float_type = llvm::Type::getFloatTy(*context);
+  llvm::VectorType* vector_type =
+      llvm::VectorType::get(float_type, vector_width);
+
+  llvm::Function* min_intrinsic = llvm::Intrinsic::getDeclaration(
+      module, llvm::Intrinsic::minnum, vector_type);
+  llvm::Function* max_intrinsic = llvm::Intrinsic::getDeclaration(
+      module, llvm::Intrinsic::maxnum, vector_type);
+
+  llvm::BasicBlock* vector_tanh_body =
+      llvm::BasicBlock::Create(*context, "body", vector_tanh_function);
+
+  llvm::IRBuilder<> ir_builder(vector_tanh_body);
+
+  llvm::FastMathFlags fast_math_flags;
+  fast_math_flags.setUnsafeAlgebra();
+  ir_builder.setFastMathFlags(fast_math_flags);
+
+  llvm::Value* input = &*vector_tanh_function->arg_begin();
+  CHECK_EQ(input->getType(), vector_type);
+
+  // This implements the same rational interpolant as implemented in Eigen3.
+  llvm::Value* input_clamped = ir_builder.CreateCall(
+      min_intrinsic,
+      {ir_builder.CreateCall(max_intrinsic,
+                             {input, llvm::ConstantFP::get(vector_type, -9.0)}),
+       llvm::ConstantFP::get(vector_type, 9.0)});
+
+  std::array<float, 7> numerator_coeffs(
+      {{-2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
+        5.12229709037114e-08f, 1.48572235717979e-05f, 6.37261928875436e-04f,
+        4.89352455891786e-03f}});
+
+  std::array<float, 4> denominator_coeffs(
+      {{1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
+        4.89352518554385e-03f}});
+
+  llvm::Value* input_squared =
+      ir_builder.CreateFMul(input_clamped, input_clamped);
+  llvm::Value* numerator =
+      llvm::ConstantFP::get(vector_type, numerator_coeffs[0]);
+  for (int i = 1; i < numerator_coeffs.size(); i++) {
+    numerator = ir_builder.CreateFAdd(
+        ir_builder.CreateFMul(input_squared, numerator),
+        llvm::ConstantFP::get(vector_type, numerator_coeffs[i]));
+  }
+  numerator = ir_builder.CreateFMul(input_clamped, numerator);
+
+  llvm::Value* denominator =
+      llvm::ConstantFP::get(vector_type, denominator_coeffs[0]);
+  for (int i = 1; i < denominator_coeffs.size(); i++) {
+    denominator = ir_builder.CreateFAdd(
+        ir_builder.CreateFMul(input_squared, denominator),
+        llvm::ConstantFP::get(vector_type, denominator_coeffs[i]));
+  }
+
+  llvm::Value* result = ir_builder.CreateFDiv(numerator, denominator);
+  ir_builder.CreateRet(result);
+
+  DCHECK(!llvm::verifyFunction(*vector_tanh_function));
+  return vector_tanh_function;
+}
+}  // namespace
+
+void RewriteIRRuntimeFunctions(llvm::Module* module) {
+  auto* tanh_v4f32 = EmitVectorF32TanhIfNeeded(module, kTanhV4F32SymbolName,
+                                               /*vector_width=*/4);
+  auto* tanh_v8f32 = EmitVectorF32TanhIfNeeded(module, kTanhV8F32SymbolName,
+                                               /*vector_width=*/8);
+
+  // Gather all the call sites, force inline them and then delete the vector
+  // function bodies.
+
+  std::vector<llvm::CallInst*> calls_to_inline;
+  for (auto* function : {tanh_v4f32, tanh_v8f32}) {
+    if (function != nullptr) {
+      for (auto* user : function->users()) {
+        calls_to_inline.push_back(llvm::cast<llvm::CallInst>(user));
+      }
+    }
+  }
+
+  for (auto* call_to_inline : calls_to_inline) {
+    llvm::InlineFunctionInfo inline_function_info;
+    CHECK(llvm::InlineFunction(call_to_inline, inline_function_info));
+  }
+
+  for (auto* function : {tanh_v4f32, tanh_v8f32}) {
+    if (function != nullptr) {
+      function->eraseFromParent();
+    }
+  }
+}
+
+}  // namespace runtime
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h
new file mode 100644
index 0000000000..4a0c9d8946
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LLVM_IR_RUNTINE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LLVM_IR_RUNTINE_H_
+
+#include "llvm/IR/Module.h"
+
+namespace xla {
+namespace cpu {
+namespace runtime {
+
+extern const char* const kTanhV4F32SymbolName;
+extern const char* const kTanhV8F32SymbolName;
+
+// The following CPU runtime functions have LLVM-IR only implementations:
+//
+//  - __xla_cpu_runtime_TanhV4F32
+//  - __xla_cpu_runtime_TanhV8F32
+//
+// |LinkIRRuntimeFunctions| rewrites calls to these functions into generic LLVM
+// IR.
+
+void RewriteIRRuntimeFunctions(llvm::Module* module);
+
+}  // namespace runtime
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LLVM_IR_RUNTINE_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index b46ac67096..9eb714ed87 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -93,10 +93,8 @@ class JITSymbolTable {
     ADD_JIT_SYMBOL_TO_TABLE(ReleaseOutfeedBufferAfterPopulation);
     ADD_JIT_SYMBOL_TO_TABLE(ExpV8F32);
     ADD_JIT_SYMBOL_TO_TABLE(LogV8F32);
-    ADD_JIT_SYMBOL_TO_TABLE(TanhV8F32);
     ADD_JIT_SYMBOL_TO_TABLE(ExpV4F32);
     ADD_JIT_SYMBOL_TO_TABLE(LogV4F32);
-    ADD_JIT_SYMBOL_TO_TABLE(TanhV4F32);
     ADD_JIT_SYMBOL_TO_TABLE(EigenConvF32);
     ADD_JIT_SYMBOL_TO_TABLE(EigenMatMulF32);
     ADD_JIT_SYMBOL_TO_TABLE(EigenMatMulF64);
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 821418afd0..7dba4e52f0 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -1559,6 +1559,50 @@ TEST_F(ArrayElementwiseOpTest, TanhF32s) {
                              error_spec_);
 }
 
+TEST_F(ArrayElementwiseOpTest, TanhF32sVector) {
+  // This is like the test ArrayElementwiseOpTest.TanhF32s above, except that
+  // the input tensor is large enough to exercise the vectorized tanh
+  // implementation.
+  ComputationBuilder builder(client_, TestName());
+  auto input_literal = Literal::CreateR2<float>(
+      {{1.02, -0.32, 0.85, 0.90, 1.23, -0.91, -0.49, 0.80},
+       {-0.67, 0.16, -0.07, 0.39, -0.41, 0.04, 1.36, 1.25},
+       {0.41, 0.65, -1.08, 0.32, -1.45, -0.77, -1.09, 0.91},
+       {-1.03, -0.30, -1.11, -1.17, 1.50, -0.85, 0.04, 1.02},
+       {0.34, -0.61, 0.41, 0.07, -0.02, 1.42, -0.62, 0.81},
+       {0.08, 0.81, -0.30, 1.17, -0.65, -0.44, 0.92, 1.26},
+       {-1.29, 1.35, 0.08, -1.24, -0.92, 0.49, 1.17, -0.45},
+       {-1.31, -1.44, -0.13, -1.31, -0.79, 1.41, 1.21, 1.05}});
+  auto input_data =
+      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+
+  auto input = builder.Parameter(0, input_literal->shape(), "input");
+  builder.Tanh(input);
+
+  ComputeAndCompareR2<float>(
+      &builder,
+      {{0.77009583, -0.30665702, 0.69070244, 0.71401149, 0.84400684,
+        -0.71985596, -0.45764771, 0.66664988},
+       {-0.58278900, 0.16050975, -0.06770509, 0.36843640, -0.38476998,
+        0.04018109, 0.87562293, 0.84788644},
+       {0.38603750, 0.57294142, -0.79140943, 0.31032649, -0.89590985,
+        -0.64770776, -0.79625875, 0.72234446},
+       {-0.77389336, -0.28871772, -0.80428445, -0.82541436, 0.90456349,
+        -0.68856895, 0.03877772, 0.76877952},
+       {0.32561871, -0.54546672, 0.39072621, 0.07273290, -0.01924866,
+        0.88924897, -0.55283129, 0.67183107},
+       {0.08006320, 0.66944766, -0.29068485, 0.82573754, -0.57170743,
+        -0.41581789, 0.72739530, 0.85025692},
+       {-0.85931867, 0.87357593, 0.07782833, -0.84597743, -0.72748238,
+        0.45396307, 0.82449573, -0.42462519},
+       {-0.86363792, -0.89368379, -0.12621804, -0.86445558, -0.65565848,
+        0.88789743, 0.83566397, 0.78287679}},
+      {input_data.get()},
+      // The error spec is unusually high here to account for the fact that we
+      // use a rational interpolant to approximate tanh.
+      ErrorSpec(0.004, 0.004));
+}
+
 TEST_F(ArrayElementwiseOpTest, AddChainFoldLeft) {
   // a ------ (add) --------- (add)
   //         /               /
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 1f947092d1..8678929497 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -12170,7 +12170,7 @@ op {
       }
     }
   }
-  summary: "Computes the determinant of one ore more square matrices."
+  summary: "Computes the determinant of one or more square matrices."
   description: "The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices. The output is a tensor containing the determinants\nfor all input submatrices `[..., :, :]`."
 }
 op {
diff --git a/tensorflow/docs_src/deploy/index.md b/tensorflow/docs_src/deploy/index.md
index 644115b8ff..b935ebd637 100644
--- a/tensorflow/docs_src/deploy/index.md
+++ b/tensorflow/docs_src/deploy/index.md
@@ -5,11 +5,6 @@ the following documents:
 
   * @{$distributed$Distributed TensorFlow}, which explains how to create
     a cluster of TensorFlow servers.
-  * @{$tfserve$TensorFlow Serving}, which describes TensorFlow Serving--an
-    open-source serving system for machine learning models. This document
-    provides a short introduction to TensorFlow Serving; the bulk of the
-    documentation about TensorFlow Serving is in a
-    [separate website](https://tensorflow.github.io/serving/serving_basic).
   * @{$hadoop$How to run TensorFlow on Hadoop}, which has a highly
     self-explanatory title.
 
diff --git a/tensorflow/docs_src/deploy/leftnav_files b/tensorflow/docs_src/deploy/leftnav_files
index 6f5821cfd5..f8f8d578e6 100644
--- a/tensorflow/docs_src/deploy/leftnav_files
+++ b/tensorflow/docs_src/deploy/leftnav_files
@@ -1,4 +1,3 @@
 index.md
 distributed.md
-tfserve.md
 hadoop.md
diff --git a/tensorflow/docs_src/deploy/tfserve.md b/tensorflow/docs_src/deploy/tfserve.md
deleted file mode 100644
index 8b11b6ad2a..0000000000
--- a/tensorflow/docs_src/deploy/tfserve.md
+++ /dev/null
@@ -1,30 +0,0 @@
-# TensorFlow Serving
-
-## Introduction
-
-TensorFlow Serving is a flexible, high-performance serving system for machine
-learning models, designed for production environments. TensorFlow Serving
-makes it easy to deploy new algorithms and experiments, while keeping the same
-server architecture and APIs.
-
-## Basic Serving Tutorial
-
-See the [basic tutorial](https://tensorflow.github.io/serving/serving_basic)
-on the TensorFlow Serving site to learn how to export a trained TensorFlow
-model and build a server to serve the exported model.
-
-## Advanced Serving Tutorial
-
-See the
-[advanced tutorial](https://tensorflow.github.io/serving/serving_advanced)
-on the TensorFlow Serving site to learn how to build a server that
-dynamically discovers and serves new versions of a trained TensorFlow
-model.
-
-## Serving Inception Model Tutorial
-
-See the
-[serving inception tutorial](https://tensorflow.github.io/serving/serving_inception)
-on the TensorFlow Serving site to learn how to serve the inception model with
-TensorFlow Serving and Kubernetes.
-
diff --git a/tensorflow/docs_src/programmers_guide/leftnav_files b/tensorflow/docs_src/programmers_guide/leftnav_files
index 7f0b9b8db9..2a58c4647d 100644
--- a/tensorflow/docs_src/programmers_guide/leftnav_files
+++ b/tensorflow/docs_src/programmers_guide/leftnav_files
@@ -9,7 +9,7 @@ reading_data.md
 embedding.md
 debugger.md
 supervisor.md
-saved_model_cli.md
+saved_model.md
 meta_graph.md
 version_compat.md
 faq.md
diff --git a/tensorflow/docs_src/programmers_guide/saved_model.md b/tensorflow/docs_src/programmers_guide/saved_model.md
index e169efe19c..9262143ad8 100644
--- a/tensorflow/docs_src/programmers_guide/saved_model.md
+++ b/tensorflow/docs_src/programmers_guide/saved_model.md
@@ -440,10 +440,10 @@ does not specify one.
 ### Serving the exported model locally
 
 For local deployment, you can serve your model using
-@{$deploy/tfserve$Tensorflow Serving}, an open-source project that loads a
+[TensorFlow Serving](http://github.com/tensorflow/serving), an open-source project that loads a
 SavedModel and exposes it as a [gRPC](http://www.grpc.io/) service.
 
-First, [install TensorFlow Serving](https://tensorflow.github.io/serving/setup#prerequisites).
+First, [install TensorFlow Serving](http://github.com/tensorflow/serving).
 
 Then build and run the local model server, substituting `$export_dir_base` with
 the path to the SavedModel you exported above:
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index b6d8b0db67..1ae27298ec 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -21772,7 +21772,7 @@ func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the determinant of one ore more square matrices.
+// Computes the determinant of one or more square matrices.
 //
 // The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 // form square matrices. The output is a tensor containing the determinants
diff --git a/tensorflow/java/maven/run_inside_container.sh b/tensorflow/java/maven/run_inside_container.sh
index 302ed96c12..6b4d5d7032 100644
--- a/tensorflow/java/maven/run_inside_container.sh
+++ b/tensorflow/java/maven/run_inside_container.sh
@@ -32,7 +32,7 @@ clean() {
   # (though if run inside a clean docker container, there won't be any dirty
   # artifacts lying around)
   mvn -q clean
-  rm -rf libtensorflow_jni/src libtensorflow_jni/target libtensorflow/src libtensorflow/target
+  rm -rf libtensorflow_jni/src libtensorflow_jni/target libtensorflow/src libtensorflow/target tensorflow-android/target
 }
 
 update_version_in_pom() {
@@ -52,6 +52,17 @@ download_libtensorflow() {
   cd "${DIR}"
 }
 
+# Fetch the android aar artifact from the CI build system, and update
+# its associated pom file.
+update_tensorflow_android() {
+  TARGET_DIR="${DIR}/tensorflow-android/target"
+  mkdir -p "${TARGET_DIR}"
+  python "${DIR}/tensorflow-android/update.py" \
+    --version "${TF_VERSION}" \
+    --template "${DIR}/tensorflow-android/pom-android.xml.template" \
+    --dir "${TARGET_DIR}"
+}
+
 download_libtensorflow_jni() {
   NATIVE_DIR="${DIR}/libtensorflow_jni/src/main/resources/org/tensorflow/native"
   mkdir -p "${NATIVE_DIR}"
@@ -126,6 +137,29 @@ generate_java_protos() {
   rm -rf "${DIR}/proto/tmp"
 }
 
+# If successfully built, try to deploy.
+# If successfully deployed, clean.
+# If deployment fails, debug with
+#   ./release.sh ${TF_VERSION} ${SETTINGS_XML} bash
+# To get a shell to poke around the maven artifacts with.
+deploy_artifacts() {
+  # This deploys the non-android pieces
+  mvn deploy
+
+  # Sign and deploy the previously downloaded aar file as a single
+  # maven artifact.
+  if [[ "${IS_SNAPSHOT}" == "true" ]]; then
+    REPO="https://oss.sonatype.org/content/repositories/snapshots"
+  else
+    REPO="https://oss.sonatype.org/service/local/staging/deploy/maven2/"
+  fi
+  mvn gpg:sign-and-deploy-file -Dfile="${DIR}/tensorflow-android/target/tensorflow.aar" -DpomFile="${DIR}/tensorflow-android/target/pom-android.xml" -Durl=${REPO} -DrepositoryId=ossrh
+
+  # Clean up when everything works
+  clean
+}
+
+
 if [ -z "${TF_VERSION}" ]
 then
   echo "Must set the TF_VERSION environment variable"
@@ -144,15 +178,12 @@ clean
 update_version_in_pom
 download_libtensorflow
 download_libtensorflow_jni
+update_tensorflow_android
 generate_java_protos
 # Build the release artifacts
 mvn verify
-# If successfully built, try to deploy.
-# If successfully deployed, clean.
-# If deployment fails, debug with
-#   ./release.sh ${TF_VERSION} ${SETTINGS_XML} bash
-# To get a shell to poke around the maven artifacts with.
-mvn deploy && clean
+# Push artifacts to repository
+deploy_artifacts
 
 set +ex
 if [[ "${IS_SNAPSHOT}" == "false" ]]; then
diff --git a/tensorflow/java/maven/tensorflow-android/pom-android.xml.template b/tensorflow/java/maven/tensorflow-android/pom-android.xml.template
new file mode 100644
index 0000000000..5cbd0c898d
--- /dev/null
+++ b/tensorflow/java/maven/tensorflow-android/pom-android.xml.template
@@ -0,0 +1,29 @@
+<project
+    xmlns="http://maven.apache.org/POM/4.0.0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>org.tensorflow</groupId>
+  <artifactId>tensorflow-android</artifactId>
+  <version>${version}</version>
+  <packaging>aar</packaging>
+
+  <name>TensorFlow AAR for Android Inference Library and Java API</name>
+  <url>https://github.com/tensorflow/tensorflow/</url>
+  <parent>
+    <groupId>org.tensorflow</groupId>
+    <artifactId>parentpom</artifactId>
+    <version>${version}</version>
+    <relativePath>../</relativePath>
+  </parent>
+
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <project.build.number>${build_number}</project.build.number>
+    <project.build.commitid>${build_commit_id}</project.build.commitid>
+    <project.build.type>${build_type}</project.build.type>
+    <project.build.url>${build_url}</project.build.url>
+  </properties>
+
+</project>
diff --git a/tensorflow/java/maven/tensorflow-android/update.py b/tensorflow/java/maven/tensorflow-android/update.py
new file mode 100644
index 0000000000..bd97a08948
--- /dev/null
+++ b/tensorflow/java/maven/tensorflow-android/update.py
@@ -0,0 +1,139 @@
+#  Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Fetch android artifacts and update pom properties."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import json
+import string
+import sys
+import urllib2
+
+
+def get_args():
+  """Parse command line args."""
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--version', required=True, help='Version for the artifact.')
+  parser.add_argument(
+      '--dir',
+      required=True,
+      help='Directory where the pom and aar artifact will be written.')
+  parser.add_argument(
+      '--template', required=True, help='Path to pom template file.')
+  return parser.parse_args()
+
+
+def get_json(url):
+  """Load the contents of the URL as a json object."""
+  return json.load(urllib2.urlopen(url))
+
+
+def get_commit_id(build_info):
+  """Fetch the git commit id from the build info json object."""
+  actions = build_info.get('actions')
+  build_data = next(
+      a for a in actions
+      if a.get('_class') == 'hudson.plugins.git.util.BuildData')
+  if not build_data:
+    raise ValueError('Missing BuildData: %s' % build_info)
+  revision_info = build_data.get('lastBuiltRevision')
+  if not revision_info:
+    raise ValueError('Missing lastBuiltRevision: %s' % build_info)
+  return revision_info.get('SHA1')
+
+
+def get_aar_url(build_info):
+  """Given the json build info, find the URL to the tensorflow.aar artifact."""
+  base_url = build_info.get('url')
+  if not base_url:
+    raise ValueError('Missing url: %s' % build_info)
+  build_class = build_info.get('_class')
+  if (build_class == 'hudson.model.FreeStyleBuild' or
+      build_class == 'hudson.matrix.MatrixRun'):
+    aar_info = next(
+        a for a in build_info.get('artifacts')
+        if a.get('fileName') == 'tensorflow.aar')
+    if not aar_info:
+      raise ValueError('Missing aar artifact: %s' % build_info)
+    return '%s/artifact/%s' % (base_url, aar_info.get('relativePath'))
+
+  raise ValueError('Unknown build_type %s' % build_info)
+
+
+def read_template(path):
+  with open(path) as f:
+    return string.Template(f.read())
+
+
+def main():
+  args = get_args()
+
+  # Artifacts are downloaded from the ci build. A SNAPSHOT release is
+  # associated with artifacts from the last successful nightly build. Otherwise,
+  # it comes from the officially blessed release artifacts.
+  if args.version.endswith('SNAPSHOT'):
+    info_url = ('https://ci.tensorflow.org/view/Nightly/job/nightly-android'
+                '/lastSuccessfulBuild/api/json')
+    aar_url = None
+    build_type = 'nightly-android'
+  else:
+    release_prefix = 'https://storage.googleapis.com/tensorflow/libtensorflow'
+    info_url = '/%s/android_buildinfo-%s.json' % (release_prefix, args.version)
+    aar_url = '/%s/tensorflow-%s.aar' % (release_prefix, args.version)
+    build_type = 'release-matrix-android'
+
+  # Retrieve build information
+  build_info = get_json(info_url)
+
+  # Check all required build info is present
+  if build_info.get('result') != 'SUCCESS':
+    raise ValueError('Invalid json: %s' % build_info)
+  build_url = build_info.get('url')
+  if not build_url:
+    raise ValueError('Missing url: %s' % build_info)
+  build_number = build_info.get('number')
+  if not build_number:
+    raise ValueError('Missing build number: %s' % build_info)
+  build_commit_id = get_commit_id(build_info)
+  if not build_commit_id:
+    raise ValueError('Missing commit id: %s' % build_info)
+
+  # Write the pom file updated with build attributes.
+  template = read_template(args.template)
+  with open('%s/pom-android.xml' % args.dir, 'w') as f:
+    f.write(
+        template.substitute({
+            'build_commit_id': build_commit_id,
+            'build_number': build_number,
+            'build_type': build_type,
+            'build_url': build_url,
+            'version': args.version
+        }))
+
+  # Retrieve the aar location if needed.
+  if not aar_url:
+    aar_url = get_aar_url(build_info)
+
+  # And download the aar to the desired location.
+  with open('%s/tensorflow.aar' % args.dir, 'w') as f:
+    aar = urllib2.urlopen(aar_url)
+    f.write(aar.read())
+
+
+if __name__ == '__main__':
+  sys.exit(main())