14 files changed, 66 insertions, 30 deletions
diff --git a/tensorflow/compiler/xla/array.h b/tensorflow/compiler/xla/array.h
index 213e0bac6c..71aa057cd3 100644
--- a/tensorflow/compiler/xla/array.h
+++ b/tensorflow/compiler/xla/array.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <initializer_list>
 #include <iterator>
 #include <memory>
+#include <numeric>
 #include <random>
 #include <type_traits>
 #include <vector>
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index daad09364c..7293b35c0f 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -770,7 +770,7 @@ class ComputationBuilder {
   // The operand must represent a constant value, which in this case
   // means that it must not statically depend on any parameter of the
   // computation that is being built other then the ones specified on the
-  // paramtere list. The parameters in the list will be indexed by their
+  // parameter list. The parameters in the list will be indexed by their
   // parameter id property so the number of parameters specified should be at
   // least as many as the largest used parameter index.
   //
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 3278fd5f06..8388574716 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -339,7 +339,7 @@ TEST_F(CopyInsertionTest, ElementOfNestedTupleParameter) {
            ShapeUtil::MakeShape(F32, {42})}),
       "param0"));
 
-  // The return value of the computation is the zero-th elemnt of the nested
+  // The return value of the computation is the zero-th element of the nested
   // tuple. This element is itself a tuple.
   auto gte = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
       ShapeUtil::GetSubshape(param->shape(), {0}), param, 0));
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
index acfada8540..74ae6d00c9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
@@ -38,14 +38,16 @@ typedef float V8F32AVX __attribute__((__vector_size__(32)));
 
 extern "C" {
 
+#ifdef __AVX__
 // The following functions are vectorized versions of a selection of libm
 // library functions.
 // References to these functions are created by the LLVM vectorizer.
 xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_ExpV8F32AVX(
-    xla::cpu::runtime::V8F32AVX x) TF_ATTRIBUTE_WEAK;
+    xla::cpu::runtime::V8F32AVX x);
 
 xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_LogV8F32AVX(
-    xla::cpu::runtime::V8F32AVX x) TF_ATTRIBUTE_WEAK;
+    xla::cpu::runtime::V8F32AVX x);
+#endif
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_AVX_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h
index 75cb16b273..645a43858f 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h
@@ -49,14 +49,16 @@ struct V4F32NEON;
 
 extern "C" {
 
+#ifdef __ARM_NEON__
 // The following functions are vectorized versions of a selection of libm
 // library functions.
 // References to these functions are created by the LLVM vectorizer.
 xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_ExpV4F32NEON(
-    xla::cpu::runtime::V4F32NEON x) TF_ATTRIBUTE_WEAK;
+    xla::cpu::runtime::V4F32NEON x);
 
 xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_LogV4F32NEON(
-    xla::cpu::runtime::V4F32NEON x) TF_ATTRIBUTE_WEAK;
+    xla::cpu::runtime::V4F32NEON x);
+#endif  // __ARM_NEON__
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_NEON_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
index 96587d10d2..80ca4243a2 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
@@ -39,14 +39,16 @@ typedef float V4F32SSE __attribute__((__vector_size__(16)));
 
 extern "C" {
 
+#ifdef __SSE4_1__
 // The following functions are vectorized versions of a selection of libm
 // library functions.
 // References to these functions are created by the LLVM vectorizer.
 xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_ExpV4F32SSE(
-    xla::cpu::runtime::V4F32SSE x) TF_ATTRIBUTE_WEAK;
+    xla::cpu::runtime::V4F32SSE x);
 
 xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_LogV4F32SSE(
-    xla::cpu::runtime::V4F32SSE x) TF_ATTRIBUTE_WEAK;
+    xla::cpu::runtime::V4F32SSE x);
+#endif
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_SSE4_1_H_
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index 81c29e4726..0f71258ff0 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -65,13 +65,13 @@ llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module,
       llvm::ConstantFP::get(vector_type, 9.0), &ir_builder);
 
   std::array<float, 7> numerator_coeffs(
-      {{-2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
-        5.12229709037114e-08f, 1.48572235717979e-05f, 6.37261928875436e-04f,
-        4.89352455891786e-03f}});
+      {-2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
+       5.12229709037114e-08f, 1.48572235717979e-05f, 6.37261928875436e-04f,
+       4.89352455891786e-03f});
 
   std::array<float, 4> denominator_coeffs(
-      {{1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
-        4.89352518554385e-03f}});
+      {1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
+       4.89352518554385e-03f});
 
   llvm::Value* input_squared =
       ir_builder.CreateFMul(input_clamped, input_clamped);
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index cda2783307..c942cd6bf1 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -102,9 +102,21 @@ llvm::StringRef GetHostCpuName() {
 
 CompilerFunctor::VectorIntrinsics GetAvailableIntrinsics() {
   CompilerFunctor::VectorIntrinsics intrinsics;
-  intrinsics.sse_intrinsics = (&__xla_cpu_runtime_ExpV4F32SSE != nullptr);
-  intrinsics.avx_intrinsics = (&__xla_cpu_runtime_ExpV8F32AVX != nullptr);
-  intrinsics.neon_intrinsics = (&__xla_cpu_runtime_ExpV4F32NEON != nullptr);
+#ifdef __SSE4_1__
+  intrinsics.sse_intrinsics = true;
+#else
+  intrinsics.sse_intrinsics = false;
+#endif
+#ifdef __AVX__
+  intrinsics.avx_intrinsics = true;
+#else
+  intrinsics.avx_intrinsics = false;
+#endif
+#ifdef __ARM_NEON__
+  intrinsics.neon_intrinsics = true;
+#else
+  intrinsics.neon_intrinsics = false;
+#endif
   return intrinsics;
 }
 
@@ -201,12 +213,18 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
+#ifdef __ARM_NEON__
   REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32NEON);
-  REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32SSE);
-  REGISTER_CPU_RUNTIME_SYMBOL(ExpV8F32AVX);
   REGISTER_CPU_RUNTIME_SYMBOL(LogV4F32NEON);
+#endif
+#ifdef __SSE4_1__
+  REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32SSE);
   REGISTER_CPU_RUNTIME_SYMBOL(LogV4F32SSE);
+#endif
+#ifdef __AVX__
+  REGISTER_CPU_RUNTIME_SYMBOL(ExpV8F32AVX);
   REGISTER_CPU_RUNTIME_SYMBOL(LogV8F32AVX);
+#endif
   REGISTER_CPU_RUNTIME_SYMBOL(ParallelForkJoin);
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseInfeedBufferAfterDequeue);
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseOutfeedBufferAfterPopulation);
@@ -275,7 +293,11 @@ bool RegisterKnownJITSymbols() {
   REGISTER_LIBM_SYMBOL(scalbln, double (*)(double, long));
   REGISTER_LIBM_SYMBOL(scalbn, double (*)(double, int));
   REGISTER_LIBM_SYMBOL(sin, double (*)(double));
+#ifdef __APPLE__
+  REGISTER_LIBM_SYMBOL(__sincos, void (*)(double, double*, double*));
+#else
   REGISTER_LIBM_SYMBOL(sincos, void (*)(double, double*, double*));
+#endif
   REGISTER_LIBM_SYMBOL(sinh, double (*)(double));
   REGISTER_LIBM_SYMBOL(sqrt, double (*)(double));
   REGISTER_LIBM_SYMBOL(tan, double (*)(double));
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.cc b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
index ccdd171759..ab94d7d543 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
@@ -44,7 +44,7 @@ namespace {
 //
 //            Parameter
 //               |
-//   Const  GetTupleElemet
+//   Const  GetTupleElement
 //      \   /
 //       Add (root)
 //
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index a03ad2f37c..88a8698d16 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -264,7 +264,7 @@ class LazyBestFitHeap : public HeapAlgorithm {
   enum { kLazyAllocOffset = -1 };
 
   struct OrderChunkByIncreasingSize {
-    bool operator()(const Chunk& a, const Chunk& b) {
+    bool operator()(const Chunk& a, const Chunk& b) const {
       if (a.size != b.size) return a.size < b.size;
       return a.offset < b.offset;
     }
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 0e7ab00713..b9f76531f3 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -1318,19 +1318,16 @@ string SaveGraph(const string& graph,
       file_extension = ".pbtxt";
       break;
   }
-  string path = JoinPath(
-      dest_path, StrCat("hlo_graph_", output_num++, ".XXXXXX", file_extension));
+  string path = JoinPath(dest_path, StrCat("hlo_graph_", output_num++, "."));
   auto status = Status::OK();
-  int fd = mkstemps(&path[0], file_extension.length());
-  if (fd < 0) {
+  auto env = tensorflow::Env::Default();
+  if (!env->CreateUniqueFileName(&path, file_extension)) {
     status =
         Status(tensorflow::error::Code::UNKNOWN,
                StrCat("Failed to create temporary file to dump HLO graph: ",
                       strerror(errno)));
   } else {
-    status =
-        tensorflow::WriteStringToFile(tensorflow::Env::Default(), path, graph);
-    close(fd);
+    status = tensorflow::WriteStringToFile(env, path, graph);
   }
   if (!status.ok()) {
     LOG(WARNING) << "Saving HLO graph failed: " << status;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
index 5f6f9810c3..23d2d4e87d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
@@ -111,8 +111,15 @@ void KernelSupportLibrary::EmitAndCallOutlinedKernel(
     ir_builder->SetInsertPoint(return_inst);
 
     std::vector<llvm::Value*> arg_values;
-    std::transform(function->arg_begin(), function->arg_end(),
-                   std::back_inserter(arg_values), std::addressof<llvm::Value>);
+    /*
+     * clang on OSX doesn't like std::transform or range for loop here.
+     * See https://github.com/tensorflow/tensorflow/issues/15196
+     */
+    for (llvm::Function::arg_iterator arg = function->arg_begin(),
+                                      arg_e = function->arg_end();
+         arg != arg_e; ++arg) {
+      arg_values.push_back(arg);
+    }
     if (null_arg_idx != -1) {
       arg_values.insert(arg_values.begin() + null_arg_idx, nullptr);
     }
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index bf8d190150..d752619bd6 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -238,7 +238,7 @@ class ShapeTree {
   //           (or compatible).
   //   index : the index of the element in the shape. See ShapeUtil::GetSubshape
   //           for definition of index.
-  //   data : The data value at this elemnt.
+  //   data : The data value at this element.
   template <typename Fn>
   void ForEachElement(const Fn& func) const;
 
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index b722095d1f..277cc5ec86 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -239,11 +239,14 @@ std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
 
 // Override of the above that works around compile failures with gcc 7.1.1.
 // For details see https://github.com/tensorflow/tensorflow/issues/10843
+// Hide this workaround from MSVC as it causes ambiguous error.
+#ifndef _MSC_VER
 template <typename T>
 std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
                        const std::vector<T>& input) {
   return Permute<std::vector, T>(permutation, input);
 }
+#endif
 
 // Inverts a permutation, i.e., output_permutation[input_permutation[i]] = i.
 std::vector<int64> InversePermutation(