Merge changes from github.

Change: 148954491
author: Andrew Harp <andrewharp@google.com> 2017-03-01 17:59:22 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-03-01 18:08:24 -0800
commit: 3e975ea978bac4d861bb09328b06f3c316212611 (patch)
tree: 79bac044c9723df8443495eb962c2dd98a2ed421 /third_party
parent: 8043a27ed77f59bb68409070f2bfa01df0e04b89 (diff)
5 files changed, 165 insertions, 41 deletions
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index dde8e6cdb7..557c2885eb 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -204,13 +204,13 @@ cc_library(
         "lib/wildcard.h",
         "lib/x509asn1.h",
     ] + select({
-        ":darwin": [
+        "@//tensorflow:darwin": [
             "lib/vtls/darwinssl.c",
         ],
-        ":ios": [
+        "@//tensorflow:ios": [
             "lib/vtls/darwinssl.c",
         ],
-        ":windows": [
+        "@//tensorflow:windows": [
             "lib/asyn-thread.c",
             "lib/inet_ntop.c",
             "lib/system_win32.c",
@@ -231,7 +231,7 @@ cc_library(
         "include/curl/typecheck-gcc.h",
     ],
     copts = select({
-        ":windows": [
+        "@//tensorflow:windows": [
             "/I%prefix%/curl/lib",
             "/DHAVE_CONFIG_H",
             "/DCURL_DISABLE_FTP",
@@ -255,10 +255,10 @@ cc_library(
             "-Wno-string-plus-int",
         ],
     }) + select({
-        ":darwin": [
+        "@//tensorflow:darwin": [
             "-fno-constant-cfstrings",
         ],
-        ":windows": [
+        "@//tensorflow:windows": [
             # See curl.h for discussion of write size and Windows
             "/DCURL_MAX_WRITE_SIZE=16384",
         ],
@@ -268,17 +268,17 @@ cc_library(
     }),
     includes = ["include"],
     linkopts = select({
-        ":android": [
+        "@//tensorflow:android": [
             "-pie",
         ],
-        ":darwin": [
+        "@//tensorflow:darwin": [
             "-Wl,-framework",
             "-Wl,CoreFoundation",
             "-Wl,-framework",
             "-Wl,Security",
         ],
-        ":ios": [],
-        ":windows": [
+        "@//tensorflow:ios": [],
+        "@//tensorflow:windows": [
             "ws2_32.lib",
         ],
         "//conditions:default": [
@@ -289,8 +289,8 @@ cc_library(
     deps = [
         "@zlib_archive//:zlib",
     ] + select({
-        ":ios": [],
-        ":windows": [],
+        "@//tensorflow:ios": [],
+        "@//tensorflow:windows": [],
         "//conditions:default": [
             "@boringssl//:ssl",
         ],
@@ -386,7 +386,7 @@ cc_binary(
         "src/tool_xattr.h",
     ],
     copts = select({
-        ":windows": [
+        "@//tensorflow:windows": [
             "/I%prefix%/curl/lib",
             "/DHAVE_CONFIG_H",
             "/DCURL_DISABLE_LIBCURL_OPTION",
@@ -657,23 +657,3 @@ genrule(
         "EOF",
     ]),
 )
-
-config_setting(
-    name = "ios",
-    values = {"crosstool_top": "//tools/osx/crosstool:crosstool"},
-)
-
-config_setting(
-    name = "darwin",
-    values = {"cpu": "darwin"},
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows_msvc"},
-)
-
-config_setting(
-    name = "android",
-    values = {"crosstool_top": "//external:android/crosstool"},
-)
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
index 98deb1742e..078be83e0d 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
@@ -11,6 +11,13 @@ typedef struct Packet32q8i {
   Packet32q8i(__m256i val) : val(val) {}
 } Packet32q8i;
 
+typedef struct Packet16q16i {
+  __m256i val;
+  operator __m256i() const { return val; }
+  Packet16q16i();
+  Packet16q16i(__m256i val) : val(val) {}
+} Packet16q16i;
+
 typedef struct Packet32q8u {
   __m256i val;
   operator __m256i() const { return val; }
@@ -32,6 +39,13 @@ typedef struct Packet16q8u {
   Packet16q8u(__m128i val) : val(val) {}
 } Packet16q8u;
 
+typedef struct Packet8q16i {
+  __m128i val;
+  operator __m128i() const { return val; }
+  Packet8q16i();
+  Packet8q16i(__m128i val) : val(val) {}
+} Packet8q16i;
+
 typedef struct Packet8q32i {
   __m256i val;
   operator __m256i() const { return val; }
@@ -92,6 +106,28 @@ struct packet_traits<QUInt8> : default_packet_traits {
   };
 };
 template <>
+struct packet_traits<QInt16> : default_packet_traits {
+  typedef Packet16q16i type;
+  typedef Packet8q16i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
 struct packet_traits<QInt32> : default_packet_traits {
   typedef Packet8q32i type;
   typedef Packet4q32i half;
@@ -122,6 +158,12 @@ struct unpacket_traits<Packet32q8i> {
   enum { size = 32, alignment=Aligned32 };
 };
 template <>
+struct unpacket_traits<Packet16q16i> {
+  typedef QInt16 type;
+  typedef Packet8q16i half;
+  enum { size = 16, alignment=Aligned32 };
+};
+template <>
 struct unpacket_traits<Packet32q8u> {
   typedef QUInt8 type;
   typedef Packet16q8u half;
@@ -146,6 +188,11 @@ EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) {
       reinterpret_cast<const __m256i*>(from));
 }
 template <>
+EIGEN_STRONG_INLINE Packet16q16i ploadu<Packet16q16i>(const QInt16* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
 EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
       reinterpret_cast<const __m256i*>(from));
@@ -163,6 +210,11 @@ EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) {
       reinterpret_cast<const __m256i*>(from));
 }
 template <>
+EIGEN_STRONG_INLINE Packet16q16i pload<Packet16q16i>(const QInt16* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
 EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
       reinterpret_cast<const __m256i*>(from));
@@ -180,6 +232,11 @@ EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
       reinterpret_cast<__m256i*>(to), from.val);
 }
 template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(to), from.val);
+}
+template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
       reinterpret_cast<__m256i*>(to), from.val);
@@ -192,6 +249,11 @@ EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) {
                                                from.val);
 }
 template <>
+EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
+                                               from.val);
+}
+template <>
 EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
                                                from.val);
@@ -208,6 +270,10 @@ EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) {
   return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
 }
 template <>
+EIGEN_STRONG_INLINE QInt16 pfirst<Packet16q16i>(const Packet16q16i& a) {
+  return _mm256_extract_epi16(a.val, 0);
+}
+template <>
 EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) {
   return static_cast<uint8_t>(_mm256_extract_epi8(a.val, 0));
 }
@@ -237,6 +303,10 @@ EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a,
   return _mm256_add_epi32(a.val, b.val);
 }
 template <>
+EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) {
+  return _mm256_set1_epi16(from.value);
+}
+template <>
 EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a,
                                                   const Packet8q32i& b) {
   return _mm256_sub_epi32(a.val, b.val);
@@ -265,6 +335,17 @@ EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a,
 }
 
 template <>
+EIGEN_STRONG_INLINE Packet16q16i pmin<Packet16q16i>(const Packet16q16i& a,
+                                                    const Packet16q16i& b) {
+  return _mm256_min_epi16(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q16i pmax<Packet16q16i>(const Packet16q16i& a,
+                                                    const Packet16q16i& b) {
+  return _mm256_max_epi16(a.val, b.val);
+}
+
+template <>
 EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a,
                                                   const Packet32q8u& b) {
   return _mm256_min_epu8(a.val, b.val);
@@ -305,6 +386,23 @@ EIGEN_STRONG_INLINE QInt32 predux_max<Packet8q32i>(const Packet8q32i& a) {
 }
 
 template <>
+EIGEN_STRONG_INLINE QInt16 predux_min<Packet16q16i>(const Packet16q16i& a) {
+  __m256i tmp = _mm256_min_epi16(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
+  return std::min(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1));
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 predux_max<Packet16q16i>(const Packet16q16i& a) {
+  __m256i tmp = _mm256_max_epi16(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
+  return std::max(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1));
+}
+
+template <>
 EIGEN_STRONG_INLINE QUInt8 predux_min<Packet32q8u>(const Packet32q8u& a) {
   __m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1));
   tmp =
diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD
index ddaf29a086..7e95ebd355 100644
--- a/third_party/mkl/BUILD
+++ b/third_party/mkl/BUILD
@@ -8,12 +8,17 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+)
+
 cc_library(
     name = "intel_binary_blob",
-    srcs = [
-        "libiomp5.so",
+    srcs = if_mkl([
         "libmklml_intel.so",
-    ],
+        "libiomp5.so",
+    ]),
     includes = ["."],
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/nccl.BUILD b/third_party/nccl.BUILD
index bb460a05e0..06b9b8ff68 100644
--- a/third_party/nccl.BUILD
+++ b/third_party/nccl.BUILD
@@ -43,6 +43,24 @@ cc_library(
         "-Iexternal/nccl_archive/src",
         "-O3",
     ] + cuda_default_copts(),
+    linkopts = select({
+        "@%ws%//tensorflow:android": [
+            "-pie",
+        ],
+        "@%ws%//tensorflow:darwin": [
+            "-Wl,-framework",
+            "-Wl,CoreFoundation",
+            "-Wl,-framework",
+            "-Wl,Security",
+        ],
+        "@%ws%//tensorflow:ios": [],
+        "@%ws%//tensorflow:windows": [
+            "ws2_32.lib",
+        ],
+        "//conditions:default": [
+            "-lrt",
+        ],
+    }),
     visibility = ["//visibility:public"],
     deps = ["@local_config_cuda//cuda:cuda_headers"],
 )
diff --git a/third_party/sycl/crosstool/computecpp.tpl b/third_party/sycl/crosstool/computecpp.tpl
index a5e6b9fe93..66dd9aea7b 100755
--- a/third_party/sycl/crosstool/computecpp.tpl
+++ b/third_party/sycl/crosstool/computecpp.tpl
@@ -26,9 +26,7 @@ def main():
 
   if(output_file_index == 1):
     # we are linking
-    return subprocess.call([CPU_CXX_COMPILER] + compiler_flags)
-
-  compiler_flags = compiler_flags + ['-D_GLIBCXX_USE_CXX11_ABI=0', '-DEIGEN_USE_SYCL=1']
+    return subprocess.call([CPU_CXX_COMPILER] + compiler_flags + ['-Wl,--no-undefined'])
 
   # find what we compile
   compiling_cpp = 0
@@ -38,6 +36,28 @@ def main():
       if(compited_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP', '.C', '.cxx'))):
           compiling_cpp = 1;
 
+  compiler_flags = compiler_flags + ['-D_GLIBCXX_USE_CXX11_ABI=0', '-DEIGEN_USE_SYCL=1', '-DTENSORFLOW_USE_SYCL', '-DEIGEN_HAS_C99_MATH']
+
+  if(compiling_cpp == 1):
+      # create a blacklist of folders that will be skipped when compiling with ComputeCpp
+      _skip = ["external", "llvm", ".cu.cc"]
+      # if compiling external project skip computecpp
+      if any(_folder in _skip for _folder in output_file_name):
+        return subprocess.call([CPU_CXX_COMPILER] + compiler_flags)
+
+  if(compiling_cpp == 1):
+      # this is an optimisation that will check if compiled file has to be compiled with ComputeCpp
+
+      _tmp_flags = [flag for flag in compiler_flags if not flag.startswith(('-o', output_file_name))]
+      # create preprocessed of the file
+      _cmd = " ".join([CPU_CXX_COMPILER] + _tmp_flags + ["-E"])
+      # check if it has parallel_for< in it
+      _cmd += " | grep \".parallel_for\" > /dev/null"
+      ps = subprocess.call(_cmd, shell=True)
+      # if not call CXX compiler
+      if(ps != 0):
+          return subprocess.call([CPU_CXX_COMPILER] + compiler_flags)
+
   if(compiling_cpp == 1):
       filename, file_extension = os.path.splitext(output_file_name)
       bc_out = filename + '.sycl'
@@ -52,9 +72,12 @@ def main():
           # dont want that in case of compiling with computecpp first
           host_compiler_flags = [flag for flag in compiler_flags
                                     if not flag.startswith(('-MF', '-MD',))
-                                    if not '.d' in flag]
+                                    if not '.d' in flag
+                                ]
+
+          host_compiler_flags[host_compiler_flags.index('-c')] = "--include"
 
-          host_compiler_flags = ['-D_GLIBCXX_USE_CXX11_ABI=0', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '--include', bc_out] + host_compiler_flags
+          host_compiler_flags = ['-xc++', '-D_GLIBCXX_USE_CXX11_ABI=0', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '-c', bc_out] + host_compiler_flags
           x = subprocess.call([CPU_CXX_COMPILER] + host_compiler_flags)
       return x
   else:
author	Andrew Harp <andrewharp@google.com>	2017-03-01 17:59:22 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-03-01 18:08:24 -0800
commit	3e975ea978bac4d861bb09328b06f3c316212611 (patch)
tree	79bac044c9723df8443495eb962c2dd98a2ed421 /third_party
parent	8043a27ed77f59bb68409070f2bfa01df0e04b89 (diff)