Add cuda_clang build configuration that allows to use clang as a CUDA compiler.

Change: 151705528
author: A. Unique TensorFlower <gardener@tensorflow.org> 2017-03-30 07:38:55 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-03-30 08:54:57 -0700
commit: 8d393ea2fab0ea88ecd11e36d89f186cbc884dbe (patch)
tree: cf7696a2e7afb85a1283502f589cfeecc8538417 /third_party/nccl
parent: 6e5f92ffc743c9b1765ffe4b79aac29a7059d464 (diff)
3 files changed, 151 insertions, 0 deletions
diff --git a/third_party/nccl/BUILD b/third_party/nccl/BUILD
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/third_party/nccl/BUILD
diff --git a/third_party/nccl/fix_clang_compilation.patch b/third_party/nccl/fix_clang_compilation.patch
new file mode 100644
index 0000000000..e8d2a7dc9f
--- /dev/null
+++ b/third_party/nccl/fix_clang_compilation.patch
@@ -0,0 +1,85 @@
+From 8241cd7b6ed1425eeb88fd380090575978e358f4 Mon Sep 17 00:00:00 2001
+From: Ilya Biryukov <ibiryukov@google.com>
+Date: Thu, 16 Mar 2017 12:01:11 +0100
+Subject: [PATCH 1/1] Fix compilation error when compiling with 'clang -x
+ cuda'.
+
+Functions vFetch and vStore are not found by ADL with clang,
+so they need to be declared before usage in ReduceCopy.
+---
+ src/common_kernel.h | 52 ++++++++++++++++++++++++++--------------------------
+ 1 file changed, 26 insertions(+), 26 deletions(-)
+
+diff --git a/src/common_kernel.h b/src/common_kernel.h
+index 28fbc85..cc71f8a 100644
+--- a/src/common_kernel.h
++++ b/src/common_kernel.h
+@@ -30,6 +30,32 @@
+ #define BAR(type, barid, nthreads) \
+     BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE))
+ 
++template<typename T> inline __device__
++T vFetch(const volatile T* ptr) {
++  return *ptr;
++}
++
++#ifdef CUDA_HAS_HALF
++template<> inline __device__
++half vFetch<half>(const volatile half* ptr) {
++  half r;
++  r.x = ptr->x;
++  return r;
++}
++#endif
++
++template<typename T> inline __device__
++void vStore(volatile T* ptr, const T val) {
++  *ptr = val;
++}
++
++#ifdef CUDA_HAS_HALF
++template<> inline __device__
++void vStore<half>(volatile half* ptr, const half val) {
++  ptr->x = val.x;
++}
++#endif
++
+ __device__ unsigned int spinct;
+ 
+ // Spin wait until func evaluates to true
+@@ -225,32 +251,6 @@ __device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
+   return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
+ }
+ 
+-template<typename T> inline __device__
+-T vFetch(const volatile T* ptr) {
+-  return *ptr;
+-}
+-
+-#ifdef CUDA_HAS_HALF
+-template<> inline __device__
+-half vFetch<half>(const volatile half* ptr) {
+-  half r;
+-  r.x = ptr->x;
+-  return r;
+-}
+-#endif
+-
+-template<typename T> inline __device__
+-void vStore(volatile T* ptr, const T val) {
+-  *ptr = val;
+-}
+-
+-#ifdef CUDA_HAS_HALF
+-template<> inline __device__
+-void vStore<half>(volatile half* ptr, const half val) {
+-  ptr->x = val.x;
+-}
+-#endif
+-
+ // Assumptions:
+ // - there is exactly 1 block
+ // - THREADS is the number of producer threads
+-- 
+2.12.0.367.g23dc2f6d3c-goog
+
diff --git a/third_party/nccl/nccl.BUILD b/third_party/nccl/nccl.BUILD
new file mode 100644
index 0000000000..06b9b8ff68
--- /dev/null
+++ b/third_party/nccl/nccl.BUILD
@@ -0,0 +1,66 @@
+# NVIDIA nccl
+# A package of optimized primitives for collective multi-GPU communication.
+
+licenses(["notice"])  # BSD
+
+exports_files(["LICENSE.txt"])
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "if_cuda")
+
+SRCS = [
+    "src/all_gather.cu",
+    "src/all_reduce.cu",
+    "src/broadcast.cu",
+    "src/core.cu",
+    "src/libwrap.cu",
+    "src/reduce.cu",
+    "src/reduce_scatter.cu",
+]
+
+# Copy .cu to .cu.cc so they can be in srcs of cc_library.
+[
+    genrule(
+        name = "gen_" + src,
+        srcs = [src],
+        outs = [src + ".cc"],
+        cmd = "cp $(location " + src + ") $(location " + src + ".cc)",
+    )
+    for src in SRCS
+]
+
+SRCS_CU_CC = [src + ".cc" for src in SRCS]
+
+cc_library(
+    name = "nccl",
+    srcs = if_cuda(SRCS_CU_CC + glob(["src/*.h"])),
+    hdrs = if_cuda(["src/nccl.h"]),
+    copts = [
+        "-DCUDA_MAJOR=0",
+        "-DCUDA_MINOR=0",
+        "-DNCCL_MAJOR=0",
+        "-DNCCL_MINOR=0",
+        "-DNCCL_PATCH=0",
+        "-Iexternal/nccl_archive/src",
+        "-O3",
+    ] + cuda_default_copts(),
+    linkopts = select({
+        "@%ws%//tensorflow:android": [
+            "-pie",
+        ],
+        "@%ws%//tensorflow:darwin": [
+            "-Wl,-framework",
+            "-Wl,CoreFoundation",
+            "-Wl,-framework",
+            "-Wl,Security",
+        ],
+        "@%ws%//tensorflow:ios": [],
+        "@%ws%//tensorflow:windows": [
+            "ws2_32.lib",
+        ],
+        "//conditions:default": [
+            "-lrt",
+        ],
+    }),
+    visibility = ["//visibility:public"],
+    deps = ["@local_config_cuda//cuda:cuda_headers"],
+)
author	A. Unique TensorFlower <gardener@tensorflow.org>	2017-03-30 07:38:55 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-03-30 08:54:57 -0700
commit	8d393ea2fab0ea88ecd11e36d89f186cbc884dbe (patch)
tree	cf7696a2e7afb85a1283502f589cfeecc8538417 /third_party/nccl
parent	6e5f92ffc743c9b1765ffe4b79aac29a7059d464 (diff)