aboutsummaryrefslogtreecommitdiffhomepage
path: root/third_party/nccl
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2017-03-30 07:38:55 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-03-30 08:54:57 -0700
commit8d393ea2fab0ea88ecd11e36d89f186cbc884dbe (patch)
treecf7696a2e7afb85a1283502f589cfeecc8538417 /third_party/nccl
parent6e5f92ffc743c9b1765ffe4b79aac29a7059d464 (diff)
Add cuda_clang build configuration that allows to use clang as a CUDA compiler.
Change: 151705528
Diffstat (limited to 'third_party/nccl')
-rw-r--r--third_party/nccl/BUILD0
-rw-r--r--third_party/nccl/fix_clang_compilation.patch85
-rw-r--r--third_party/nccl/nccl.BUILD66
3 files changed, 151 insertions, 0 deletions
diff --git a/third_party/nccl/BUILD b/third_party/nccl/BUILD
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/third_party/nccl/BUILD
diff --git a/third_party/nccl/fix_clang_compilation.patch b/third_party/nccl/fix_clang_compilation.patch
new file mode 100644
index 0000000000..e8d2a7dc9f
--- /dev/null
+++ b/third_party/nccl/fix_clang_compilation.patch
@@ -0,0 +1,85 @@
+From 8241cd7b6ed1425eeb88fd380090575978e358f4 Mon Sep 17 00:00:00 2001
+From: Ilya Biryukov <ibiryukov@google.com>
+Date: Thu, 16 Mar 2017 12:01:11 +0100
+Subject: [PATCH 1/1] Fix compilation error when compiling with 'clang -x
+ cuda'.
+
+Functions vFetch and vStore are not found by ADL with clang,
+so they need to be declared before usage in ReduceCopy.
+---
+ src/common_kernel.h | 52 ++++++++++++++++++++++++++--------------------------
+ 1 file changed, 26 insertions(+), 26 deletions(-)
+
+diff --git a/src/common_kernel.h b/src/common_kernel.h
+index 28fbc85..cc71f8a 100644
+--- a/src/common_kernel.h
++++ b/src/common_kernel.h
+@@ -30,6 +30,32 @@
+ #define BAR(type, barid, nthreads) \
+ BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE))
+
++template<typename T> inline __device__
++T vFetch(const volatile T* ptr) {
++ return *ptr;
++}
++
++#ifdef CUDA_HAS_HALF
++template<> inline __device__
++half vFetch<half>(const volatile half* ptr) {
++ half r;
++ r.x = ptr->x;
++ return r;
++}
++#endif
++
++template<typename T> inline __device__
++void vStore(volatile T* ptr, const T val) {
++ *ptr = val;
++}
++
++#ifdef CUDA_HAS_HALF
++template<> inline __device__
++void vStore<half>(volatile half* ptr, const half val) {
++ ptr->x = val.x;
++}
++#endif
++
+ __device__ unsigned int spinct;
+
+ // Spin wait until func evaluates to true
+@@ -225,32 +251,6 @@ __device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
+ return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
+ }
+
+-template<typename T> inline __device__
+-T vFetch(const volatile T* ptr) {
+- return *ptr;
+-}
+-
+-#ifdef CUDA_HAS_HALF
+-template<> inline __device__
+-half vFetch<half>(const volatile half* ptr) {
+- half r;
+- r.x = ptr->x;
+- return r;
+-}
+-#endif
+-
+-template<typename T> inline __device__
+-void vStore(volatile T* ptr, const T val) {
+- *ptr = val;
+-}
+-
+-#ifdef CUDA_HAS_HALF
+-template<> inline __device__
+-void vStore<half>(volatile half* ptr, const half val) {
+- ptr->x = val.x;
+-}
+-#endif
+-
+ // Assumptions:
+ // - there is exactly 1 block
+ // - THREADS is the number of producer threads
+--
+2.12.0.367.g23dc2f6d3c-goog
+
diff --git a/third_party/nccl/nccl.BUILD b/third_party/nccl/nccl.BUILD
new file mode 100644
index 0000000000..06b9b8ff68
--- /dev/null
+++ b/third_party/nccl/nccl.BUILD
@@ -0,0 +1,66 @@
+# NVIDIA nccl
+# A package of optimized primitives for collective multi-GPU communication.
+
+licenses(["notice"]) # BSD
+
+exports_files(["LICENSE.txt"])
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "if_cuda")
+
+SRCS = [
+ "src/all_gather.cu",
+ "src/all_reduce.cu",
+ "src/broadcast.cu",
+ "src/core.cu",
+ "src/libwrap.cu",
+ "src/reduce.cu",
+ "src/reduce_scatter.cu",
+]
+
+# Copy .cu to .cu.cc so they can be in srcs of cc_library.
+[
+ genrule(
+ name = "gen_" + src,
+ srcs = [src],
+ outs = [src + ".cc"],
+ cmd = "cp $(location " + src + ") $(location " + src + ".cc)",
+ )
+ for src in SRCS
+]
+
+SRCS_CU_CC = [src + ".cc" for src in SRCS]
+
+cc_library(
+ name = "nccl",
+ srcs = if_cuda(SRCS_CU_CC + glob(["src/*.h"])),
+ hdrs = if_cuda(["src/nccl.h"]),
+ copts = [
+ "-DCUDA_MAJOR=0",
+ "-DCUDA_MINOR=0",
+ "-DNCCL_MAJOR=0",
+ "-DNCCL_MINOR=0",
+ "-DNCCL_PATCH=0",
+ "-Iexternal/nccl_archive/src",
+ "-O3",
+ ] + cuda_default_copts(),
+ linkopts = select({
+ "@%ws%//tensorflow:android": [
+ "-pie",
+ ],
+ "@%ws%//tensorflow:darwin": [
+ "-Wl,-framework",
+ "-Wl,CoreFoundation",
+ "-Wl,-framework",
+ "-Wl,Security",
+ ],
+ "@%ws%//tensorflow:ios": [],
+ "@%ws%//tensorflow:windows": [
+ "ws2_32.lib",
+ ],
+ "//conditions:default": [
+ "-lrt",
+ ],
+ }),
+ visibility = ["//visibility:public"],
+ deps = ["@local_config_cuda//cuda:cuda_headers"],
+)