From 8d393ea2fab0ea88ecd11e36d89f186cbc884dbe Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 30 Mar 2017 07:38:55 -0800 Subject: Add cuda_clang build configuration that allows to use clang as a CUDA compiler. Change: 151705528 --- third_party/nccl/BUILD | 0 third_party/nccl/fix_clang_compilation.patch | 85 ++++++++++++++++++++++++++++ third_party/nccl/nccl.BUILD | 66 +++++++++++++++++++++ 3 files changed, 151 insertions(+) create mode 100644 third_party/nccl/BUILD create mode 100644 third_party/nccl/fix_clang_compilation.patch create mode 100644 third_party/nccl/nccl.BUILD (limited to 'third_party/nccl') diff --git a/third_party/nccl/BUILD b/third_party/nccl/BUILD new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/nccl/fix_clang_compilation.patch b/third_party/nccl/fix_clang_compilation.patch new file mode 100644 index 0000000000..e8d2a7dc9f --- /dev/null +++ b/third_party/nccl/fix_clang_compilation.patch @@ -0,0 +1,85 @@ +From 8241cd7b6ed1425eeb88fd380090575978e358f4 Mon Sep 17 00:00:00 2001 +From: Ilya Biryukov +Date: Thu, 16 Mar 2017 12:01:11 +0100 +Subject: [PATCH 1/1] Fix compilation error when compiling with 'clang -x + cuda'. + +Functions vFetch and vStore are not found by ADL with clang, +so they need to be declared before usage in ReduceCopy. +--- + src/common_kernel.h | 52 ++++++++++++++++++++++++++-------------------------- + 1 file changed, 26 insertions(+), 26 deletions(-) + +diff --git a/src/common_kernel.h b/src/common_kernel.h +index 28fbc85..cc71f8a 100644 +--- a/src/common_kernel.h ++++ b/src/common_kernel.h +@@ -30,6 +30,32 @@ + #define BAR(type, barid, nthreads) \ + BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE)) + ++template inline __device__ ++T vFetch(const volatile T* ptr) { ++ return *ptr; ++} ++ ++#ifdef CUDA_HAS_HALF ++template<> inline __device__ ++half vFetch(const volatile half* ptr) { ++ half r; ++ r.x = ptr->x; ++ return r; ++} ++#endif ++ ++template inline __device__ ++void vStore(volatile T* ptr, const T val) { ++ *ptr = val; ++} ++ ++#ifdef CUDA_HAS_HALF ++template<> inline __device__ ++void vStore(volatile half* ptr, const half val) { ++ ptr->x = val.x; ++} ++#endif ++ + __device__ unsigned int spinct; + + // Spin wait until func evaluates to true +@@ -225,32 +251,6 @@ __device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) { + return reinterpret_cast(ALIGNUP(ptrval, align)); + } + +-template inline __device__ +-T vFetch(const volatile T* ptr) { +- return *ptr; +-} +- +-#ifdef CUDA_HAS_HALF +-template<> inline __device__ +-half vFetch(const volatile half* ptr) { +- half r; +- r.x = ptr->x; +- return r; +-} +-#endif +- +-template inline __device__ +-void vStore(volatile T* ptr, const T val) { +- *ptr = val; +-} +- +-#ifdef CUDA_HAS_HALF +-template<> inline __device__ +-void vStore(volatile half* ptr, const half val) { +- ptr->x = val.x; +-} +-#endif +- + // Assumptions: + // - there is exactly 1 block + // - THREADS is the number of producer threads +-- +2.12.0.367.g23dc2f6d3c-goog + diff --git a/third_party/nccl/nccl.BUILD b/third_party/nccl/nccl.BUILD new file mode 100644 index 0000000000..06b9b8ff68 --- /dev/null +++ b/third_party/nccl/nccl.BUILD @@ -0,0 +1,66 @@ +# NVIDIA nccl +# A package of optimized primitives for collective multi-GPU communication. + +licenses(["notice"]) # BSD + +exports_files(["LICENSE.txt"]) + +load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "if_cuda") + +SRCS = [ + "src/all_gather.cu", + "src/all_reduce.cu", + "src/broadcast.cu", + "src/core.cu", + "src/libwrap.cu", + "src/reduce.cu", + "src/reduce_scatter.cu", +] + +# Copy .cu to .cu.cc so they can be in srcs of cc_library. +[ + genrule( + name = "gen_" + src, + srcs = [src], + outs = [src + ".cc"], + cmd = "cp $(location " + src + ") $(location " + src + ".cc)", + ) + for src in SRCS +] + +SRCS_CU_CC = [src + ".cc" for src in SRCS] + +cc_library( + name = "nccl", + srcs = if_cuda(SRCS_CU_CC + glob(["src/*.h"])), + hdrs = if_cuda(["src/nccl.h"]), + copts = [ + "-DCUDA_MAJOR=0", + "-DCUDA_MINOR=0", + "-DNCCL_MAJOR=0", + "-DNCCL_MINOR=0", + "-DNCCL_PATCH=0", + "-Iexternal/nccl_archive/src", + "-O3", + ] + cuda_default_copts(), + linkopts = select({ + "@%ws%//tensorflow:android": [ + "-pie", + ], + "@%ws%//tensorflow:darwin": [ + "-Wl,-framework", + "-Wl,CoreFoundation", + "-Wl,-framework", + "-Wl,Security", + ], + "@%ws%//tensorflow:ios": [], + "@%ws%//tensorflow:windows": [ + "ws2_32.lib", + ], + "//conditions:default": [ + "-lrt", + ], + }), + visibility = ["//visibility:public"], + deps = ["@local_config_cuda//cuda:cuda_headers"], +) -- cgit v1.2.3