1 files changed, 85 insertions, 0 deletions
diff --git a/third_party/nccl/fix_clang_compilation.patch b/third_party/nccl/fix_clang_compilation.patch
new file mode 100644
index 0000000000..e8d2a7dc9f
--- /dev/null
+++ b/third_party/nccl/fix_clang_compilation.patch
@@ -0,0 +1,85 @@
+From 8241cd7b6ed1425eeb88fd380090575978e358f4 Mon Sep 17 00:00:00 2001
+From: Ilya Biryukov <ibiryukov@google.com>
+Date: Thu, 16 Mar 2017 12:01:11 +0100
+Subject: [PATCH 1/1] Fix compilation error when compiling with 'clang -x
+ cuda'.
+
+Functions vFetch and vStore are not found by ADL with clang,
+so they need to be declared before usage in ReduceCopy.
+---
+ src/common_kernel.h | 52 ++++++++++++++++++++++++++--------------------------
+ 1 file changed, 26 insertions(+), 26 deletions(-)
+
+diff --git a/src/common_kernel.h b/src/common_kernel.h
+index 28fbc85..cc71f8a 100644
+--- a/src/common_kernel.h
++++ b/src/common_kernel.h
+@@ -30,6 +30,32 @@
+ #define BAR(type, barid, nthreads) \
+     BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE))
+ 
++template<typename T> inline __device__
++T vFetch(const volatile T* ptr) {
++  return *ptr;
++}
++
++#ifdef CUDA_HAS_HALF
++template<> inline __device__
++half vFetch<half>(const volatile half* ptr) {
++  half r;
++  r.x = ptr->x;
++  return r;
++}
++#endif
++
++template<typename T> inline __device__
++void vStore(volatile T* ptr, const T val) {
++  *ptr = val;
++}
++
++#ifdef CUDA_HAS_HALF
++template<> inline __device__
++void vStore<half>(volatile half* ptr, const half val) {
++  ptr->x = val.x;
++}
++#endif
++
+ __device__ unsigned int spinct;
+ 
+ // Spin wait until func evaluates to true
+@@ -225,32 +251,6 @@ __device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
+   return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
+ }
+ 
+-template<typename T> inline __device__
+-T vFetch(const volatile T* ptr) {
+-  return *ptr;
+-}
+-
+-#ifdef CUDA_HAS_HALF
+-template<> inline __device__
+-half vFetch<half>(const volatile half* ptr) {
+-  half r;
+-  r.x = ptr->x;
+-  return r;
+-}
+-#endif
+-
+-template<typename T> inline __device__
+-void vStore(volatile T* ptr, const T val) {
+-  *ptr = val;
+-}
+-
+-#ifdef CUDA_HAS_HALF
+-template<> inline __device__
+-void vStore<half>(volatile half* ptr, const half val) {
+-  ptr->x = val.x;
+-}
+-#endif
+-
+ // Assumptions:
+ // - there is exactly 1 block
+ // - THREADS is the number of producer threads
+-- 
+2.12.0.367.g23dc2f6d3c-goog
+