From 9ea39ce13c453127844cff474730af119e889cd1 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 23 Oct 2015 09:15:34 -0700
Subject: Refined the #ifdef __CUDACC__ guard to ensure that when trying to
 compile gpu code with a non cuda compiler results in a linking error instead
 of bogus code.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h  |  2 +-
 .../Eigen/CXX11/src/Tensor/TensorDeviceType.h      | 11 ++-
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h        | 78 ++++++++++++----------
 3 files changed, 54 insertions(+), 37 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index 17f10c07b..7b2485fb7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -106,7 +106,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
 #endif
 
 
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU)
 template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
 {
   public:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
index 2ff7d471d..300ee8ac0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -287,6 +287,7 @@ class StreamInterface {
   virtual void deallocate(void* buffer) const = 0;
 };
 
+#if defined(__CUDACC__)
 static cudaDeviceProp* m_deviceProperties;
 static bool m_devicePropInitialized = false;
 
@@ -362,7 +363,7 @@ class CudaStreamDevice : public StreamInterface {
   const cudaStream_t* stream_;
   int device_;
 };
-
+#endif  // __CUDACC__
 
 struct GpuDevice {
   // The StreamInterface is not owned: the caller is
@@ -450,7 +451,7 @@ struct GpuDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
-#ifndef __CUDA_ARCH__
+#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
     cudaError_t err = cudaStreamSynchronize(stream_->stream());
     assert(err == cudaSuccess);
 #else
@@ -477,8 +478,12 @@ struct GpuDevice {
   // This function checks if the CUDA runtime recorded an error for the
   // underlying stream device.
   inline bool ok() const {
+#ifdef __CUDACC__
     cudaError_t error = cudaStreamQuery(stream_->stream());
     return (error == cudaSuccess) || (error == cudaErrorNotReady);
+#else
+    return false;
+#endif
   }
 
  private:
@@ -493,10 +498,12 @@ struct GpuDevice {
 
 
 // FIXME: Should be device and kernel specific.
+#ifdef __CUDACC__
 static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
   cudaError_t status = cudaDeviceSetSharedMemConfig(config);
   assert(status == cudaSuccess);
 }
+#endif
 
 #endif
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index b2800aefb..95fc9fec6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -149,7 +149,24 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
 
 
 // GPU: the evaluation of the expression is offloaded to a GPU.
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU)
+
+template <typename Expression>
+class TensorExecutor<Expression, GpuDevice, false> {
+ public:
+  typedef typename Expression::Index Index;
+  static void run(const Expression& expr, const GpuDevice& device);
+};
+
+template <typename Expression>
+class TensorExecutor<Expression, GpuDevice, true> {
+ public:
+  typedef typename Expression::Index Index;
+  static void run(const Expression& expr, const GpuDevice& device);
+};
+
+#if defined(__CUDACC__)
+
 template <typename Evaluator, typename Index>
 __global__ void
 __launch_bounds__(1024)
@@ -193,48 +210,41 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
   }
 }
 
-
-template<typename Expression>
-class TensorExecutor<Expression, GpuDevice, false>
+/*static*/
+template <typename Expression>
+inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
 {
- public:
-  typedef typename Expression::Index Index;
-  static inline void run(const Expression& expr, const GpuDevice& device)
+  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+  if (needs_assign)
   {
-    TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
-      const int block_size = device.maxCudaThreadsPerBlock();
-      const Index size = array_prod(evaluator.dimensions());
-      LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
-    }
-    evaluator.cleanup();
+    const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
+    const int block_size = device.maxCudaThreadsPerBlock();
+    const Index size = array_prod(evaluator.dimensions());
+    LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
   }
-};
+  evaluator.cleanup();
+}
 
+
+/*static*/
 template<typename Expression>
-class TensorExecutor<Expression, GpuDevice, true>
+inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
 {
- public:
-  typedef typename Expression::Index Index;
-  static inline void run(const Expression& expr, const GpuDevice& device)
+  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+  if (needs_assign)
   {
-    TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
-      const int block_size = device.maxCudaThreadsPerBlock();
-      const Index size = array_prod(evaluator.dimensions());
-      LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
-    }
-    evaluator.cleanup();
+    const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
+    const int block_size = device.maxCudaThreadsPerBlock();
+    const Index size = array_prod(evaluator.dimensions());
+    LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
   }
-};
+  evaluator.cleanup();
+}
 
-#endif
+#endif  // __CUDACC__
+#endif  // EIGEN_USE_GPU
 
 } // end namespace internal
 
-- 
cgit v1.2.3