aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2017-07-17 01:02:51 +0200
committerGravatar Gael Guennebaud <g.gael@free.fr>2017-07-17 01:02:51 +0200
commitbbd97b4095ff9cbe9898d68b3ab7bdff8125f3fb (patch)
tree2e51268d6fbffc5e1d95b937358a7b7af1232f0f /unsupported
parentf0b154a4b09914a9f11f5801220785f525217b9e (diff)
Add a EIGEN_NO_CUDA option, and introduce EIGEN_CUDACC and EIGEN_CUDA_ARCH aliases
Diffstat (limited to 'unsupported')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorScan.h4
-rw-r--r--unsupported/Eigen/CXX11/src/util/EmulateArray.h2
-rw-r--r--unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h4
-rw-r--r--unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h2
16 files changed, 40 insertions, 40 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index c04b784a4..428b18499 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -12,7 +12,7 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
namespace Eigen {
@@ -1382,5 +1382,5 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
} // end namespace Eigen
-#endif // EIGEN_USE_GPU and __CUDACC__
+#endif // EIGEN_USE_GPU and EIGEN_CUDACC
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 6fa51fd64..84d5be173 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -553,7 +553,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
// Use an optimized implementation of the evaluation code for GPUs whenever possible.
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
template <int StaticKernelSize>
struct GetKernelSize {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index be8d69386..ded7129da 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -211,7 +211,7 @@ struct GpuDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
+#ifndef EIGEN_CUDA_ARCH
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
@@ -239,7 +239,7 @@ struct GpuDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
-#ifndef __CUDA_ARCH__
+#ifndef EIGEN_CUDA_ARCH
cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
@@ -265,7 +265,7 @@ struct GpuDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
-#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
+#if defined(EIGEN_CUDACC) && !defined(EIGEN_CUDA_ARCH)
cudaError_t err = cudaStreamSynchronize(stream_->stream());
if (err != cudaSuccess) {
std::cerr << "Error detected in CUDA stream: "
@@ -304,7 +304,7 @@ struct GpuDevice {
// This function checks if the CUDA runtime recorded an error for the
// underlying stream device.
inline bool ok() const {
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
cudaError_t error = cudaStreamQuery(stream_->stream());
return (error == cudaSuccess) || (error == cudaErrorNotReady);
#else
@@ -323,9 +323,9 @@ struct GpuDevice {
// FIXME: Should be device and kernel specific.
-#ifdef __CUDACC__
+#ifdef EIGEN_CUDACC
static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
-#ifndef __CUDA_ARCH__
+#ifndef EIGEN_CUDA_ARCH
cudaError_t status = cudaDeviceSetSharedMemConfig(config);
EIGEN_UNUSED_VARIABLE(status)
assert(status == cudaSuccess);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index ccaaa6cb2..341889e88 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -35,7 +35,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
-#ifndef __CUDA_ARCH__
+#ifndef EIGEN_CUDA_ARCH
// Running on the host CPU
return 1;
#else
@@ -45,7 +45,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
// Running on the host CPU
return l1CacheSize();
#else
@@ -55,7 +55,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
// Running single threaded on the host CPU
return l3CacheSize();
#else
@@ -65,13 +65,13 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+#ifndef EIGEN_CUDA_ARCH
// Running single threaded on the host CPU
// Should return an enum that encodes the ISA supported by the CPU
return 1;
#else
// Running on a CUDA device
- return __CUDA_ARCH__ / 100;
+ return EIGEN_CUDA_ARCH / 100;
#endif
}
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index fcf330b10..2264be391 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -131,7 +131,7 @@ T loadConstant(const T* address) {
return *address;
}
// Use the texture cache on CUDA devices whenever possible
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float loadConstant(const float* address) {
return __ldg(address);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index f01d77c0a..0ffe68ab3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -201,7 +201,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> {
};
-#if defined(__CUDACC__)
+#if defined(EIGEN_CUDACC)
template <typename Evaluator, typename Index, bool Vectorizable>
struct EigenMetaKernelEval {
static __device__ EIGEN_ALWAYS_INLINE
@@ -264,7 +264,7 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
evaluator.cleanup();
}
-#endif // __CUDACC__
+#endif // EIGEN_CUDACC
#endif // EIGEN_USE_GPU
// SYCL Executor policy
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index ef1c9c42c..fb6454623 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -35,7 +35,7 @@ namespace {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
{
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_CUDA_ARCH
return __clz(val);
#elif defined(__SYCL_DEVICE_ONLY__)
return cl::sycl::clz(val);
@@ -53,7 +53,7 @@ namespace {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
{
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_CUDA_ARCH
return __clzll(val);
#elif defined(__SYCL_DEVICE_ONLY__)
return cl::sycl::clz(val);
@@ -90,7 +90,7 @@ namespace {
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_CUDA_ARCH)
return __umulhi(a, b);
#elif defined(__SYCL_DEVICE_ONLY__)
return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
@@ -101,7 +101,7 @@ namespace {
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_CUDA_ARCH)
return __umul64hi(a, b);
#elif defined(__SYCL_DEVICE_ONLY__)
return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
@@ -124,7 +124,7 @@ namespace {
template <typename T>
struct DividerHelper<64, T> {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
-#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SIZEOF_INT128__) && !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
#else
const uint64_t shift = 1ULL << log_div;
@@ -203,7 +203,7 @@ class TensorIntDivisor<int32_t, true> {
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_CUDA_ARCH
return (__umulhi(magic, n) >> shift);
#elif defined(__SYCL_DEVICE_ONLY__)
return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index f92e39d69..c9e61f359 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -27,7 +27,7 @@
*/
// SFINAE requires variadic templates
-#ifndef __CUDACC__
+#ifndef EIGEN_CUDACC
#if EIGEN_HAS_VARIADIC_TEMPLATES
// SFINAE doesn't work for gcc <= 4.7
#ifdef EIGEN_COMP_GNUC
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index 77c9c6c6e..5431eb740 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits<Scalar> {
};
// For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)
template <>
struct PacketType<half, GpuDevice> {
typedef half2 type;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index f108c349f..230915db2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -16,7 +16,7 @@ namespace internal {
namespace {
EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_CUDA_ARCH
// We don't support 3d kernels since we currently only use 1 and
// 2d kernels.
assert(threadIdx.z == 0);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 69079805d..da0ffe728 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -334,7 +334,7 @@ struct OuterReducer {
};
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
template <int B, int N, typename S, typename R, typename I>
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
@@ -694,7 +694,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
#ifdef EIGEN_USE_THREADS
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
#endif
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
#ifdef EIGEN_HAS_CUDA_FP16
template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
@@ -781,7 +781,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
Op m_reducer;
// For full reductions
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
static const bool RunningOnSycl = false;
#elif defined(EIGEN_USE_SYCL)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 24a55a3d5..974eb7deb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -14,7 +14,7 @@ namespace Eigen {
namespace internal {
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
// Full reducers for GPU, don't vectorize for now
// Reducer function that enables multiple cuda thread to safely accumulate at the same
@@ -23,7 +23,7 @@ namespace internal {
// updated the content of the output address it will try again.
template <typename T, typename R>
__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
-#if __CUDA_ARCH__ >= 300
+#if EIGEN_CUDA_ARCH >= 300
if (sizeof(T) == 4)
{
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
@@ -102,7 +102,7 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer
template <>
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
-#if __CUDA_ARCH__ >= 300
+#if EIGEN_CUDA_ARCH >= 300
atomicAdd(output, accum);
#else // __CUDA_ARCH__ >= 300
assert(0 && "Shouldn't be called on unsupported device");
@@ -124,7 +124,7 @@ template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
typename Self::CoeffReturnType* output, unsigned int* semaphore) {
-#if __CUDA_ARCH__ >= 300
+#if EIGEN_CUDA_ARCH >= 300
// Initialize the output value
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
if (gridDim.x == 1) {
@@ -372,7 +372,7 @@ template <int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) {
-#if __CUDA_ARCH__ >= 300
+#if EIGEN_CUDA_ARCH >= 300
typedef typename Self::CoeffReturnType Type;
eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 2a85ed840..1f545ef1a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -242,7 +242,7 @@ struct ScanLauncher {
}
};
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
// GPU implementation of scan
// TODO(ibab) This placeholder implementation performs multiple scans in
@@ -281,7 +281,7 @@ struct ScanLauncher<Self, Reducer, GpuDevice> {
LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
}
};
-#endif // EIGEN_USE_GPU && __CUDACC__
+#endif // EIGEN_USE_GPU && EIGEN_CUDACC
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 573ca435a..96b3a8261 100644
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -15,7 +15,7 @@
// The array class is only available starting with cxx11. Emulate our own here
// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
-#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY)
+#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_AVOID_STL_ARRAY)
namespace Eigen {
template <typename T, size_t n> class array {
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 369ad97b4..5d1b8fcc2 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -121,7 +121,7 @@ template <>
struct lgamma_impl<float> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE float run(float x) {
-#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
+#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
int dummy;
return ::lgammaf_r(x, &dummy);
#else
@@ -134,7 +134,7 @@ template <>
struct lgamma_impl<double> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE double run(double x) {
-#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
+#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
int dummy;
return ::lgamma_r(x, &dummy);
#else
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
index ec4fa8448..e0e3a8be6 100644
--- a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
@@ -17,7 +17,7 @@ namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float4 plgamma<float4>(const float4& a)