diff options
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor')
8 files changed, 159 insertions, 111 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt deleted file mode 100644 index 6d4b3ea0d..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_CXX11_Tensor_SRCS "*.h") - -INSTALL(FILES - ${Eigen_CXX11_Tensor_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Tensor COMPONENT Devel - ) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index e3880d2e0..3c8710255 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -192,6 +192,12 @@ class TensorBase<Derived, ReadOnlyAccessors> } EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived> + log1p() const { + return unaryExpr(internal::scalar_log1p_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> abs() const { return unaryExpr(internal::scalar_abs_op<Scalar>()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 56d9c2025..20b29e5fd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -25,8 +25,9 @@ template<typename Dimensions, typename LhsXprType, typename RhsXprType> struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> > { // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename internal::promote_storage_type<typename LhsXprType::Scalar, - typename RhsXprType::Scalar>::ret Scalar; + typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type, + typename remove_const<typename RhsXprType::Scalar>::type>::ResScalar Scalar; + typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind, typename traits<RhsXprType>::StorageKind>::ret StorageKind; typedef typename promote_index_type<typename traits<LhsXprType>::Index, @@ -75,8 +76,8 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp { public: typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar; - typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType, - typename RhsXprType::CoeffReturnType>::ret CoeffReturnType; + typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType, + typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType; typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested; typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind; typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index a76c8ca35..d66e45d50 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -91,21 +91,21 @@ class TensorOpCost { } // TODO(rmlarsen): Define min in terms of total cost, not elementwise. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin( - const TensorOpCost& rhs) { - bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded()); - bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored()); - compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles()); - return *this; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin( + const TensorOpCost& rhs) const { + double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded()); + double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored()); + double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles()); + return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); } // TODO(rmlarsen): Define max in terms of total cost, not elementwise. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax( - const TensorOpCost& rhs) { - bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); - bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored()); - compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles()); - return *this; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax( + const TensorOpCost& rhs) const { + double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); + double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored()); + double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles()); + return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=( diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index b2b4bcf62..834ce07df 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -239,7 +239,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) + : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() { } typedef typename XprType::Index Index; @@ -256,13 +256,13 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { - return m_functor(index); + return m_wrapper(m_functor, index); } template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return m_functor.template packetOp<Index, PacketReturnType>(index); + return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost @@ -282,6 +282,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> private: const NullaryOp m_functor; TensorEvaluator<ArgType, Device> m_argImpl; + const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper; }; @@ -612,7 +613,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator<IfArgType, Device> & cond_impl() const { return m_condImpl; } /// required by sycl in order to extract the accessor diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index a8e48fced..fc75dbb5c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -25,7 +25,7 @@ struct scalar_mod_op { }; template <typename Scalar> struct functor_traits<scalar_mod_op<Scalar> > -{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; }; +{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; }; /** \internal @@ -38,7 +38,7 @@ struct scalar_mod2_op { }; template <typename Scalar> struct functor_traits<scalar_mod2_op<Scalar> > -{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; }; +{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; }; template <typename Scalar> struct scalar_fmod_op { @@ -188,6 +188,32 @@ struct reducer_traits<MeanReducer<T>, Device> { }; +template <typename T, bool IsMax = true, bool IsInteger = true> +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() { + return Eigen::NumTraits<T>::lowest(); + } +}; +template <typename T> +struct MinMaxBottomValue<T, true, false> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() { + return -Eigen::NumTraits<T>::infinity(); + } +}; +template <typename T> +struct MinMaxBottomValue<T, false, true> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() { + return Eigen::NumTraits<T>::highest(); + } +}; +template <typename T> +struct MinMaxBottomValue<T, false, false> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() { + return Eigen::NumTraits<T>::infinity(); + } +}; + + template <typename T> struct MaxReducer { static const bool PacketAccess = packet_traits<T>::HasMax; @@ -200,9 +226,8 @@ template <typename T> struct MaxReducer EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { (*accum) = pmax<Packet>(*accum, p); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return Eigen::NumTraits<T>::lowest(); + return MinMaxBottomValue<T, true, Eigen::NumTraits<T>::IsInteger>::bottom_value(); } template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { @@ -242,9 +267,8 @@ template <typename T> struct MinReducer EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { (*accum) = pmin<Packet>(*accum, p); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return Eigen::NumTraits<T>::highest(); + return MinMaxBottomValue<T, false, Eigen::NumTraits<T>::IsInteger>::bottom_value(); } template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { @@ -454,12 +478,11 @@ template <typename T> class UniformRandomGenerator { m_deterministic = other.m_deterministic; } - template<typename Index> - T operator()(Index) const { + T operator()() const { return random<T>(); } - template<typename Index, typename PacketType> - PacketType packetOp(Index) const { + template<typename PacketType> + PacketType packetOp() const { const int packetSize = internal::unpacket_traits<PacketType>::size; EIGEN_ALIGN_MAX T values[packetSize]; for (int i = 0; i < packetSize; ++i) { @@ -484,23 +507,22 @@ template <> class UniformRandomGenerator<float> { } UniformRandomGenerator(const UniformRandomGenerator<float>& other) { m_generator = new std::mt19937(); - m_generator->seed(other(0) * UINT_MAX); + m_generator->seed(other() * UINT_MAX); m_deterministic = other.m_deterministic; } ~UniformRandomGenerator() { delete m_generator; } - template<typename Index> - float operator()(Index) const { + float operator()() const { return m_distribution(*m_generator); } - template<typename Index, typename PacketType> - PacketType packetOp(Index i) const { + template<typename PacketType> + PacketType packetOp() const { const int packetSize = internal::unpacket_traits<PacketType>::size; EIGEN_ALIGN_MAX float values[packetSize]; for (int k = 0; k < packetSize; ++k) { - values[k] = this->operator()(i); + values[k] = this->operator()(); } return internal::pload<PacketType>(values); } @@ -525,23 +547,22 @@ template <> class UniformRandomGenerator<double> { } UniformRandomGenerator(const UniformRandomGenerator<double>& other) { m_generator = new std::mt19937(); - m_generator->seed(other(0) * UINT_MAX); + m_generator->seed(other() * UINT_MAX); m_deterministic = other.m_deterministic; } ~UniformRandomGenerator() { delete m_generator; } - template<typename Index> - double operator()(Index) const { + double operator()() const { return m_distribution(*m_generator); } - template<typename Index, typename PacketType> - PacketType packetOp(Index i) const { + template<typename PacketType> + PacketType packetOp() const { const int packetSize = internal::unpacket_traits<PacketType>::size; EIGEN_ALIGN_MAX double values[packetSize]; for (int k = 0; k < packetSize; ++k) { - values[k] = this->operator()(i); + values[k] = this->operator()(); } return internal::pload<PacketType>(values); } @@ -578,12 +599,11 @@ template <> class UniformRandomGenerator<float> { curand_init(seed, tid, 0, &m_state); } - template<typename Index> - __device__ float operator()(Index) const { + __device__ float operator()() const { return curand_uniform(&m_state); } - template<typename Index, typename PacketType> - __device__ float4 packetOp(Index) const { + template<typename PacketType> + __device__ float4 packetOp() const { EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE); return curand_uniform4(&m_state); } @@ -608,12 +628,11 @@ template <> class UniformRandomGenerator<double> { const int seed = m_deterministic ? 0 : get_random_seed(); curand_init(seed, tid, 0, &m_state); } - template<typename Index> - __device__ double operator()(Index) const { + __device__ double operator()() const { return curand_uniform_double(&m_state); } - template<typename Index, typename PacketType> - __device__ double2 packetOp(Index) const { + template<typename PacketType> + __device__ double2 packetOp() const { EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE); return curand_uniform2_double(&m_state); } @@ -638,8 +657,7 @@ template <> class UniformRandomGenerator<std::complex<float> > { const int seed = m_deterministic ? 0 : get_random_seed(); curand_init(seed, tid, 0, &m_state); } - template<typename Index> - __device__ std::complex<float> operator()(Index) const { + __device__ std::complex<float> operator()() const { float4 vals = curand_uniform4(&m_state); return std::complex<float>(vals.x, vals.y); } @@ -664,8 +682,7 @@ template <> class UniformRandomGenerator<std::complex<double> > { const int seed = m_deterministic ? 0 : get_random_seed(); curand_init(seed, tid, 0, &m_state); } - template<typename Index> - __device__ std::complex<double> operator()(Index) const { + __device__ std::complex<double> operator()() const { double2 vals = curand_uniform2_double(&m_state); return std::complex<double>(vals.x, vals.y); } @@ -701,17 +718,16 @@ template <typename T> class NormalRandomGenerator { } NormalRandomGenerator(const NormalRandomGenerator& other) : m_deterministic(other.m_deterministic), m_distribution(other.m_distribution), m_generator(new std::mt19937()) { - m_generator->seed(other(0) * UINT_MAX); + m_generator->seed(other() * UINT_MAX); } ~NormalRandomGenerator() { delete m_generator; } - template<typename Index> - T operator()(Index) const { + T operator()() const { return m_distribution(*m_generator); } - template<typename Index, typename PacketType> - PacketType packetOp(Index) const { + template<typename PacketType> + PacketType packetOp() const { const int packetSize = internal::unpacket_traits<PacketType>::size; EIGEN_ALIGN_MAX T values[packetSize]; for (int i = 0; i < packetSize; ++i) { @@ -749,12 +765,11 @@ template <> class NormalRandomGenerator<float> { const int seed = m_deterministic ? 0 : get_random_seed(); curand_init(seed, tid, 0, &m_state); } - template<typename Index> - __device__ float operator()(Index) const { + __device__ float operator()() const { return curand_normal(&m_state); } - template<typename Index, typename PacketType> - __device__ float4 packetOp(Index) const { + template<typename PacketType> + __device__ float4 packetOp() const { EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE); return curand_normal4(&m_state); } @@ -779,12 +794,11 @@ template <> class NormalRandomGenerator<double> { const int seed = m_deterministic ? 0 : get_random_seed(); curand_init(seed, tid, 0, &m_state); } - template<typename Index> - __device__ double operator()(Index) const { + __device__ double operator()() const { return curand_normal_double(&m_state); } - template<typename Index, typename PacketType> - __device__ double2 packetOp(Index) const { + template<typename PacketType> + __device__ double2 packetOp() const { EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE); return curand_normal2_double(&m_state); } @@ -809,8 +823,7 @@ template <> class NormalRandomGenerator<std::complex<float> > { const int seed = m_deterministic ? 0 : get_random_seed(); curand_init(seed, tid, 0, &m_state); } - template<typename Index> - __device__ std::complex<float> operator()(Index) const { + __device__ std::complex<float> operator()() const { float4 vals = curand_normal4(&m_state); return std::complex<float>(vals.x, vals.y); } @@ -835,8 +848,7 @@ template <> class NormalRandomGenerator<std::complex<double> > { const int seed = m_deterministic ? 0 : get_random_seed(); curand_init(seed, tid, 0, &m_state); } - template<typename Index> - __device__ std::complex<double> operator()(Index) const { + __device__ std::complex<double> operator()() const { double2 vals = curand_normal2_double(&m_state); return std::complex<double>(vals.x, vals.y); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 9df697e4c..a87777b22 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -505,9 +505,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> (reducing_inner_dims || ReducingInnerMostDims)) { const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); - if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) { - data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); - m_result = data; + if (!data) { + if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) { + data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + m_result = data; + } + else { + return true; + } } Op reducer(m_reducer); if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { @@ -533,9 +538,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> preserving_inner_dims) { const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); - if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) { - data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); - m_result = data; + if (!data) { + if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) { + data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + m_result = data; + } + else { + return true; + } } Op reducer(m_reducer); if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { @@ -556,6 +566,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> m_impl.cleanup(); if (m_result) { m_device.deallocate(m_result); + m_result = NULL; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 5e512490c..65638b6a8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -67,11 +67,21 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) #endif } +// We extend atomicExch to support extra data types +template <typename Type> +__device__ inline Type atomicExchCustom(Type* address, Type val) { + return atomicExch(address, val); +} + +template <> +__device__ inline double atomicExchCustom(double* address, double val) { + unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address); + return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); +} #ifdef EIGEN_HAS_CUDA_FP16 template <template <typename T> class R> __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) { -#if __CUDA_ARCH__ >= 300 unsigned int oldval = *reinterpret_cast<unsigned int*>(output); unsigned int newval = oldval; reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval)); @@ -87,9 +97,6 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer return; } } -#else - assert(0 && "Shouldn't be called on unsupported device"); -#endif } #endif @@ -130,7 +137,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num unsigned int block = atomicCAS(semaphore, 0u, 1u); if (block == 0) { // We're the first block to run, initialize the output value - atomicExch(output, reducer.initialize()); + atomicExchCustom(output, reducer.initialize()); __threadfence(); atomicExch(semaphore, 2u); } @@ -263,17 +270,22 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2 #endif - -template <typename Self, typename Op, typename OutputType, bool PacketAccess> +template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> struct FullReductionLauncher { static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) { - assert(false && "Should only be called on floats and half floats"); + assert(false && "Should only be called on doubles, floats and half floats"); } }; -template <typename Self, typename Op, bool PacketAccess> -struct FullReductionLauncher<Self, Op, float, PacketAccess> { - static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs) { +// Specialization for float and double +template <typename Self, typename Op, typename OutputType, bool PacketAccess> +struct FullReductionLauncher< + Self, Op, OutputType, PacketAccess, + typename internal::enable_if< + internal::is_same<float, OutputType>::value || + internal::is_same<double, OutputType>::value, + void>::type> { + static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) { typedef typename Self::Index Index; typedef typename Self::CoeffReturnType Scalar; const int block_size = 256; @@ -330,20 +342,22 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> { template <typename Self, typename Op, bool Vectorizable> struct FullReducer<Self, Op, GpuDevice, Vectorizable> { // Unfortunately nvidia doesn't support well exotic types such as complex, - // so reduce the scope of the optimized version of the code to the simple case - // of floats and half floats. + // so reduce the scope of the optimized version of the code to the simple cases + // of doubles, floats and half floats #ifdef EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value || (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); #else static const bool HasOptimizedImplementation = !Op::IsStateful && - internal::is_same<typename Self::CoeffReturnType, float>::value; + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value); #endif template <typename OutputType> static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { - assert(HasOptimizedImplementation && "Should only be called on floats or half floats"); + assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); const Index num_coeffs = array_prod(self.m_impl.dimensions()); // Don't crash when we're called with an input tensor of size 0. if (num_coeffs == 0) { @@ -360,6 +374,7 @@ template <int NumPerThread, typename Self, __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, typename Self::CoeffReturnType* output) { #if __CUDA_ARCH__ >= 300 + typedef typename Self::CoeffReturnType Type; eigen_assert(blockDim.y == 1); eigen_assert(blockDim.z == 1); eigen_assert(gridDim.y == 1); @@ -389,13 +404,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu const Index col_block = i % input_col_blocks; const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x; - float reduced_val = reducer.initialize(); + Type reduced_val = reducer.initialize(); for (Index j = 0; j < NumPerThread; j += unroll_times) { const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1); if (last_col >= num_coeffs_to_reduce) { for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) { - const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); + const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); reducer.reduce(val, &reduced_val); } break; @@ -521,17 +536,23 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, #endif -template <typename Self, typename Op, typename OutputType, bool PacketAccess> +template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> struct InnerReductionLauncher { static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) { - assert(false && "Should only be called to reduce floats and half floats on a gpu device"); + assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device"); return true; } }; -template <typename Self, typename Op, bool PacketAccess> -struct InnerReductionLauncher<Self, Op, float, PacketAccess> { - static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { +// Specialization for float and double +template <typename Self, typename Op, typename OutputType, bool PacketAccess> +struct InnerReductionLauncher< + Self, Op, OutputType, PacketAccess, + typename internal::enable_if< + internal::is_same<float, OutputType>::value || + internal::is_same<double, OutputType>::value, + void>::type> { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; @@ -549,7 +570,7 @@ struct InnerReductionLauncher<Self, Op, float, PacketAccess> { const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / 1024; const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); - LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>), + LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>), num_blocks, 1024, 0, device, reducer.initialize(), num_preserved_vals, output); } @@ -616,15 +637,17 @@ struct InnerReducer<Self, Op, GpuDevice> { #ifdef EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value || (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); #else static const bool HasOptimizedImplementation = !Op::IsStateful && - internal::is_same<typename Self::CoeffReturnType, float>::value; + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value); #endif template <typename OutputType> static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { - assert(HasOptimizedImplementation && "Should only be called on floats or half floats"); + assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); const Index num_coeffs = array_prod(self.m_impl.dimensions()); // Don't crash when we're called with an input tensor of size 0. if (num_coeffs == 0) { @@ -675,11 +698,11 @@ struct OuterReducer<Self, Op, GpuDevice> { // so reduce the scope of the optimized version of the code to the simple case // of floats. static const bool HasOptimizedImplementation = !Op::IsStateful && - internal::is_same<typename Self::CoeffReturnType, float>::value; - + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value); template <typename Device, typename OutputType> static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { - assert(false && "Should only be called to reduce floats on a gpu device"); + assert(false && "Should only be called to reduce doubles or floats on a gpu device"); return true; } |