diff options
author | 2016-06-14 15:33:47 +0200 | |
---|---|---|
committer | 2016-06-14 15:33:47 +0200 | |
commit | 76236cdea402e7236249ab19bd5d8d6ceac1346d (patch) | |
tree | 3b979d88426a94114e8af03877cc47de09862b59 /unsupported/Eigen | |
parent | 1004c4df99a3e4a019f05b83badb06f4e2df5ee6 (diff) | |
parent | 4c61f00838202889045ec9e5ad0d60b79f00fec5 (diff) |
merge
Diffstat (limited to 'unsupported/Eigen')
9 files changed, 160 insertions, 57 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index a60a17049..ee16cde9b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -202,7 +202,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // across k dimension. const TensorOpCost cost = contractionCost(m, n, bm, bn, bk, shard_by_col, false); - Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( + int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( static_cast<double>(n) * m, cost, this->m_device.numThreads()); // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost @@ -301,7 +301,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT class Context { public: Context(const Device& device, int num_threads, LhsMapper& lhs, - RhsMapper& rhs, Scalar* buffer, Index m, Index n, Index k, Index bm, + RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, Index gn, Index nm0, Index nn0, bool shard_by_col, bool parallel_pack) @@ -309,13 +309,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT lhs_(lhs), rhs_(rhs), buffer_(buffer), - output_(buffer, m), + output_(buffer, tm), num_threads_(num_threads), shard_by_col_(shard_by_col), parallel_pack_(parallel_pack), - m_(m), - n_(n), - k_(k), + m_(tm), + n_(tn), + k_(tk), bm_(bm), bn_(bn), bk_(bk), diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index d31b0ad38..c770d024f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -106,7 +106,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { // Build a thread pool device on top the an existing pool of threads. struct ThreadPoolDevice { // The ownership of the thread pool remains with the caller. - ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { } + ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { } EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return internal::aligned_malloc(num_bytes); @@ -130,7 +130,7 @@ struct ThreadPoolDevice { ::memset(buffer, c, n); } - EIGEN_STRONG_INLINE size_t numThreads() const { + EIGEN_STRONG_INLINE int numThreads() const { return num_threads_; } @@ -182,7 +182,7 @@ struct ThreadPoolDevice { std::function<void(Index, Index)> f) const { typedef TensorCostModel<ThreadPoolDevice> CostModel; if (n <= 1 || numThreads() == 1 || - CostModel::numThreads(n, cost, numThreads()) == 1) { + CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) { f(0, n); return; } @@ -242,7 +242,7 @@ struct ThreadPoolDevice { // Recursively divide size into halves until we reach block_size. // Division code rounds mid to block_size, so we are guaranteed to get // block_count leaves that do actual computations. - Barrier barrier(block_count); + Barrier barrier(static_cast<unsigned int>(block_count)); std::function<void(Index, Index)> handleRange; handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) { if (last - first <= block_size) { @@ -268,7 +268,7 @@ struct ThreadPoolDevice { private: ThreadPoolInterface* pool_; - size_t num_threads_; + int num_threads_; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 4e873011e..a48cb1daa 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -426,6 +426,20 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, m_arg3Impl(op.arg3Expression(), device) { EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind, + typename internal::traits<Arg2Type>::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind, + typename internal::traits<Arg3Type>::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index, + typename internal::traits<Arg2Type>::Index>::value), + STORAGE_INDEX_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index, + typename internal::traits<Arg3Type>::Index>::value), + STORAGE_INDEX_MUST_MATCH) + eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions())); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 9509f8002..5f2e329f2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -227,22 +227,6 @@ struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprT TernaryOp(typename Arg1XprType::Scalar, typename Arg2XprType::Scalar, typename Arg3XprType::Scalar)>::type Scalar; - EIGEN_STATIC_ASSERT( - (internal::is_same<typename traits<Arg1XprType>::StorageKind, - typename traits<Arg2XprType>::StorageKind>::value), - STORAGE_KIND_MUST_MATCH) - EIGEN_STATIC_ASSERT( - (internal::is_same<typename traits<Arg1XprType>::StorageKind, - typename traits<Arg3XprType>::StorageKind>::value), - STORAGE_KIND_MUST_MATCH) - EIGEN_STATIC_ASSERT( - (internal::is_same<typename traits<Arg1XprType>::Index, - typename traits<Arg2XprType>::Index>::value), - STORAGE_INDEX_MUST_MATCH) - EIGEN_STATIC_ASSERT( - (internal::is_same<typename traits<Arg1XprType>::Index, - typename traits<Arg3XprType>::Index>::value), - STORAGE_INDEX_MUST_MATCH) typedef traits<Arg1XprType> XprTraits; typedef typename traits<Arg1XprType>::StorageKind StorageKind; typedef typename traits<Arg1XprType>::Index Index; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 3dd32e9d1..a8e48fced 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -84,6 +84,14 @@ struct functor_traits<scalar_sigmoid_op<T> > { }; +template<typename Reducer, typename Device> +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false + }; +}; + // Standard reduction functors template <typename T> struct SumReducer { @@ -119,6 +127,15 @@ template <typename T> struct SumReducer } }; +template <typename T, typename Device> +struct reducer_traits<SumReducer<T>, Device> { + enum { + Cost = NumTraits<T>::AddCost, + PacketAccess = PacketType<T, Device>::HasAdd + }; +}; + + template <typename T> struct MeanReducer { static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger; @@ -162,6 +179,15 @@ template <typename T> struct MeanReducer DenseIndex packetCount_; }; +template <typename T, typename Device> +struct reducer_traits<MeanReducer<T>, Device> { + enum { + Cost = NumTraits<T>::AddCost, + PacketAccess = PacketType<T, Device>::HasAdd + }; +}; + + template <typename T> struct MaxReducer { static const bool PacketAccess = packet_traits<T>::HasMax; @@ -195,6 +221,15 @@ template <typename T> struct MaxReducer } }; +template <typename T, typename Device> +struct reducer_traits<MaxReducer<T>, Device> { + enum { + Cost = NumTraits<T>::AddCost, + PacketAccess = PacketType<T, Device>::HasMax + }; +}; + + template <typename T> struct MinReducer { static const bool PacketAccess = packet_traits<T>::HasMin; @@ -228,6 +263,14 @@ template <typename T> struct MinReducer } }; +template <typename T, typename Device> +struct reducer_traits<MinReducer<T>, Device> { + enum { + Cost = NumTraits<T>::AddCost, + PacketAccess = PacketType<T, Device>::HasMin + }; +}; + template <typename T> struct ProdReducer { @@ -263,6 +306,14 @@ template <typename T> struct ProdReducer } }; +template <typename T, typename Device> +struct reducer_traits<ProdReducer<T>, Device> { + enum { + Cost = NumTraits<T>::MulCost, + PacketAccess = PacketType<T, Device>::HasMul + }; +}; + struct AndReducer { @@ -280,6 +331,15 @@ struct AndReducer } }; +template <typename Device> +struct reducer_traits<AndReducer, Device> { + enum { + Cost = 1, + PacketAccess = false + }; +}; + + struct OrReducer { static const bool PacketAccess = false; static const bool IsStateful = false; @@ -295,6 +355,15 @@ struct OrReducer { } }; +template <typename Device> +struct reducer_traits<OrReducer, Device> { + enum { + Cost = 1, + PacketAccess = false + }; +}; + + // Argmin/Argmax reducers template <typename T> struct ArgMaxTupleReducer { @@ -312,6 +381,15 @@ template <typename T> struct ArgMaxTupleReducer } }; +template <typename T, typename Device> +struct reducer_traits<ArgMaxTupleReducer<T>, Device> { + enum { + Cost = NumTraits<T>::AddCost, + PacketAccess = false + }; +}; + + template <typename T> struct ArgMinTupleReducer { static const bool PacketAccess = false; @@ -328,6 +406,14 @@ template <typename T> struct ArgMinTupleReducer } }; +template <typename T, typename Device> +struct reducer_traits<ArgMinTupleReducer<T>, Device> { + enum { + Cost = NumTraits<T>::AddCost, + PacketAccess = false + }; +}; + // Random number generation namespace { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index b1645d56f..fdb5ee6b8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -47,22 +47,39 @@ template <> struct max_n_1<0> { // Default packet types template <typename Scalar, typename Device> -struct PacketType { +struct PacketType : internal::packet_traits<Scalar> { typedef typename internal::packet_traits<Scalar>::type type; - enum { size = internal::unpacket_traits<type>::size }; }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16) template <> -struct PacketType<float, GpuDevice> { - typedef float4 type; - static const int size = 4; -}; -template <> -struct PacketType<double, GpuDevice> { - typedef double2 type; +struct PacketType<half, GpuDevice> { + typedef half2 type; static const int size = 2; + enum { + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0, + HasBlend = 0, + + HasDiv = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasExp = 1, + HasLog = 1, + HasLog1p = 0, + HasLog10 = 0, + HasPow = 1, + }; }; #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 52cfc2824..d34f1e328 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -148,7 +148,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); } - const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } protected: TensorEvaluator<ArgType, Device> m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 0d1a098b7..d9bbcd858 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -130,15 +130,17 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num if (block == 0) { // We're the first block to run, initialize the output value atomicExch(output, reducer.initialize()); - unsigned int old = atomicExch(semaphore, 2u); - assert(old == 1u); + __threadfence(); + atomicExch(semaphore, 2u); } else { + // Wait for the first block to initialize the output value. // Use atomicCAS here to ensure that the reads aren't cached - unsigned int val = atomicCAS(semaphore, 2u, 2u); - while (val < 2u) { + unsigned int val; + do { val = atomicCAS(semaphore, 2u, 2u); } + while (val < 2u); } } } @@ -166,12 +168,8 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num } if (gridDim.x > 1 && threadIdx.x == 0) { - unsigned int ticket = atomicInc(semaphore, UINT_MAX); - assert(ticket >= 2u); - if (ticket == gridDim.x + 1) { - // We're the last block, reset the semaphore - *semaphore = 0; - } + // Let the last block reset the semaphore + atomicInc(semaphore, gridDim.x + 1); } } @@ -330,10 +328,10 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> { // Unfortunately nvidia doesn't support well exotic types such as complex, // so reduce the scope of the optimized version of the code to the simple case // of floats and half floats. - #ifdef EIGEN_HAS_CUDA_FP16 +#ifdef EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value || - (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess)); + (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); #else static const bool HasOptimizedImplementation = !Op::IsStateful && internal::is_same<typename Self::CoeffReturnType, float>::value; @@ -348,7 +346,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> { return; } - FullReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs); + FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs); } }; @@ -610,7 +608,7 @@ struct InnerReducer<Self, Op, GpuDevice> { #ifdef EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value || - (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess)); + (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); #else static const bool HasOptimizedImplementation = !Op::IsStateful && internal::is_same<typename Self::CoeffReturnType, float>::value; @@ -629,7 +627,7 @@ struct InnerReducer<Self, Op, GpuDevice> { return true; } - return InnerReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); + return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 031dbf6f2..5207f6a8d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -81,7 +81,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { typedef typename XprType::Index Index; static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; typedef DSizes<Index, NumDims> Dimensions; - typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; @@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { m_output(NULL) { // Accumulating a scalar isn't supported. - EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(m_axis >= 0 && m_axis < NumDims); // Compute stride of scan axis @@ -122,7 +122,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { - return m_dimensions; + return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { @@ -136,7 +136,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { return true; } } - + template<int LoadMode> EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { return internal::ploadt<PacketReturnType, LoadMode>(m_output + index); @@ -152,6 +152,10 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { return m_output[index]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { if (m_output != NULL) { m_device.deallocate(m_output); |