diff options
author | Ville Kallioniemi <ville.kallioniemi@gmail.com> | 2016-02-01 19:32:31 -0700 |
---|---|---|
committer | Ville Kallioniemi <ville.kallioniemi@gmail.com> | 2016-02-01 19:32:31 -0700 |
commit | f0fdefa96fdbdafe0daaa47a2dd54b9e77cf9716 (patch) | |
tree | e175666422c11be478eaaf68c934e1fc913fec6f /unsupported/Eigen/CXX11/src | |
parent | 02db1228ed9ca3728ae0685a5e1602fe7299ae50 (diff) | |
parent | 64ce78c2ec52aa2fd2e408c7c4160b06e8fc1a03 (diff) |
Rebase to latest.
Diffstat (limited to 'unsupported/Eigen/CXX11/src')
14 files changed, 84 insertions, 40 deletions
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h index 456b34d0b..89aeb03e7 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h @@ -25,6 +25,16 @@ template <typename T, size_t n> class array { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& front() { return values[0]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& front() const { return values[0]; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& back() { return values[n-1]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static std::size_t size() { return n; } @@ -123,13 +133,33 @@ template <typename T> class array<T, 0> { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& operator[] (size_t) { eigen_assert(false && "Can't index a zero size array"); - return *static_cast<T*>(NULL); + return dummy; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[] (size_t) const { eigen_assert(false && "Can't index a zero size array"); - return *static_cast<const T*>(NULL); + return dummy; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& front() { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& front() const { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& back() { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& back() const { + eigen_assert(false && "Can't index a zero size array"); + return dummy; } static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; } @@ -142,6 +172,9 @@ template <typename T> class array<T, 0> { eigen_assert(l.size() == 0); } #endif + + private: + T dummy; }; namespace internal { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 392acf302..cca716d6f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -129,6 +129,12 @@ class TensorBase<Derived, ReadOnlyAccessors> } EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived> + digamma() const { + return unaryExpr(internal::scalar_digamma_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> erf() const { return unaryExpr(internal::scalar_erf_op<Scalar>()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 624e814e2..1adb68894 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -378,7 +378,7 @@ struct TensorContractionEvaluatorBase } template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> - void evalGemv(Scalar* buffer) const { + EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const { const Index rows = m_i_size; const Index cols = m_k_size; @@ -516,7 +516,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT Base(op, device) { } template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> - void evalProduct(Scalar* buffer) const { + EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const { if (this->m_j_size == 1) { this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); return; @@ -582,10 +582,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT OutputMapper output(buffer, m); - typedef typename internal::gemm_blocking_space<ColMajor, LhsScalar, RhsScalar, Dynamic, Dynamic, Dynamic> BlockingType; - // Sizes of the blocks to load in cache. See the Goto paper for details. - BlockingType blocking(m, n, k, 1, true); + internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1); const Index kc = blocking.kc(); const Index mc = numext::mini(m, blocking.mc()); const Index nc = numext::mini(n, blocking.nc()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index 78ed5038f..3d3f6904f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -28,7 +28,7 @@ class TensorContractionBlocking { typedef typename LhsMapper::Scalar LhsScalar; typedef typename RhsMapper::Scalar RhsScalar; - TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : + EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : kc_(k), mc_(m), nc_(n) { if (ShardingType == ShardByCol) { @@ -41,9 +41,9 @@ class TensorContractionBlocking { } } - EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } - EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } - EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } private: Index kc_; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index 9b6d18090..63c8ae126 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -426,15 +426,16 @@ class TensorContractionSubMapper { }; -template<typename Scalar, typename Index, int side, +template<typename Scalar_, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t, int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> class TensorContractionInputMapper - : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> { + : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> { public: + typedef Scalar_ Scalar; typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base; typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper; typedef SubMapper VectorMapper; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 576bea295..51a3b9490 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -176,10 +176,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // compute block sizes (which depend on number of threads) const Index num_threads = this->m_device.numThreads(); - Index mc = m; - Index nc = n; - Index kc = k; - internal::computeProductBlockingSizes<LhsScalar,RhsScalar,1>(kc, mc, nc, num_threads); + internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, num_threads); + Index mc = blocking.mc(); + Index nc = blocking.nc(); + Index kc = blocking.kc(); eigen_assert(mc <= m); eigen_assert(nc <= n); eigen_assert(kc <= k); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 367a152a0..67c797802 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -21,7 +21,7 @@ namespace Eigen { */ namespace internal { -template <typename Index, typename InputDims, size_t NumKernelDims, int Layout> +template <typename Index, typename InputDims, int NumKernelDims, int Layout> class IndexMapper { public: IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims, @@ -123,7 +123,7 @@ class IndexMapper { } inputIndex += p * m_inputStrides[NumKernelDims]; } else { - int limit = 0; + std::ptrdiff_t limit = 0; if (NumKernelDims < NumDims) { limit = NumDims - NumKernelDims - 1; } @@ -147,7 +147,7 @@ class IndexMapper { } outputIndex += p * m_outputStrides[NumKernelDims]; } else { - int limit = 0; + std::ptrdiff_t limit = 0; if (NumKernelDims < NumDims) { limit = NumDims - NumKernelDims - 1; } @@ -206,7 +206,7 @@ class IndexMapper { } private: - static const size_t NumDims = internal::array_size<InputDims>::value; + static const int NumDims = internal::array_size<InputDims>::value; array<Index, NumDims> m_inputStrides; array<Index, NumDims> m_outputStrides; array<Index, NumDims> m_cudaInputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 5abdc489b..e684ab8f7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -109,10 +109,12 @@ class CudaStreamDevice : public StreamInterface { struct GpuDevice { // The StreamInterface is not owned: the caller is // responsible for its initialization and eventual destruction. - explicit GpuDevice(const StreamInterface* stream) : stream_(stream) { + explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { + eigen_assert(stream); + } + explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { eigen_assert(stream); } - // TODO(bsteiner): This is an internal API, we should not expose it. EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return stream_->stream(); @@ -246,6 +248,10 @@ struct GpuDevice { #endif } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const { + return max_blocks_; + } + // This function checks if the CUDA runtime recorded an error for the // underlying stream device. inline bool ok() const { @@ -259,7 +265,7 @@ struct GpuDevice { private: const StreamInterface* stream_; - + int max_blocks_; }; #ifndef __CUDA_ARCH__ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index e7daf7304..bd83d5de8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -136,7 +136,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device> } template<int LoadMode> - EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt<Packet, LoadMode>(m_buffer + index); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index d2ab70f2b..df15c6204 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -220,7 +220,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run( if (needs_assign) { const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size; + const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); const Index size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash if we're called with tensors of size 0. const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1); @@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(c if (needs_assign) { const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size; + const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); const Index size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash if we're called with tensors of size 0. const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index c9b0b2f28..58b864787 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -106,7 +106,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - m_impl.evalSubExprsIfNeeded(NULL); const Index numValues = m_impl.dimensions().TotalSize(); m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); // Should initialize the memory in case we're dealing with non POD types. @@ -119,7 +118,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> EvalTo evalToTmp(m_buffer, m_op); const bool PacketAccess = internal::IsVectorizable<Device, const ArgType>::value; internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device); - m_impl.cleanup(); return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 09ee0c2c6..22aea5ea4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -345,8 +345,8 @@ template <typename Self, typename Op, typename Device> struct InnerReducer { static const bool HasOptimizedImplementation = false; - static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { - assert(false && "Not implemented"); + EIGEN_DEVICE_FUNC static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); } }; @@ -355,8 +355,8 @@ template <typename Self, typename Op, typename Device> struct OuterReducer { static const bool HasOptimizedImplementation = false; - static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { - assert(false && "Not implemented"); + EIGEN_DEVICE_FUNC static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); } }; @@ -463,7 +463,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; } } else { - m_outputStrides[NumOutputDims - 1] = 1; + m_outputStrides.back() = 1; for (int i = NumOutputDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; } @@ -479,7 +479,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> input_strides[i] = input_strides[i-1] * input_dims[i-1]; } } else { - input_strides[NumInputDims - 1] = 1; + input_strides.back() = 1; for (int i = NumInputDims - 2; i >= 0; --i) { input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 98631fc7f..ed933b6ac 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -41,7 +41,10 @@ class TensorStorage<T, FixedDimensions, Options_> private: static const std::size_t Size = FixedDimensions::total_size; - EIGEN_ALIGN_MAX T m_data[Size]; + // Allocate an array of size at least one to prevent compiler warnings. + static const std::size_t MinSize = max_n_1<Size>::size; + EIGEN_ALIGN_MAX T m_data[MinSize]; + FixedDimensions m_dimensions; public: @@ -105,7 +108,6 @@ class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> EIGEN_DEVICE_FUNC void resize(Index size, const array<Index, NumIndices_>& nbDimensions) { - eigen_assert(size >= 1); const Index currentSz = internal::array_prod(m_dimensions); if(size != currentSz) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 19352eb5e..0d34f7ee6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -34,11 +34,11 @@ struct TensorUInt128 LOW low; EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(int x) : high(0), low(x) { + TensorUInt128(int32_t x) : high(0), low(x) { eigen_assert(x >= 0); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(unsigned int x) : high(0), low(x) { } + TensorUInt128(uint32_t x) : high(0), low(x) { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(long x) : high(0), low(x) { eigen_assert(x >= 0); |