diff options
author | Ville Kallioniemi <ville.kallioniemi@gmail.com> | 2016-02-01 19:32:31 -0700 |
---|---|---|
committer | Ville Kallioniemi <ville.kallioniemi@gmail.com> | 2016-02-01 19:32:31 -0700 |
commit | f0fdefa96fdbdafe0daaa47a2dd54b9e77cf9716 (patch) | |
tree | e175666422c11be478eaaf68c934e1fc913fec6f /unsupported | |
parent | 02db1228ed9ca3728ae0685a5e1602fe7299ae50 (diff) | |
parent | 64ce78c2ec52aa2fd2e408c7c4160b06e8fc1a03 (diff) |
Rebase to latest.
Diffstat (limited to 'unsupported')
25 files changed, 403 insertions, 195 deletions
diff --git a/unsupported/Eigen/AlignedVector3 b/unsupported/Eigen/AlignedVector3 index f5c40a189..135eec572 100644 --- a/unsupported/Eigen/AlignedVector3 +++ b/unsupported/Eigen/AlignedVector3 @@ -188,7 +188,7 @@ template<typename _Scalar> class AlignedVector3 } template<typename Derived> - inline bool isApprox(const MatrixBase<Derived>& other, RealScalar eps=NumTraits<Scalar>::dummy_precision()) const + inline bool isApprox(const MatrixBase<Derived>& other, const RealScalar& eps=NumTraits<Scalar>::dummy_precision()) const { return m_coeffs.template head<3>().isApprox(other,eps); } diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h index 456b34d0b..89aeb03e7 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h @@ -25,6 +25,16 @@ template <typename T, size_t n> class array { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& front() { return values[0]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& front() const { return values[0]; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& back() { return values[n-1]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static std::size_t size() { return n; } @@ -123,13 +133,33 @@ template <typename T> class array<T, 0> { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& operator[] (size_t) { eigen_assert(false && "Can't index a zero size array"); - return *static_cast<T*>(NULL); + return dummy; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[] (size_t) const { eigen_assert(false && "Can't index a zero size array"); - return *static_cast<const T*>(NULL); + return dummy; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& front() { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& front() const { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& back() { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& back() const { + eigen_assert(false && "Can't index a zero size array"); + return dummy; } static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; } @@ -142,6 +172,9 @@ template <typename T> class array<T, 0> { eigen_assert(l.size() == 0); } #endif + + private: + T dummy; }; namespace internal { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 392acf302..cca716d6f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -129,6 +129,12 @@ class TensorBase<Derived, ReadOnlyAccessors> } EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived> + digamma() const { + return unaryExpr(internal::scalar_digamma_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> erf() const { return unaryExpr(internal::scalar_erf_op<Scalar>()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 624e814e2..1adb68894 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -378,7 +378,7 @@ struct TensorContractionEvaluatorBase } template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> - void evalGemv(Scalar* buffer) const { + EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const { const Index rows = m_i_size; const Index cols = m_k_size; @@ -516,7 +516,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT Base(op, device) { } template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> - void evalProduct(Scalar* buffer) const { + EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const { if (this->m_j_size == 1) { this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); return; @@ -582,10 +582,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT OutputMapper output(buffer, m); - typedef typename internal::gemm_blocking_space<ColMajor, LhsScalar, RhsScalar, Dynamic, Dynamic, Dynamic> BlockingType; - // Sizes of the blocks to load in cache. See the Goto paper for details. - BlockingType blocking(m, n, k, 1, true); + internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1); const Index kc = blocking.kc(); const Index mc = numext::mini(m, blocking.mc()); const Index nc = numext::mini(n, blocking.nc()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index 78ed5038f..3d3f6904f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -28,7 +28,7 @@ class TensorContractionBlocking { typedef typename LhsMapper::Scalar LhsScalar; typedef typename RhsMapper::Scalar RhsScalar; - TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : + EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : kc_(k), mc_(m), nc_(n) { if (ShardingType == ShardByCol) { @@ -41,9 +41,9 @@ class TensorContractionBlocking { } } - EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } - EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } - EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } private: Index kc_; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index 9b6d18090..63c8ae126 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -426,15 +426,16 @@ class TensorContractionSubMapper { }; -template<typename Scalar, typename Index, int side, +template<typename Scalar_, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t, int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> class TensorContractionInputMapper - : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> { + : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> { public: + typedef Scalar_ Scalar; typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base; typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper; typedef SubMapper VectorMapper; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 576bea295..51a3b9490 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -176,10 +176,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // compute block sizes (which depend on number of threads) const Index num_threads = this->m_device.numThreads(); - Index mc = m; - Index nc = n; - Index kc = k; - internal::computeProductBlockingSizes<LhsScalar,RhsScalar,1>(kc, mc, nc, num_threads); + internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, num_threads); + Index mc = blocking.mc(); + Index nc = blocking.nc(); + Index kc = blocking.kc(); eigen_assert(mc <= m); eigen_assert(nc <= n); eigen_assert(kc <= k); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 367a152a0..67c797802 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -21,7 +21,7 @@ namespace Eigen { */ namespace internal { -template <typename Index, typename InputDims, size_t NumKernelDims, int Layout> +template <typename Index, typename InputDims, int NumKernelDims, int Layout> class IndexMapper { public: IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims, @@ -123,7 +123,7 @@ class IndexMapper { } inputIndex += p * m_inputStrides[NumKernelDims]; } else { - int limit = 0; + std::ptrdiff_t limit = 0; if (NumKernelDims < NumDims) { limit = NumDims - NumKernelDims - 1; } @@ -147,7 +147,7 @@ class IndexMapper { } outputIndex += p * m_outputStrides[NumKernelDims]; } else { - int limit = 0; + std::ptrdiff_t limit = 0; if (NumKernelDims < NumDims) { limit = NumDims - NumKernelDims - 1; } @@ -206,7 +206,7 @@ class IndexMapper { } private: - static const size_t NumDims = internal::array_size<InputDims>::value; + static const int NumDims = internal::array_size<InputDims>::value; array<Index, NumDims> m_inputStrides; array<Index, NumDims> m_outputStrides; array<Index, NumDims> m_cudaInputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 5abdc489b..e684ab8f7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -109,10 +109,12 @@ class CudaStreamDevice : public StreamInterface { struct GpuDevice { // The StreamInterface is not owned: the caller is // responsible for its initialization and eventual destruction. - explicit GpuDevice(const StreamInterface* stream) : stream_(stream) { + explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { + eigen_assert(stream); + } + explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { eigen_assert(stream); } - // TODO(bsteiner): This is an internal API, we should not expose it. EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return stream_->stream(); @@ -246,6 +248,10 @@ struct GpuDevice { #endif } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const { + return max_blocks_; + } + // This function checks if the CUDA runtime recorded an error for the // underlying stream device. inline bool ok() const { @@ -259,7 +265,7 @@ struct GpuDevice { private: const StreamInterface* stream_; - + int max_blocks_; }; #ifndef __CUDA_ARCH__ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index e7daf7304..bd83d5de8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -136,7 +136,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device> } template<int LoadMode> - EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt<Packet, LoadMode>(m_buffer + index); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index d2ab70f2b..df15c6204 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -220,7 +220,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run( if (needs_assign) { const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size; + const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); const Index size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash if we're called with tensors of size 0. const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1); @@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(c if (needs_assign) { const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size; + const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); const Index size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash if we're called with tensors of size 0. const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index c9b0b2f28..58b864787 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -106,7 +106,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - m_impl.evalSubExprsIfNeeded(NULL); const Index numValues = m_impl.dimensions().TotalSize(); m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); // Should initialize the memory in case we're dealing with non POD types. @@ -119,7 +118,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> EvalTo evalToTmp(m_buffer, m_op); const bool PacketAccess = internal::IsVectorizable<Device, const ArgType>::value; internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device); - m_impl.cleanup(); return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 09ee0c2c6..22aea5ea4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -345,8 +345,8 @@ template <typename Self, typename Op, typename Device> struct InnerReducer { static const bool HasOptimizedImplementation = false; - static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { - assert(false && "Not implemented"); + EIGEN_DEVICE_FUNC static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); } }; @@ -355,8 +355,8 @@ template <typename Self, typename Op, typename Device> struct OuterReducer { static const bool HasOptimizedImplementation = false; - static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { - assert(false && "Not implemented"); + EIGEN_DEVICE_FUNC static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); } }; @@ -463,7 +463,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; } } else { - m_outputStrides[NumOutputDims - 1] = 1; + m_outputStrides.back() = 1; for (int i = NumOutputDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; } @@ -479,7 +479,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> input_strides[i] = input_strides[i-1] * input_dims[i-1]; } } else { - input_strides[NumInputDims - 1] = 1; + input_strides.back() = 1; for (int i = NumInputDims - 2; i >= 0; --i) { input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 98631fc7f..ed933b6ac 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -41,7 +41,10 @@ class TensorStorage<T, FixedDimensions, Options_> private: static const std::size_t Size = FixedDimensions::total_size; - EIGEN_ALIGN_MAX T m_data[Size]; + // Allocate an array of size at least one to prevent compiler warnings. + static const std::size_t MinSize = max_n_1<Size>::size; + EIGEN_ALIGN_MAX T m_data[MinSize]; + FixedDimensions m_dimensions; public: @@ -105,7 +108,6 @@ class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> EIGEN_DEVICE_FUNC void resize(Index size, const array<Index, NumIndices_>& nbDimensions) { - eigen_assert(size >= 1); const Index currentSz = internal::array_prod(m_dimensions); if(size != currentSz) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 19352eb5e..0d34f7ee6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -34,11 +34,11 @@ struct TensorUInt128 LOW low; EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(int x) : high(0), low(x) { + TensorUInt128(int32_t x) : high(0), low(x) { eigen_assert(x >= 0); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - TensorUInt128(unsigned int x) : high(0), low(x) { } + TensorUInt128(uint32_t x) : high(0), low(x) { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(long x) : high(0), low(x) { eigen_assert(x >= 0); diff --git a/unsupported/Eigen/src/Splines/SplineFitting.h b/unsupported/Eigen/src/Splines/SplineFitting.h index d3c245fa9..8e6a5aaed 100644 --- a/unsupported/Eigen/src/Splines/SplineFitting.h +++ b/unsupported/Eigen/src/Splines/SplineFitting.h @@ -167,7 +167,7 @@ namespace Eigen derivativeKnots.data(), derivativeKnots.data() + derivativeKnots.size(), temporaryKnots.data()); - // Number of control points (one for each point and derivative) plus spline order. + // Number of knots (one for each point and derivative) plus spline order. DenseIndex numKnots = numParameters + numDerivatives + degree + 1; knots.resize(numKnots); diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 97257b183..eed724bcf 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -147,13 +147,27 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_sugar "-std=c++0x") ei_add_test(cxx11_tensor_fft "-std=c++0x") ei_add_test(cxx11_tensor_ifft "-std=c++0x") + ei_add_test(cxx11_tensor_empty "-std=c++0x") - # These tests needs nvcc -# ei_add_test(cxx11_tensor_device "-std=c++0x") -# ei_add_test(cxx11_tensor_cuda "-std=c++0x") -# ei_add_test(cxx11_tensor_contract_cuda "-std=c++0x") -# ei_add_test(cxx11_tensor_reduction_cuda "-std=c++0x") -# ei_add_test(cxx11_tensor_random_cuda "-std=c++0x") -# ei_add_test(cxx11_tensor_argmax_cuda "-std=c++0x") +endif() +# These tests needs nvcc +find_package(CUDA 7.0) +if(CUDA_FOUND) + set(CUDA_PROPAGATE_HOST_FLAGS OFF) + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) + endif() + set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_30") + cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") + + ei_add_test(cxx11_tensor_device) + ei_add_test(cxx11_tensor_cuda) + ei_add_test(cxx11_tensor_contract_cuda) + ei_add_test(cxx11_tensor_reduction_cuda) + ei_add_test(cxx11_tensor_random_cuda) + ei_add_test(cxx11_tensor_argmax_cuda) + + unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) endif() diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cpp b/unsupported/test/cxx11_tensor_argmax_cuda.cu index d37490d15..45311d4f7 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cpp +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -56,6 +56,10 @@ void test_cuda_simple_argmax() VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1); VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0); + + cudaFree(d_in); + cudaFree(d_out_max); + cudaFree(d_out_min); } template <int DataLayout> @@ -141,6 +145,9 @@ void test_cuda_argmax_dim() // Expect max to be in the last index of the reduced dimension VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); } + + cudaFree(d_in); + cudaFree(d_out); } } @@ -227,15 +234,18 @@ void test_cuda_argmin_dim() // Expect max to be in the last index of the reduced dimension VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); } + + cudaFree(d_in); + cudaFree(d_out); } } void test_cxx11_tensor_cuda() { - CALL_SUBTEST(test_cuda_simple_argmax<RowMajor>()); - CALL_SUBTEST(test_cuda_simple_argmax<ColMajor>()); - CALL_SUBTEST(test_cuda_argmax_dim<RowMajor>()); - CALL_SUBTEST(test_cuda_argmax_dim<ColMajor>()); - CALL_SUBTEST(test_cuda_argmin_dim<RowMajor>()); - CALL_SUBTEST(test_cuda_argmin_dim<ColMajor>()); + CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>()); + CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>()); + CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>()); + CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>()); + CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>()); + CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>()); } diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cpp b/unsupported/test/cxx11_tensor_contract_cuda.cu index 035a093e6..6d1ef07f9 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cpp +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -22,16 +22,16 @@ using Eigen::Tensor; typedef Tensor<float, 1>::DimensionPair DimPair; template<int DataLayout> -static void test_cuda_contraction(int m_size, int k_size, int n_size) +void test_cuda_contraction(int m_size, int k_size, int n_size) { - cout<<"Calling with ("<<m_size<<","<<k_size<<","<<n_size<<")"<<std::endl; + std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU - Tensor<float, 2, DataLayout> t_left(Eigen::array<int, 2>(m_size, k_size)); - Tensor<float, 2, DataLayout> t_right(Eigen::array<int, 2>(k_size, n_size)); - Tensor<float, 2, DataLayout> t_result(Eigen::array<int, 2>(m_size, n_size)); - Tensor<float, 2, DataLayout> t_result_gpu(Eigen::array<int, 2>(m_size, n_size)); + Tensor<float, 2, DataLayout> t_left(m_size, k_size); + Tensor<float, 2, DataLayout> t_right(k_size, n_size); + Tensor<float, 2, DataLayout> t_result(m_size, n_size); + Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size); Eigen::array<DimPair, 1> dims(DimPair(1, 0)); t_left.setRandom(); @@ -67,12 +67,16 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size) t_result = t_left.contract(t_right, dims); cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); - for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { - if (fabs(t_result.data()[i] - t_result_gpu.data()[i]) >= 1e-4) { - cout << "mismatch detected at index " << i << ": " << t_result.data()[i] - << " vs " << t_result_gpu.data()[i] << endl; - assert(false); + for (size_t i = 0; i < t_result.size(); i++) { + if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { + continue; } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) { + continue; + } + std::cout << "mismatch detected at index " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + assert(false); } cudaFree((void*)d_t_left); @@ -80,41 +84,69 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size) cudaFree((void*)d_t_result); } - -void test_cxx11_tensor_cuda() -{ - cout<<"Calling contraction tests"<<std::endl; - CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, 128)); - CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, 128)); +template<int DataLayout> +void test_cuda_contraction_m() { for (int k = 32; k < 256; k++) { - CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, k, 128)); - CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, k, 128)); + test_cuda_contraction<ColMajor>(k, 128, 128); + test_cuda_contraction<RowMajor>(k, 128, 128); } +} + +template<int DataLayout> +void test_cuda_contraction_k() { for (int k = 32; k < 256; k++) { - CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, k)); - CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, k)); + test_cuda_contraction<ColMajor>(128, k, 128); + test_cuda_contraction<RowMajor>(128, k, 128); } +} + +template<int DataLayout> +void test_cuda_contraction_n() { for (int k = 32; k < 256; k++) { - CALL_SUBTEST(test_cuda_contraction<ColMajor>(k, 128, 128)); - CALL_SUBTEST(test_cuda_contraction<RowMajor>(k, 128, 128)); + test_cuda_contraction<ColMajor>(128, 128, k); + test_cuda_contraction<RowMajor>(128, 128, k); } +} - int m_sizes[] = {31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025 }; - int n_sizes[] = {31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025 }; - - int k_sizes[] = { 31, 39, 63, 64, 65, - 95, 96, 127, 129, 255, - 257, 511, 512, 513, 1023, - 1024, 1025}; - for (int i = 0; i <15; i++) - for (int j = 0; j < 15; j++) +template<int DataLayout> +void test_cuda_contraction_sizes() { + int m_sizes[] = { 31, 39, 63, 64, 65, + 127, 129, 255, 257 , 511, + 512, 513, 1023, 1024, 1025}; + + int n_sizes[] = { 31, 39, 63, 64, 65, + 127, 129, 255, 257, 511, + 512, 513, 1023, 1024, 1025}; + + int k_sizes[] = { 31, 39, 63, 64, 65, + 95, 96, 127, 129, 255, + 257, 511, 512, 513, 1023, + 1024, 1025}; + + for (int i = 0; i < 15; i++) { + for (int j = 0; j < 15; j++) { for (int k = 0; k < 17; k++) { - CALL_SUBTEST(test_cuda_contraction<ColMajor>(m_sizes[i], n_sizes[j], k_sizes[k])); - CALL_SUBTEST(test_cuda_contraction<RowMajor>(m_sizes[i], n_sizes[j], k_sizes[k])); + test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]); } + } + } +} + +void test_cxx11_tensor_cuda() +{ + CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128)); + CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128)); + + CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>()); + CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>()); + + CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>()); + CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>()); + + CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>()); + CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>()); + + CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>()); + CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>()); } diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cu index 49e1894ab..60f9314a5 100644 --- a/unsupported/test/cxx11_tensor_cuda.cpp +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -63,6 +63,10 @@ void test_cuda_elementwise_small() { out(Eigen::array<int, 1>(i)), in1(Eigen::array<int, 1>(i)) + in2(Eigen::array<int, 1>(i))); } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); } void test_cuda_elementwise() @@ -113,6 +117,11 @@ void test_cuda_elementwise() } } } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); } void test_cuda_reduction() @@ -131,8 +140,7 @@ void test_cuda_reduction() cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113); @@ -159,10 +167,13 @@ void test_cuda_reduction() VERIFY_IS_APPROX(out(i,j), expected); } } + + cudaFree(d_in1); + cudaFree(d_out); } template<int DataLayout> -static void test_cuda_contraction() +void test_cuda_contraction() { // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on @@ -189,8 +200,7 @@ static void test_cuda_contraction() cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31); @@ -214,14 +224,18 @@ static void test_cuda_contraction() for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { - cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << endl; + std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); } } + + cudaFree(d_t_left); + cudaFree(d_t_right); + cudaFree(d_t_result); } template<int DataLayout> -static void test_cuda_convolution_1d() +void test_cuda_convolution_1d() { Tensor<float, 4, DataLayout> input(74,37,11,137); Tensor<float, 1, DataLayout> kernel(4); @@ -243,8 +257,7 @@ static void test_cuda_convolution_1d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137); @@ -269,9 +282,13 @@ static void test_cuda_convolution_1d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } -static void test_cuda_convolution_inner_dim_col_major_1d() +void test_cuda_convolution_inner_dim_col_major_1d() { Tensor<float, 4, ColMajor> input(74,9,11,7); Tensor<float, 1, ColMajor> kernel(4); @@ -293,8 +310,7 @@ static void test_cuda_convolution_inner_dim_col_major_1d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7); @@ -319,9 +335,13 @@ static void test_cuda_convolution_inner_dim_col_major_1d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } -static void test_cuda_convolution_inner_dim_row_major_1d() +void test_cuda_convolution_inner_dim_row_major_1d() { Tensor<float, 4, RowMajor> input(7,9,11,74); Tensor<float, 1, RowMajor> kernel(4); @@ -343,8 +363,7 @@ static void test_cuda_convolution_inner_dim_row_major_1d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74); @@ -369,10 +388,14 @@ static void test_cuda_convolution_inner_dim_row_major_1d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } template<int DataLayout> -static void test_cuda_convolution_2d() +void test_cuda_convolution_2d() { Tensor<float, 4, DataLayout> input(74,37,11,137); Tensor<float, 2, DataLayout> kernel(3,4); @@ -394,8 +417,7 @@ static void test_cuda_convolution_2d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137); @@ -430,10 +452,14 @@ static void test_cuda_convolution_2d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } template<int DataLayout> -static void test_cuda_convolution_3d() +void test_cuda_convolution_3d() { Tensor<float, 5, DataLayout> input(Eigen::array<int, 5>(74,37,11,137,17)); Tensor<float, 3, DataLayout> kernel(3,4,2); @@ -455,8 +481,7 @@ static void test_cuda_convolution_3d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17); @@ -505,6 +530,10 @@ static void test_cuda_convolution_3d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } @@ -542,6 +571,9 @@ void test_cuda_lgamma(const Scalar stddev) VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j))); } } + + cudaFree(d_in); + cudaFree(d_out); } template <typename Scalar> @@ -578,6 +610,9 @@ void test_cuda_erf(const Scalar stddev) VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j))); } } + + cudaFree(d_in); + cudaFree(d_out); } template <typename Scalar> @@ -614,51 +649,50 @@ void test_cuda_erfc(const Scalar stddev) VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j))); } } + + cudaFree(d_in); + cudaFree(d_out); } void test_cxx11_tensor_cuda() { - CALL_SUBTEST(test_cuda_elementwise_small()); - CALL_SUBTEST(test_cuda_elementwise()); - CALL_SUBTEST(test_cuda_reduction()); - CALL_SUBTEST(test_cuda_contraction<ColMajor>()); - CALL_SUBTEST(test_cuda_contraction<RowMajor>()); - CALL_SUBTEST(test_cuda_convolution_1d<ColMajor>()); - CALL_SUBTEST(test_cuda_convolution_1d<RowMajor>()); - CALL_SUBTEST(test_cuda_convolution_inner_dim_col_major_1d()); - CALL_SUBTEST(test_cuda_convolution_inner_dim_row_major_1d()); - CALL_SUBTEST(test_cuda_convolution_2d<ColMajor>()); - CALL_SUBTEST(test_cuda_convolution_2d<RowMajor>()); - CALL_SUBTEST(test_cuda_convolution_3d<ColMajor>()); - CALL_SUBTEST(test_cuda_convolution_3d<RowMajor>()); - CALL_SUBTEST(test_cuda_lgamma<float>(1.0f)); - CALL_SUBTEST(test_cuda_lgamma<float>(100.0f)); - CALL_SUBTEST(test_cuda_lgamma<float>(0.01f)); - CALL_SUBTEST(test_cuda_lgamma<float>(0.001f)); - CALL_SUBTEST(test_cuda_erf<float>(1.0f)); - CALL_SUBTEST(test_cuda_erf<float>(100.0f)); - CALL_SUBTEST(test_cuda_erf<float>(0.01f)); - CALL_SUBTEST(test_cuda_erf<float>(0.001f)); - CALL_SUBTEST(test_cuda_erfc<float>(1.0f)); + CALL_SUBTEST_1(test_cuda_elementwise_small()); + CALL_SUBTEST_1(test_cuda_elementwise()); + CALL_SUBTEST_1(test_cuda_reduction()); + CALL_SUBTEST_2(test_cuda_contraction<ColMajor>()); + CALL_SUBTEST_2(test_cuda_contraction<RowMajor>()); + CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>()); + CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>()); + CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d()); + CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d()); + CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>()); + CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>()); + CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>()); + CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>()); + CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f)); + CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f)); + CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f)); + CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f)); + CALL_SUBTEST_4(test_cuda_erf<float>(1.0f)); + CALL_SUBTEST_4(test_cuda_erf<float>(100.0f)); + CALL_SUBTEST_4(test_cuda_erf<float>(0.01f)); + CALL_SUBTEST_4(test_cuda_erf<float>(0.001f)); + CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f)); // CALL_SUBTEST(test_cuda_erfc<float>(100.0f)); - CALL_SUBTEST(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs - CALL_SUBTEST(test_cuda_erfc<float>(0.01f)); - CALL_SUBTEST(test_cuda_erfc<float>(0.001f)); - CALL_SUBTEST(test_cuda_tanh<double>(1.0)); - CALL_SUBTEST(test_cuda_tanh<double>(100.0)); - CALL_SUBTEST(test_cuda_tanh<double>(0.01)); - CALL_SUBTEST(test_cuda_tanh<double>(0.001)); - CALL_SUBTEST(test_cuda_lgamma<double>(1.0)); - CALL_SUBTEST(test_cuda_lgamma<double>(100.0)); - CALL_SUBTEST(test_cuda_lgamma<double>(0.01)); - CALL_SUBTEST(test_cuda_lgamma<double>(0.001)); - CALL_SUBTEST(test_cuda_erf<double>(1.0)); - CALL_SUBTEST(test_cuda_erf<double>(100.0)); - CALL_SUBTEST(test_cuda_erf<double>(0.01)); - CALL_SUBTEST(test_cuda_erf<double>(0.001)); - CALL_SUBTEST(test_cuda_erfc<double>(1.0)); + CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs + CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f)); + CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f)); + CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0)); + CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0)); + CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01)); + CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001)); + CALL_SUBTEST_4(test_cuda_erf<double>(1.0)); + CALL_SUBTEST_4(test_cuda_erf<double>(100.0)); + CALL_SUBTEST_4(test_cuda_erf<double>(0.01)); + CALL_SUBTEST_4(test_cuda_erf<double>(0.001)); + CALL_SUBTEST_4(test_cuda_erfc<double>(1.0)); // CALL_SUBTEST(test_cuda_erfc<double>(100.0)); - CALL_SUBTEST(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs - CALL_SUBTEST(test_cuda_erfc<double>(0.01)); - CALL_SUBTEST(test_cuda_erfc<double>(0.001)); + CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs + CALL_SUBTEST_4(test_cuda_erfc<double>(0.01)); + CALL_SUBTEST_4(test_cuda_erfc<double>(0.001)); } diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cu index ed5dd7505..cbe9e6449 100644 --- a/unsupported/test/cxx11_tensor_device.cpp +++ b/unsupported/test/cxx11_tensor_device.cu @@ -109,19 +109,19 @@ struct GPUContext { // The actual expression to evaluate template <typename Context> -static void test_contextual_eval(Context* context) +void test_contextual_eval(Context* context) { context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); } template <typename Context> -static void test_forced_contextual_eval(Context* context) +void test_forced_contextual_eval(Context* context) { context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); } template <typename Context> -static void test_compound_assignment(Context* context) +void test_compound_assignment(Context* context) { context->out().device(context->device()) = context->in1().constant(2.718f); context->out().device(context->device()) += context->in1() + context->in2() * 3.14f; @@ -129,7 +129,7 @@ static void test_compound_assignment(Context* context) template <typename Context> -static void test_contraction(Context* context) +void test_contraction(Context* context) { Eigen::array<std::pair<int, int>, 2> dims; dims[0] = std::make_pair(1, 1); @@ -145,7 +145,7 @@ static void test_contraction(Context* context) template <typename Context> -static void test_1d_convolution(Context* context) +void test_1d_convolution(Context* context) { Eigen::DSizes<int, 3> indices(0,0,0); Eigen::DSizes<int, 3> sizes(40,49,70); @@ -155,7 +155,7 @@ static void test_1d_convolution(Context* context) } template <typename Context> -static void test_2d_convolution(Context* context) +void test_2d_convolution(Context* context) { Eigen::DSizes<int, 3> indices(0,0,0); Eigen::DSizes<int, 3> sizes(40,49,69); @@ -165,7 +165,7 @@ static void test_2d_convolution(Context* context) } template <typename Context> -static void test_3d_convolution(Context* context) +void test_3d_convolution(Context* context) { Eigen::DSizes<int, 3> indices(0,0,0); Eigen::DSizes<int, 3> sizes(39,49,69); @@ -175,7 +175,7 @@ static void test_3d_convolution(Context* context) } -static void test_cpu() { +void test_cpu() { Eigen::Tensor<float, 3> in1(40,50,70); Eigen::Tensor<float, 3> in2(40,50,70); Eigen::Tensor<float, 3> out(40,50,70); @@ -267,7 +267,7 @@ static void test_cpu() { } } -static void test_gpu() { +void test_gpu() { Eigen::Tensor<float, 3> in1(40,50,70); Eigen::Tensor<float, 3> in2(40,50,70); Eigen::Tensor<float, 3> out(40,50,70); @@ -383,6 +383,6 @@ static void test_gpu() { void test_cxx11_tensor_device() { - CALL_SUBTEST(test_cpu()); - CALL_SUBTEST(test_gpu()); + CALL_SUBTEST_1(test_cpu()); + CALL_SUBTEST_2(test_gpu()); } diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp new file mode 100644 index 000000000..9130fff35 --- /dev/null +++ b/unsupported/test/cxx11_tensor_empty.cpp @@ -0,0 +1,40 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + + +static void test_empty_tensor() +{ + Tensor<float, 2> source; + Tensor<float, 2> tgt1 = source; + Tensor<float, 2> tgt2(source); + Tensor<float, 2> tgt3; + tgt3 = tgt1; + tgt3 = tgt2; +} + +static void test_empty_fixed_size_tensor() +{ + TensorFixedSize<float, Sizes<0>> source; + TensorFixedSize<float, Sizes<0>> tgt1 = source; + TensorFixedSize<float, Sizes<0>> tgt2(source); + TensorFixedSize<float, Sizes<0>> tgt3; + tgt3 = tgt1; + tgt3 = tgt2; +} + + +void test_cxx11_tensor_empty() +{ + CALL_SUBTEST(test_empty_tensor()); + CALL_SUBTEST(test_empty_fixed_size_tensor()); +} diff --git a/unsupported/test/cxx11_tensor_random_cuda.cpp b/unsupported/test/cxx11_tensor_random_cuda.cu index 5d091de15..5d091de15 100644 --- a/unsupported/test/cxx11_tensor_random_cuda.cpp +++ b/unsupported/test/cxx11_tensor_random_cuda.cu diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cpp b/unsupported/test/cxx11_tensor_reduction_cuda.cu index 9e06eb126..cad0c08e0 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cpp +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -48,9 +48,12 @@ static void test_full_reductions() { // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); + + gpu_device.deallocate(gpu_in_ptr); + gpu_device.deallocate(gpu_out_ptr); } void test_cxx11_tensor_reduction_cuda() { - CALL_SUBTEST(test_full_reductions<ColMajor>()); - CALL_SUBTEST(test_full_reductions<RowMajor>()); + CALL_SUBTEST_1(test_full_reductions<ColMajor>()); + CALL_SUBTEST_2(test_full_reductions<RowMajor>()); } diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index e28cf55e2..e46197464 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -17,7 +17,7 @@ using Eigen::Tensor; -static void test_multithread_elementwise() +void test_multithread_elementwise() { Tensor<float, 3> in1(2,3,7); Tensor<float, 3> in2(2,3,7); @@ -40,7 +40,7 @@ static void test_multithread_elementwise() } -static void test_multithread_compound_assignment() +void test_multithread_compound_assignment() { Tensor<float, 3> in1(2,3,7); Tensor<float, 3> in2(2,3,7); @@ -64,7 +64,7 @@ static void test_multithread_compound_assignment() } template<int DataLayout> -static void test_multithread_contraction() +void test_multithread_contraction() { Tensor<float, 4, DataLayout> t_left(30, 50, 37, 31); Tensor<float, 5, DataLayout> t_right(37, 31, 70, 2, 10); @@ -91,15 +91,20 @@ static void test_multithread_contraction() for (ptrdiff_t i = 0; i < t_result.size(); i++) { VERIFY(&t_result.data()[i] != &m_result.data()[i]); - if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { - std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; - assert(false); + if (fabs(t_result(i) - m_result(i)) < 1e-4) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) { + continue; } + std::cout << "mismatch detected at index " << i << ": " << t_result(i) + << " vs " << m_result(i) << std::endl; + assert(false); } } template<int DataLayout> -static void test_contraction_corner_cases() +void test_contraction_corner_cases() { Tensor<float, 2, DataLayout> t_left(32, 500); Tensor<float, 2, DataLayout> t_right(32, 28*28); @@ -186,7 +191,7 @@ static void test_contraction_corner_cases() } template<int DataLayout> -static void test_multithread_contraction_agrees_with_singlethread() { +void test_multithread_contraction_agrees_with_singlethread() { int contract_size = internal::random<int>(1, 5000); Tensor<float, 3, DataLayout> left(internal::random<int>(1, 80), @@ -229,7 +234,7 @@ static void test_multithread_contraction_agrees_with_singlethread() { template<int DataLayout> -static void test_multithreaded_reductions() { +void test_multithreaded_reductions() { const int num_threads = internal::random<int>(3, 11); ThreadPool thread_pool(num_threads); Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads); @@ -239,19 +244,19 @@ static void test_multithreaded_reductions() { Tensor<float, 2, DataLayout> t1(num_rows, num_cols); t1.setRandom(); - Tensor<float, 1, DataLayout> full_redux(1); + Tensor<float, 0, DataLayout> full_redux; full_redux = t1.sum(); - Tensor<float, 1, DataLayout> full_redux_tp(1); + Tensor<float, 0, DataLayout> full_redux_tp; full_redux_tp.device(thread_pool_device) = t1.sum(); // Check that the single threaded and the multi threaded reductions return // the same result. - VERIFY_IS_APPROX(full_redux(0), full_redux_tp(0)); + VERIFY_IS_APPROX(full_redux(), full_redux_tp()); } -static void test_memcpy() { +void test_memcpy() { for (int i = 0; i < 5; ++i) { const int num_threads = internal::random<int>(3, 11); @@ -270,7 +275,7 @@ static void test_memcpy() { } -static void test_multithread_random() +void test_multithread_random() { Eigen::ThreadPool tp(2); Eigen::ThreadPoolDevice device(&tp, 2); @@ -278,26 +283,52 @@ static void test_multithread_random() t.device(device) = t.random<Eigen::internal::NormalRandomGenerator<float>>(); } +template<int DataLayout> +void test_multithread_shuffle() +{ + Tensor<float, 4, DataLayout> tensor(17,5,7,11); + tensor.setRandom(); + + const int num_threads = internal::random<int>(2, 11); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor<float, 4, DataLayout> shuffle(7,5,11,17); + array<ptrdiff_t, 4> shuffles = {{2,1,3,0}}; + shuffle.device(device) = tensor.shuffle(shuffles); + + for (int i = 0; i < 17; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,j,l,i)); + } + } + } + } +} + void test_cxx11_tensor_thread_pool() { - CALL_SUBTEST(test_multithread_elementwise()); - CALL_SUBTEST(test_multithread_compound_assignment()); + CALL_SUBTEST_1(test_multithread_elementwise()); + CALL_SUBTEST_1(test_multithread_compound_assignment()); - CALL_SUBTEST(test_multithread_contraction<ColMajor>()); - CALL_SUBTEST(test_multithread_contraction<RowMajor>()); + CALL_SUBTEST_2(test_multithread_contraction<ColMajor>()); + CALL_SUBTEST_2(test_multithread_contraction<RowMajor>()); - CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread<ColMajor>()); - CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread<RowMajor>()); + CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>()); + CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>()); // Exercise various cases that have been problematic in the past. - CALL_SUBTEST(test_contraction_corner_cases<ColMajor>()); - CALL_SUBTEST(test_contraction_corner_cases<RowMajor>()); - - CALL_SUBTEST(test_multithreaded_reductions<ColMajor>()); - CALL_SUBTEST(test_multithreaded_reductions<RowMajor>()); + CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>()); + CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>()); - CALL_SUBTEST(test_memcpy()); + CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>()); + CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>()); - CALL_SUBTEST(test_multithread_random()); + CALL_SUBTEST_6(test_memcpy()); + CALL_SUBTEST_6(test_multithread_random()); + CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>()); + CALL_SUBTEST_6(test_multithread_shuffle<RowMajor>()); } |