From 99d75235a9567865d2c070a2840d54c8a5ad0f43 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 13 Oct 2014 17:02:09 -0700 Subject: Misc improvements and cleanups --- Eigen/src/Core/GenericPacketMath.h | 15 +- unsupported/Eigen/CXX11/Tensor | 4 + .../Eigen/CXX11/src/Core/util/CXX11Workarounds.h | 5 + .../Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h | 101 ++++++++- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 4 +- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 12 +- unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h | 35 ++++ .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 73 ++++--- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 20 +- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 36 +++- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 4 +- unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 26 ++- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 22 +- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 4 +- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 9 +- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 61 ++++-- unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h | 32 +-- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_assign.cpp | 35 +++- unsupported/test/cxx11_tensor_convolution.cpp | 70 +++++++ unsupported/test/cxx11_tensor_device.cpp | 27 +++ unsupported/test/cxx11_tensor_morphing.cpp | 5 +- unsupported/test/cxx11_tensor_of_complex.cpp | 64 ++++++ unsupported/test/cxx11_tensor_thread_pool.cpp | 232 ++++++++++++++++++++- 29 files changed, 780 insertions(+), 141 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_of_complex.cpp diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index e6fea5bba..3ef3475c7 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -359,7 +359,7 @@ pmadd(const Packet& a, /** \internal \returns a packet version of \a *from. * If LoadMode equals #Aligned, \a from must be 16 bytes aligned */ template -inline Packet ploadt(const typename unpacket_traits::type* from) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits::type* from) { if(LoadMode == Aligned) return pload(from); @@ -370,7 +370,7 @@ inline Packet ploadt(const typename unpacket_traits::type* from) /** \internal copy the packet \a from to \a *to. * If StoreMode equals #Aligned, \a to must be 16 bytes aligned */ template -inline void pstoret(Scalar* to, const Packet& from) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from) { if(LoadMode == Aligned) pstore(to, from); @@ -378,6 +378,17 @@ inline void pstoret(Scalar* to, const Packet& from) pstoreu(to, from); } +/** \internal \returns a packet version of \a *from. + * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the + * hardware if available to speedup the loading of data that won't be modified + * by the current computation. + */ +template +inline Packet ploadt_ro(const typename unpacket_traits::type* from) +{ + return ploadt(from); +} + /** \internal default implementation of palign() allowing partial specialization */ template struct palign_impl diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 0dac95e45..2137f4276 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -30,6 +30,10 @@ #include #include +#ifdef EIGEN_USE_THREADS +#include +#endif + #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) #include #endif diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index 227522ecb..e30eb6ad8 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -66,6 +66,11 @@ template constexpr inline T& array_ template constexpr inline T&& array_get(std::array&& a) { return (T&&) STD_GET_ARR_HACK; } template constexpr inline T const& array_get(std::array const& a) { return (T const&) STD_GET_ARR_HACK; } +template constexpr inline T& array_get(std::vector& a) { return a[I]; } +template constexpr inline T&& array_get(std::vector&& a) { return a[I]; } +template constexpr inline T const& array_get(std::vector const& a) { return a[I]; } + + #undef STD_GET_ARR_HACK template struct array_size; diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h index 4c6b95773..e45d0a3b1 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -48,7 +48,8 @@ template class array { values[2] = v3; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4) { + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, + const T& v4) { EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE) values[0] = v1; values[1] = v2; @@ -56,7 +57,8 @@ template class array { values[3] = v4; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, const T& v5) { + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5) { EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE) values[0] = v1; values[1] = v2; @@ -64,6 +66,43 @@ template class array { values[3] = v4; values[4] = v5; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6) { + EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6, const T& v7) { + EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + values[6] = v7; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array( + const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6, const T& v7, const T& v8) { + EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + values[6] = v7; + values[7] = v8; + } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES array(std::initializer_list l) { @@ -93,9 +132,11 @@ template struct type_list { struct null_type { }; -template +template struct make_type_list { - typedef typename make_type_list::type tailresult; + typedef typename make_type_list::type tailresult; typedef type_list type; }; @@ -150,6 +191,23 @@ template struct gen_numeric_list_repeated { typedef typename make_type_list, type2val, type2val, type2val, type2val >::type type; }; +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, + type2val, type2val, type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, + type2val, type2val, type2val, + type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, + type2val, type2val, type2val, + type2val, type2val >::type type; +}; + template struct get; @@ -174,6 +232,7 @@ template <> struct arg_prod { static const int value = 1; }; + template array repeat(t v) { array array; @@ -190,6 +249,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_l return get >::value; } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList& l) { + return arg_prod::value; +}; + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& a) { t prod = 1; @@ -201,6 +265,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& /*a*/) { return 0; } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector& a) { + eigen_assert(a.size() > 0); + t prod = 1; + for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; } + return prod; +} + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array& a) { return a[I]; @@ -210,12 +282,31 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array& a) { return a[I]; } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector& a) { + return a[I]; +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector& a) { + return a[I]; +} +template struct array_size; +template struct array_size > { + static const size_t value = N; +}; +template struct array_size; +template struct array_size& > { + static const size_t value = N; +}; template struct array_size; template struct array_size > { static const size_t value = N; }; - +template struct array_size; +template struct array_size& > { + static const size_t value = N; +}; struct sum_op { template static inline bool run(A a, B b) { return a + b; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 3bfe80c9e..e973c00d3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -131,8 +131,8 @@ struct TensorEvaluator, Device> m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { - static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 27c10f64f..6018ecc66 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -30,6 +30,12 @@ class TensorBase typedef Scalar CoeffReturnType; typedef typename internal::packet_traits::type PacketReturnType; + // Dimensions + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return derived().dimensions()[n]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(derived().dimensions()); } + // Nullary operators EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> @@ -187,7 +193,7 @@ class TensorBase } // Contractions. - typedef std::pair DimensionPair; + typedef Eigen::IndexPair DimensionPair; template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorContractionOp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 3b2a9c8b9..0e55d4de1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -48,7 +48,7 @@ struct nested, 1, typename eval -class TensorBroadcastingOp : public TensorBase, WriteAccessors> +class TensorBroadcastingOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -91,7 +91,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, }; -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); @@ -141,7 +141,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const D template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - static const int packetSize = internal::unpacket_traits::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < dimensions().TotalSize()); @@ -161,7 +161,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const D if (innermostLoc + packetSize <= m_impl.dimensions()[0]) { return m_impl.template packet(inputIndex); } else { - EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; values[0] = m_impl.coeff(inputIndex); for (int i = 1; i < packetSize; ++i) { values[i] = coeff(originalIndex+i); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 4a5fd9c79..34bdd5309 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -872,11 +872,19 @@ struct TensorEvaluator + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt(m_buf+index); + } + private: // No assignment (copies are needed by the kernels) TensorEvaluator& operator = (const TensorEvaluator&); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index 75519c9f5..649bdb308 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -38,6 +38,18 @@ template class TensorDevice { return *this; } + template + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, sum); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, m_device); + return *this; + } + protected: const DeviceType& m_device; ExpressionType& m_expression; @@ -58,6 +70,18 @@ template class TensorDevice + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, sum); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, m_device); + return *this; + } + protected: const ThreadPoolDevice& m_device; ExpressionType& m_expression; @@ -79,6 +103,17 @@ template class TensorDevice return *this; } + template + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, sum); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + protected: const GpuDevice& m_device; ExpressionType m_expression; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index fad342eab..5a6ff70e9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -37,23 +37,41 @@ struct DefaultDevice { // Multiple cpu cores // We should really use a thread pool here but first we need to find a portable thread pool library. #ifdef EIGEN_USE_THREADS + +typedef std::future Future; + struct ThreadPoolDevice { - ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } - size_t numThreads() const { return num_threads_; } + ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : num_threads_(num_cores) { } EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return internal::aligned_malloc(num_bytes); } + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { internal::aligned_free(buffer); } + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { ::memcpy(dst, src, n); } + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); } + EIGEN_STRONG_INLINE size_t numThreads() const { + return num_threads_; + } + + template + EIGEN_STRONG_INLINE Future enqueue(Function&& f, Args&&... args) const { + return std::async(std::launch::async, f, args...); + } + template + EIGEN_STRONG_INLINE void enqueueNoFuture(Function&& f, Args&&... args) const { + std::async(std::launch::async, f, args...); + } + private: // todo: NUMA, ... size_t num_threads_; @@ -63,41 +81,34 @@ struct ThreadPoolDevice { // GPU offloading #ifdef EIGEN_USE_GPU -static int m_numMultiProcessors = 0; -static int m_maxThreadsPerBlock = 0; -static int m_maxThreadsPerMultiProcessor = 0; +static cudaDeviceProp m_deviceProperties; +static bool m_devicePropInitialized = false; + +static void initializeDeviceProp() { + if (!m_devicePropInitialized) { + assert(cudaGetDeviceProperties(&m_deviceProperties, 0) == cudaSuccess); + m_devicePropInitialized = true; + } +} static inline int getNumCudaMultiProcessors() { - if (m_numMultiProcessors == 0) { - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; - m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; - m_numMultiProcessors = deviceProp.multiProcessorCount; - } - return m_numMultiProcessors; + initializeDeviceProp(); + return m_deviceProperties.multiProcessorCount; } static inline int maxCudaThreadsPerBlock() { - if (m_maxThreadsPerBlock == 0) { - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - m_numMultiProcessors = deviceProp.multiProcessorCount; - m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; - m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; - } - return m_maxThreadsPerBlock; + initializeDeviceProp(); + return m_deviceProperties.maxThreadsPerBlock; } static inline int maxCudaThreadsPerMultiProcessor() { - if (m_maxThreadsPerBlock == 0) { - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - m_numMultiProcessors = deviceProp.multiProcessorCount; - m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; - m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; - } - return m_maxThreadsPerMultiProcessor; + initializeDeviceProp(); + return m_deviceProperties.maxThreadsPerMultiProcessor; +} +static inline int sharedMemPerBlock() { + initializeDeviceProp(); + return m_deviceProperties.sharedMemPerBlock; } + struct GpuDevice { // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction. GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); } @@ -141,8 +152,8 @@ struct GpuDevice { #endif } - EIGEN_STRONG_INLINE size_t numThreads() const { - // Fixme: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME return 32; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 732c6b344..2dd8e274b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -29,7 +29,7 @@ namespace Eigen { * \sa Tensor */ -// Can't use std::pairs on cuda devices +// Can't use std::pair on cuda devices template struct IndexPair { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 587cbd5ca..ce9d73578 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -116,7 +116,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { m_buffer[i] = m_impl.coeff(i); } - EIGEN_STRONG_INLINE void evalPacket(Index i) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 0f969036c..e324ba8d2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -65,13 +65,13 @@ struct TensorEvaluator return m_data[index]; } - template EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt(m_data + index); } - template EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const Packet& x) { return internal::pstoret(m_data + index, x); @@ -113,13 +113,17 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { eigen_assert(m_data); +#ifdef __CUDA_ARCH__ + return __ldg(m_data+index); +#else return m_data[index]; +#endif } - template EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return internal::ploadt(m_data + index); + return internal::ploadt_ro(m_data + index); } const Scalar* data() const { return m_data; } @@ -166,7 +170,7 @@ struct TensorEvaluator, Device> } template - EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(index); } @@ -219,7 +223,7 @@ struct TensorEvaluator, Device> } template - EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(m_argImpl.template packet(index)); } @@ -278,7 +282,7 @@ struct TensorEvaluator - EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(m_leftImpl.template packet(index), m_rightImpl.template packet(index)); } @@ -340,7 +344,7 @@ struct TensorEvaluator return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); } template - PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { static const int PacketSize = internal::unpacket_traits::size; internal::Selector select; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 10f5a5ee7..01fa04c64 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -10,10 +10,6 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H -#ifdef EIGEN_USE_THREADS -#include -#endif - namespace Eigen { /** \class TensorExecutor @@ -62,7 +58,7 @@ class TensorExecutor { const Index size = array_prod(evaluator.dimensions()); static const int PacketSize = unpacket_traits::PacketReturnType>::size; - const int VectorizedSize = (size / PacketSize) * PacketSize; + const Index VectorizedSize = (size / PacketSize) * PacketSize; for (Index i = 0; i < VectorizedSize; i += PacketSize) { evaluator.evalPacket(i); @@ -131,10 +127,10 @@ class TensorExecutor const Index numblocks = size / blocksize; Index i = 0; - std::vector > results; + std::vector results; results.reserve(numblocks); for (int i = 0; i < numblocks; ++i) { - results.push_back(std::async(std::launch::async, &EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); + results.push_back(device.enqueue(&EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); } for (int i = 0; i < numblocks; ++i) { @@ -154,11 +150,31 @@ class TensorExecutor // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template -__global__ void EigenMetaKernel(Evaluator eval, unsigned int size) { +__global__ void +__launch_bounds__(1024) +EigenMetaKernel(Evaluator eval, unsigned int size) { + const int first_index = blockIdx.x * blockDim.x + threadIdx.x; const int step_size = blockDim.x * gridDim.x; - for (int i = first_index; i < size; i += step_size) { - eval.evalScalar(i); + + if (!Evaluator::PacketAccess || !Evaluator::IsAligned) { + // Use the scalar path + for (int i = first_index; i < size; i += step_size) { + eval.evalScalar(i); + } + } + else { + // Use the vector path + const int PacketSize = unpacket_traits::size; + const int vectorized_step_size = step_size * PacketSize; + const int vectorized_size = (size / PacketSize) * PacketSize; + int i = first_index * PacketSize; + for ( ; i < vectorized_size; i += vectorized_step_size) { + eval.evalPacket(i); + } + for ( ; i < size; i += step_size) { + eval.evalScalar(i); + } } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 4d7f9e1fd..a753c5a48 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -17,7 +17,7 @@ namespace Eigen { * * \brief The fixed sized version of the tensor class. * - * The fixes sized equivalent of + * The fixed sized equivalent of * Eigen::Tensor t(3, 5, 7); * is * Eigen::TensorFixedSize> t; @@ -41,7 +41,7 @@ class TensorFixedSize : public TensorBase::size > 1), }; typedef Dimensions_ Dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index cf97031be..2714117ab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -31,30 +31,34 @@ namespace internal { template struct TensorIntDivisor { public: - TensorIntDivisor() { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { multiplier = 0; shift1 = 0; shift2 = 0; } // Must have 1 <= divider <= 2^31-1 - TensorIntDivisor(const T divider) { - static const int N = 32; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) { + const int N = 32; eigen_assert(divider > 0); eigen_assert(divider <= (1<<(N-1)) - 1); // fast ln2 +#ifndef __CUDA_ARCH__ const int leading_zeros = __builtin_clz(divider); - const int l = N - (leading_zeros+1); - - multiplier = (static_cast(1) << (N+l)) / divider - (static_cast(1) << N) + 1; - shift1 = (std::min)(1, l); - shift2 = (std::max)(0, l-1); +#else + const int leading_zeros = __clz(divider); +#endif + const int log_div = N - (leading_zeros+1); + + multiplier = (static_cast(1) << (N+log_div)) / divider - (static_cast(1) << N) + 1; + shift1 = log_div > 1 ? 1 : log_div; + shift2 = log_div > 1 ? log_div-1 : 0; } // Must have 0 <= numerator <= 2^32-1 - T divide(const T numerator) const { - static const int N = 32; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { + const int N = 32; eigen_assert(numerator >= 0); eigen_assert(numerator <= (1ull< -static T operator / (const T& numerator, const TensorIntDivisor& divisor) { +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor& divisor) { return divisor.divide(numerator); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 04849dd9f..2c0d2cd0f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -42,26 +42,25 @@ template class TensorMap : public Tensor static const int Options = Options_; - static const std::size_t NumIndices = PlainObjectType::NumIndices; + static const Index NumIndices = PlainObjectType::NumIndices; typedef typename PlainObjectType::Dimensions Dimensions; - enum { - IsAligned = bool(EIGEN_ALIGN) && ((int(Options_)&Aligned)==Aligned), - PacketAccess = true, + IsAligned = ((int(Options_)&Aligned)==Aligned), + PacketAccess = (internal::packet_traits::size > 1), }; #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(array({{firstDimension, otherDimensions...}})) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } #else EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(array(firstDimension)) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } #endif @@ -176,12 +175,13 @@ template class TensorMap : public Tensor template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) { - static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + const std::size_t NumDims = sizeof...(otherIndices) + 1; if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 7da89458f..8da6e0f26 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -144,7 +144,7 @@ struct TensorEvaluator, Device template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - static const int packetSize = internal::unpacket_traits::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < dimensions().TotalSize()); @@ -206,7 +206,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - static const int packetSize = internal::unpacket_traits::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; for (int i = 0; i < packetSize; ++i) { values[i] = coeff(index+i); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index f7e7fc107..7e0063626 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -97,7 +97,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; enum { - IsAligned = true, + IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), }; @@ -194,7 +194,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; enum { - IsAligned = true, + IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 0c4f8a3d6..aaec39756 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -30,11 +30,11 @@ namespace Eigen { * * \sa Tensor */ -template class TensorStorage; +template class TensorStorage; // Pure fixed-size storage -template +template class TensorStorage { private: @@ -62,7 +62,7 @@ class TensorStorage // pure-dynamic, but without specification of all dimensions explicitly -template +template class TensorStorage : public TensorStorage::type> { @@ -79,7 +79,7 @@ class TensorStorage }; // pure dynamic -template +template class TensorStorage::type> { T *m_data; @@ -140,6 +140,7 @@ class TensorStorage, 1, typename eval -class TensorStridingOp : public TensorBase, WriteAccessors> +class TensorStridingOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -97,7 +97,7 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = /*TensorEvaluator::PacketAccess*/false, + PacketAccess = TensorEvaluator::PacketAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -109,28 +109,23 @@ struct TensorEvaluator, Device> } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } else { - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - } - } - for (int i = 0; i < NumDims; ++i) { - m_inputStrides[i] *= op.strides()[i]; + m_outputStrides[0] = 1; + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_inputStrides[i-1] *= op.strides()[i-1]; } + m_inputStrides[NumDims-1] *= op.strides()[NumDims-1]; } - // typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -150,16 +145,44 @@ struct TensorEvaluator, Device> return m_impl.coeff(inputIndex); } - /* template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return m_impl.template packet(index); - }*/ + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[0]; + inputIndices[1] += indices[1] * m_inputStrides[0]; + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } + else { + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } Scalar* data() const { return NULL; } protected: - // Strides m_strides; Dimensions m_dimensions; array m_outputStrides; array m_inputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 40f805741..5940a8cf1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -70,14 +70,18 @@ struct traits > }; -template -struct traits > +template +struct traits > : public traits { typedef traits BaseTraits; typedef typename BaseTraits::Scalar Scalar; typedef typename BaseTraits::StorageKind StorageKind; typedef typename BaseTraits::Index Index; + enum { + Options = Options_, + Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + }; }; @@ -105,16 +109,16 @@ struct eval, Eigen::Dense> typedef const TensorFixedSize& type; }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const TensorMap& type; + typedef const TensorMap& type; }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const TensorMap& type; + typedef const TensorMap& type; }; template @@ -141,16 +145,16 @@ struct nested, 1, typename e typedef const TensorFixedSize& type; }; -template -struct nested, 1, typename eval >::type> +template +struct nested, 1, typename eval >::type> { - typedef const TensorMap& type; + typedef const TensorMap& type; }; -template -struct nested, 1, typename eval >::type> +template +struct nested, 1, typename eval >::type> { - typedef const TensorMap& type; + typedef const TensorMap& type; }; } // end namespace internal diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index d6c435947..a7ef2b402 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -110,6 +110,7 @@ if(EIGEN_TEST_CXX11) # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") ei_add_test(cxx11_tensor_const "-std=c++0x") ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") + ei_add_test(cxx11_tensor_of_complex "-std=c++0x") ei_add_test(cxx11_tensor_of_strings "-std=c++0x") ei_add_test(cxx11_tensor_intdiv "-std=c++0x") ei_add_test(cxx11_tensor_lvalue "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp index f2b126413..0ac3f9bf9 100644 --- a/unsupported/test/cxx11_tensor_assign.cpp +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -253,6 +253,39 @@ static void test_auto_resize() } +static void test_compound_assign() +{ + Tensor start_tensor(10); + Tensor offset_tensor(10); + start_tensor.setRandom(); + offset_tensor.setRandom(); + + Tensor tensor = start_tensor; + tensor += offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) + offset_tensor(i)); + } + + tensor = start_tensor; + tensor -= offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) - offset_tensor(i)); + } + + tensor = start_tensor; + tensor *= offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) * offset_tensor(i)); + } + + tensor = start_tensor; + tensor /= offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) / offset_tensor(i)); + } +} + + void test_cxx11_tensor_assign() { CALL_SUBTEST(test_1d()); @@ -260,5 +293,5 @@ void test_cxx11_tensor_assign() CALL_SUBTEST(test_3d()); CALL_SUBTEST(test_same_type()); CALL_SUBTEST(test_auto_resize()); - + CALL_SUBTEST(test_compound_assign()); } diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp index bafe73edd..4672db463 100644 --- a/unsupported/test/cxx11_tensor_convolution.cpp +++ b/unsupported/test/cxx11_tensor_convolution.cpp @@ -64,8 +64,78 @@ static void test_expr() } +static void test_modes() { + Tensor input(3); + Tensor kernel(3); + input(0) = 1.0f; + input(1) = 2.0f; + input(2) = 3.0f; + kernel(0) = 0.5f; + kernel(1) = 1.0f; + kernel(2) = 0.0f; + + const Eigen::array dims{{0}}; + Eigen::array, 1> padding; + + // Emulate VALID mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(0, 0); + Tensor valid(1); + valid = input.pad(padding).convolve(kernel, dims); + VERIFY_IS_EQUAL(valid.dimension(0), 1); + VERIFY_IS_APPROX(valid(0), 2.5f); + + // Emulate SAME mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(1, 1); + Tensor same(3); + same = input.pad(padding).convolve(kernel, dims); + VERIFY_IS_EQUAL(same.dimension(0), 3); + VERIFY_IS_APPROX(same(0), 1.0f); + VERIFY_IS_APPROX(same(1), 2.5f); + VERIFY_IS_APPROX(same(2), 4.0f); + + // Emulate FULL mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(2, 2); + Tensor full(5); + full = input.pad(padding).convolve(kernel, dims); + VERIFY_IS_EQUAL(full.dimension(0), 5); + VERIFY_IS_APPROX(full(0), 0.0f); + VERIFY_IS_APPROX(full(1), 1.0f); + VERIFY_IS_APPROX(full(2), 2.5f); + VERIFY_IS_APPROX(full(3), 4.0f); + VERIFY_IS_APPROX(full(4), 1.5f); +} + + +static void test_strides() { + Tensor input(13); + Tensor kernel(3); + input.setRandom(); + kernel.setRandom(); + + const Eigen::array dims{{0}}; + const Eigen::array stride_of_3{{3}}; + const Eigen::array stride_of_2{{2}}; + + Tensor result; + result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2); + + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) + + input(6)*kernel(2))); + VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) + + input(12)*kernel(2))); +} + + + + void test_cxx11_tensor_convolution() { CALL_SUBTEST(test_evals()); CALL_SUBTEST(test_expr()); + CALL_SUBTEST(test_modes()); + CALL_SUBTEST(test_strides()); } diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp index f331cb481..26465ee11 100644 --- a/unsupported/test/cxx11_tensor_device.cpp +++ b/unsupported/test/cxx11_tensor_device.cpp @@ -123,6 +123,14 @@ static void test_forced_contextual_eval(Context* context) context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); } +template +static void test_compound_assignment(Context* context) +{ + context->out().device(context->device()) = context->in1().constant(2.718f); + context->out().device(context->device()) += context->in1() + context->in2() * 3.14f; +} + + template static void test_contraction(Context* context) { @@ -197,6 +205,15 @@ static void test_cpu() { } } + test_compound_assignment(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); + } + } + } + test_contraction(&context); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 40; ++j) { @@ -299,6 +316,16 @@ static void test_gpu() { } } + test_compound_assignment(&context); + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); + } + } + } + test_contraction(&context); assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); for (int i = 0; i < 40; ++i) { diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index 2a6a97856..fd1b1fa32 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -12,6 +12,7 @@ #include using Eigen::Tensor; +using Eigen::IndexPair; static void test_simple_reshape() { @@ -52,7 +53,7 @@ static void test_reshape_in_expr() { TensorMap> tensor2(m2.data(), 3,5,7,11,13); Tensor::Dimensions newDims1{{2,3*5*7*11}}; Tensor::Dimensions newDims2{{3*5*7*11,13}}; - array::DimensionPair, 1> contract_along{{std::make_pair(1, 0)}}; + Eigen::array, 1> contract_along{{IndexPair(1, 0)}}; Tensor tensor3(2,13); tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along); @@ -125,7 +126,7 @@ static void test_slice_in_expr() { TensorMap> tensor1(m1.data(), 7, 7); TensorMap> tensor2(m2.data(), 3, 3); Tensor tensor3(3,1); - array::DimensionPair, 1> contract_along{{std::make_pair(1, 0)}}; + array, 1> contract_along{{IndexPair(1, 0)}}; Eigen::DSizes indices1(1,2); Eigen::DSizes sizes1(3,3); diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp new file mode 100644 index 000000000..b5044b962 --- /dev/null +++ b/unsupported/test/cxx11_tensor_of_complex.cpp @@ -0,0 +1,64 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::TensorMap; + + + +static void test_additions() +{ + Tensor, 1> data1(3); + Tensor, 1> data2(3); + for (int i = 0; i < 3; ++i) { + data1(i) = std::complex(i, -i); + data2(i) = std::complex(i, 7 * i); + } + + Tensor, 1> sum = data1 + data2; + for (int i = 0; i < 3; ++i) { + VERIFY_IS_EQUAL(sum(i), std::complex(2*i, 6*i)); + } +} + + +static void test_contractions() +{ + Tensor, 4> t_left(30, 50, 8, 31); + Tensor, 5> t_right(8, 31, 7, 20, 10); + Tensor, 5> t_result(30, 50, 7, 20, 10); + + t_left.setRandom(); + t_right.setRandom(); + + typedef Map, Dynamic, Dynamic>> MapXcf; + MapXcf m_left(t_left.data(), 1500, 248); + MapXcf m_right(t_right.data(), 248, 1400); + Matrix, Dynamic, Dynamic> m_result(1500, 1400); + + // This contraction should be equivalent to a regular matrix multiplication + typedef Tensor::DimensionPair DimPair; + Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); + t_result = t_left.contract(t_right, dims); + m_result = m_left * m_right; + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); + } +} + + +void test_cxx11_tensor_of_complex() +{ + CALL_SUBTEST(test_additions()); + CALL_SUBTEST(test_contractions()); +} diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index e02d8e4be..f0de61f8b 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -9,22 +9,23 @@ #define EIGEN_USE_THREADS - +#include #include "main.h" #include + using Eigen::Tensor; -void test_cxx11_tensor_thread_pool() +static void test_multithread_elementwise() { - Eigen::Tensor in1(2,3,7); - Eigen::Tensor in2(2,3,7); - Eigen::Tensor out(2,3,7); + Tensor in1(2,3,7); + Tensor in2(2,3,7); + Tensor out(2,3,7); in1.setRandom(); in2.setRandom(); - Eigen::ThreadPoolDevice thread_pool_device(3); + Eigen::ThreadPoolDevice thread_pool_device(internal::random(3, 11)); out.device(thread_pool_device) = in1 + in2 * 3.14f; for (int i = 0; i < 2; ++i) { @@ -35,3 +36,222 @@ void test_cxx11_tensor_thread_pool() } } } + + +static void test_multithread_compound_assignment() +{ + Tensor in1(2,3,7); + Tensor in2(2,3,7); + Tensor out(2,3,7); + + in1.setRandom(); + in2.setRandom(); + + Eigen::ThreadPoolDevice thread_pool_device(internal::random(3, 11)); + out.device(thread_pool_device) = in1; + out.device(thread_pool_device) += in2 * 3.14f; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f); + } + } + } +} + + +static void test_multithread_contraction() +{ + Tensor t_left(30, 50, 37, 31); + Tensor t_right(37, 31, 70, 2, 10); + Tensor t_result(30, 50, 70, 2, 10); + + t_left.setRandom(); + t_right.setRandom(); + + // this contraction should be equivalent to a single matrix multiplication + typedef Tensor::DimensionPair DimPair; + Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); + + + typedef Map MapXf; + MapXf m_left(t_left.data(), 1500, 1147); + MapXf m_right(t_right.data(), 1147, 1400); + MatrixXf m_result(1500, 1400); + + Eigen::ThreadPoolDevice thread_pool_device(4); + + // compute results by separate methods + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + m_result = m_left * m_right; + + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + VERIFY(&t_result.data()[i] != &m_result.data()[i]); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } +} + + +static void test_contraction_corner_cases() +{ + Tensor t_left(32, 500); + Tensor t_right(32, 28*28); + Tensor t_result(500, 28*28); + + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; + t_result = t_result.constant(NAN); + + // this contraction should be equivalent to a single matrix multiplication + typedef Tensor::DimensionPair DimPair; + Eigen::array dims{{DimPair(0, 0)}}; + + typedef Map MapXf; + MapXf m_left(t_left.data(), 32, 500); + MapXf m_right(t_right.data(), 32, 28*28); + MatrixXf m_result(500, 28*28); + + Eigen::ThreadPoolDevice thread_pool_device(12); + + // compute results by separate methods + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + m_result = m_left.transpose() * m_right; + + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } + + t_left.resize(32, 1); + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_result.resize (1, 28*28); + t_result = t_result.constant(NAN); + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + new(&m_left) MapXf(t_left.data(), 32, 1); + m_result = m_left.transpose() * m_right; + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } + + t_left.resize(32, 500); + t_right.resize(32, 4); + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; + t_result.resize (500, 4); + t_result = t_result.constant(NAN); + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + new(&m_left) MapXf(t_left.data(), 32, 500); + new(&m_right) MapXf(t_right.data(), 32, 4); + m_result = m_left.transpose() * m_right; + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } + + t_left.resize(32, 1); + t_right.resize(32, 4); + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; + t_result.resize (1, 4); + t_result = t_result.constant(NAN); + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + new(&m_left) MapXf(t_left.data(), 32, 1); + new(&m_right) MapXf(t_right.data(), 32, 4); + m_result = m_left.transpose() * m_right; + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } +} + + +static void test_multithread_contraction_agrees_with_singlethread() { + int contract_size = internal::random(1, 5000); + + Tensor left(internal::random(1, 80), + contract_size, + internal::random(1, 100)); + + Tensor right(internal::random(1, 25), + internal::random(1, 37), + contract_size, + internal::random(1, 51)); + + left.setRandom(); + right.setRandom(); + + // add constants to shift values away from 0 for more precision + left += left.constant(1.5f); + right += right.constant(1.5f); + + typedef Tensor::DimensionPair DimPair; + Eigen::array dims({{DimPair(1, 2)}}); + + Eigen::ThreadPoolDevice thread_pool_device(internal::random(2, 11)); + + Tensor st_result; + st_result = left.contract(right, dims); + + Tensor tp_result(st_result.dimensions()); + tp_result.device(thread_pool_device) = left.contract(right, dims); + + VERIFY(internal::dimensions_match(st_result.dimensions(), tp_result.dimensions())); + for (ptrdiff_t i = 0; i < st_result.size(); i++) { + // if both of the values are very small, then do nothing (because the test will fail + // due to numerical precision issues when values are small) + if (fabs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4) { + VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]); + } + } +} + + +static void test_memcpy() { + + for (int i = 0; i < 5; ++i) { + const int num_threads = internal::random(3, 11); + Eigen::ThreadPoolDevice thread_pool_device(num_threads); + + const int size = internal::random(13, 7632); + Tensor t1(size); + t1.setRandom(); + std::vector result(size); + thread_pool_device.memcpy(&result[0], t1.data(), size*sizeof(float)); + for (int i = 0; i < size; i++) { + VERIFY_IS_EQUAL(t1(i), result[i]); + } + } +} + + +void test_cxx11_tensor_thread_pool() +{ + CALL_SUBTEST(test_multithread_elementwise()); + CALL_SUBTEST(test_multithread_compound_assignment()); + + CALL_SUBTEST(test_multithread_contraction()); + + CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread()); + + // Exercise various cases that have been problematic in the past. + CALL_SUBTEST(test_contraction_corner_cases()); + + CALL_SUBTEST(test_memcpy()); +} -- cgit v1.2.3