From c0f2cb016e60b7dbde1d5946f42234a709a711f9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 28 Apr 2014 10:32:27 -0700 Subject: Extended support for Tensors: * Added ability to map a region of the memory to a tensor * Added basic support for unary and binary coefficient wise expressions, such as addition or square root * Provided an emulation layer to make it possible to compile the code with compilers (such as nvcc) that don't support cxx11. --- Eigen/src/Core/util/Macros.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'Eigen') diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index bfd6ba7de..3a928001e 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -121,6 +121,11 @@ #define EIGEN_HAVE_RVALUE_REFERENCES #endif +// Does the compiler support variadic templates? +#if __cplusplus > 199711L +#define EIGEN_HAS_VARIADIC_TEMPLATES 1 +#endif + /** Allows to disable some optimizations which might affect the accuracy of the result. * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. * They currently include: -- cgit v1.2.3 From 29aebf96e62f4fb5e4b1f3fb475e299df2e7a02e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 6 Jun 2014 20:18:44 -0700 Subject: Created the pblend packet primitive and implemented it using SSE and AVX instructions. --- Eigen/src/Core/GenericPacketMath.h | 14 ++++++++++++ Eigen/src/Core/arch/AVX/PacketMath.h | 15 +++++++++++++ Eigen/src/Core/arch/SSE/Complex.h | 8 ++++++- Eigen/src/Core/arch/SSE/PacketMath.h | 41 +++++++++++++++++++++++++++++++++--- test/packetmath.cpp | 16 ++++++++++++++ 5 files changed, 90 insertions(+), 4 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 98313c68f..0869dd49f 100755 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -54,6 +54,7 @@ struct default_packet_traits HasMax = 1, HasConj = 1, HasSetLinear = 1, + HasBlend = 0, HasDiv = 0, HasSqrt = 0, @@ -429,6 +430,19 @@ ptranspose(PacketBlock& /*kernel*/) { // Nothing to do in the scalar case, i.e. a 1x1 matrix. } +/*************************************************************************** + * Selector, i.e. vector of N boolean values used to select (i.e. blend) + * words from 2 packets. +***************************************************************************/ +template struct Selector { + bool select[N]; +}; + +template EIGEN_DEVICE_FUNC inline Packet +pblend(const Selector::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) { + return ifPacket.select[0] ? thenPacket : elsePacket; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 8b8307d75..688ff91e4 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -59,6 +59,7 @@ template<> struct packet_traits : default_packet_traits HasLog = 0, HasExp = 0, HasSqrt = 0 + HasBlend = 1, }; }; template<> struct packet_traits : default_packet_traits @@ -73,6 +74,7 @@ template<> struct packet_traits : default_packet_traits HasDiv = 1, HasExp = 0 + HasBlend = 1, }; }; @@ -557,6 +559,19 @@ ptranspose(PacketBlock& kernel) { kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); } +template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) { + const __m256 zero = _mm256_setzero_ps(); + const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_ps(thenPacket, elsePacket, false_mask); +} +template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) { + const __m256d zero = _mm256_setzero_pd(); + const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 758183c18..0bc03cf9e 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -44,7 +44,8 @@ template<> struct packet_traits > : default_packet_traits HasAbs2 = 0, HasMin = 0, HasMax = 0, - HasSetLinear = 0 + HasSetLinear = 0, + HasBlend = 1 }; }; #endif @@ -472,6 +473,11 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); + return Packet2cf(_mm_castpd_ps(result)); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 6912f3bc3..1124b24df 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -108,7 +108,8 @@ template<> struct packet_traits : default_packet_traits HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasBlend = 1 }; }; template<> struct packet_traits : default_packet_traits @@ -123,7 +124,8 @@ template<> struct packet_traits : default_packet_traits HasDiv = 1, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasBlend = 1 }; }; #endif @@ -135,7 +137,9 @@ template<> struct packet_traits : default_packet_traits // FIXME check the Has* Vectorizable = 1, AlignedOnScalar = 1, - size=4 + size=4, + + HasBlend = 1 }; }; @@ -809,6 +813,37 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); } +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + const __m128i zero = _mm_setzero_si128(); + const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128i false_mask = _mm_cmpeq_epi32(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); +#else + return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + const __m128 zero = _mm_setzero_ps(); + const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128 false_mask = _mm_cmpeq_ps(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_ps(thenPacket, elsePacket, false_mask); +#else + return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + const __m128d zero = _mm_setzero_pd(); + const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]); + __m128d false_mask = _mm_cmpeq_pd(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_pd(thenPacket, elsePacket, false_mask); +#else + return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket)); +#endif +} + } // end namespace internal } // end namespace Eigen diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 9dab07522..663ab886d 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -261,6 +261,22 @@ template void packetmath() VERIFY(isApproxAbs(data2[j], data1[i+j*PacketSize], refvalue) && "ptranspose"); } } + + if (internal::packet_traits::HasBlend) { + Packet thenPacket = internal::pload(data1); + Packet elsePacket = internal::pload(data2); + EIGEN_ALIGN_DEFAULT internal::Selector selector; + for (int i = 0; i < PacketSize; ++i) { + selector.select[i] = i; + } + + Packet blend = internal::pblend(selector, thenPacket, elsePacket); + EIGEN_ALIGN_DEFAULT Scalar result[size]; + internal::pstore(result, blend); + for (int i = 0; i < PacketSize; ++i) { + VERIFY(isApproxAbs(result[i], (selector.select[i] ? data1[i] : data2[i]), refvalue)); + } + } } template void packetmath_real() -- cgit v1.2.3 From 8c8ae2d8193809744f5952713287639817e2b442 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 7 Jun 2014 11:24:38 -0700 Subject: Fixed a typo --- Eigen/src/Core/arch/AVX/PacketMath.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 688ff91e4..74d3746d9 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -58,8 +58,8 @@ template<> struct packet_traits : default_packet_traits HasCos = 0, HasLog = 0, HasExp = 0, - HasSqrt = 0 - HasBlend = 1, + HasSqrt = 0, + HasBlend = 1 }; }; template<> struct packet_traits : default_packet_traits @@ -73,8 +73,8 @@ template<> struct packet_traits : default_packet_traits HasHalfPacket = 1, HasDiv = 1, - HasExp = 0 - HasBlend = 1, + HasExp = 0, + HasBlend = 1 }; }; -- cgit v1.2.3 From 5cc23199be743d0d1be85d709eb366e67e87a262 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 2 Oct 2014 10:30:44 -0700 Subject: More tests to validate the const-correctness of the tensor code. --- Eigen/src/Core/GenericPacketMath.h | 2 ++ Eigen/src/Core/util/XprHelper.h | 8 +++++++ unsupported/test/CMakeLists.txt | 3 ++- unsupported/test/cxx11_tensor_const.cpp | 39 +++++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 unsupported/test/cxx11_tensor_const.cpp (limited to 'Eigen') diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 6ec29d0fd..e6fea5bba 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -95,6 +95,8 @@ template struct packet_traits : default_packet_traits }; }; +template struct packet_traits : packet_traits { }; + /** \internal \returns a + b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet padd(const Packet& a, diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 7c77b2263..67ca49754 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -415,6 +415,14 @@ template struct promote_storage_type { typedef A ret; }; +template struct promote_storage_type +{ + typedef A ret; +}; +template struct promote_storage_type +{ + typedef A ret; +}; /** \internal gives the plain matrix or array type to store a row/column/diagonal of a matrix type. * \param Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType. diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index e83d8b54e..a47c7bc74 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -106,7 +106,8 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_convolution "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") -# ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") + ei_add_test(cxx11_tensor_const "-std=c++0x") + ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") ei_add_test(cxx11_tensor_of_strings "-std=c++0x") ei_add_test(cxx11_tensor_intdiv "-std=c++0x") ei_add_test(cxx11_tensor_lvalue "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp new file mode 100644 index 000000000..0ffb02afd --- /dev/null +++ b/unsupported/test/cxx11_tensor_const.cpp @@ -0,0 +1,39 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include +using Eigen::Tensor; + + + + +static void test_simple_assign() +{ + Tensor random(2,3,7); + random.setRandom(); + + TensorMap > constant(random.data(), 2, 3, 7); + Tensor result(2,3,7); + result = constant; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL((result(i,j,k)), random(i,j,k)); + } + } + } +} + +void test_cxx11_tensor_const() +{ + CALL_SUBTEST(test_simple_assign()); +} -- cgit v1.2.3 From b7271dffb5b1ceeee4c8bd99402ff89dcce58d74 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 2 Oct 2014 16:51:57 -0700 Subject: Generalized the gebp apis --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 423 +++++++++++---------- Eigen/src/Core/products/GeneralMatrixMatrix.h | 80 ++-- .../Core/products/GeneralMatrixMatrixTriangular.h | 54 +-- Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 51 ++- Eigen/src/Core/products/TriangularMatrixMatrix.h | 65 ++-- Eigen/src/Core/products/TriangularSolverMatrix.h | 49 ++- Eigen/src/Core/util/BlasUtil.h | 106 +++++- unsupported/test/CMakeLists.txt | 2 +- 8 files changed, 473 insertions(+), 357 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 7da52c2e8..090c8f4e6 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -667,7 +667,7 @@ protected: * |real |cplx | no vectorization yet, would require to pack A with duplication * |cplx |real | easy vectorization */ -template +template struct gebp_kernel { typedef gebp_traits Traits; @@ -676,14 +676,15 @@ struct gebp_kernel typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; - + typedef gebp_traits SwappedTraits; typedef typename SwappedTraits::ResScalar SResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; typedef typename SwappedTraits::RhsPacket SRhsPacket; typedef typename SwappedTraits::ResPacket SResPacket; typedef typename SwappedTraits::AccPacket SAccPacket; - + + typedef typename DataMapper::LinearMapper LinearMapper; enum { Vectorizable = Traits::Vectorizable, @@ -693,14 +694,16 @@ struct gebp_kernel }; EIGEN_DONT_INLINE - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, + void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; -template +template EIGEN_DONT_INLINE -void gebp_kernel - ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, +void gebp_kernel + ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, ResScalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { Traits traits; @@ -743,15 +746,15 @@ void gebp_kernel traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = &res[(j2+1)*resStride + i]; - ResScalar* r2 = &res[(j2+2)*resStride + i]; - ResScalar* r3 = &res[(j2+3)*resStride + i]; - - internal::prefetch(r0); - internal::prefetch(r1); - internal::prefetch(r2); - internal::prefetch(r3); + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(0); + r1.prefetch(0); + r2.prefetch(0); + r3.prefetch(0); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; @@ -813,48 +816,48 @@ void gebp_kernel ResPacket R0, R1, R2; ResPacket alphav = pset1(alpha); - - R0 = ploadu(r0+0*Traits::ResPacketSize); - R1 = ploadu(r0+1*Traits::ResPacketSize); - R2 = ploadu(r0+2*Traits::ResPacketSize); + + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r0.loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); - pstoreu(r0+2*Traits::ResPacketSize, R2); - - R0 = ploadu(r1+0*Traits::ResPacketSize); - R1 = ploadu(r1+1*Traits::ResPacketSize); - R2 = ploadu(r1+2*Traits::ResPacketSize); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r0.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r1.loadPacket(0 * Traits::ResPacketSize); + R1 = r1.loadPacket(1 * Traits::ResPacketSize); + R2 = r1.loadPacket(2 * Traits::ResPacketSize); traits.acc(C1, alphav, R0); traits.acc(C5, alphav, R1); traits.acc(C9, alphav, R2); - pstoreu(r1+0*Traits::ResPacketSize, R0); - pstoreu(r1+1*Traits::ResPacketSize, R1); - pstoreu(r1+2*Traits::ResPacketSize, R2); - - R0 = ploadu(r2+0*Traits::ResPacketSize); - R1 = ploadu(r2+1*Traits::ResPacketSize); - R2 = ploadu(r2+2*Traits::ResPacketSize); + r1.storePacket(0 * Traits::ResPacketSize, R0); + r1.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r2.loadPacket(1 * Traits::ResPacketSize); + R2 = r2.loadPacket(2 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C10, alphav, R2); - pstoreu(r2+0*Traits::ResPacketSize, R0); - pstoreu(r2+1*Traits::ResPacketSize, R1); - pstoreu(r2+2*Traits::ResPacketSize, R2); - - R0 = ploadu(r3+0*Traits::ResPacketSize); - R1 = ploadu(r3+1*Traits::ResPacketSize); - R2 = ploadu(r3+2*Traits::ResPacketSize); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r2.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r3.loadPacket(0 * Traits::ResPacketSize); + R1 = r3.loadPacket(1 * Traits::ResPacketSize); + R2 = r3.loadPacket(2 * Traits::ResPacketSize); traits.acc(C3, alphav, R0); traits.acc(C7, alphav, R1); traits.acc(C11, alphav, R2); - pstoreu(r3+0*Traits::ResPacketSize, R0); - pstoreu(r3+1*Traits::ResPacketSize, R1); - pstoreu(r3+2*Traits::ResPacketSize, R2); + r3.storePacket(0 * Traits::ResPacketSize, R0); + r3.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(2 * Traits::ResPacketSize, R2); } - + // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2 traits.initAcc(C4); traits.initAcc(C8); - ResScalar* r0 = &res[(j2+0)*resStride + i]; + LinearMapper r0 = res.getLinearMapper(i, j2); + r0.prefetch(0); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; @@ -912,19 +916,19 @@ void gebp_kernel ResPacket R0, R1, R2; ResPacket alphav = pset1(alpha); - R0 = ploadu(r0+0*Traits::ResPacketSize); - R1 = ploadu(r0+1*Traits::ResPacketSize); - R2 = ploadu(r0+2*Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r0.loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); - traits.acc(C8 , alphav, R2); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); - pstoreu(r0+2*Traits::ResPacketSize, R2); + traits.acc(C8, alphav, R2); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r0.storePacket(2 * Traits::ResPacketSize, R2); } } } - + //---------- Process 2 * LhsProgress rows at once ---------- if(mr>=2*Traits::LhsProgress) { @@ -946,15 +950,15 @@ void gebp_kernel traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = &res[(j2+1)*resStride + i]; - ResScalar* r2 = &res[(j2+2)*resStride + i]; - ResScalar* r3 = &res[(j2+3)*resStride + i]; - - internal::prefetch(r0+prefetch_res_offset); - internal::prefetch(r1+prefetch_res_offset); - internal::prefetch(r2+prefetch_res_offset); - internal::prefetch(r3+prefetch_res_offset); + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(prefetch_res_offset); + r1.prefetch(prefetch_res_offset); + r2.prefetch(prefetch_res_offset); + r3.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; @@ -978,7 +982,7 @@ void gebp_kernel traits.madd(A1, B2, C6, B2); \ traits.madd(A0, B3, C3, T0); \ traits.madd(A1, B3, C7, B3) - + internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); @@ -1002,37 +1006,37 @@ void gebp_kernel blA += 2*Traits::LhsProgress; } #undef EIGEN_GEBGP_ONESTEP - + ResPacket R0, R1, R2, R3; ResPacket alphav = pset1(alpha); - - R0 = ploadu(r0+0*Traits::ResPacketSize); - R1 = ploadu(r0+1*Traits::ResPacketSize); - R2 = ploadu(r1+0*Traits::ResPacketSize); - R3 = ploadu(r1+1*Traits::ResPacketSize); + + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r1.loadPacket(0 * Traits::ResPacketSize); + R3 = r1.loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C1, alphav, R2); traits.acc(C5, alphav, R3); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); - pstoreu(r1+0*Traits::ResPacketSize, R2); - pstoreu(r1+1*Traits::ResPacketSize, R3); - - R0 = ploadu(r2+0*Traits::ResPacketSize); - R1 = ploadu(r2+1*Traits::ResPacketSize); - R2 = ploadu(r3+0*Traits::ResPacketSize); - R3 = ploadu(r3+1*Traits::ResPacketSize); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(0 * Traits::ResPacketSize, R2); + r1.storePacket(1 * Traits::ResPacketSize, R3); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r2.loadPacket(1 * Traits::ResPacketSize); + R2 = r3.loadPacket(0 * Traits::ResPacketSize); + R3 = r3.loadPacket(1 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C3, alphav, R2); traits.acc(C7, alphav, R3); - pstoreu(r2+0*Traits::ResPacketSize, R0); - pstoreu(r2+1*Traits::ResPacketSize, R1); - pstoreu(r3+0*Traits::ResPacketSize, R2); - pstoreu(r3+1*Traits::ResPacketSize, R3); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(0 * Traits::ResPacketSize, R2); + r3.storePacket(1 * Traits::ResPacketSize, R3); } - + // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2 traits.initAcc(C0); traits.initAcc(C4); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - internal::prefetch(r0+prefetch_res_offset); + LinearMapper r0 = res.getLinearMapper(i, j2); + r0.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; @@ -1089,12 +1093,12 @@ void gebp_kernel ResPacket R0, R1; ResPacket alphav = pset1(alpha); - R0 = ploadu(r0+0*Traits::ResPacketSize); - R1 = ploadu(r0+1*Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); } } } @@ -1120,15 +1124,15 @@ void gebp_kernel traits.initAcc(C2); traits.initAcc(C3); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = &res[(j2+1)*resStride + i]; - ResScalar* r2 = &res[(j2+2)*resStride + i]; - ResScalar* r3 = &res[(j2+3)*resStride + i]; - - internal::prefetch(r0+prefetch_res_offset); - internal::prefetch(r1+prefetch_res_offset); - internal::prefetch(r2+prefetch_res_offset); - internal::prefetch(r3+prefetch_res_offset); + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(prefetch_res_offset); + r1.prefetch(prefetch_res_offset); + r2.prefetch(prefetch_res_offset); + r3.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; @@ -1171,25 +1175,25 @@ void gebp_kernel blA += 1*LhsProgress; } #undef EIGEN_GEBGP_ONESTEP - + ResPacket R0, R1; ResPacket alphav = pset1(alpha); - - R0 = ploadu(r0+0*Traits::ResPacketSize); - R1 = ploadu(r1+0*Traits::ResPacketSize); + + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r1.loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C1, alphav, R1); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r1+0*Traits::ResPacketSize, R1); - - R0 = ploadu(r2+0*Traits::ResPacketSize); - R1 = ploadu(r3+0*Traits::ResPacketSize); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r1.storePacket(0 * Traits::ResPacketSize, R1); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r3.loadPacket(0 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C3, alphav, R1); - pstoreu(r2+0*Traits::ResPacketSize, R0); - pstoreu(r3+0*Traits::ResPacketSize, R1); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r3.storePacket(0 * Traits::ResPacketSize, R1); } - + // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2 AccPacket C0; traits.initAcc(C0); - ResScalar* r0 = &res[(j2+0)*resStride + i]; + LinearMapper r0 = res.getLinearMapper(i, j2); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; @@ -1241,9 +1245,9 @@ void gebp_kernel #undef EIGEN_GEBGP_ONESTEP ResPacket R0; ResPacket alphav = pset1(alpha); - R0 = ploadu(r0+0*Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); - pstoreu(r0+0*Traits::ResPacketSize, R0); + r0.storePacket(0 * Traits::ResPacketSize, R0); } } } @@ -1259,7 +1263,7 @@ void gebp_kernel const LhsScalar* blA = &blockA[i*strideA+offsetA]; prefetch(&blA[0]); const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; - + if( (SwappedTraits::LhsProgress % 4)==0 ) { // NOTE The following piece of code wont work for 512 bit registers @@ -1268,32 +1272,32 @@ void gebp_kernel straits.initAcc(C1); straits.initAcc(C2); straits.initAcc(C3); - + const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4); const Index endk = (depth/spk)*spk; const Index endk4 = (depth/(spk*4))*(spk*4); - + Index k=0; for(; k { SLhsPacket A0; SRhsPacket B_0; - + straits.loadLhsUnaligned(blB, A0); straits.loadRhsQuad(blA, B_0); straits.madd(A0,B_0,C0,B_0); - + blB += SwappedTraits::LhsProgress; blA += spk; } @@ -1317,10 +1321,10 @@ void gebp_kernel typedef typename conditional::half,SLhsPacket>::type SLhsPacketHalf; typedef typename conditional::half,SRhsPacket>::type SRhsPacketHalf; typedef typename conditional::half,SAccPacket>::type SAccPacketHalf; - - SResPacketHalf R = pgather(&res[j2*resStride + i], resStride); + + SResPacketHalf R = res.template gatherPacket(i, j2); SResPacketHalf alphav = pset1(alpha); - + if(depth-endk>0) { // We have to handle the last row of the rhs which corresponds to a half-packet @@ -1336,14 +1340,14 @@ void gebp_kernel { straits.acc(predux4(C0), alphav, R); } - pscatter(&res[j2*resStride + i], R, resStride); + res.scatterPacket(i, j2, R); } else { - SResPacket R = pgather(&res[j2*resStride + i], resStride); + SResPacket R = res.template gatherPacket(i, j2); SResPacket alphav = pset1(alpha); straits.acc(C0, alphav, R); - pscatter(&res[j2*resStride + i], R, resStride); + res.scatterPacket(i, j2, R); } } else // scalar path @@ -1355,25 +1359,25 @@ void gebp_kernel { LhsScalar A0; RhsScalar B_0, B_1; - + A0 = blA[k]; - + B_0 = blB[0]; B_1 = blB[1]; MADD(cj,A0,B_0,C0, B_0); MADD(cj,A0,B_1,C1, B_1); - + B_0 = blB[2]; B_1 = blB[3]; MADD(cj,A0,B_0,C2, B_0); MADD(cj,A0,B_1,C3, B_1); - + blB += 4; } - res[(j2+0)*resStride + i] += alpha*C0; - res[(j2+1)*resStride + i] += alpha*C1; - res[(j2+2)*resStride + i] += alpha*C2; - res[(j2+3)*resStride + i] += alpha*C3; + res(i, j2 + 0) += alpha * C0; + res(i, j2 + 1) += alpha * C1; + res(i, j2 + 2) += alpha * C2; + res(i, j2 + 3) += alpha * C3; } } } @@ -1394,7 +1398,7 @@ void gebp_kernel RhsScalar B_0 = blB[k]; MADD(cj, A0, B_0, C0, B_0); } - res[(j2+0)*resStride + i] += alpha*C0; + res(i, j2) += alpha * C0; } } } @@ -1417,15 +1421,16 @@ void gebp_kernel // // 32 33 34 35 ... // 36 36 38 39 ... -template -struct gemm_pack_lhs +template +struct gemm_pack_lhs { - EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); + typedef typename DataMapper::LinearMapper LinearMapper; + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs - ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) +template +EIGEN_DONT_INLINE void gemm_pack_lhs + ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { typedef typename packet_traits::type Packet; enum { PacketSize = packet_traits::size }; @@ -1436,30 +1441,29 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=depth && offset<=stride)); eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) ); conj_if::IsComplex && Conjugate> cj; - const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; - + const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 : Pack2>1 ? (rows/Pack2)*Pack2 : 0; - + Index i=0; - + // Pack 3 packets if(Pack1>=3*PacketSize) { for(; i(&lhs(i+0*PacketSize, k)); - B = ploadu(&lhs(i+1*PacketSize, k)); - C = ploadu(&lhs(i+2*PacketSize, k)); + A = lhs.loadPacket(i+0*PacketSize, k); + B = lhs.loadPacket(i+1*PacketSize, k); + C = lhs.loadPacket(i+2*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; pstore(blockA+count, cj.pconj(C)); count+=PacketSize; @@ -1473,12 +1477,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(&lhs(i+0*PacketSize, k)); - B = ploadu(&lhs(i+1*PacketSize, k)); + A = lhs.loadPacket(i+0*PacketSize, k); + B = lhs.loadPacket(i+1*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } @@ -1491,11 +1495,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(&lhs(i+0*PacketSize, k)); + A = lhs.loadPacket(i+0*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } @@ -1508,11 +1512,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs -struct gemm_pack_lhs +template +struct gemm_pack_lhs { - EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); + typedef typename DataMapper::LinearMapper LinearMapper; + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs - ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) +template +EIGEN_DONT_INLINE void gemm_pack_lhs + ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { typedef typename packet_traits::type Packet; enum { PacketSize = packet_traits::size }; @@ -1543,13 +1548,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; - const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; - + // const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; // const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; // const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - + int pack = Pack1; Index i = 0; while(pack>0) @@ -1569,7 +1573,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu(&lhs(i+p+m, k)); + for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k); ptranspose(kernel); for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); } @@ -1594,15 +1598,15 @@ EIGEN_DONT_INLINE void gemm_pack_lhs -struct gemm_pack_rhs +template +struct gemm_pack_rhs { typedef typename packet_traits::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; enum { PacketSize = packet_traits::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_rhs - ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) +template +EIGEN_DONT_INLINE void gemm_pack_rhs + ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR"); EIGEN_UNUSED_VARIABLE(stride); @@ -1685,27 +1690,27 @@ EIGEN_DONT_INLINE void gemm_pack_rhs=4) { for(Index j2=packet_cols8; j2 kernel; - kernel.packet[0] = ploadu(&b0[k]); - kernel.packet[1] = ploadu(&b1[k]); - kernel.packet[2] = ploadu(&b2[k]); - kernel.packet[3] = ploadu(&b3[k]); + kernel.packet[0] = dm0.loadPacket(k); + kernel.packet[1] = dm1.loadPacket(k); + kernel.packet[2] = dm2.loadPacket(k); + kernel.packet[3] = dm3.loadPacket(k); ptranspose(kernel); pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1])); @@ -1716,10 +1721,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs -struct gemm_pack_rhs +template +struct gemm_pack_rhs { typedef typename packet_traits::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; enum { PacketSize = packet_traits::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_rhs - ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) +template +EIGEN_DONT_INLINE void gemm_pack_rhs + ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); EIGEN_UNUSED_VARIABLE(stride); @@ -1762,7 +1768,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs=8 ? (cols/8) * 8 : 0; Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; - + // if(nr>=8) // { // for(Index j2=0; j2(&rhs[k*rhsStride + j2]); + Packet A = rhs.loadPacket(k, j2); pstoreu(blockB+count, cj.pconj(A)); count += PacketSize; } else { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = cj(b0[0]); - blockB[count+1] = cj(b0[1]); - blockB[count+2] = cj(b0[2]); - blockB[count+3] = cj(b0[3]); + const LinearMapper dm0 = rhs.getLinearMapper(k, j2); + blockB[count+0] = cj(dm0(0)); + blockB[count+1] = cj(dm0(1)); + blockB[count+2] = cj(dm0(2)); + blockB[count+3] = cj(dm0(3)); count += 4; } } @@ -1825,10 +1831,9 @@ EIGEN_DONT_INLINE void gemm_pack_rhs::ReturnType ResScal static void run(Index rows, Index cols, Index depth, const LhsScalar* _lhs, Index lhsStride, const RhsScalar* _rhs, Index rhsStride, - ResScalar* res, Index resStride, + ResScalar* _res, Index resStride, ResScalar alpha, level3_blocking& blocking, GemmParallelInfo* info = 0) { - const_blas_data_mapper lhs(_lhs,lhsStride); - const_blas_data_mapper rhs(_rhs,rhsStride); + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction - gemm_pack_lhs pack_lhs; - gemm_pack_rhs pack_rhs; - gebp_kernel gebp; + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; + gebp_kernel gebp; #ifdef EIGEN_HAS_OPENMP if(info) @@ -95,7 +99,7 @@ static void run(Index rows, Index cols, Index depth, // In order to reduce the chance that a thread has to wait for the other, // let's start by packing B'. - pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc); + pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc); // Pack A_k to A' in a parallel fashion: // each thread packs the sub block A_k,i to A'_i where i is the thread id. @@ -105,8 +109,8 @@ static void run(Index rows, Index cols, Index depth, // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. while(info[tid].users!=0) {} info[tid].users += threads; - - pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length); + + pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length); // Notify the other threads that the part A'_i is ready to go. info[tid].sync = k; @@ -119,9 +123,12 @@ static void run(Index rows, Index cols, Index depth, // At this point we have to make sure that A'_i has been updated by the thread i, // we use testAndSetOrdered to mimic a volatile access. // However, no need to wait for the B' part which has been updated by the current thread! - if(shift>0) - while(info[i].sync!=k) {} - gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha); + if (shift>0) { + while(info[i].sync!=k) { + } + } + + gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha); } // Then keep going as usual with the remaining B' @@ -130,10 +137,10 @@ static void run(Index rows, Index cols, Index depth, const Index actual_nc = (std::min)(j+nc,cols)-j; // pack B_k,j to B' - pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc); + pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc); // C_j += A' * B' - gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha); + gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha); } // Release all the sub blocks A'_i of A' for the current thread, @@ -159,28 +166,33 @@ static void run(Index rows, Index cols, Index depth, ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); // For each horizontal panel of the rhs, and corresponding panel of the lhs... - for(Index k2=0; k2 Pack lhs's panel into a sequential chunk of memory (L2/L3 caching) - // Note that this panel will be read as many times as the number of blocks in the rhs's - // horizontal panel which is, in practice, a very low number. - pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows); + const Index actual_mc = (std::min)(i2+mc,rows)-i2; - // For each kc x nc block of the rhs's horizontal panel... - for(Index j2=0; j2 Pack lhs's panel into a sequential chunk of memory (L2/L3 caching) + // Note that this panel will be read as many times as the number of blocks in the rhs's + // horizontal panel which is, in practice, a very low number. + pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc); + + // For each kc x nc block of the rhs's horizontal panel... + for(Index j2=0; j2m_nc; computeProductBlockingSizes(this->m_kc, this->m_mc, n); } - + m_sizeA = this->m_mc * this->m_kc; m_sizeB = this->m_kc * this->m_nc; } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 225b994d1..daa8a1d8a 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -58,13 +58,17 @@ struct general_matrix_matrix_triangular_product::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha) + const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha) { - const_blas_data_mapper lhs(_lhs,lhsStride); - const_blas_data_mapper rhs(_rhs,rhsStride); - typedef gebp_traits Traits; + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); + Index kc = depth; // cache block size along the K direction Index mc = size; // cache block size along the M direction Index nc = size; // cache block size along the N direction @@ -75,10 +79,10 @@ struct general_matrix_matrix_triangular_product pack_lhs; - gemm_pack_rhs pack_rhs; - gebp_kernel gebp; + + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; + gebp_kernel gebp; tribb_kernel sybb; for(Index k2=0; k2 processed with gebp or skipped // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel // 3 - after the diagonal => processed with gebp or skipped if (UpLo==Lower) - gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha, - -1, -1, 0, 0); + gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, + (std::min)(size,i2), alpha, -1, -1, 0, 0); + - sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); + sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); if (UpLo==Upper) { Index j2 = i2+actual_mc; - gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha, - -1, -1, 0, 0); + gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc, + actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0); } } } @@ -129,13 +134,16 @@ struct tribb_kernel { typedef gebp_traits Traits; typedef typename Traits::ResScalar ResScalar; - + enum { BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) }; - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) + void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { - gebp_kernel gebp_kernel; + typedef blas_data_mapper ResMapper; + ResMapper res(_res, resStride); + gebp_kernel gebp_kernel; + Matrix buffer; // let's process the block per panel of actual_mc x BlockSize, @@ -146,7 +154,7 @@ struct tribb_kernel const RhsScalar* actual_b = blockB+j*depth; if(UpLo==Upper) - gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, + gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha, -1, -1, 0, 0); // selfadjoint micro block @@ -154,12 +162,12 @@ struct tribb_kernel Index i = j; buffer.setZero(); // 1 - apply the kernel on the temporary buffer - gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, + gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, -1, -1, 0, 0); // 2 - triangular accumulation for(Index j1=0; j1 lhs(_lhs,lhsStride); - const_blas_data_mapper rhs(_rhs,rhsStride); - typedef gebp_traits Traits; + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper LhsTransposeMapper; + typedef const_blas_data_mapper RhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + LhsTransposeMapper lhs_transpose(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); + Index kc = size; // cache block size along the K direction Index mc = rows; // cache block size along the M direction Index nc = cols; // cache block size along the N direction @@ -346,10 +352,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix gebp_kernel; + gebp_kernel gebp_kernel; symm_pack_lhs pack_lhs; - gemm_pack_rhs pack_rhs; - gemm_pack_lhs pack_lhs_transposed; + gemm_pack_rhs pack_rhs; + gemm_pack_lhs pack_lhs_transposed; for(Index k2=0; k2 transposed packed copy @@ -368,9 +374,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix() - (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); + gemm_pack_lhs() + (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); } } } @@ -414,15 +420,18 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix lhs(_lhs,lhsStride); - typedef gebp_traits Traits; + typedef const_blas_data_mapper LhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + ResMapper res(_res,resStride); + Index kc = size; // cache block size along the K direction Index mc = rows; // cache block size along the M direction Index nc = cols; // cache block size along the N direction @@ -432,8 +441,8 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gebp_kernel gebp_kernel; + gemm_pack_lhs pack_lhs; symm_pack_rhs pack_rhs; for(Index k2=0; k2& blocking) { // strip zeros @@ -117,8 +117,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix lhs(_lhs,lhsStride); - const_blas_data_mapper rhs(_rhs,rhsStride); + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -136,9 +140,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; - gemm_pack_rhs pack_rhs; + gebp_kernel gebp_kernel; + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; for(Index k2=IsLower ? depth : 0; IsLower ? k2>0 : k2 skip it @@ -182,9 +186,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix() - (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); + gemm_pack_lhs() + (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, + actual_kc, cols, alpha, -1, -1, 0, 0); } } } @@ -247,7 +254,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix& blocking) { // strip zeros @@ -256,8 +263,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix lhs(_lhs,lhsStride); - const_blas_data_mapper rhs(_rhs,rhsStride); + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -275,10 +286,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; - gemm_pack_rhs pack_rhs; - gemm_pack_rhs pack_rhs_panel; + gebp_kernel gebp_kernel; + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; + gemm_pack_rhs pack_rhs_panel; for(Index k2=IsLower ? 0 : depth; IsLower ? k20; @@ -302,7 +313,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix0) @@ -315,7 +326,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix0) @@ -349,7 +360,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix& blocking) { Index cols = otherSize; - const_blas_data_mapper tri(_tri,triStride); - blas_data_mapper other(_other,otherStride); + + typedef const_blas_data_mapper TriMapper; + typedef blas_data_mapper OtherMapper; + TriMapper tri(_tri, triStride); + OtherMapper other(_other, otherStride); typedef gebp_traits Traits; + enum { SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower @@ -71,9 +75,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; - gebp_kernel gebp_kernel; - gemm_pack_lhs pack_lhs; - gemm_pack_rhs pack_rhs; + gebp_kernel gebp_kernel; + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; // the goal here is to subdivise the Rhs panels such that we keep some cache // coherence when accessing the rhs elements @@ -146,16 +150,16 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) { Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc; - pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget); + pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget); - gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), + gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), actualPanelWidth, actual_kc, 0, blockBOffset); } } @@ -170,9 +174,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) { - pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc); + pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc); - gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0); + gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0); } } } @@ -198,8 +202,11 @@ EIGEN_DONT_INLINE void triangular_solve_matrix& blocking) { Index rows = otherSize; - const_blas_data_mapper rhs(_tri,triStride); - blas_data_mapper lhs(_other,otherStride); + + typedef blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + LhsMapper lhs(_other, otherStride); + RhsMapper rhs(_tri, triStride); typedef gebp_traits Traits; enum { @@ -218,10 +225,10 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; - gebp_kernel gebp_kernel; - gemm_pack_rhs pack_rhs; - gemm_pack_rhs pack_rhs_panel; - gemm_pack_lhs pack_lhs_panel; + gebp_kernel gebp_kernel; + gemm_pack_rhs pack_rhs; + gemm_pack_rhs pack_rhs_panel; + gemm_pack_lhs pack_lhs_panel; for(Index k2=IsLower ? size : 0; IsLower ? k2>0 : k20) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs); + if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs); // triangular packing (we only pack the panels off the diagonal, // neglecting the blocks overlapping the diagonal @@ -248,7 +255,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) pack_rhs_panel(blockB+j2*actual_kc, - &rhs(actual_k2+panelOffset, actual_j2), triStride, + rhs.getSubMapper(actual_k2+panelOffset, actual_j2), panelLength, actualPanelWidth, actual_kc, panelOffset); } @@ -276,7 +283,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) { - gebp_kernel(&lhs(i2,absolute_j2), otherStride, + gebp_kernel(lhs.getSubMapper(i2,absolute_j2), blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, Scalar(-1), @@ -303,14 +310,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) - gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, + gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb, actual_mc, actual_kc, rs, Scalar(-1), -1, -1, 0, 0); } diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 0d8e2705a..25a62d528 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -18,13 +18,13 @@ namespace Eigen { namespace internal { // forward declarations -template +template struct gebp_kernel; -template +template struct gemm_pack_rhs; -template +template struct gemm_pack_lhs; template< @@ -117,32 +117,96 @@ template struct get_factor::R static EIGEN_STRONG_INLINE typename NumTraits::Real run(const Scalar& x) { return numext::real(x); } }; + +template +class MatrixLinearMapper { + public: + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + EIGEN_ALWAYS_INLINE MatrixLinearMapper(Scalar *data) : m_data(data) {} + + EIGEN_ALWAYS_INLINE void prefetch(int i) const { + internal::prefetch(&operator()(i)); + } + + EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { + return m_data[i]; + } + + EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return ploadt(m_data + i); + } + + EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + return ploadt(m_data + i); + } + + EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + pstoret(m_data + i, p); + } + + protected: + Scalar *m_data; +}; + // Lightweight helper class to access matrix coefficients. -// Yes, this is somehow redundant with Map<>, but this version is much much lighter, -// and so I hope better compilation performance (time and code quality). -template -class blas_data_mapper -{ +template +class blas_data_mapper { public: - blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} - EIGEN_STRONG_INLINE Scalar& operator()(Index i, Index j) - { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + typedef MatrixLinearMapper LinearMapper; + + EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} + + EIGEN_ALWAYS_INLINE blas_data_mapper + getSubMapper(Index i, Index j) const { + return blas_data_mapper(&operator()(i, j), m_stride); + } + + EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(&operator()(i, j)); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const { + return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; + } + + EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return ploadt(&operator()(i, j)); + } + + EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + return ploadt(&operator()(i, j)); + } + + template + EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, SubPacket p) const { + pscatter(&operator()(i, j), p, m_stride); + } + + template + EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const { + return pgather(&operator()(i, j), m_stride); + } + protected: - Scalar* EIGEN_RESTRICT m_data; - Index m_stride; + Scalar* EIGEN_RESTRICT m_data; + const Index m_stride; }; // lightweight helper class to access matrix coefficients (const version) template -class const_blas_data_mapper -{ +class const_blas_data_mapper : public blas_data_mapper { public: - const_blas_data_mapper(const Scalar* data, Index stride) : m_data(data), m_stride(stride) {} - EIGEN_STRONG_INLINE const Scalar& operator()(Index i, Index j) const - { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } - protected: - const Scalar* EIGEN_RESTRICT m_data; - Index m_stride; + EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper(data, stride) {} + + EIGEN_ALWAYS_INLINE const_blas_data_mapper getSubMapper(Index i, Index j) const { + return const_blas_data_mapper(&(this->operator()(i, j)), this->m_stride); + } }; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 5d8913dd8..75423f516 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -48,7 +48,7 @@ if(MPFR_FOUND) include_directories(${MPFR_INCLUDES} ./mpreal) ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ") set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES}) - ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" ) +# ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" ) else() ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ") endif() -- cgit v1.2.3 From 95a430a2ca8489a85d0a12ffa66d260011c11745 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 3 Oct 2014 19:45:19 -0700 Subject: Vector primitives for CUDA --- Eigen/Core | 5 + Eigen/src/Core/arch/CUDA/MathFunctions.h | 75 +++++++++ Eigen/src/Core/arch/CUDA/PacketMath.h | 260 +++++++++++++++++++++++++++++++ 3 files changed, 340 insertions(+) create mode 100644 Eigen/src/Core/arch/CUDA/MathFunctions.h create mode 100644 Eigen/src/Core/arch/CUDA/PacketMath.h (limited to 'Eigen') diff --git a/Eigen/Core b/Eigen/Core index 776b7faf3..537ac16b2 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -294,6 +294,11 @@ using std::ptrdiff_t; #include "src/Core/arch/NEON/Complex.h" #endif +#if defined EIGEN_VECTORIZE_CUDA + #include "src/Core/arch/CUDA/PacketMath.h" + #include "src/Core/arch/CUDA/MathFunctions.h" +#endif + #include "src/Core/arch/Default/Settings.h" #include "src/Core/functors/BinaryFunctors.h" diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h new file mode 100644 index 000000000..e7305c01e --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -0,0 +1,75 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H +#define EIGEN_MATH_FUNCTIONS_CUDA_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +template<> EIGEN_STRONG_INLINE +float4 plog(const float4& a) +{ + return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 plog(const double2& a) +{ + return make_double2(log(a.x), log(a.y)); +} + +template<> EIGEN_STRONG_INLINE +float4 pexp(const float4& a) +{ + return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 pexp(const double2& a) +{ + return make_double2(exp(a.x), exp(a.y)); +} + +template<> EIGEN_STRONG_INLINE +float4 psqrt(const float4& a) +{ + return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 psqrt(const double2& a) +{ + return make_double2(sqrt(a.x), sqrt(a.y)); +} + +template<> EIGEN_STRONG_INLINE +float4 prsqrt(const float4& a) +{ + return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 prsqrt(const double2& a) +{ + return make_double2(rsqrt(a.x), rsqrt(a.y)); +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_CUDA_H diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h new file mode 100644 index 000000000..5b0abe2e6 --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -0,0 +1,260 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_CUDA_H +#define EIGEN_PACKET_MATH_CUDA_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; + + +template<> struct packet_traits : default_packet_traits +{ + typedef float4 type; + typedef float4 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 0, + + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + + HasBlend = 0, + }; +}; + +template<> struct packet_traits : default_packet_traits +{ + typedef double2 type; + typedef double2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + + HasDiv = 1, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + + HasBlend = 0, + }; +}; + + +template<> struct unpacket_traits { typedef float type; enum {size=4}; typedef float4 half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2}; typedef double2 half; }; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1(const float& from) { + return make_float4(from, from, from, from); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const double& from) { + return make_double2(from, from); +} + + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset(const float& a) { + return make_float4(a, a+1, a+2, a+3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset(const double& a) { + return make_double2(a, a+1); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd(const float4& a, const float4& b) { + return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd(const double2& a, const double2& b) { + return make_double2(a.x+b.x, a.y+b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub(const float4& a, const float4& b) { + return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub(const double2& a, const double2& b) { + return make_double2(a.x-b.x, a.y-b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) { + return make_float4(-a.x, -a.y, -a.z, -a.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) { + return make_double2(-a.x, -a.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul(const float4& a, const float4& b) { + return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul(const double2& a, const double2& b) { + return make_double2(a.x*b.x, a.y*b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv(const float4& a, const float4& b) { + return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv(const double2& a, const double2& b) { + return make_double2(a.x/b.x, a.y/b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin(const float4& a, const float4& b) { + return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin(const double2& a, const double2& b) { + return make_double2(fmin(a.x, b.x), fmin(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax(const float4& a, const float4& b) { + return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax(const double2& a, const double2& b) { + return make_double2(fmax(a.x, b.x), fmax(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload(const float* from) { + return *reinterpret_cast(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload(const double* from) { + return *reinterpret_cast(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu(const float* from) { + return make_float4(from[0], from[1], from[2], from[3]); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu(const double* from) { + return make_double2(from[0], from[1]); +} + +template<> EIGEN_STRONG_INLINE float4 ploaddup(const float* from) { + return make_float4(from[0], from[0], from[1], from[1]); +} +template<> EIGEN_STRONG_INLINE double2 ploaddup(const double* from) { + return make_double2(from[0], from[0]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(float* to, const float4& from) { + *reinterpret_cast(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(double* to, const double2& from) { + *reinterpret_cast(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(float* to, const float4& from) { + to[0] = from.x; + to[1] = from.y; + to[2] = from.z; + to[3] = from.w; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to, const double2& from) { + to[0] = from.x; + to[1] = from.y; +} + +#ifdef __CUDA_ARCH__ +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { + return __ldg((const float4*)from); +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { + return __ldg((const double2*)from); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { + return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { + return make_double2(__ldg(from+0), __ldg(from+1)); +} +#endif + +template<> EIGEN_DEVICE_FUNC inline float4 pgather(const float* from, int stride) { + return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline double2 pgather(const double* from, int stride) { + return make_double2(from[0*stride], from[1*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const float4& from, int stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; + to[stride*2] = from.z; + to[stride*3] = from.w; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const double2& from, int stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; +} + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + double tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; + + tmp = kernel.packet[0].z; + kernel.packet[0].z = kernel.packet[2].x; + kernel.packet[2].x = tmp; + + tmp = kernel.packet[0].w; + kernel.packet[0].w = kernel.packet[3].x; + kernel.packet[3].x = tmp; + + tmp = kernel.packet[1].z; + kernel.packet[1].z = kernel.packet[2].y; + kernel.packet[2].y = tmp; + + tmp = kernel.packet[1].w; + kernel.packet[1].w = kernel.packet[3].y; + kernel.packet[3].y = tmp; + + tmp = kernel.packet[2].w; + kernel.packet[2].w = kernel.packet[3].z; + kernel.packet[3].z = tmp; +} + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + double tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + + +#endif // EIGEN_PACKET_MATH_CUDA_H -- cgit v1.2.3 From bbce6fa65d8a196f05e0428d014e0e3865e202f3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 3 Oct 2014 19:55:35 -0700 Subject: define EIGEN_VECTORIZE_CUDA when compiling with nvcc --- Eigen/Core | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'Eigen') diff --git a/Eigen/Core b/Eigen/Core index 537ac16b2..acdeca5f4 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -178,6 +178,11 @@ #endif #endif +#if defined __CUDACC__ + #define EIGEN_VECTORIZE_CUDA + #include +#endif + #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) #define EIGEN_HAS_OPENMP #endif -- cgit v1.2.3 From 99d75235a9567865d2c070a2840d54c8a5ad0f43 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 13 Oct 2014 17:02:09 -0700 Subject: Misc improvements and cleanups --- Eigen/src/Core/GenericPacketMath.h | 15 +- unsupported/Eigen/CXX11/Tensor | 4 + .../Eigen/CXX11/src/Core/util/CXX11Workarounds.h | 5 + .../Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h | 101 ++++++++- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 4 +- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 12 +- unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h | 35 ++++ .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 73 ++++--- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 20 +- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 36 +++- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 4 +- unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 26 ++- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 22 +- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 4 +- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 9 +- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 61 ++++-- unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h | 32 +-- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_assign.cpp | 35 +++- unsupported/test/cxx11_tensor_convolution.cpp | 70 +++++++ unsupported/test/cxx11_tensor_device.cpp | 27 +++ unsupported/test/cxx11_tensor_morphing.cpp | 5 +- unsupported/test/cxx11_tensor_of_complex.cpp | 64 ++++++ unsupported/test/cxx11_tensor_thread_pool.cpp | 232 ++++++++++++++++++++- 29 files changed, 780 insertions(+), 141 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_of_complex.cpp (limited to 'Eigen') diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index e6fea5bba..3ef3475c7 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -359,7 +359,7 @@ pmadd(const Packet& a, /** \internal \returns a packet version of \a *from. * If LoadMode equals #Aligned, \a from must be 16 bytes aligned */ template -inline Packet ploadt(const typename unpacket_traits::type* from) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits::type* from) { if(LoadMode == Aligned) return pload(from); @@ -370,7 +370,7 @@ inline Packet ploadt(const typename unpacket_traits::type* from) /** \internal copy the packet \a from to \a *to. * If StoreMode equals #Aligned, \a to must be 16 bytes aligned */ template -inline void pstoret(Scalar* to, const Packet& from) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from) { if(LoadMode == Aligned) pstore(to, from); @@ -378,6 +378,17 @@ inline void pstoret(Scalar* to, const Packet& from) pstoreu(to, from); } +/** \internal \returns a packet version of \a *from. + * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the + * hardware if available to speedup the loading of data that won't be modified + * by the current computation. + */ +template +inline Packet ploadt_ro(const typename unpacket_traits::type* from) +{ + return ploadt(from); +} + /** \internal default implementation of palign() allowing partial specialization */ template struct palign_impl diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 0dac95e45..2137f4276 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -30,6 +30,10 @@ #include #include +#ifdef EIGEN_USE_THREADS +#include +#endif + #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) #include #endif diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index 227522ecb..e30eb6ad8 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -66,6 +66,11 @@ template constexpr inline T& array_ template constexpr inline T&& array_get(std::array&& a) { return (T&&) STD_GET_ARR_HACK; } template constexpr inline T const& array_get(std::array const& a) { return (T const&) STD_GET_ARR_HACK; } +template constexpr inline T& array_get(std::vector& a) { return a[I]; } +template constexpr inline T&& array_get(std::vector&& a) { return a[I]; } +template constexpr inline T const& array_get(std::vector const& a) { return a[I]; } + + #undef STD_GET_ARR_HACK template struct array_size; diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h index 4c6b95773..e45d0a3b1 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -48,7 +48,8 @@ template class array { values[2] = v3; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4) { + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, + const T& v4) { EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE) values[0] = v1; values[1] = v2; @@ -56,7 +57,8 @@ template class array { values[3] = v4; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, const T& v5) { + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5) { EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE) values[0] = v1; values[1] = v2; @@ -64,6 +66,43 @@ template class array { values[3] = v4; values[4] = v5; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6) { + EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6, const T& v7) { + EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + values[6] = v7; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array( + const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6, const T& v7, const T& v8) { + EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + values[6] = v7; + values[7] = v8; + } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES array(std::initializer_list l) { @@ -93,9 +132,11 @@ template struct type_list { struct null_type { }; -template +template struct make_type_list { - typedef typename make_type_list::type tailresult; + typedef typename make_type_list::type tailresult; typedef type_list type; }; @@ -150,6 +191,23 @@ template struct gen_numeric_list_repeated { typedef typename make_type_list, type2val, type2val, type2val, type2val >::type type; }; +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, + type2val, type2val, type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, + type2val, type2val, type2val, + type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, + type2val, type2val, type2val, + type2val, type2val >::type type; +}; + template struct get; @@ -174,6 +232,7 @@ template <> struct arg_prod { static const int value = 1; }; + template array repeat(t v) { array array; @@ -190,6 +249,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_l return get >::value; } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList& l) { + return arg_prod::value; +}; + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& a) { t prod = 1; @@ -201,6 +265,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& /*a*/) { return 0; } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector& a) { + eigen_assert(a.size() > 0); + t prod = 1; + for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; } + return prod; +} + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array& a) { return a[I]; @@ -210,12 +282,31 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array& a) { return a[I]; } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector& a) { + return a[I]; +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector& a) { + return a[I]; +} +template struct array_size; +template struct array_size > { + static const size_t value = N; +}; +template struct array_size; +template struct array_size& > { + static const size_t value = N; +}; template struct array_size; template struct array_size > { static const size_t value = N; }; - +template struct array_size; +template struct array_size& > { + static const size_t value = N; +}; struct sum_op { template static inline bool run(A a, B b) { return a + b; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 3bfe80c9e..e973c00d3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -131,8 +131,8 @@ struct TensorEvaluator, Device> m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { - static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 27c10f64f..6018ecc66 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -30,6 +30,12 @@ class TensorBase typedef Scalar CoeffReturnType; typedef typename internal::packet_traits::type PacketReturnType; + // Dimensions + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return derived().dimensions()[n]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(derived().dimensions()); } + // Nullary operators EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> @@ -187,7 +193,7 @@ class TensorBase } // Contractions. - typedef std::pair DimensionPair; + typedef Eigen::IndexPair DimensionPair; template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorContractionOp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 3b2a9c8b9..0e55d4de1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -48,7 +48,7 @@ struct nested, 1, typename eval -class TensorBroadcastingOp : public TensorBase, WriteAccessors> +class TensorBroadcastingOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -91,7 +91,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, }; -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); @@ -141,7 +141,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const D template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - static const int packetSize = internal::unpacket_traits::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < dimensions().TotalSize()); @@ -161,7 +161,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const D if (innermostLoc + packetSize <= m_impl.dimensions()[0]) { return m_impl.template packet(inputIndex); } else { - EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; values[0] = m_impl.coeff(inputIndex); for (int i = 1; i < packetSize; ++i) { values[i] = coeff(originalIndex+i); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 4a5fd9c79..34bdd5309 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -872,11 +872,19 @@ struct TensorEvaluator + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt(m_buf+index); + } + private: // No assignment (copies are needed by the kernels) TensorEvaluator& operator = (const TensorEvaluator&); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index 75519c9f5..649bdb308 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -38,6 +38,18 @@ template class TensorDevice { return *this; } + template + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, sum); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, m_device); + return *this; + } + protected: const DeviceType& m_device; ExpressionType& m_expression; @@ -58,6 +70,18 @@ template class TensorDevice + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, sum); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, m_device); + return *this; + } + protected: const ThreadPoolDevice& m_device; ExpressionType& m_expression; @@ -79,6 +103,17 @@ template class TensorDevice return *this; } + template + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, sum); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + protected: const GpuDevice& m_device; ExpressionType m_expression; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index fad342eab..5a6ff70e9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -37,23 +37,41 @@ struct DefaultDevice { // Multiple cpu cores // We should really use a thread pool here but first we need to find a portable thread pool library. #ifdef EIGEN_USE_THREADS + +typedef std::future Future; + struct ThreadPoolDevice { - ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } - size_t numThreads() const { return num_threads_; } + ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : num_threads_(num_cores) { } EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return internal::aligned_malloc(num_bytes); } + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { internal::aligned_free(buffer); } + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { ::memcpy(dst, src, n); } + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); } + EIGEN_STRONG_INLINE size_t numThreads() const { + return num_threads_; + } + + template + EIGEN_STRONG_INLINE Future enqueue(Function&& f, Args&&... args) const { + return std::async(std::launch::async, f, args...); + } + template + EIGEN_STRONG_INLINE void enqueueNoFuture(Function&& f, Args&&... args) const { + std::async(std::launch::async, f, args...); + } + private: // todo: NUMA, ... size_t num_threads_; @@ -63,41 +81,34 @@ struct ThreadPoolDevice { // GPU offloading #ifdef EIGEN_USE_GPU -static int m_numMultiProcessors = 0; -static int m_maxThreadsPerBlock = 0; -static int m_maxThreadsPerMultiProcessor = 0; +static cudaDeviceProp m_deviceProperties; +static bool m_devicePropInitialized = false; + +static void initializeDeviceProp() { + if (!m_devicePropInitialized) { + assert(cudaGetDeviceProperties(&m_deviceProperties, 0) == cudaSuccess); + m_devicePropInitialized = true; + } +} static inline int getNumCudaMultiProcessors() { - if (m_numMultiProcessors == 0) { - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; - m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; - m_numMultiProcessors = deviceProp.multiProcessorCount; - } - return m_numMultiProcessors; + initializeDeviceProp(); + return m_deviceProperties.multiProcessorCount; } static inline int maxCudaThreadsPerBlock() { - if (m_maxThreadsPerBlock == 0) { - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - m_numMultiProcessors = deviceProp.multiProcessorCount; - m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; - m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; - } - return m_maxThreadsPerBlock; + initializeDeviceProp(); + return m_deviceProperties.maxThreadsPerBlock; } static inline int maxCudaThreadsPerMultiProcessor() { - if (m_maxThreadsPerBlock == 0) { - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - m_numMultiProcessors = deviceProp.multiProcessorCount; - m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; - m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; - } - return m_maxThreadsPerMultiProcessor; + initializeDeviceProp(); + return m_deviceProperties.maxThreadsPerMultiProcessor; +} +static inline int sharedMemPerBlock() { + initializeDeviceProp(); + return m_deviceProperties.sharedMemPerBlock; } + struct GpuDevice { // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction. GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); } @@ -141,8 +152,8 @@ struct GpuDevice { #endif } - EIGEN_STRONG_INLINE size_t numThreads() const { - // Fixme: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME return 32; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 732c6b344..2dd8e274b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -29,7 +29,7 @@ namespace Eigen { * \sa Tensor */ -// Can't use std::pairs on cuda devices +// Can't use std::pair on cuda devices template struct IndexPair { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 587cbd5ca..ce9d73578 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -116,7 +116,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { m_buffer[i] = m_impl.coeff(i); } - EIGEN_STRONG_INLINE void evalPacket(Index i) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 0f969036c..e324ba8d2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -65,13 +65,13 @@ struct TensorEvaluator return m_data[index]; } - template EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt(m_data + index); } - template EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const Packet& x) { return internal::pstoret(m_data + index, x); @@ -113,13 +113,17 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { eigen_assert(m_data); +#ifdef __CUDA_ARCH__ + return __ldg(m_data+index); +#else return m_data[index]; +#endif } - template EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return internal::ploadt(m_data + index); + return internal::ploadt_ro(m_data + index); } const Scalar* data() const { return m_data; } @@ -166,7 +170,7 @@ struct TensorEvaluator, Device> } template - EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(index); } @@ -219,7 +223,7 @@ struct TensorEvaluator, Device> } template - EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(m_argImpl.template packet(index)); } @@ -278,7 +282,7 @@ struct TensorEvaluator - EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(m_leftImpl.template packet(index), m_rightImpl.template packet(index)); } @@ -340,7 +344,7 @@ struct TensorEvaluator return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); } template - PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { static const int PacketSize = internal::unpacket_traits::size; internal::Selector select; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 10f5a5ee7..01fa04c64 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -10,10 +10,6 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H -#ifdef EIGEN_USE_THREADS -#include -#endif - namespace Eigen { /** \class TensorExecutor @@ -62,7 +58,7 @@ class TensorExecutor { const Index size = array_prod(evaluator.dimensions()); static const int PacketSize = unpacket_traits::PacketReturnType>::size; - const int VectorizedSize = (size / PacketSize) * PacketSize; + const Index VectorizedSize = (size / PacketSize) * PacketSize; for (Index i = 0; i < VectorizedSize; i += PacketSize) { evaluator.evalPacket(i); @@ -131,10 +127,10 @@ class TensorExecutor const Index numblocks = size / blocksize; Index i = 0; - std::vector > results; + std::vector results; results.reserve(numblocks); for (int i = 0; i < numblocks; ++i) { - results.push_back(std::async(std::launch::async, &EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); + results.push_back(device.enqueue(&EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); } for (int i = 0; i < numblocks; ++i) { @@ -154,11 +150,31 @@ class TensorExecutor // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template -__global__ void EigenMetaKernel(Evaluator eval, unsigned int size) { +__global__ void +__launch_bounds__(1024) +EigenMetaKernel(Evaluator eval, unsigned int size) { + const int first_index = blockIdx.x * blockDim.x + threadIdx.x; const int step_size = blockDim.x * gridDim.x; - for (int i = first_index; i < size; i += step_size) { - eval.evalScalar(i); + + if (!Evaluator::PacketAccess || !Evaluator::IsAligned) { + // Use the scalar path + for (int i = first_index; i < size; i += step_size) { + eval.evalScalar(i); + } + } + else { + // Use the vector path + const int PacketSize = unpacket_traits::size; + const int vectorized_step_size = step_size * PacketSize; + const int vectorized_size = (size / PacketSize) * PacketSize; + int i = first_index * PacketSize; + for ( ; i < vectorized_size; i += vectorized_step_size) { + eval.evalPacket(i); + } + for ( ; i < size; i += step_size) { + eval.evalScalar(i); + } } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 4d7f9e1fd..a753c5a48 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -17,7 +17,7 @@ namespace Eigen { * * \brief The fixed sized version of the tensor class. * - * The fixes sized equivalent of + * The fixed sized equivalent of * Eigen::Tensor t(3, 5, 7); * is * Eigen::TensorFixedSize> t; @@ -41,7 +41,7 @@ class TensorFixedSize : public TensorBase::size > 1), }; typedef Dimensions_ Dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index cf97031be..2714117ab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -31,30 +31,34 @@ namespace internal { template struct TensorIntDivisor { public: - TensorIntDivisor() { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { multiplier = 0; shift1 = 0; shift2 = 0; } // Must have 1 <= divider <= 2^31-1 - TensorIntDivisor(const T divider) { - static const int N = 32; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) { + const int N = 32; eigen_assert(divider > 0); eigen_assert(divider <= (1<<(N-1)) - 1); // fast ln2 +#ifndef __CUDA_ARCH__ const int leading_zeros = __builtin_clz(divider); - const int l = N - (leading_zeros+1); - - multiplier = (static_cast(1) << (N+l)) / divider - (static_cast(1) << N) + 1; - shift1 = (std::min)(1, l); - shift2 = (std::max)(0, l-1); +#else + const int leading_zeros = __clz(divider); +#endif + const int log_div = N - (leading_zeros+1); + + multiplier = (static_cast(1) << (N+log_div)) / divider - (static_cast(1) << N) + 1; + shift1 = log_div > 1 ? 1 : log_div; + shift2 = log_div > 1 ? log_div-1 : 0; } // Must have 0 <= numerator <= 2^32-1 - T divide(const T numerator) const { - static const int N = 32; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { + const int N = 32; eigen_assert(numerator >= 0); eigen_assert(numerator <= (1ull< -static T operator / (const T& numerator, const TensorIntDivisor& divisor) { +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor& divisor) { return divisor.divide(numerator); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 04849dd9f..2c0d2cd0f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -42,26 +42,25 @@ template class TensorMap : public Tensor static const int Options = Options_; - static const std::size_t NumIndices = PlainObjectType::NumIndices; + static const Index NumIndices = PlainObjectType::NumIndices; typedef typename PlainObjectType::Dimensions Dimensions; - enum { - IsAligned = bool(EIGEN_ALIGN) && ((int(Options_)&Aligned)==Aligned), - PacketAccess = true, + IsAligned = ((int(Options_)&Aligned)==Aligned), + PacketAccess = (internal::packet_traits::size > 1), }; #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(array({{firstDimension, otherDimensions...}})) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } #else EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(array(firstDimension)) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } #endif @@ -176,12 +175,13 @@ template class TensorMap : public Tensor template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) { - static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + const std::size_t NumDims = sizeof...(otherIndices) + 1; if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 7da89458f..8da6e0f26 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -144,7 +144,7 @@ struct TensorEvaluator, Device template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - static const int packetSize = internal::unpacket_traits::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < dimensions().TotalSize()); @@ -206,7 +206,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - static const int packetSize = internal::unpacket_traits::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; for (int i = 0; i < packetSize; ++i) { values[i] = coeff(index+i); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index f7e7fc107..7e0063626 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -97,7 +97,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; enum { - IsAligned = true, + IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), }; @@ -194,7 +194,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; enum { - IsAligned = true, + IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 0c4f8a3d6..aaec39756 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -30,11 +30,11 @@ namespace Eigen { * * \sa Tensor */ -template class TensorStorage; +template class TensorStorage; // Pure fixed-size storage -template +template class TensorStorage { private: @@ -62,7 +62,7 @@ class TensorStorage // pure-dynamic, but without specification of all dimensions explicitly -template +template class TensorStorage : public TensorStorage::type> { @@ -79,7 +79,7 @@ class TensorStorage }; // pure dynamic -template +template class TensorStorage::type> { T *m_data; @@ -140,6 +140,7 @@ class TensorStorage, 1, typename eval -class TensorStridingOp : public TensorBase, WriteAccessors> +class TensorStridingOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -97,7 +97,7 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = /*TensorEvaluator::PacketAccess*/false, + PacketAccess = TensorEvaluator::PacketAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -109,28 +109,23 @@ struct TensorEvaluator, Device> } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } else { - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - } - } - for (int i = 0; i < NumDims; ++i) { - m_inputStrides[i] *= op.strides()[i]; + m_outputStrides[0] = 1; + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_inputStrides[i-1] *= op.strides()[i-1]; } + m_inputStrides[NumDims-1] *= op.strides()[NumDims-1]; } - // typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -150,16 +145,44 @@ struct TensorEvaluator, Device> return m_impl.coeff(inputIndex); } - /* template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return m_impl.template packet(index); - }*/ + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[0]; + inputIndices[1] += indices[1] * m_inputStrides[0]; + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } + else { + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } Scalar* data() const { return NULL; } protected: - // Strides m_strides; Dimensions m_dimensions; array m_outputStrides; array m_inputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 40f805741..5940a8cf1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -70,14 +70,18 @@ struct traits > }; -template -struct traits > +template +struct traits > : public traits { typedef traits BaseTraits; typedef typename BaseTraits::Scalar Scalar; typedef typename BaseTraits::StorageKind StorageKind; typedef typename BaseTraits::Index Index; + enum { + Options = Options_, + Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + }; }; @@ -105,16 +109,16 @@ struct eval, Eigen::Dense> typedef const TensorFixedSize& type; }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const TensorMap& type; + typedef const TensorMap& type; }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const TensorMap& type; + typedef const TensorMap& type; }; template @@ -141,16 +145,16 @@ struct nested, 1, typename e typedef const TensorFixedSize& type; }; -template -struct nested, 1, typename eval >::type> +template +struct nested, 1, typename eval >::type> { - typedef const TensorMap& type; + typedef const TensorMap& type; }; -template -struct nested, 1, typename eval >::type> +template +struct nested, 1, typename eval >::type> { - typedef const TensorMap& type; + typedef const TensorMap& type; }; } // end namespace internal diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index d6c435947..a7ef2b402 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -110,6 +110,7 @@ if(EIGEN_TEST_CXX11) # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") ei_add_test(cxx11_tensor_const "-std=c++0x") ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") + ei_add_test(cxx11_tensor_of_complex "-std=c++0x") ei_add_test(cxx11_tensor_of_strings "-std=c++0x") ei_add_test(cxx11_tensor_intdiv "-std=c++0x") ei_add_test(cxx11_tensor_lvalue "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp index f2b126413..0ac3f9bf9 100644 --- a/unsupported/test/cxx11_tensor_assign.cpp +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -253,6 +253,39 @@ static void test_auto_resize() } +static void test_compound_assign() +{ + Tensor start_tensor(10); + Tensor offset_tensor(10); + start_tensor.setRandom(); + offset_tensor.setRandom(); + + Tensor tensor = start_tensor; + tensor += offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) + offset_tensor(i)); + } + + tensor = start_tensor; + tensor -= offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) - offset_tensor(i)); + } + + tensor = start_tensor; + tensor *= offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) * offset_tensor(i)); + } + + tensor = start_tensor; + tensor /= offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) / offset_tensor(i)); + } +} + + void test_cxx11_tensor_assign() { CALL_SUBTEST(test_1d()); @@ -260,5 +293,5 @@ void test_cxx11_tensor_assign() CALL_SUBTEST(test_3d()); CALL_SUBTEST(test_same_type()); CALL_SUBTEST(test_auto_resize()); - + CALL_SUBTEST(test_compound_assign()); } diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp index bafe73edd..4672db463 100644 --- a/unsupported/test/cxx11_tensor_convolution.cpp +++ b/unsupported/test/cxx11_tensor_convolution.cpp @@ -64,8 +64,78 @@ static void test_expr() } +static void test_modes() { + Tensor input(3); + Tensor kernel(3); + input(0) = 1.0f; + input(1) = 2.0f; + input(2) = 3.0f; + kernel(0) = 0.5f; + kernel(1) = 1.0f; + kernel(2) = 0.0f; + + const Eigen::array dims{{0}}; + Eigen::array, 1> padding; + + // Emulate VALID mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(0, 0); + Tensor valid(1); + valid = input.pad(padding).convolve(kernel, dims); + VERIFY_IS_EQUAL(valid.dimension(0), 1); + VERIFY_IS_APPROX(valid(0), 2.5f); + + // Emulate SAME mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(1, 1); + Tensor same(3); + same = input.pad(padding).convolve(kernel, dims); + VERIFY_IS_EQUAL(same.dimension(0), 3); + VERIFY_IS_APPROX(same(0), 1.0f); + VERIFY_IS_APPROX(same(1), 2.5f); + VERIFY_IS_APPROX(same(2), 4.0f); + + // Emulate FULL mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(2, 2); + Tensor full(5); + full = input.pad(padding).convolve(kernel, dims); + VERIFY_IS_EQUAL(full.dimension(0), 5); + VERIFY_IS_APPROX(full(0), 0.0f); + VERIFY_IS_APPROX(full(1), 1.0f); + VERIFY_IS_APPROX(full(2), 2.5f); + VERIFY_IS_APPROX(full(3), 4.0f); + VERIFY_IS_APPROX(full(4), 1.5f); +} + + +static void test_strides() { + Tensor input(13); + Tensor kernel(3); + input.setRandom(); + kernel.setRandom(); + + const Eigen::array dims{{0}}; + const Eigen::array stride_of_3{{3}}; + const Eigen::array stride_of_2{{2}}; + + Tensor result; + result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2); + + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) + + input(6)*kernel(2))); + VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) + + input(12)*kernel(2))); +} + + + + void test_cxx11_tensor_convolution() { CALL_SUBTEST(test_evals()); CALL_SUBTEST(test_expr()); + CALL_SUBTEST(test_modes()); + CALL_SUBTEST(test_strides()); } diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp index f331cb481..26465ee11 100644 --- a/unsupported/test/cxx11_tensor_device.cpp +++ b/unsupported/test/cxx11_tensor_device.cpp @@ -123,6 +123,14 @@ static void test_forced_contextual_eval(Context* context) context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); } +template +static void test_compound_assignment(Context* context) +{ + context->out().device(context->device()) = context->in1().constant(2.718f); + context->out().device(context->device()) += context->in1() + context->in2() * 3.14f; +} + + template static void test_contraction(Context* context) { @@ -197,6 +205,15 @@ static void test_cpu() { } } + test_compound_assignment(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); + } + } + } + test_contraction(&context); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 40; ++j) { @@ -299,6 +316,16 @@ static void test_gpu() { } } + test_compound_assignment(&context); + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); + } + } + } + test_contraction(&context); assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); for (int i = 0; i < 40; ++i) { diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index 2a6a97856..fd1b1fa32 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -12,6 +12,7 @@ #include using Eigen::Tensor; +using Eigen::IndexPair; static void test_simple_reshape() { @@ -52,7 +53,7 @@ static void test_reshape_in_expr() { TensorMap> tensor2(m2.data(), 3,5,7,11,13); Tensor::Dimensions newDims1{{2,3*5*7*11}}; Tensor::Dimensions newDims2{{3*5*7*11,13}}; - array::DimensionPair, 1> contract_along{{std::make_pair(1, 0)}}; + Eigen::array, 1> contract_along{{IndexPair(1, 0)}}; Tensor tensor3(2,13); tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along); @@ -125,7 +126,7 @@ static void test_slice_in_expr() { TensorMap> tensor1(m1.data(), 7, 7); TensorMap> tensor2(m2.data(), 3, 3); Tensor tensor3(3,1); - array::DimensionPair, 1> contract_along{{std::make_pair(1, 0)}}; + array, 1> contract_along{{IndexPair(1, 0)}}; Eigen::DSizes indices1(1,2); Eigen::DSizes sizes1(3,3); diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp new file mode 100644 index 000000000..b5044b962 --- /dev/null +++ b/unsupported/test/cxx11_tensor_of_complex.cpp @@ -0,0 +1,64 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::TensorMap; + + + +static void test_additions() +{ + Tensor, 1> data1(3); + Tensor, 1> data2(3); + for (int i = 0; i < 3; ++i) { + data1(i) = std::complex(i, -i); + data2(i) = std::complex(i, 7 * i); + } + + Tensor, 1> sum = data1 + data2; + for (int i = 0; i < 3; ++i) { + VERIFY_IS_EQUAL(sum(i), std::complex(2*i, 6*i)); + } +} + + +static void test_contractions() +{ + Tensor, 4> t_left(30, 50, 8, 31); + Tensor, 5> t_right(8, 31, 7, 20, 10); + Tensor, 5> t_result(30, 50, 7, 20, 10); + + t_left.setRandom(); + t_right.setRandom(); + + typedef Map, Dynamic, Dynamic>> MapXcf; + MapXcf m_left(t_left.data(), 1500, 248); + MapXcf m_right(t_right.data(), 248, 1400); + Matrix, Dynamic, Dynamic> m_result(1500, 1400); + + // This contraction should be equivalent to a regular matrix multiplication + typedef Tensor::DimensionPair DimPair; + Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); + t_result = t_left.contract(t_right, dims); + m_result = m_left * m_right; + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); + } +} + + +void test_cxx11_tensor_of_complex() +{ + CALL_SUBTEST(test_additions()); + CALL_SUBTEST(test_contractions()); +} diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index e02d8e4be..f0de61f8b 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -9,22 +9,23 @@ #define EIGEN_USE_THREADS - +#include #include "main.h" #include + using Eigen::Tensor; -void test_cxx11_tensor_thread_pool() +static void test_multithread_elementwise() { - Eigen::Tensor in1(2,3,7); - Eigen::Tensor in2(2,3,7); - Eigen::Tensor out(2,3,7); + Tensor in1(2,3,7); + Tensor in2(2,3,7); + Tensor out(2,3,7); in1.setRandom(); in2.setRandom(); - Eigen::ThreadPoolDevice thread_pool_device(3); + Eigen::ThreadPoolDevice thread_pool_device(internal::random(3, 11)); out.device(thread_pool_device) = in1 + in2 * 3.14f; for (int i = 0; i < 2; ++i) { @@ -35,3 +36,222 @@ void test_cxx11_tensor_thread_pool() } } } + + +static void test_multithread_compound_assignment() +{ + Tensor in1(2,3,7); + Tensor in2(2,3,7); + Tensor out(2,3,7); + + in1.setRandom(); + in2.setRandom(); + + Eigen::ThreadPoolDevice thread_pool_device(internal::random(3, 11)); + out.device(thread_pool_device) = in1; + out.device(thread_pool_device) += in2 * 3.14f; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f); + } + } + } +} + + +static void test_multithread_contraction() +{ + Tensor t_left(30, 50, 37, 31); + Tensor t_right(37, 31, 70, 2, 10); + Tensor t_result(30, 50, 70, 2, 10); + + t_left.setRandom(); + t_right.setRandom(); + + // this contraction should be equivalent to a single matrix multiplication + typedef Tensor::DimensionPair DimPair; + Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); + + + typedef Map MapXf; + MapXf m_left(t_left.data(), 1500, 1147); + MapXf m_right(t_right.data(), 1147, 1400); + MatrixXf m_result(1500, 1400); + + Eigen::ThreadPoolDevice thread_pool_device(4); + + // compute results by separate methods + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + m_result = m_left * m_right; + + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + VERIFY(&t_result.data()[i] != &m_result.data()[i]); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } +} + + +static void test_contraction_corner_cases() +{ + Tensor t_left(32, 500); + Tensor t_right(32, 28*28); + Tensor t_result(500, 28*28); + + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; + t_result = t_result.constant(NAN); + + // this contraction should be equivalent to a single matrix multiplication + typedef Tensor::DimensionPair DimPair; + Eigen::array dims{{DimPair(0, 0)}}; + + typedef Map MapXf; + MapXf m_left(t_left.data(), 32, 500); + MapXf m_right(t_right.data(), 32, 28*28); + MatrixXf m_result(500, 28*28); + + Eigen::ThreadPoolDevice thread_pool_device(12); + + // compute results by separate methods + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + m_result = m_left.transpose() * m_right; + + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } + + t_left.resize(32, 1); + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_result.resize (1, 28*28); + t_result = t_result.constant(NAN); + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + new(&m_left) MapXf(t_left.data(), 32, 1); + m_result = m_left.transpose() * m_right; + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } + + t_left.resize(32, 500); + t_right.resize(32, 4); + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; + t_result.resize (500, 4); + t_result = t_result.constant(NAN); + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + new(&m_left) MapXf(t_left.data(), 32, 500); + new(&m_right) MapXf(t_right.data(), 32, 4); + m_result = m_left.transpose() * m_right; + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } + + t_left.resize(32, 1); + t_right.resize(32, 4); + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; + t_result.resize (1, 4); + t_result = t_result.constant(NAN); + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + new(&m_left) MapXf(t_left.data(), 32, 1); + new(&m_right) MapXf(t_right.data(), 32, 4); + m_result = m_left.transpose() * m_right; + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } +} + + +static void test_multithread_contraction_agrees_with_singlethread() { + int contract_size = internal::random(1, 5000); + + Tensor left(internal::random(1, 80), + contract_size, + internal::random(1, 100)); + + Tensor right(internal::random(1, 25), + internal::random(1, 37), + contract_size, + internal::random(1, 51)); + + left.setRandom(); + right.setRandom(); + + // add constants to shift values away from 0 for more precision + left += left.constant(1.5f); + right += right.constant(1.5f); + + typedef Tensor::DimensionPair DimPair; + Eigen::array dims({{DimPair(1, 2)}}); + + Eigen::ThreadPoolDevice thread_pool_device(internal::random(2, 11)); + + Tensor st_result; + st_result = left.contract(right, dims); + + Tensor tp_result(st_result.dimensions()); + tp_result.device(thread_pool_device) = left.contract(right, dims); + + VERIFY(internal::dimensions_match(st_result.dimensions(), tp_result.dimensions())); + for (ptrdiff_t i = 0; i < st_result.size(); i++) { + // if both of the values are very small, then do nothing (because the test will fail + // due to numerical precision issues when values are small) + if (fabs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4) { + VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]); + } + } +} + + +static void test_memcpy() { + + for (int i = 0; i < 5; ++i) { + const int num_threads = internal::random(3, 11); + Eigen::ThreadPoolDevice thread_pool_device(num_threads); + + const int size = internal::random(13, 7632); + Tensor t1(size); + t1.setRandom(); + std::vector result(size); + thread_pool_device.memcpy(&result[0], t1.data(), size*sizeof(float)); + for (int i = 0; i < size; i++) { + VERIFY_IS_EQUAL(t1(i), result[i]); + } + } +} + + +void test_cxx11_tensor_thread_pool() +{ + CALL_SUBTEST(test_multithread_elementwise()); + CALL_SUBTEST(test_multithread_compound_assignment()); + + CALL_SUBTEST(test_multithread_contraction()); + + CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread()); + + // Exercise various cases that have been problematic in the past. + CALL_SUBTEST(test_contraction_corner_cases()); + + CALL_SUBTEST(test_memcpy()); +} -- cgit v1.2.3 From bfdd9f3ac95d9a2b41e6f2ec1f7434331125b9e1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 15 Oct 2014 15:32:59 -0700 Subject: Made the blocking computation aware of the l3 cache Also optimized the blocking parameters to take into account the number of threads used for a computation --- Eigen/src/Core/SolveTriangular.h | 2 +- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 124 ++++++++++++++------- Eigen/src/Core/products/GeneralMatrixMatrix.h | 16 +-- .../Core/products/GeneralMatrixMatrixTriangular.h | 2 +- Eigen/src/Core/products/Parallelizer.h | 4 +- Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 6 +- Eigen/src/Core/products/TriangularMatrixMatrix.h | 2 +- Eigen/src/Core/products/TriangularSolverMatrix.h | 4 +- blas/level3_impl.h | 12 +- test/product_large.cpp | 7 +- unsupported/Eigen/CXX11/Tensor | 2 +- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 13 +-- 13 files changed, 117 insertions(+), 79 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h index ef17f288e..e158e3162 100644 --- a/Eigen/src/Core/SolveTriangular.h +++ b/Eigen/src/Core/SolveTriangular.h @@ -96,7 +96,7 @@ struct triangular_solver_selector typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType; - BlockingType blocking(rhs.rows(), rhs.cols(), size); + BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false); triangular_solve_matrix diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 090c8f4e6..b91786037 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -26,28 +26,37 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff } /** \internal */ -inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) +inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) { - static std::ptrdiff_t m_l1CacheSize = 0; - static std::ptrdiff_t m_l2CacheSize = 0; - if(m_l2CacheSize==0) + static bool m_cache_sizes_initialized = false; + static std::ptrdiff_t m_l1CacheSize = 32*1024; + static std::ptrdiff_t m_l2CacheSize = 256*1024; + static std::ptrdiff_t m_l3CacheSize = 2*1024*1024; + + if(!m_cache_sizes_initialized) { - m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024); - m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024); + int l1CacheSize, l2CacheSize, l3CacheSize; + queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize); + m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, 8*1024); + m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, 256*1024); + m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, 8*1024*1024); + m_cache_sizes_initialized = true; } - + if(action==SetAction) { // set the cpu cache size and cache all block sizes from a global cache size in byte eigen_internal_assert(l1!=0 && l2!=0); m_l1CacheSize = *l1; m_l2CacheSize = *l2; + m_l3CacheSize = *l3; } else if(action==GetAction) { eigen_internal_assert(l1!=0 && l2!=0); *l1 = m_l1CacheSize; *l2 = m_l2CacheSize; + *l3 = m_l3CacheSize; } else { @@ -70,10 +79,11 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi * - the number of scalars that fit into a packet (when vectorization is enabled). * * \sa setCpuCacheSizes */ +#define CEIL(a, b) ((a)+(b)-1)/(b) + template -void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) +void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) { - EIGEN_UNUSED_VARIABLE(n); // Explanations: // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed @@ -81,43 +91,71 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) // at the register level. For vectorization purpose, these small vertical panels are unpacked, // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to // stay in L1 cache. - std::ptrdiff_t l1, l2; - - typedef gebp_traits Traits; - enum { - kdiv = KcFactor * 2 * Traits::nr - * Traits::RhsProgress * sizeof(RhsScalar), - mr = gebp_traits::mr, - mr_mask = (0xffffffff/mr)*mr - }; + std::ptrdiff_t l1, l2, l3; + manage_caching_sizes(GetAction, &l1, &l2, &l3); + + if (num_threads > 1) { + typedef gebp_traits Traits; + typedef typename Traits::ResScalar ResScalar; + enum { + kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), + ksub = Traits::mr * Traits::nr * sizeof(ResScalar), + k_mask = (0xffffffff/8)*8, + + mr = Traits::mr, + mr_mask = (0xffffffff/mr)*mr, + + nr = Traits::nr, + nr_mask = (0xffffffff/nr)*nr + }; + SizeType k_cache = (l1-ksub)/kdiv; + if (k_cache < k) { + k = k_cache & k_mask; + eigen_assert(k > 0); + } - manage_caching_sizes(GetAction, &l1, &l2); + SizeType n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); + SizeType n_per_thread = CEIL(n, num_threads); + if (n_cache <= n_per_thread) { + // Don't exceed the capacity of the l2 cache. + eigen_assert(n_cache >= static_cast(nr)); + n = n_cache & nr_mask; + eigen_assert(n > 0); + } else { + n = (std::min)(n, (n_per_thread + nr - 1) & nr_mask); + } -// k = std::min(k, l1/kdiv); -// SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; -// if(_m l2) { + // l3 is shared between all cores, so we'll give each thread its own chunk of l3. + SizeType m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); + SizeType m_per_thread = CEIL(m, num_threads); + if(m_cache < m_per_thread && m_cache >= static_cast(mr)) { + m = m_cache & mr_mask; + eigen_assert(m > 0); + } else { + m = (std::min)(m, (m_per_thread + mr - 1) & mr_mask); + } + } + } + else { + // In unit tests we do not want to use extra large matrices, + // so we reduce the block size to check the blocking strategy is not flawed #ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS -// k = std::min(k,240); -// n = std::min(n,3840/sizeof(RhsScalar)); -// m = std::min(m,3840/sizeof(RhsScalar)); - - k = std::min(k,sizeof(LhsScalar)<=4 ? 360 : 240); - n = std::min(n,3840/sizeof(RhsScalar)); - m = std::min(m,3840/sizeof(RhsScalar)); + k = std::min(k,sizeof(LhsScalar)<=4 ? 360 : 240); + n = std::min(n,3840/sizeof(RhsScalar)); + m = std::min(m,3840/sizeof(RhsScalar)); #else - k = std::min(k,24); - n = std::min(n,384/sizeof(RhsScalar)); - m = std::min(m,384/sizeof(RhsScalar)); + k = std::min(k,24); + n = std::min(n,384/sizeof(RhsScalar)); + m = std::min(m,384/sizeof(RhsScalar)); #endif + } } template -inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) +inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) { - computeProductBlockingSizes(k, m, n); + computeProductBlockingSizes(k, m, n, num_threads); } #ifdef EIGEN_HAS_FUSE_CJMADD @@ -1846,8 +1884,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhsm_mc = ActualRows; this->m_nc = ActualCols; @@ -331,21 +331,21 @@ class gemm_blocking_spacem_mc = Transpose ? cols : rows; this->m_nc = Transpose ? rows : cols; this->m_kc = depth; - if(full_rows) + if(l3_blocking) { - DenseIndex m = this->m_mc; - computeProductBlockingSizes(this->m_kc, m, this->m_nc); + computeProductBlockingSizes(this->m_kc, this->m_mc, this->m_nc, num_threads); } - else // full columns + else // no l3 blocking { + DenseIndex m = this->m_mc; DenseIndex n = this->m_nc; - computeProductBlockingSizes(this->m_kc, this->m_mc, n); + computeProductBlockingSizes(this->m_kc, m, n, num_threads); } m_sizeA = this->m_mc * this->m_kc; @@ -451,7 +451,7 @@ class GeneralProduct (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor; - BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true); + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit); } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index daa8a1d8a..8de39f76f 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -72,7 +72,7 @@ struct general_matrix_matrix_triangular_product(kc, mc, nc); + computeProductBlockingSizes(kc, mc, nc, 1); // !!! mc must be a multiple of nr: if(mc > Traits::nr) mc = (mc/Traits::nr)*Traits::nr; diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 4079063eb..837e69415 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -49,8 +49,8 @@ inline void initParallel() { int nbt; internal::manage_multi_threading(GetAction, &nbt); - std::ptrdiff_t l1, l2; - internal::manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); } /** \returns the max number of threads reserved for Eigen diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index d9e6084c3..21f8175d2 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -343,7 +343,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc); + computeProductBlockingSizes(kc, mc, nc, 1); // kc must smaller than mc kc = (std::min)(kc,mc); @@ -432,10 +432,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc); + computeProductBlockingSizes(kc, mc, nc, 1); std::size_t sizeB = kc*cols; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 77aa3e5ee..4cbb79da0 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -412,7 +412,7 @@ struct TriangularProduct Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows())) : ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols())); - BlockingType blocking(stripedRows, stripedCols, stripedDepth); + BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false); internal::product_triangular_matrix_matrix0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0; subcols = std::max((subcols/Traits::nr)*Traits::nr, Traits::nr); diff --git a/blas/level3_impl.h b/blas/level3_impl.h index a05872666..37a803ced 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -56,7 +56,7 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal else matrix(c, *m, *n, *ldc) *= beta; } - internal::gemm_blocking_space blocking(*m,*n,*k,true); + internal::gemm_blocking_space blocking(*m,*n,*k,1,true); int code = OP(*opa) | (OP(*opb) << 2); func[code](*m, *n, *k, a, *lda, b, *ldb, c, *ldc, alpha, blocking, 0); @@ -131,12 +131,12 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m, if(SIDE(*side)==LEFT) { - internal::gemm_blocking_space blocking(*m,*n,*m); + internal::gemm_blocking_space blocking(*m,*n,*m,1,false); func[code](*m, *n, a, *lda, b, *ldb, blocking); } else { - internal::gemm_blocking_space blocking(*m,*n,*n); + internal::gemm_blocking_space blocking(*m,*n,*n,1,false); func[code](*n, *m, a, *lda, b, *ldb, blocking); } @@ -222,12 +222,12 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, if(SIDE(*side)==LEFT) { - internal::gemm_blocking_space blocking(*m,*n,*m); + internal::gemm_blocking_space blocking(*m,*n,*m,1,false); func[code](*m, *n, *m, a, *lda, tmp.data(), tmp.outerStride(), b, *ldb, alpha, blocking); } else { - internal::gemm_blocking_space blocking(*m,*n,*n); + internal::gemm_blocking_space blocking(*m,*n,*n,1,false); func[code](*m, *n, *n, tmp.data(), tmp.outerStride(), a, *lda, b, *ldb, alpha, blocking); } return 1; @@ -577,7 +577,7 @@ int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal else if(*n<0) info = 3; else if(*k<0) info = 4; else if(*lda(10000,20000); - std::ptrdiff_t l2 = internal::random(1000000,2000000); - setCpuCacheSizes(l1,l2); + std::ptrdiff_t l2 = internal::random(100000,200000); + std::ptrdiff_t l3 = internal::random(1000000,2000000); + setCpuCacheSizes(l1,l2,l3); VERIFY(l1==l1CacheSize()); VERIFY(l2==l2CacheSize()); std::ptrdiff_t k1 = internal::random(10,100)*16; std::ptrdiff_t m1 = internal::random(10,100)*16; std::ptrdiff_t n1 = internal::random(10,100)*16; // only makes sure it compiles fine - internal::computeProductBlockingSizes(k1,m1,n1); + internal::computeProductBlockingSizes(k1,m1,n1,1); } { diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 7ec60044e..47447f446 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -55,7 +55,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" -//#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 1e6f276e0..cd992daab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -766,7 +766,7 @@ struct TensorEvaluator BlockingType; // Sizes of the blocks to load in cache. See the Goto paper for details. - BlockingType blocking(m, n, k, true); + BlockingType blocking(m, n, k, 1, true); const Index kc = blocking.kc(); const Index mc = (std::min)(m, blocking.mc()); const Index nc = (std::min)(n, blocking.nc()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index dc0513305..8e4c7c11d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -152,7 +152,7 @@ struct TensorEvaluator(kc, mc, nc/*, num_threads*/); + internal::computeProductBlockingSizes(kc, mc, nc, num_threads); eigen_assert(mc <= m); eigen_assert(nc <= n); eigen_assert(kc <= k); @@ -197,9 +197,10 @@ struct TensorEvaluator kernel_promises(num_kernel_promises, p); + std::vector kernel_promises(num_kernel_promises); + for (int i = 0; i < kernel_promises.size(); ++i) { + kernel_promises[i].set_value(); + } for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { const Index k_start = k_block_idx * kc; @@ -275,8 +276,7 @@ struct TensorEvaluator) Func; - this->m_device.enqueueNoFuture(&Self::packRhsAndKernel, arg); + this->m_device.enqueueNoFuture(&Self::packRhsAndKernel, arg); } } } @@ -338,7 +338,6 @@ struct TensorEvaluator Date: Thu, 30 Oct 2014 17:52:32 -0700 Subject: Added missing packet primitives for CUDA. --- Eigen/src/Core/arch/CUDA/PacketMath.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'Eigen') diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 5b0abe2e6..7b481d512 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -216,6 +216,21 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, c to[stride*1] = from.y; } +template<> EIGEN_DEVICE_FUNC inline float pfirst(const float4& a) { + return a.x; +} +template<> EIGEN_DEVICE_FUNC inline double pfirst(const double2& a) { + return a.x; +} + +template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { + return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { + return make_double2(abs(a.x), abs(a.y)); +} + + template<> EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { double tmp = kernel.packet[0].y; -- cgit v1.2.3 From bc99c5f7db8d4d7e41e5e4358170e99a1bf9d364 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 30 Oct 2014 18:09:53 -0700 Subject: fixed some potential alignment issues. --- Eigen/src/Core/util/Macros.h | 4 +++- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 8fdd7d898..001907a0b 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -297,7 +297,9 @@ namespace Eigen { * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link * vectorized and non-vectorized code. */ -#if (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION) +#if (defined __CUDACC__) +#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) +#elif (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION) #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) #elif (defined _MSC_VER) #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 3447592eb..33849ed3e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -459,7 +459,7 @@ struct TensorEvaluator, Device> this->m_impl.template writePacket(inputIndices[0], x); } else { - CoeffReturnType values[packetSize]; + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; internal::pstore(values, x); this->m_impl.coeffRef(inputIndices[0]) = values[0]; this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; -- cgit v1.2.3 From 2dde63499c4ef836a0d9dfd443494d863ad62b16 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 31 Oct 2014 16:33:51 -0700 Subject: Generalized the matrix vector product code. --- Eigen/src/Core/GeneralProduct.h | 32 +-- Eigen/src/Core/products/GeneralMatrixVector.h | 246 ++++++++++++----------- Eigen/src/Core/products/TriangularMatrixVector.h | 46 +++-- Eigen/src/Core/products/TriangularSolverVector.h | 24 ++- Eigen/src/Core/util/BlasUtil.h | 47 ++++- 5 files changed, 228 insertions(+), 167 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 7179eb124..9d3d5562c 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -11,7 +11,7 @@ #ifndef EIGEN_GENERAL_PRODUCT_H #define EIGEN_GENERAL_PRODUCT_H -namespace Eigen { +namespace Eigen { /** \class GeneralProduct * \ingroup Core_Module @@ -257,7 +257,7 @@ class GeneralProduct : public ProductBase, Lhs, Rhs> { template struct IsRowMajor : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {}; - + public: EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct) @@ -266,7 +266,7 @@ class GeneralProduct EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) } - + struct set { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } }; struct add { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } }; struct sub { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } }; @@ -277,12 +277,12 @@ class GeneralProduct dst.const_cast_derived() += m_scale * src; } }; - + template inline void evalTo(Dest& dest) const { internal::outer_product_selector_run(*this, dest, set(), IsRowMajor()); } - + template inline void addTo(Dest& dest) const { internal::outer_product_selector_run(*this, dest, add(), IsRowMajor()); @@ -436,12 +436,12 @@ template<> struct gemv_selector bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0)); bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; - + RhsScalar compatibleAlpha = get_factor::run(actualAlpha); ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), evalToDest ? dest.data() : static_dest.data()); - + if(!evalToDest) { #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN @@ -457,11 +457,13 @@ template<> struct gemv_selector MappedDest(actualDestPtr, dest.size()) = dest; } + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; general_matrix_vector_product - ::run( + ::run( actualLhs.rows(), actualLhs.cols(), - actualLhs.data(), actualLhs.outerStride(), - actualRhs.data(), actualRhs.innerStride(), + LhsMapper(actualLhs.data(), actualLhs.outerStride()), + RhsMapper(actualRhs.data(), actualRhs.innerStride()), actualDestPtr, 1, compatibleAlpha); @@ -516,11 +518,13 @@ template<> struct gemv_selector Map(actualRhsPtr, actualRhs.size()) = actualRhs; } + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; general_matrix_vector_product - ::run( + ::run( actualLhs.rows(), actualLhs.cols(), - actualLhs.data(), actualLhs.outerStride(), - actualRhsPtr, 1, + LhsMapper(actualLhs.data(), actualLhs.outerStride()), + RhsMapper(actualRhsPtr, 1), dest.data(), dest.innerStride(), actualAlpha); } @@ -594,7 +598,7 @@ MatrixBase::operator*(const MatrixBase &other) const #ifdef EIGEN_DEBUG_PRODUCT internal::product_type::debug(); #endif - + return Product(derived(), other.derived()); } #else diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 340c51394..7dfa48bfb 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -10,7 +10,7 @@ #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H #define EIGEN_GENERAL_MATRIX_VECTOR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -48,17 +48,17 @@ namespace internal { * // we currently fall back to the NoneAligned case * * The same reasoning apply for the transposed case. - * + * * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet... * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow * compared to unaligned loads on a 4 byte boundary. * */ -template -struct general_matrix_vector_product +template +struct general_matrix_vector_product { -typedef typename scalar_product_traits::ReturnType ResScalar; + typedef typename scalar_product_traits::ReturnType ResScalar; enum { Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable @@ -78,17 +78,17 @@ typedef typename conditional::type ResPacket; EIGEN_DONT_INLINE static void run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha); }; -template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( +template +EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha) { @@ -97,14 +97,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(&res[j]), \ padd( \ - padd(pcj.pmul(EIGEN_CAT(ploa , A0)(&lhs0[j]), ptmp0), \ - pcj.pmul(EIGEN_CAT(ploa , A13)(&lhs1[j]), ptmp1)), \ - padd(pcj.pmul(EIGEN_CAT(ploa , A2)(&lhs2[j]), ptmp2), \ - pcj.pmul(EIGEN_CAT(ploa , A13)(&lhs3[j]), ptmp3)) ))) + padd(pcj.pmul(lhs0.template load(j), ptmp0), \ + pcj.pmul(lhs1.template load(j), ptmp1)), \ + padd(pcj.pmul(lhs2.template load(j), ptmp2), \ + pcj.pmul(lhs3.template load(j), ptmp3)) ))) + + typedef typename LhsMapper::VectorMapper LhsScalars; conj_helper cj; conj_helper pcj; @@ -118,7 +120,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product1) { - eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size= cols) || LhsPacketSize > size - || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0); + || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/ } else if(Vectorizable) { @@ -178,20 +182,20 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(alpha*rhs[i*rhsIncr]), - ptmp1 = pset1(alpha*rhs[(i+offset1)*rhsIncr]), - ptmp2 = pset1(alpha*rhs[(i+2)*rhsIncr]), - ptmp3 = pset1(alpha*rhs[(i+offset3)*rhsIncr]); + RhsPacket ptmp0 = pset1(alpha*rhs(i, 0)), + ptmp1 = pset1(alpha*rhs(i+offset1, 0)), + ptmp2 = pset1(alpha*rhs(i+2, 0)), + ptmp3 = pset1(alpha*rhs(i+offset3, 0)); // this helps a lot generating better binary code - const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, - *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1), + lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3); if (Vectorizable) { @@ -199,10 +203,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_productalignedStart) @@ -211,11 +215,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(&lhs1[alignedStart-1]); - A02 = pload(&lhs2[alignedStart-2]); - A03 = pload(&lhs3[alignedStart-3]); + A01 = lhs1.template load(alignedStart-1); + A02 = lhs2.template load(alignedStart-2); + A03 = lhs3.template load(alignedStart-3); for (; j(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); - A12 = pload(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); - A13 = pload(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); + A11 = lhs1.template load(j-1+LhsPacketSize); palign<1>(A01,A11); + A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); + A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); - A00 = pload(&lhs0[j]); - A10 = pload(&lhs0[j+LhsPacketSize]); + A00 = lhs0.template load(j); + A10 = lhs0.template load(j+LhsPacketSize); T0 = pcj.pmadd(A00, ptmp0, pload(&res[j])); T1 = pcj.pmadd(A10, ptmp0, pload(&res[j+ResPacketSize])); T0 = pcj.pmadd(A01, ptmp1, T0); - A01 = pload(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); + A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); T0 = pcj.pmadd(A02, ptmp2, T0); - A02 = pload(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); + A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); T0 = pcj.pmadd(A03, ptmp3, T0); pstore(&res[j],T0); - A03 = pload(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); + A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); T1 = pcj.pmadd(A11, ptmp1, T1); T1 = pcj.pmadd(A12, ptmp2, T1); T1 = pcj.pmadd(A13, ptmp3, T1); @@ -254,12 +258,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(alpha*rhs[k*rhsIncr]); - const LhsScalar* lhs0 = lhs + k*lhsStride; + RhsPacket ptmp0 = pset1(alpha*rhs(k, 0)); + const LhsScalars lhs0 = lhs.getVectorMapper(0, k); if (Vectorizable) { /* explicit vectorization */ // process first unaligned result's coeffs for (Index j=0; j(alignedStart)) for (Index i = alignedStart;i(&lhs0[i]), ptmp0, pload(&res[i]))); + pstore(&res[i], pcj.pmadd(lhs0.template load(i), ptmp0, pload(&res[i]))); else for (Index i = alignedStart;i(&lhs0[i]), ptmp0, pload(&res[i]))); + pstore(&res[i], pcj.pmadd(lhs0.template load(i), ptmp0, pload(&res[i]))); } // process remaining scalars (or all if no explicit vectorization) for (Index i=alignedSize; i -struct general_matrix_vector_product +template +struct general_matrix_vector_product { typedef typename scalar_product_traits::ReturnType ResScalar; @@ -346,67 +350,69 @@ typedef typename packet_traits::type _ResPacket; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; - + EIGEN_DONT_INLINE static void run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha); }; -template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( +template +EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha) { - EIGEN_UNUSED_VARIABLE(rhsIncr); - eigen_internal_assert(rhsIncr==1); - + eigen_internal_assert(rhs.stride()==1); + #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif - #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ - RhsPacket b = pload(&rhs[j]); \ - ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) (&lhs0[j]), b, ptmp0); \ - ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)(&lhs1[j]), b, ptmp1); \ - ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) (&lhs2[j]), b, ptmp2); \ - ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)(&lhs3[j]), b, ptmp3); } + #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\ + RhsPacket b = rhs.getVectorMapper(j, 0).template load(0); \ + ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); \ + ptmp1 = pcj.pmadd(lhs1.template load(j), b, ptmp1); \ + ptmp2 = pcj.pmadd(lhs2.template load(j), b, ptmp2); \ + ptmp3 = pcj.pmadd(lhs3.template load(j), b, ptmp3); } conj_helper cj; conj_helper pcj; + typedef typename LhsMapper::VectorMapper LhsScalars; + enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; const Index rowsAtOnce = 4; const Index peels = 2; const Index RhsPacketAlignedMask = RhsPacketSize-1; const Index LhsPacketAlignedMask = LhsPacketSize-1; -// const Index PeelAlignedMask = RhsPacketSize*peels-1; const Index depth = cols; + const Index lhsStride = lhs.stride(); // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type // if that's not the case then vectorization is discarded, see below. - Index alignedStart = internal::first_aligned(rhs, depth); + Index alignedStart = rhs.firstAligned(depth); Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; + : alignmentStep==(LhsPacketSize/2) ? EvenAligned + : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth); + const Index lhsAlignmentOffset = lhs.firstAligned(depth); + const Index rhsAlignmentOffset = rhs.firstAligned(rows); // find how many rows do we have to skip to be aligned with rhs (if possible) Index skipRows = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) ) + if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (lhsAlignmentOffset < 0) || (rhsAlignmentOffset < 0) ) { alignedSize = 0; alignedStart = 0; @@ -418,7 +424,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product1) { - eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth= rows) || LhsPacketSize > depth - || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0); + || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/ } else if(Vectorizable) { @@ -447,8 +453,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_productalignedStart) @@ -481,11 +487,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(&lhs1[alignedStart-1]); - A02 = pload(&lhs2[alignedStart-2]); - A03 = pload(&lhs3[alignedStart-3]); + A01 = lhs1.template load(alignedStart-1); + A02 = lhs2.template load(alignedStart-2); + A03 = lhs3.template load(alignedStart-3); for (; j(&rhs[j]); - A11 = pload(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); - A12 = pload(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); - A13 = pload(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); + RhsPacket b = rhs.getVectorMapper(j, 0).template load(0); + A11 = lhs1.template load(j-1+LhsPacketSize); palign<1>(A01,A11); + A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); + A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); - ptmp0 = pcj.pmadd(pload(&lhs0[j]), b, ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); ptmp1 = pcj.pmadd(A01, b, ptmp1); - A01 = pload(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); + A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); ptmp2 = pcj.pmadd(A02, b, ptmp2); - A02 = pload(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); + A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); ptmp3 = pcj.pmadd(A03, b, ptmp3); - A03 = pload(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); + A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); - b = pload(&rhs[j+RhsPacketSize]); - ptmp0 = pcj.pmadd(pload(&lhs0[j+LhsPacketSize]), b, ptmp0); + b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load(0); + ptmp0 = pcj.pmadd(lhs0.template load(j+LhsPacketSize), b, ptmp0); ptmp1 = pcj.pmadd(A11, b, ptmp1); ptmp2 = pcj.pmadd(A12, b, ptmp2); ptmp3 = pcj.pmadd(A13, b, ptmp3); } } for (; j(tmp0); - const LhsScalar* lhs0 = lhs + i*lhsStride; + const LhsScalars lhs0 = lhs.getVectorMapper(i, 0); // process first unaligned result's coeffs // FIXME this loop get vectorized by the compiler ! for (Index j=0; jalignedStart) { // process aligned rhs coeffs - if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + if (lhs0.template aligned(alignedStart)) for (Index j = alignedStart;j(&lhs0[j]), pload(&rhs[j]), ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); else for (Index j = alignedStart;j(&lhs0[j]), pload(&rhs[j]), ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); tmp0 += predux(ptmp0); } // process remaining scalars // FIXME this loop get vectorized by the compiler ! for (Index j=alignedSize; j, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); typename conj_expr_if::type cjLhs(lhs); - + typedef Map, 0, InnerStride<> > RhsMap; const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr)); typename conj_expr_if::type cjRhs(rhs); @@ -51,6 +51,9 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product > ResMap; ResMap res(_res,rows); + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + for (Index pi=0; pi0) { Index s = IsLower ? pi+actualPanelWidth : 0; - general_matrix_vector_product::run( + general_matrix_vector_product::run( r, actualPanelWidth, - &lhs.coeffRef(s,pi), lhsStride, - &rhs.coeffRef(pi), rhsIncr, + LhsMapper(&lhs.coeffRef(s,pi), lhsStride), + RhsMapper(&rhs.coeffRef(pi), rhsIncr), &res.coeffRef(s), resIncr, alpha); } } if((!IsLower) && cols>size) { - general_matrix_vector_product::run( + general_matrix_vector_product::run( rows, cols-size, - &lhs.coeffRef(0,size), lhsStride, - &rhs.coeffRef(size), rhsIncr, + LhsMapper(&lhs.coeffRef(0,size), lhsStride), + RhsMapper(&rhs.coeffRef(size), rhsIncr), _res, resIncr, alpha); } } @@ -118,7 +121,10 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product, 0, InnerStride<> > ResMap; ResMap res(_res,rows,InnerStride<>(resIncr)); - + + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + for (Index pi=0; pi0) { Index s = IsLower ? 0 : pi + actualPanelWidth; - general_matrix_vector_product::run( + general_matrix_vector_product::run( actualPanelWidth, r, - &lhs.coeffRef(pi,s), lhsStride, - &rhs.coeffRef(s), rhsIncr, + LhsMapper(&lhs.coeffRef(pi,s), lhsStride), + RhsMapper(&rhs.coeffRef(s), rhsIncr), &res.coeffRef(pi), resIncr, alpha); } } if(IsLower && rows>diagSize) { - general_matrix_vector_product::run( + general_matrix_vector_product::run( rows-diagSize, cols, - &lhs.coeffRef(diagSize,0), lhsStride, - &rhs.coeffRef(0), rhsIncr, + LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride), + RhsMapper(&rhs.coeffRef(0), rhsIncr), &res.coeffRef(diagSize), resIncr, alpha); } } @@ -184,7 +190,7 @@ struct TriangularProduct template void scaleAndAddTo(Dest& dst, const Scalar& alpha) const { eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); - + internal::trmv_selector<(int(internal::traits::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dst, alpha); } }; @@ -211,7 +217,7 @@ struct TriangularProduct namespace internal { // TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same. - + template<> struct trmv_selector { template @@ -247,7 +253,7 @@ template<> struct trmv_selector bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0)); bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; - + RhsScalar compatibleAlpha = get_factor::run(actualAlpha); ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), @@ -267,7 +273,7 @@ template<> struct trmv_selector else MappedDest(actualDestPtr, dest.size()) = dest; } - + internal::triangular_matrix_vector_product struct trmv_selector #endif Map(actualRhsPtr, actualRhs.size()) = actualRhs; } - + internal::triangular_matrix_vector_product ::run(size, _lhs, lhsStride, rhs); } }; - + // forward and backward substitution, row-major, rhs is a vector template struct triangular_solve_vector @@ -37,6 +37,10 @@ struct triangular_solve_vector, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + typename internal::conditional< Conjugate, const CwiseUnaryOp,LhsMap>, @@ -58,10 +62,10 @@ struct triangular_solve_vector::run( + general_matrix_vector_product::run( actualPanelWidth, r, - &lhs.coeffRef(startRow,startCol), lhsStride, - rhs + startCol, 1, + LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride), + RhsMapper(rhs + startCol, 1), rhs + startRow, 1, RhsScalar(-1)); } @@ -72,7 +76,7 @@ struct triangular_solve_vector0) rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map >(rhs+s,k))).sum(); - + if(!(Mode & UnitDiag)) rhs[i] /= cjLhs(i,i); } @@ -91,6 +95,8 @@ struct triangular_solve_vector, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; typename internal::conditional,LhsMap>, const LhsMap& @@ -122,10 +128,10 @@ struct triangular_solve_vector::run( + general_matrix_vector_product::run( r, actualPanelWidth, - &lhs.coeffRef(endBlock,startBlock), lhsStride, - rhs+startBlock, 1, + LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride), + RhsMapper(rhs+startBlock, 1), rhs+endBlock, 1, RhsScalar(-1)); } } diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 25a62d528..c4881b8da 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -34,7 +34,9 @@ template< int ResStorageOrder> struct general_matrix_matrix_product; -template +template struct general_matrix_vector_product; @@ -118,13 +120,35 @@ template struct get_factor::R }; +template +class BlasVectorMapper { + public: + EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {} + + EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + return m_data[i]; + } + template + EIGEN_ALWAYS_INLINE Packet load(Index i) const { + return ploadt(m_data + i); + } + + template + bool aligned(Index i) const { + return (size_t(m_data+i)%sizeof(Packet))==0; + } + + protected: + Scalar* m_data; +}; + template -class MatrixLinearMapper { +class BlasLinearMapper { public: typedef typename packet_traits::type Packet; typedef typename packet_traits::half HalfPacket; - EIGEN_ALWAYS_INLINE MatrixLinearMapper(Scalar *data) : m_data(data) {} + EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} EIGEN_ALWAYS_INLINE void prefetch(int i) const { internal::prefetch(&operator()(i)); @@ -157,7 +181,8 @@ class blas_data_mapper { typedef typename packet_traits::type Packet; typedef typename packet_traits::half HalfPacket; - typedef MatrixLinearMapper LinearMapper; + typedef BlasLinearMapper LinearMapper; + typedef BlasVectorMapper VectorMapper; EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} @@ -170,6 +195,11 @@ class blas_data_mapper { return LinearMapper(&operator()(i, j)); } + EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(&operator()(i, j)); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; @@ -193,6 +223,15 @@ class blas_data_mapper { return pgather(&operator()(i, j), m_stride); } + const Index stride() const { return m_stride; } + + Index firstAligned(Index size) const { + if (size_t(m_data)%sizeof(Scalar)) { + return -1; + } + return internal::first_aligned(m_data, size); + } + protected: Scalar* EIGEN_RESTRICT m_data; const Index m_stride; -- cgit v1.2.3 From 509e4ddc02e0d70b8c1ee325f3b18624d4235c1e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 19 Nov 2014 10:34:11 -0800 Subject: Added reduction packet primitives for CUDA --- Eigen/src/Core/arch/CUDA/PacketMath.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'Eigen') diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 7b481d512..19749c832 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -223,6 +223,27 @@ template<> EIGEN_DEVICE_FUNC inline double pfirst(const double2& a) { return a.x; } +template<> EIGEN_DEVICE_FUNC inline float predux(const float4& a) { + return a.x + a.y + a.z + a.w; +} +template<> EIGEN_DEVICE_FUNC inline double predux(const double2& a) { + return a.x + a.y; +} + +template<> EIGEN_DEVICE_FUNC inline float predux_max(const float4& a) { + return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_max(const double2& a) { + return fmax(a.x, a.y); +} + +template<> EIGEN_DEVICE_FUNC inline float predux_min(const float4& a) { + return fminf(fminf(a.x, a.y), fminf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_min(const double2& a) { + return fmin(a.x, a.y); +} + template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w)); } -- cgit v1.2.3 From 80ed5bd90c245655ce0f892f6a679a0278ccbbab Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 5 Dec 2014 12:49:30 +0100 Subject: Workaround various "returning reference to temporary" warnings. --- Eigen/SparseCore | 7 ------- Eigen/src/Core/CoreEvaluators.h | 4 +++- Eigen/src/Core/util/Constants.h | 3 +++ Eigen/src/SparseCore/CompressedStorage.h | 2 +- Eigen/src/SparseCore/SparseMatrix.h | 4 ++-- 5 files changed, 9 insertions(+), 11 deletions(-) (limited to 'Eigen') diff --git a/Eigen/SparseCore b/Eigen/SparseCore index b68c8fa8a..d5c0f6271 100644 --- a/Eigen/SparseCore +++ b/Eigen/SparseCore @@ -26,13 +26,6 @@ * This module depends on: Core. */ -namespace Eigen { - -/** The type used to identify a general sparse storage. */ -struct Sparse {}; - -} - #include "src/SparseCore/SparseUtil.h" #include "src/SparseCore/SparseMatrixBase.h" #include "src/SparseCore/SparseAssign.h" diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index a0dc72c4d..1c7123b85 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -1221,7 +1221,9 @@ struct evaluator > typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; + // FIXME having to check whether ArgType is sparse here i not very nice. + typedef typename internal::conditional::value, + typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType; EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index) const { diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 5c7d70af6..9b40093f0 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -449,6 +449,9 @@ enum Action {GetAction, SetAction}; /** The type used to identify a dense storage. */ struct Dense {}; +/** The type used to identify a general sparse storage. */ +struct Sparse {}; + /** The type used to identify a permutation storage. */ struct PermutationStorage {}; diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h index 2741f8292..99f741138 100644 --- a/Eigen/src/SparseCore/CompressedStorage.h +++ b/Eigen/src/SparseCore/CompressedStorage.h @@ -143,7 +143,7 @@ class CompressedStorage } /** Like at(), but the search is performed in the range [start,end) */ - inline const Scalar& atInRange(size_t start, size_t end, Index key, const Scalar& defaultValue = Scalar(0)) const + inline Scalar atInRange(size_t start, size_t end, Index key, const Scalar &defaultValue = Scalar(0)) const { if (start>=end) return defaultValue; diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 4c79c7dc3..93677c786 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -179,7 +179,7 @@ class SparseMatrix /** \returns the value of the matrix at position \a i, \a j * This function returns Scalar(0) if the element is an explicit \em zero */ - inline const Scalar& coeff(Index row, Index col) const + inline Scalar coeff(Index row, Index col) const { eigen_assert(row>=0 && row=0 && col > operator const SparseMatrixType&() const { return *m_matrix; } typedef typename DenseCoeffsBase::CoeffReturnType CoeffReturnType; - CoeffReturnType coeff(Index row, Index col) const + Scalar coeff(Index row, Index col) const { return m_matrix->coeff(row,col); } Scalar& coeffRef(Index row, Index col) -- cgit v1.2.3 From 30c849669d6d29f6e19484478639ec5176c1826a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 8 Dec 2014 14:45:04 +0100 Subject: Fix dynamic allocation in JacobiSVD (regression) --- Eigen/src/SVD/JacobiSVD.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index 0f7e5b8fe..444187ae7 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -628,6 +628,7 @@ template class JacobiSVD internal::qr_preconditioner_impl m_qr_precond_morecols; internal::qr_preconditioner_impl m_qr_precond_morerows; + MatrixType m_scaledMatrix; }; template @@ -674,8 +675,9 @@ void JacobiSVD::allocate(Index rows, Index cols, u : 0); m_workMatrix.resize(m_diagSize, m_diagSize); - if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this); - if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this); + if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this); + if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this); + if(m_cols!=m_cols) m_scaledMatrix.resize(rows,cols); } template @@ -698,7 +700,13 @@ JacobiSVD::compute(const MatrixType& matrix, unsig /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */ - if(!m_qr_precond_morecols.run(*this, matrix/scale) && !m_qr_precond_morerows.run(*this, matrix/scale)) + if(m_rows!=m_cols) + { + m_scaledMatrix = matrix / scale; + m_qr_precond_morecols.run(*this, m_scaledMatrix); + m_qr_precond_morerows.run(*this, m_scaledMatrix); + } + else { m_workMatrix = matrix.block(0,0,m_diagSize,m_diagSize) / scale; if(m_computeFullU) m_matrixU.setIdentity(m_rows,m_rows); -- cgit v1.2.3 From 7f7a71206267014f33a175322789977047ced24f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 8 Dec 2014 15:02:25 +0100 Subject: Optimize Simplicial Cholesky when NaturalOrdering is used. --- Eigen/src/SparseCholesky/SimplicialCholesky.h | 35 ++++++++++++++++++--------- 1 file changed, 24 insertions(+), 11 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h index 918a34e13..1928670a3 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -607,22 +607,35 @@ void SimplicialCholeskyBase::ordering(const MatrixType& a, CholMatrixTy { eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); - // Note that amd compute the inverse permutation + // Note that ordering methods compute the inverse permutation + if(!internal::is_same >::value) { - CholMatrixType C; - C = a.template selfadjointView(); + { + CholMatrixType C; + C = a.template selfadjointView(); + + OrderingType ordering; + ordering(C,m_Pinv); + } + + if(m_Pinv.size()>0) m_P = m_Pinv.inverse(); + else m_P.resize(0); - OrderingType ordering; - ordering(C,m_Pinv); + ap.resize(size,size); + ap.template selfadjointView() = a.template selfadjointView().twistedBy(m_P); } - - if(m_Pinv.size()>0) - m_P = m_Pinv.inverse(); else + { + m_Pinv.resize(0); m_P.resize(0); - - ap.resize(size,size); - ap.template selfadjointView() = a.template selfadjointView().twistedBy(m_P); + if(UpLo==Lower) + { + ap.resize(size,size); + ap.template selfadjointView() = a.template selfadjointView().twistedBy(m_P); + } + else + ap = a.template triangularView(); + } } } // end namespace Eigen -- cgit v1.2.3 From bea36925dbdd753c861d650623ae2692bb9de812 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 8 Dec 2014 16:26:53 +0100 Subject: bug #876: implement a portable log1p function --- Eigen/src/Core/MathFunctions.h | 85 ++++++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 33 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 4c5fc1cae..72d6acfc1 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -14,7 +14,7 @@ namespace Eigen { // On WINCE, std::abs is defined for int only, so let's defined our own overloads: // This issue has been confirmed with MSVC 2008 only, but the issue might exist for more recent versions too. -#if defined(_WIN32_WCE) && defined(_MSC_VER) && _MSC_VER<=1500 +#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC<=1500 long abs(long x) { return (labs(x)); } double abs(double x) { return (fabs(x)); } float abs(float x) { return (fabsf(x)); } @@ -360,50 +360,31 @@ inline NewType cast(const OldType& x) } /**************************************************************************** -* Implementation of atanh2 * +* Implementation of logp1 * ****************************************************************************/ template -struct atanh2_impl +struct log1p_impl { - static inline Scalar run(const Scalar& x, const Scalar& r) + static inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) - #if (__cplusplus >= 201103L) && !defined(__CYGWIN__) + // Let's be conservative and enable the default C++11 implementation only if we are sure it exists + #if (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC) \ + && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC) using std::log1p; - return log1p(2 * x / (r - x)) / 2; + return log1p(x); #else - using std::abs; + typedef typename NumTraits::Real RealScalar; using std::log; - using std::sqrt; - Scalar z = x / r; - if (r == 0 || abs(z) > sqrt(NumTraits::epsilon())) - return log((r + x) / (r - x)) / 2; - else - return z + z*z*z / 3; + Scalar x1p = RealScalar(1) + x; + return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); #endif } }; -template -struct atanh2_impl > -{ - typedef std::complex Scalar; - static inline Scalar run(const Scalar& x, const Scalar& r) - { - using std::log; - using std::norm; - using std::sqrt; - Scalar z = x / r; - if (r == Scalar(0) || norm(z) > NumTraits::epsilon()) - return RealScalar(0.5) * log((r + x) / (r - x)); - else - return z + z*z*z / RealScalar(3); - } -}; - template -struct atanh2_retval +struct log1p_retval { typedef Scalar type; }; @@ -680,9 +661,9 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& template EIGEN_DEVICE_FUNC -inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y) +inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x) { - return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y); + return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x); } template @@ -727,6 +708,44 @@ inline int log2(int x) } // end namespace numext + +namespace internal { + +/**************************************************************************** +* Implementation of atanh2 * +****************************************************************************/ + +template +struct atanh2_impl +{ + static inline Scalar run(const Scalar& x, const Scalar& r) + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) + typedef typename NumTraits::Real RealScalar; + return numext::log1p(RealScalar(2) * x / (r - x)) / RealScalar(2); + } +}; + +template +struct atanh2_retval +{ + typedef Scalar type; +}; + + +} // end namespace internal + +namespace numext { + +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y) +{ + return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y); +} + +} // end namespace numext + namespace internal { /**************************************************************************** -- cgit v1.2.3 From 437191186165005e0b619c069239ac2913fd2c41 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 8 Dec 2014 16:44:34 +0100 Subject: Remove useless and non standard numext::atanh2 function. --- Eigen/src/Core/MathFunctions.h | 38 -------------------------------------- 1 file changed, 38 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 72d6acfc1..16ad2dc7e 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -708,44 +708,6 @@ inline int log2(int x) } // end namespace numext - -namespace internal { - -/**************************************************************************** -* Implementation of atanh2 * -****************************************************************************/ - -template -struct atanh2_impl -{ - static inline Scalar run(const Scalar& x, const Scalar& r) - { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) - typedef typename NumTraits::Real RealScalar; - return numext::log1p(RealScalar(2) * x / (r - x)) / RealScalar(2); - } -}; - -template -struct atanh2_retval -{ - typedef Scalar type; -}; - - -} // end namespace internal - -namespace numext { - -template -EIGEN_DEVICE_FUNC -inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y) -{ - return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y); -} - -} // end namespace numext - namespace internal { /**************************************************************************** -- cgit v1.2.3 From a910a7466e143502439606572367849cb09ff5bf Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 8 Dec 2014 17:55:31 +0100 Subject: Fix inner iterator type --- Eigen/src/SparseCholesky/SimplicialCholesky_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Eigen') diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h index 7aaf702be..b7fd62faa 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h @@ -126,7 +126,7 @@ void SimplicialCholeskyBase::factorize_preordered(const CholMatrixType& Index top = size; // stack for pattern is empty tags[k] = k; // mark node k as visited m_nonZerosPerCol[k] = 0; // count of nonzeros in column k of L - for(typename MatrixType::InnerIterator it(ap,k); it; ++it) + for(typename CholMatrixType::InnerIterator it(ap,k); it; ++it) { Index i = it.index(); if(i <= k) -- cgit v1.2.3 From 41a20994cce4d7e2c49bbb958a43c9ed69473f7f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 8 Dec 2014 17:56:33 +0100 Subject: In simplicial cholesky: avoid deep copy of the input matrix is this later can be used readily --- Eigen/src/SparseCholesky/SimplicialCholesky.h | 68 +++++++++++++++++++++------ 1 file changed, 53 insertions(+), 15 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h index 1928670a3..22325d7f4 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -17,6 +17,27 @@ enum SimplicialCholeskyMode { SimplicialCholeskyLDLT }; +namespace internal { + template + struct simplicial_cholesky_grab_input { + typedef CholMatrixType const * ConstCholMatrixPtr; + static void run(const InputMatrixType& input, ConstCholMatrixPtr &pmat, CholMatrixType &tmp) + { + tmp = input; + pmat = &tmp; + } + }; + + template + struct simplicial_cholesky_grab_input { + typedef MatrixType const * ConstMatrixPtr; + static void run(const MatrixType& input, ConstMatrixPtr &pmat, MatrixType &/*tmp*/) + { + pmat = &input; + } + }; +} // end namespace internal + /** \ingroup SparseCholesky_Module * \brief A direct sparse Cholesky factorizations * @@ -46,6 +67,7 @@ class SimplicialCholeskyBase : public SparseSolverBase typedef typename MatrixType::RealScalar RealScalar; typedef typename MatrixType::Index Index; typedef SparseMatrix CholMatrixType; + typedef CholMatrixType const * ConstCholMatrixPtr; typedef Matrix VectorType; public: @@ -169,10 +191,11 @@ class SimplicialCholeskyBase : public SparseSolverBase { eigen_assert(matrix.rows()==matrix.cols()); Index size = matrix.cols(); - CholMatrixType ap(size,size); - ordering(matrix, ap); - analyzePattern_preordered(ap, DoLDLT); - factorize_preordered(ap); + CholMatrixType tmp(size,size); + ConstCholMatrixPtr pmat; + ordering(matrix, pmat, tmp); + analyzePattern_preordered(*pmat, DoLDLT); + factorize_preordered(*pmat); } template @@ -180,9 +203,21 @@ class SimplicialCholeskyBase : public SparseSolverBase { eigen_assert(a.rows()==a.cols()); int size = a.cols(); - CholMatrixType ap(size,size); - ap.template selfadjointView() = a.template selfadjointView().twistedBy(m_P); - factorize_preordered(ap); + CholMatrixType tmp(size,size); + ConstCholMatrixPtr pmat; + + if(m_P.size()==0 && (UpLo&Upper)==Upper) + { + // If there is no ordering, try to directly use the input matrix without any copy + internal::simplicial_cholesky_grab_input::run(a, pmat, tmp); + } + else + { + tmp.template selfadjointView() = a.template selfadjointView().twistedBy(m_P); + pmat = &tmp; + } + + factorize_preordered(*pmat); } template @@ -192,13 +227,14 @@ class SimplicialCholeskyBase : public SparseSolverBase { eigen_assert(a.rows()==a.cols()); int size = a.cols(); - CholMatrixType ap(size,size); - ordering(a, ap); - analyzePattern_preordered(ap,doLDLT); + CholMatrixType tmp(size,size); + ConstCholMatrixPtr pmat; + ordering(a, pmat, tmp); + analyzePattern_preordered(*pmat,doLDLT); } void analyzePattern_preordered(const CholMatrixType& a, bool doLDLT); - void ordering(const MatrixType& a, CholMatrixType& ap); + void ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap); /** keeps off-diagonal entries; drops diagonal entries */ struct keep_diag { @@ -603,10 +639,11 @@ public: }; template -void SimplicialCholeskyBase::ordering(const MatrixType& a, CholMatrixType& ap) +void SimplicialCholeskyBase::ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap) { eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); + pmat = ≈ // Note that ordering methods compute the inverse permutation if(!internal::is_same >::value) { @@ -628,13 +665,14 @@ void SimplicialCholeskyBase::ordering(const MatrixType& a, CholMatrixTy { m_Pinv.resize(0); m_P.resize(0); - if(UpLo==Lower) + if(UpLo==Lower || MatrixType::IsRowMajor) { + // we have to transpose the lower part to to the upper one ap.resize(size,size); - ap.template selfadjointView() = a.template selfadjointView().twistedBy(m_P); + ap.template selfadjointView() = a.template selfadjointView(); } else - ap = a.template triangularView(); + internal::simplicial_cholesky_grab_input::run(a, pmat, ap); } } -- cgit v1.2.3 From 0efaff9b3b261daaa91baf8935ec7c4f5156a647 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 11 Dec 2014 16:15:20 +0100 Subject: Fix out-of-bounds write --- Eigen/src/SparseCore/AmbiVector.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h index 5c9c3101e..76ef25f7d 100644 --- a/Eigen/src/SparseCore/AmbiVector.h +++ b/Eigen/src/SparseCore/AmbiVector.h @@ -69,7 +69,7 @@ class AmbiVector delete[] m_buffer; if (size<1000) { - Index allocSize = (size * sizeof(ListEl))/sizeof(Scalar); + Index allocSize = (size * sizeof(ListEl) + sizeof(Scalar) - 1)/sizeof(Scalar); m_allocatedElements = (allocSize*sizeof(Scalar))/sizeof(ListEl); m_buffer = new Scalar[allocSize]; } @@ -88,7 +88,7 @@ class AmbiVector Index copyElements = m_allocatedElements; m_allocatedElements = (std::min)(Index(m_allocatedElements*1.5),m_size); Index allocSize = m_allocatedElements * sizeof(ListEl); - allocSize = allocSize/sizeof(Scalar) + (allocSize%sizeof(Scalar)>0?1:0); + allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar); Scalar* newBuffer = new Scalar[allocSize]; memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); delete[] m_buffer; -- cgit v1.2.3 From 7dad5f797e8c270be5f32aee154f6660df2242f5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 16 Dec 2014 13:33:43 +0100 Subject: bug #821: workaround MSVC 2013 issue with using Base::Base::operator= --- Eigen/src/Core/MapBase.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index 3c67edae5..5d51548cd 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -243,7 +243,11 @@ template class MapBase return derived(); } - using Base::Base::operator=; + // In theory MapBase should not make a using Base::operator=, + // and thus we should directly do: using Base::Base::operator=; + // However, this would confuse recent MSVC 2013 (bug 821), and since MapBase + // has operator= to make ICC 11 happy, we can also make MSVC 2013 happy as follow: + using Base::operator=; }; #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS -- cgit v1.2.3 From b8d9eaa19bd8501e8007c4c3633904643b44617e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 13 Dec 2014 22:16:39 +0100 Subject: Use true compile time "if" for Transform::makeAffine --- Eigen/src/Geometry/Transform.h | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h index 7ebde6803..d33fc24db 100644 --- a/Eigen/src/Geometry/Transform.h +++ b/Eigen/src/Geometry/Transform.h @@ -78,6 +78,8 @@ struct traits > }; }; +template struct transform_make_affine; + } // end namespace internal /** \geometry_module \ingroup Geometry_Module @@ -246,8 +248,7 @@ public: inline Transform() { check_template_params(); - if (int(Mode)==Affine) - makeAffine(); + internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix); } inline Transform(const Transform& other) @@ -610,11 +611,7 @@ public: */ void makeAffine() { - if(int(Mode)!=int(AffineCompact)) - { - matrix().template block<1,Dim>(Dim,0).setZero(); - matrix().coeffRef(Dim,Dim) = Scalar(1); - } + internal::transform_make_affine::run(m_matrix); } /** \internal @@ -1102,6 +1099,24 @@ Transform::fromPositionOrientationScale(const MatrixBas namespace internal { +template +struct transform_make_affine +{ + template + static void run(MatrixType &mat) + { + static const int Dim = MatrixType::ColsAtCompileTime-1; + mat.template block<1,Dim>(Dim,0).setZero(); + mat.coeffRef(Dim,Dim) = typename MatrixType::Scalar(1); + } +}; + +template<> +struct transform_make_affine +{ + template static void run(MatrixType &) { } +}; + // selector needed to avoid taking the inverse of a 3x4 matrix template struct projective_transform_inverse -- cgit v1.2.3 From 25c7d9164f45119fa20dc6af2fa451d278c5f285 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 18 Dec 2014 22:58:15 +0100 Subject: bug #920: fix MSVC 2015 compilation issues --- Eigen/src/Core/MapBase.h | 11 +++++------ Eigen/src/Core/util/Macros.h | 4 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index 5d51548cd..1589cbaae 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -172,6 +172,7 @@ template class MapBase template class MapBase : public MapBase { + typedef MapBase ReadOnlyMapBase; public: typedef MapBase Base; @@ -239,15 +240,13 @@ template class MapBase EIGEN_DEVICE_FUNC Derived& operator=(const MapBase& other) { - Base::Base::operator=(other); + ReadOnlyMapBase::Base::operator=(other); return derived(); } - // In theory MapBase should not make a using Base::operator=, - // and thus we should directly do: using Base::Base::operator=; - // However, this would confuse recent MSVC 2013 (bug 821), and since MapBase - // has operator= to make ICC 11 happy, we can also make MSVC 2013 happy as follow: - using Base::operator=; + // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base, + // see bugs 821 and 920. + using ReadOnlyMapBase::Base::operator=; }; #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index bc26043d7..687ba41dd 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -73,7 +73,7 @@ /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC) - #define EIGEN_COMP_MSVC_STRICT 1 + #define EIGEN_COMP_MSVC_STRICT _MSC_VER #else #define EIGEN_COMP_MSVC_STRICT 0 #endif @@ -592,7 +592,7 @@ namespace Eigen { // just an empty macro ! #define EIGEN_EMPTY -#if EIGEN_COMP_MSVC_STRICT +#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1900 #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) -- cgit v1.2.3 From f5f6e2c6f46a8999ee36ce0c7adc62098d8d93d2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 19 Dec 2014 14:41:59 +0100 Subject: bug #921: fix utilization of bitwise operation on enums in first_aligned --- Eigen/src/Core/util/Memory.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index a54ccaedc..bacf236fb 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -523,9 +523,8 @@ template inline void conditional_aligned_delete_auto(T * template inline Index first_aligned(const Scalar* array, Index size) { - enum { PacketSize = packet_traits::size, - PacketAlignedMask = PacketSize-1 - }; + static const Index PacketSize = packet_traits::size; + static const Index PacketAlignedMask = PacketSize-1; if(PacketSize==1) { -- cgit v1.2.3 From 9f98650d0a82d4757afb4503ce6f2b6f61763463 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 6 Jan 2015 09:29:13 -0800 Subject: Ensured that contractions that can be reduced to a matrix vector product work correctly even when the input coefficients aren't aligned. --- Eigen/src/Core/products/GeneralMatrixVector.h | 8 +++-- unsupported/test/cxx11_tensor_contraction.cpp | 48 +++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 7dfa48bfb..7df6a6b1a 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -140,10 +140,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product 4) { @@ -412,10 +413,13 @@ EIGEN_DONT_INLINE void general_matrix_vector_product 4) { diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index 2b599d30d..17bd335f7 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -352,6 +352,52 @@ static void test_large_contraction() } +static void test_matrix_vector() +{ + Tensor t_left(30, 50); + Tensor t_right(50); + Tensor t_result(30); + + t_left.setRandom(); + t_right.setRandom(); + + typedef Map> MapXf; + MapXf m_left(t_left.data(), 30, 50); + MapXf m_right(t_right.data(), 50, 1); + Eigen::Matrix m_result(30, 1); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array dims{{DimPair(1, 0)}}; + + // compute results by separate methods + t_result = t_left.contract(t_right, dims); + m_result = m_left * m_right; + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result(i), m_result(i, 0)); + } +} + + +static void test_tensor_vector() +{ + Tensor t_left(7, 13, 17); + Tensor t_right(1, 7); + typedef typename Tensor::DimensionPair DimensionPair; + Eigen::array dim_pair01{{{0, 1}}}; + Tensor t_result = t_left.contract(t_right, dim_pair01); + + typedef Map> MapXf; + MapXf m_left(t_left.data(), 7, 13*17); + MapXf m_right(t_right.data(), 1, 7); + Eigen::Matrix m_result = m_left.transpose() * m_right.transpose(); + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result(i), m_result(i, 0)); + } +} + + void test_cxx11_tensor_contraction() { CALL_SUBTEST(test_evals()); @@ -364,4 +410,6 @@ void test_cxx11_tensor_contraction() CALL_SUBTEST(test_out_of_order_contraction()); CALL_SUBTEST(test_consistency()); CALL_SUBTEST(test_large_contraction()); + CALL_SUBTEST(test_matrix_vector()); + CALL_SUBTEST(test_tensor_vector()); } -- cgit v1.2.3 From 79f4a59ed9ac3fc1a3b6e4516c2b3e04cec5f522 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 7 Jan 2015 09:41:56 +0100 Subject: bug #907: fix compilation with ARM64 --- Eigen/src/Core/arch/NEON/PacketMath.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 6c5c669a1..586fa95e5 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -597,7 +597,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vco template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return vget_low_f64(a) + vget_high_f64(a); } +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) { @@ -613,7 +613,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) } // Other reduction functions: // mul -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return vget_low_f64(a) * vget_high_f64(a); } +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } // min template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); } -- cgit v1.2.3 From 63974bcb88f23bd4768eb3232906f2b9f3c92fca Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 7 Jan 2015 09:44:25 +0100 Subject: Big 907: workaround some missing intrinsics in current NDK's gcc version (ARM64) --- Eigen/src/Core/arch/NEON/PacketMath.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'Eigen') diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 586fa95e5..d962e8adc 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -492,6 +492,21 @@ ptranspose(PacketBlock& kernel) { //---------- double ---------- #if EIGEN_ARCH_ARM64 +#if EIGEN_COMP_GNUC_STRICT && __ANDROID__ +// Bug 907: workaround missing declarations of the following two functions in the ADK +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_u64_f64 (float64x2_t __a) +{ + return (uint64x2_t) __a; +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_f64_u64 (uint64x2_t __a) +{ + return (float64x2_t) __a; +} +#endif + typedef float64x2_t Packet2d; typedef float64x1_t Packet1d; -- cgit v1.2.3 From 36f7c1337f2f73421325a9bdbe98dd53fd99951f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 13 Jan 2015 09:57:37 +0100 Subject: bug #907, ARM64: workaround vreinterpretq_u64_* not defined in xcode/clang --- Eigen/src/Core/arch/NEON/PacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index d962e8adc..29512e264 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -492,7 +492,7 @@ ptranspose(PacketBlock& kernel) { //---------- double ---------- #if EIGEN_ARCH_ARM64 -#if EIGEN_COMP_GNUC_STRICT && __ANDROID__ +#if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__) // Bug 907: workaround missing declarations of the following two functions in the ADK __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) vreinterpretq_u64_f64 (float64x2_t __a) -- cgit v1.2.3 From ae4644cc6827bb1b6d654ceed8b3a0c256b1d173 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 13 Jan 2015 10:03:00 +0100 Subject: bug #907, ARM64: workaround ICE in xcode/clang --- Eigen/src/Core/arch/NEON/PacketMath.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'Eigen') diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 29512e264..f83f8db0e 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -612,7 +612,12 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vco template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +// workaround ICE, see bug 907 +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; } +#else template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } +#endif template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) { @@ -628,7 +633,11 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) } // Other reduction functions: // mul +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; } +#else template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } +#endif // min template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); } -- cgit v1.2.3 From 279786e9875cc71c7bed1f78f8df1803be215904 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 13 Jan 2015 10:25:50 +0100 Subject: Fix missing evaluator in outer-product --- Eigen/src/Core/ProductEvaluators.h | 10 ++++++---- test/product_small.cpp | 10 ++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 3cebbbd12..488eee00c 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -211,24 +211,26 @@ template EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) { typedef typename Dst::Index Index; + typename evaluator::type rhsEval(rhs); // FIXME make sure lhs is sequentially stored // FIXME not very good if rhs is real and lhs complex while alpha is real too - // FIXME we should probably build an evaluator for dst and rhs + // FIXME we should probably build an evaluator for dst const Index cols = dst.cols(); for (Index j=0; j EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) { typedef typename Dst::Index Index; + typename evaluator::type lhsEval(lhs); // FIXME make sure rhs is sequentially stored // FIXME not very good if lhs is real and rhs complex while alpha is real too - // FIXME we should probably build an evaluator for dst and lhs + // FIXME we should probably build an evaluator for dst const Index rows = dst.rows(); for (Index i=0; i diff --git a/test/product_small.cpp b/test/product_small.cpp index 8b132abb6..091955a0f 100644 --- a/test/product_small.cpp +++ b/test/product_small.cpp @@ -9,6 +9,7 @@ #define EIGEN_NO_STATIC_ASSERT #include "product.h" +#include // regression test for bug 447 void product1x1() @@ -46,5 +47,14 @@ void test_product_small() Vector3f v = Vector3f::Random(); VERIFY_IS_APPROX( (v * v.transpose()) * v, (v * v.transpose()).eval() * v); } + + { + // regression test for pull-request #93 + Eigen::Matrix A; A.setRandom(); + Eigen::Matrix B; B.setRandom(); + Eigen::Matrix C; C.setRandom(); + VERIFY_IS_APPROX(B * A.inverse(), B * A.inverse()[0]); + VERIFY_IS_APPROX(A.inverse() * C, A.inverse()[0] * C); + } #endif } -- cgit v1.2.3 From b9d314ae19b1c857adfe42b7eaecf2695428f0ed Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 17 Jan 2015 21:55:33 +0100 Subject: bug #329: fix typo --- Eigen/src/Eigenvalues/RealQZ.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Eigen') diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h index ae10ff91e..128ef9028 100644 --- a/Eigen/src/Eigenvalues/RealQZ.h +++ b/Eigen/src/Eigenvalues/RealQZ.h @@ -313,7 +313,7 @@ namespace Eigen { using std::abs; using std::sqrt; const Index dim=m_S.cols(); - if (abs(m_S.coeff(i+1,i)==Scalar(0))) + if (abs(m_S.coeff(i+1,i))==Scalar(0)) return; Index z = findSmallDiagEntry(i,i+1); if (z==i-1) -- cgit v1.2.3 From e1f1091fde660581d64b54ff1019bc494dbbca89 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 24 Jan 2015 10:32:49 +0100 Subject: Add support for dense ?= diagonal --- Eigen/src/Core/DiagonalMatrix.h | 6 ++++++ test/diagonalmatrices.cpp | 7 +++++++ 2 files changed, 13 insertions(+) (limited to 'Eigen') diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h index e3dc71336..49b9b7925 100644 --- a/Eigen/src/Core/DiagonalMatrix.h +++ b/Eigen/src/Core/DiagonalMatrix.h @@ -326,6 +326,12 @@ struct Assignment dst.setZero(); dst.diagonal() = src.diagonal(); } + + static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &/*func*/) + { dst.diagonal() += src.diagonal(); } + + static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &/*func*/) + { dst.diagonal() -= src.diagonal(); } }; } // namespace internal diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp index 149f1db2f..0227ba577 100644 --- a/test/diagonalmatrices.cpp +++ b/test/diagonalmatrices.cpp @@ -84,6 +84,13 @@ template void diagonalmatrices(const MatrixType& m) VERIFY_IS_APPROX(m1 * (rdm1 * s1), (m1 * rdm1) * s1); VERIFY_IS_APPROX(m1 * (s1 * rdm1), (m1 * rdm1) * s1); + + // Diagonal to dense + sq_m1.setRandom(); + sq_m2 = sq_m1; + VERIFY_IS_APPROX( (sq_m1 += (s1*v1).asDiagonal()), sq_m2 += (s1*v1).asDiagonal().toDenseMatrix() ); + VERIFY_IS_APPROX( (sq_m1 -= (s1*v1).asDiagonal()), sq_m2 -= (s1*v1).asDiagonal().toDenseMatrix() ); + VERIFY_IS_APPROX( (sq_m1 = (s1*v1).asDiagonal()), (s1*v1).asDiagonal().toDenseMatrix() ); } void test_diagonalmatrices() -- cgit v1.2.3 From c6eb84aabcf102aaa3ba1c288e890984f4b49277 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 26 Jan 2015 17:09:01 +0100 Subject: Enable vectorization of transposeInPlace for PacketSize x PacketSize matrices --- Eigen/src/Core/Transpose.h | 27 ++++++++++++++++++++++++--- test/adjoint.cpp | 22 ++++++++++++++++++++++ 2 files changed, 46 insertions(+), 3 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index a3b95256f..3bab6092c 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -217,18 +217,39 @@ MatrixBase::adjoint() const namespace internal { template + bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic, + bool MatchPacketSize = + (int(MatrixType::RowsAtCompileTime) == int(internal::packet_traits::size)) + && (internal::evaluator::Flags&PacketAccessBit) > struct inplace_transpose_selector; template -struct inplace_transpose_selector { // square matrix +struct inplace_transpose_selector { // square matrix static void run(MatrixType& m) { m.matrix().template triangularView().swap(m.matrix().transpose()); } }; +// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only. template -struct inplace_transpose_selector { // non square matrix +struct inplace_transpose_selector { // PacketSize x PacketSize + static void run(MatrixType& m) { + typedef typename MatrixType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename MatrixType::Index Index; + const Index PacketSize = internal::packet_traits::size; + const Index Alignment = internal::evaluator::Flags&AlignedBit ? Aligned : Unaligned; + PacketBlock A; + for (Index i=0; i(i,0); + internal::ptranspose(A); + for (Index i=0; i(m.rowIndexByOuterInner(i,0), m.colIndexByOuterInner(i,0), A.packet[i]); + } +}; + +template +struct inplace_transpose_selector { // non square matrix static void run(MatrixType& m) { if (m.rows()==m.cols()) m.matrix().template triangularView().swap(m.matrix().transpose()); diff --git a/test/adjoint.cpp b/test/adjoint.cpp index ea36f7841..3b2a53c91 100644 --- a/test/adjoint.cpp +++ b/test/adjoint.cpp @@ -64,6 +64,7 @@ template void adjoint(const MatrixType& m) typedef typename NumTraits::Real RealScalar; typedef Matrix VectorType; typedef Matrix SquareMatrixType; + const Index PacketSize = internal::packet_traits::size; Index rows = m.rows(); Index cols = m.cols(); @@ -108,6 +109,17 @@ template void adjoint(const MatrixType& m) VERIFY_IS_APPROX(m3,m1.transpose()); m3.transposeInPlace(); VERIFY_IS_APPROX(m3,m1); + + if(PacketSize(0,m3.rows()-PacketSize); + Index j = internal::random(0,m3.cols()-PacketSize); + m3.template block(i,j).transposeInPlace(); + VERIFY_IS_APPROX( (m3.template block(i,j)), (m1.template block(i,j).transpose()) ); + m3.template block(i,j).transposeInPlace(); + VERIFY_IS_APPROX(m3,m1); + } // check inplace adjoint m3 = m1; @@ -129,9 +141,19 @@ void test_adjoint() CALL_SUBTEST_1( adjoint(Matrix()) ); CALL_SUBTEST_2( adjoint(Matrix3d()) ); CALL_SUBTEST_3( adjoint(Matrix4f()) ); + CALL_SUBTEST_4( adjoint(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE/2), internal::random(1,EIGEN_TEST_MAX_SIZE/2))) ); CALL_SUBTEST_5( adjoint(MatrixXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_6( adjoint(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + + // Complement for 128 bits vectorization: + CALL_SUBTEST_8( adjoint(Matrix2d()) ); + CALL_SUBTEST_9( adjoint(Matrix()) ); + + // 256 bits vectorization: + CALL_SUBTEST_10( adjoint(Matrix()) ); + CALL_SUBTEST_11( adjoint(Matrix()) ); + CALL_SUBTEST_12( adjoint(Matrix()) ); } // test a large static matrix only once CALL_SUBTEST_7( adjoint(Matrix()) ); -- cgit v1.2.3 From a727a2c4edfa85303ad1ad406e65415187fbb770 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 28 Jan 2015 16:07:51 +0100 Subject: bug #933: RealSchur, do not consider the input matrix norm to check negligible sub-diag entries. This also makes this test consistent with the complex and self-adjoint cases. --- Eigen/src/Eigenvalues/RealSchur.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h index 10f5fb174..51e61ba38 100644 --- a/Eigen/src/Eigenvalues/RealSchur.h +++ b/Eigen/src/Eigenvalues/RealSchur.h @@ -234,7 +234,7 @@ template class RealSchur typedef Matrix Vector3s; Scalar computeNormOfT(); - Index findSmallSubdiagEntry(Index iu, const Scalar& norm); + Index findSmallSubdiagEntry(Index iu); void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift); void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo); void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector); @@ -286,7 +286,7 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa { while (iu >= 0) { - Index il = findSmallSubdiagEntry(iu, norm); + Index il = findSmallSubdiagEntry(iu); // Check for convergence if (il == iu) // One root found @@ -343,16 +343,14 @@ inline typename MatrixType::Scalar RealSchur::computeNormOfT() /** \internal Look for single small sub-diagonal element and returns its index */ template -inline typename MatrixType::Index RealSchur::findSmallSubdiagEntry(Index iu, const Scalar& norm) +inline typename MatrixType::Index RealSchur::findSmallSubdiagEntry(Index iu) { using std::abs; Index res = iu; while (res > 0) { Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res)); - if (s == 0.0) - s = norm; - if (abs(m_matT.coeff(res,res-1)) < NumTraits::epsilon() * s) + if (abs(m_matT.coeff(res,res-1)) <= NumTraits::epsilon() * s) break; res--; } @@ -457,9 +455,7 @@ inline void RealSchur::initFrancisQRStep(Index il, Index iu, const V const Scalar lhs = m_matT.coeff(im,im-1) * (abs(v.coeff(1)) + abs(v.coeff(2))); const Scalar rhs = v.coeff(0) * (abs(m_matT.coeff(im-1,im-1)) + abs(Tmm) + abs(m_matT.coeff(im+1,im+1))); if (abs(lhs) < NumTraits::epsilon() * rhs) - { break; - } } } -- cgit v1.2.3 From 9d82f7e30d53086d090ae13d69dffef771eb6263 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 30 Jan 2015 17:24:40 +0100 Subject: Supernodes was disabled. --- Eigen/src/SparseLU/SparseLU.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Eigen') diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h index d72d7f150..79b78da99 100644 --- a/Eigen/src/SparseLU/SparseLU.h +++ b/Eigen/src/SparseLU/SparseLU.h @@ -309,7 +309,7 @@ class SparseLU : public SparseSolverBase >, // Functions void initperfvalues() { - m_perfv.panel_size = 1; + m_perfv.panel_size = 16; m_perfv.relax = 1; m_perfv.maxsuper = 128; m_perfv.rowblk = 16; -- cgit v1.2.3 From f1092d2f736bbe541b3d8b5cca893f668f9c6b5f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 30 Jan 2015 19:04:04 +0100 Subject: bug #941: fix accuracy issue in ColPivHouseholderQR, do not stop decomposition on a small pivot --- Eigen/src/QR/ColPivHouseholderQR.h | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index de77e8411..370cb69e3 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -476,20 +476,10 @@ ColPivHouseholderQR& ColPivHouseholderQR::compute(const // we store that back into our table: it can't hurt to correct our table. m_colSqNorms.coeffRef(biggest_col_index) = biggest_col_sq_norm; - // if the current biggest column is smaller than epsilon times the initial biggest column, - // terminate to avoid generating nan/inf values. - // Note that here, if we test instead for "biggest == 0", we get a failure every 1000 (or so) - // repetitions of the unit test, with the result of solve() filled with large values of the order - // of 1/(size*epsilon). - if(biggest_col_sq_norm < threshold_helper * RealScalar(rows-k)) - { + // Track the number of meaningful pivots but do not stop the decomposition to make + // sure that the initial matrix is properly reproduced. See bug 941. + if(m_nonzero_pivots==size && biggest_col_sq_norm < threshold_helper * RealScalar(rows-k)) m_nonzero_pivots = k; - m_hCoeffs.tail(size-k).setZero(); - m_qr.bottomRightCorner(rows-k,cols-k) - .template triangularView() - .setZero(); - break; - } // apply the transposition to the columns m_colsTranspositions.coeffRef(k) = biggest_col_index; @@ -518,7 +508,7 @@ ColPivHouseholderQR& ColPivHouseholderQR::compute(const } m_colsPermutation.setIdentity(PermIndexType(cols)); - for(PermIndexType k = 0; k < m_nonzero_pivots; ++k) + for(PermIndexType k = 0; k < size/*m_nonzero_pivots*/; ++k) m_colsPermutation.applyTranspositionOnTheRight(k, PermIndexType(m_colsTranspositions.coeff(k))); m_det_pq = (number_of_transpositions%2) ? -1 : 1; @@ -574,13 +564,15 @@ struct Assignment >, interna } // end namespace internal -/** \returns the matrix Q as a sequence of householder transformations */ +/** \returns the matrix Q as a sequence of householder transformations. + * You can extract the meaningful part only by using: + * \code qr.householderQ().setLength(qr.nonzeroPivots()) */ template typename ColPivHouseholderQR::HouseholderSequenceType ColPivHouseholderQR ::householderQ() const { eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized."); - return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate()).setLength(m_nonzero_pivots); + return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate()); } #ifndef __CUDACC__ -- cgit v1.2.3 From 759bd92a85393617a56405ec0372e87416cfaebb Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 30 Jan 2015 17:27:56 -0500 Subject: bug #935: Add asm comments in GEBP kernels to work around a bug in both GCC and Clang on ARM/NEON, whereby they spill registers, severely harming performance. The reason why the asm comments make a difference is that they prevent the compiler from reordering code across these boundaries, which has the effect of extending the lifetime of local variables and increasing register pressure on this register-tight code. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 160 ++++++++++++++-------- Eigen/src/Core/util/Macros.h | 8 +- 2 files changed, 108 insertions(+), 60 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 7b2ed6728..1b39642fb 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -760,31 +760,36 @@ void gebp_kernel for(Index k=0; k blB += pk*4*RhsProgress; blA += pk*3*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4"); } // process remaining peeled loop for(Index k=peeled_kc; k for(Index k=0; k blB += pk*RhsProgress; blA += pk*3*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1"); } // process remaining peeled loop @@ -963,21 +977,26 @@ void gebp_kernel for(Index k=0; k blB += pk*4*RhsProgress; blA += pk*(2*Traits::LhsProgress); + + EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4"); } // process remaining peeled loop for(Index k=peeled_kc; k for(Index k=0; k blB += pk*RhsProgress; blA += pk*2*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1"); } // process remaining peeled loop @@ -1137,16 +1165,21 @@ void gebp_kernel for(Index k=0; k blB += pk*4*RhsProgress; blA += pk*1*LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4"); } // process remaining peeled loop for(Index k=peeled_kc; k for(Index k=0; k blB += pk*RhsProgress; blA += pk*1*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1"); } // process remaining peeled loop diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 687ba41dd..13f8fdd4e 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -160,6 +160,12 @@ #define EIGEN_ARCH_ARM64 0 #endif +#if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64 + #define EIGEN_ARCH_ARM_OR_ARM64 1 +#else + #define EIGEN_ARCH_ARM_OR_ARM64 0 +#endif + /// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS #if defined(__mips__) || defined(__mips) #define EIGEN_ARCH_MIPS 1 @@ -526,7 +532,7 @@ namespace Eigen { #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var); #if !defined(EIGEN_ASM_COMMENT) - #if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64 + #if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64) #define EIGEN_ASM_COMMENT(X) __asm__("#" X) #else #define EIGEN_ASM_COMMENT(X) -- cgit v1.2.3 From 9f99f61e69a70e0a209d5f93e78a9257685cd70d Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 30 Jan 2015 17:43:56 -0500 Subject: bug #936, patch 1/3: some cleanup and renaming for consistency. --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 4 ++-- Eigen/src/Core/arch/NEON/PacketMath.h | 4 ++-- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 18 +++++++++--------- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index fa02f57a1..27df5a025 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -22,8 +22,8 @@ namespace internal { #define EIGEN_HAS_FUSED_MADD 1 #endif -#ifndef EIGEN_HAS_FUSE_CJMADD -#define EIGEN_HAS_FUSE_CJMADD 1 +#ifndef EIGEN_HAS_FUSED_CJMADD +#define EIGEN_HAS_FUSED_CJMADD #endif // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index f83f8db0e..5a6eb8c1d 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -24,8 +24,8 @@ namespace internal { #define EIGEN_HAS_FUSED_MADD 1 #endif -#ifndef EIGEN_HAS_FUSE_CJMADD -#define EIGEN_HAS_FUSE_CJMADD 1 +#ifndef EIGEN_HAS_FUSED_CJMADD +#define EIGEN_HAS_FUSED_CJMADD #endif // FIXME NEON has 16 quad registers, but since the current register allocator diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 1b39642fb..ae2fd9006 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -120,8 +120,8 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) computeProductBlockingSizes(k, m, n); } -#ifdef EIGEN_HAS_FUSE_CJMADD - #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); +#ifdef EIGEN_HAS_FUSED_CJMADD + #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); #else // FIXME (a bit overkill maybe ?) @@ -146,8 +146,8 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) gebp_madd_selector::run(cj,a,b,c,t); } - #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); -// #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); + #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); +// #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); #endif /* Vectorization logic @@ -1402,13 +1402,13 @@ void gebp_kernel B_0 = blB[0]; B_1 = blB[1]; - MADD(cj,A0,B_0,C0, B_0); - MADD(cj,A0,B_1,C1, B_1); + CJMADD(cj,A0,B_0,C0, B_0); + CJMADD(cj,A0,B_1,C1, B_1); B_0 = blB[2]; B_1 = blB[3]; - MADD(cj,A0,B_0,C2, B_0); - MADD(cj,A0,B_1,C3, B_1); + CJMADD(cj,A0,B_0,C2, B_0); + CJMADD(cj,A0,B_1,C3, B_1); blB += 4; } @@ -1434,7 +1434,7 @@ void gebp_kernel { LhsScalar A0 = blA[k]; RhsScalar B_0 = blB[k]; - MADD(cj, A0, B_0, C0, B_0); + CJMADD(cj, A0, B_0, C0, B_0); } res[(j2+0)*resStride + i] += alpha*C0; } -- cgit v1.2.3 From 340b8afb14bb06788570ba22ba4ccba674402f09 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Sat, 31 Jan 2015 14:15:57 -0500 Subject: bug #936, patch 1.5/3: rename _FUSED_ macros to _SINGLE_INSTRUCTION_, because this is what they are about. "Fused" means "no intermediate rounding between the mul and the add, only one rounding at the end". Instead, what we are concerned about here is whether a temporary register is needed, i.e. whether the MUL and ADD are separate instructions. Concretely, on ARM NEON, a single-instruction mul-add is always available: VMLA. But a true fused mul-add is only available on VFPv4: VFMA. --- Eigen/src/Core/arch/AVX/PacketMath.h | 4 ++-- Eigen/src/Core/arch/AltiVec/PacketMath.h | 8 ++++---- Eigen/src/Core/arch/NEON/PacketMath.h | 8 ++++---- Eigen/src/Core/arch/SSE/PacketMath.h | 4 ++-- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 12 ++++++------ 5 files changed, 18 insertions(+), 18 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index e2376bd1f..1d8c674a6 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -23,8 +23,8 @@ namespace internal { #endif #ifdef EIGEN_VECTORIZE_FMA -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 #endif #endif diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 27df5a025..578b303a0 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -18,12 +18,12 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 #endif -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 #endif -#ifndef EIGEN_HAS_FUSED_CJMADD -#define EIGEN_HAS_FUSED_CJMADD +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD #endif // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 5a6eb8c1d..9cfb9c358 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -20,12 +20,12 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 #endif -#ifndef EIGEN_HAS_FUSED_CJMADD -#define EIGEN_HAS_FUSED_CJMADD +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD #endif // FIXME NEON has 16 quad registers, but since the current register allocator diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 28427c308..202aaa72f 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -23,8 +23,8 @@ namespace internal { #endif #ifdef EIGEN_VECTORIZE_FMA -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 #endif #endif diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index ae2fd9006..b5f06d831 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -120,7 +120,7 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) computeProductBlockingSizes(k, m, n); } -#ifdef EIGEN_HAS_FUSED_CJMADD +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); #else @@ -182,7 +182,7 @@ public: nr = 4, // register block size along the M direction (currently, this one cannot be modified) -#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) // we assume 16 registers mr = 3*LhsPacketSize, #else @@ -248,7 +248,7 @@ public: // let gcc allocate the register in which to store the result of the pmul // (in the case where there is no FMA) gcc fails to figure out how to avoid // spilling register. -#ifdef EIGEN_HAS_FUSED_MADD +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); c = pmadd(a,b,c); #else @@ -290,7 +290,7 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = 4, -#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) // we assume 16 registers mr = 3*LhsPacketSize, #else @@ -353,7 +353,7 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { -#ifdef EIGEN_HAS_FUSED_MADD +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); c.v = pmadd(a.v,b,c.v); #else @@ -637,7 +637,7 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { -#ifdef EIGEN_HAS_FUSED_MADD +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); c.v = pmadd(a,b.v,c.v); #else -- cgit v1.2.3 From 0f216136980503c3792a90e382b4d6bbdbb870c0 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 30 Jan 2015 17:44:26 -0500 Subject: bug #936, patch 2/3: Remove EIGEN_VECTORIZE_FMA, was redundant with EIGEN_HAS_SINGLE_INSTRUCTION_MADD --- Eigen/Core | 4 +--- Eigen/src/Core/arch/AVX/PacketMath.h | 4 ++-- Eigen/src/Core/arch/AltiVec/PacketMath.h | 2 +- Eigen/src/Core/arch/NEON/PacketMath.h | 2 +- Eigen/src/Core/arch/SSE/PacketMath.h | 2 +- 5 files changed, 6 insertions(+), 8 deletions(-) (limited to 'Eigen') diff --git a/Eigen/Core b/Eigen/Core index dcb20bfd0..b5af63623 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -125,9 +125,7 @@ #define EIGEN_VECTORIZE_SSE4_1 #define EIGEN_VECTORIZE_SSE4_2 #endif - #ifdef __FMA__ - #define EIGEN_VECTORIZE_FMA - #endif + // include files // This extern "C" works around a MINGW-w64 compilation issue diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 1d8c674a6..485bac10b 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -22,9 +22,9 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef EIGEN_VECTORIZE_FMA +#ifdef __FMA__ #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif #endif diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 578b303a0..6b68fc7a5 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -19,7 +19,7 @@ namespace internal { #endif #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 9cfb9c358..71255ac85 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -21,7 +21,7 @@ namespace internal { #endif #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 202aaa72f..3f6fb0254 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -22,7 +22,7 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef EIGEN_VECTORIZE_FMA +#ifdef __FMA__ #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 #endif -- cgit v1.2.3 From 5ef95fabee5e9a9357c082cd32ae3b4affb2eff6 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 30 Jan 2015 17:45:03 -0500 Subject: bug #936, patch 3/3: Properly detect FMA support on ARM (requires VFPv4) and use it instead of MLA when available, because it's both more accurate, and faster. --- Eigen/src/Core/arch/NEON/PacketMath.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 71255ac85..9afd86bec 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -177,8 +177,19 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, co return pset1(0); } -// for some weird raisons, it has to be overloaded for packet of integers +#ifdef __ARM_FEATURE_FMA +// See bug 936. +// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4. +// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding. +// MLA is not fused i.e. does 2 roundings. +// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4): +// MLA: 10 GFlop/s ; FMA: 12 GFlops/s. +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); } +#else template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); } +#endif + +// No FMA instruction for int, so use MLA unconditionally. template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); } template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } @@ -551,8 +562,12 @@ template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); } -// for some weird raisons, it has to be overloaded for packet of integers +#ifdef __ARM_FEATURE_FMA +// See bug 936. See above comment about FMA for float. +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); } +#else template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); } +#endif template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); } -- cgit v1.2.3 From ebdf6a2dbbc66d0f6fb045a3c5b0023bc2e89851 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Feb 2015 22:32:34 +0100 Subject: SPQR: fix default threshold value --- Eigen/src/SPQRSupport/SuiteSparseQRSupport.h | 51 ++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 11 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h index 44f6a1acb..54a1b21b8 100644 --- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h @@ -68,13 +68,13 @@ class SPQR : public SparseSolverBase > typedef Map > PermutationType; public: SPQR() - : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits::epsilon()) + : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits::epsilon()), m_useDefaultThreshold(true) { cholmod_l_start(&m_cc); } explicit SPQR(const _MatrixType& matrix) - : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits::epsilon()) + : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits::epsilon()), m_useDefaultThreshold(true) { cholmod_l_start(&m_cc); compute(matrix); @@ -99,10 +99,25 @@ class SPQR : public SparseSolverBase > if(m_isInitialized) SPQR_free(); MatrixType mat(matrix); + + /* Compute the default threshold as in MatLab, see: + * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing + * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3 + */ + RealScalar pivotThreshold = m_tolerance; + if(m_useDefaultThreshold) + { + RealScalar max2Norm = 0.0; + for (int j = 0; j < mat.cols(); j++) max2Norm = numext::maxi(max2Norm, mat.col(j).norm()); + if(max2Norm==RealScalar(0)) + max2Norm = RealScalar(1); + pivotThreshold = 20 * (mat.rows() + mat.cols()) * max2Norm * NumTraits::epsilon(); + } + cholmod_sparse A; A = viewAsCholmod(mat); Index col = matrix.cols(); - m_rank = SuiteSparseQR(m_ordering, m_tolerance, col, &A, + m_rank = SuiteSparseQR(m_ordering, pivotThreshold, col, &A, &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc); if (!m_cR) @@ -118,7 +133,7 @@ class SPQR : public SparseSolverBase > /** * Get the number of rows of the input matrix and the Q matrix */ - inline Index rows() const {return m_H->nrow; } + inline Index rows() const {return m_cR->nrow; } /** * Get the number of columns of the input matrix. @@ -130,16 +145,25 @@ class SPQR : public SparseSolverBase > { eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()"); eigen_assert(b.cols()==1 && "This method is for vectors only"); - + //Compute Q^T * b - typename Dest::PlainObject y; + typename Dest::PlainObject y, y2; y = matrixQ().transpose() * b; - // Solves with the triangular matrix R + + // Solves with the triangular matrix R Index rk = this->rank(); - y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView().solve(y.topRows(rk)); - y.bottomRows(cols()-rk).setZero(); + y2 = y; + y.resize((std::max)(cols(),Index(y.rows())),y.cols()); + y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView().solve(y2.topRows(rk)); + // Apply the column permutation - dest.topRows(cols()) = colsPermutation() * y.topRows(cols()); + // colsPermutation() performs a copy of the permutation, + // so let's apply it manually: + for(Index i = 0; i < rk; ++i) dest.row(m_E[i]) = y.row(i); + for(Index i = rk; i < cols(); ++i) dest.row(m_E[i]).setZero(); + +// y.bottomRows(y.rows()-rk).setZero(); +// dest = colsPermutation() * y.topRows(cols()); m_info = Success; } @@ -178,7 +202,11 @@ class SPQR : public SparseSolverBase > /// Set the fill-reducing ordering method to be used void setSPQROrdering(int ord) { m_ordering = ord;} /// Set the tolerance tol to treat columns with 2-norm < =tol as zero - void setPivotThreshold(const RealScalar& tol) { m_tolerance = tol; } + void setPivotThreshold(const RealScalar& tol) + { + m_useDefaultThreshold = false; + m_tolerance = tol; + } /** \returns a pointer to the SPQR workspace */ cholmod_common *cholmodCommon() const { return &m_cc; } @@ -210,6 +238,7 @@ class SPQR : public SparseSolverBase > mutable cholmod_dense *m_HTau; // The Householder coefficients mutable Index m_rank; // The rank of the matrix mutable cholmod_common m_cc; // Workspace and parameters + bool m_useDefaultThreshold; // Use default threshold template friend struct SPQR_QProduct; }; -- cgit v1.2.3 From b1eca55328436f9778254c6bbc8852910a0d3d2a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Feb 2015 23:46:05 +0100 Subject: Use Ref<> to ensure that both x and b in Ax=b are compatible with Umfpack/SuperLU expectations --- Eigen/src/SuperLUSupport/SuperLUSupport.h | 23 +++++++++++++++++++---- Eigen/src/UmfPackSupport/UmfPackSupport.h | 13 ++++++++++++- 2 files changed, 31 insertions(+), 5 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h index ef73587a7..6de5b3dc5 100644 --- a/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -627,8 +627,12 @@ void SuperLU::_solve_impl(const MatrixBase &b, MatrixBase m_sluFerr.resize(rhsCols); m_sluBerr.resize(rhsCols); - m_sluB = SluMatrix::Map(b.const_cast_derived()); - m_sluX = SluMatrix::Map(x.derived()); + + Ref > b_ref(b); + Ref > x_ref(x); + + m_sluB = SluMatrix::Map(b_ref.const_cast_derived()); + m_sluX = SluMatrix::Map(x_ref.const_cast_derived()); typename Rhs::PlainObject b_cpy; if(m_sluEqued!='N') @@ -651,6 +655,10 @@ void SuperLU::_solve_impl(const MatrixBase &b, MatrixBase &m_sluFerr[0], &m_sluBerr[0], &m_sluStat, &info, Scalar()); StatFree(&m_sluStat); + + if(&x.coeffRef(0) != x_ref.data()) + x = x_ref; + m_info = info==0 ? Success : NumericalIssue; } @@ -938,8 +946,12 @@ void SuperILU::_solve_impl(const MatrixBase &b, MatrixBase > b_ref(b); + Ref > x_ref(x); + + m_sluB = SluMatrix::Map(b_ref.const_cast_derived()); + m_sluX = SluMatrix::Map(x_ref.const_cast_derived()); typename Rhs::PlainObject b_cpy; if(m_sluEqued!='N') @@ -962,6 +974,9 @@ void SuperILU::_solve_impl(const MatrixBase &b, MatrixBase::_solve_impl(const MatrixBase &b, MatrixBas eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve"); int errorCode; + Scalar* x_ptr = 0; + Matrix x_tmp; + if(x.innerStride()!=1) + { + x_tmp.resize(x.rows()); + x_ptr = x_tmp.data(); + } for (int j=0; j Date: Fri, 6 Feb 2015 02:51:59 -0800 Subject: Added the EIGEN_HAS_CONSTEXPR define Gate the tensor index list code based on the value of EIGEN_HAS_CONSTEXPR --- Eigen/src/Core/util/Macros.h | 6 ++++++ unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- unsupported/test/cxx11_tensor_index_list.cpp | 4 ++++ unsupported/test/cxx11_tensor_reduction.cpp | 8 ++++---- 5 files changed, 16 insertions(+), 6 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 001907a0b..40a28d4d6 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -133,6 +133,12 @@ #define EIGEN_HAS_VARIADIC_TEMPLATES 1 #endif +// Does the compiler support const expressions? +#if (defined(__plusplus) && __cplusplus >= 201402L) || \ + EIGEN_GNUC_AT_LEAST(4,9) +#define EIGEN_HAS_CONSTEXPR 1 +#endif + /** Allows to disable some optimizations which might affect the accuracy of the result. * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. * They currently include: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index c94ed977e..eed0a9f05 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -10,7 +10,7 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H #define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H -#if __cplusplus > 199711L +#ifdef EIGEN_HAS_CONSTEXPR namespace Eigen { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 83ba1df71..21416afe0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -53,7 +53,7 @@ struct preserve_inner_most_dims { static const bool value = false; }; -#if __cplusplus > 199711L +#ifdef EIGEN_HAS_CONSTEXPR template struct are_inner_most_dims{ static const bool value = indices_statically_known_to_increase()() && diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp index d79a3ed45..c4d4f244f 100644 --- a/unsupported/test/cxx11_tensor_index_list.cpp +++ b/unsupported/test/cxx11_tensor_index_list.cpp @@ -11,6 +11,7 @@ #include +#ifdef EIGEN_HAS_CONSTEXPR static void test_static_index_list() { @@ -254,11 +255,14 @@ static void test_mixed_index_list() VERIFY_IS_APPROX(result3(0), expected); } +#endif void test_cxx11_tensor_index_list() { +#ifdef EIGEN_HAS_CONSTEXPR CALL_SUBTEST(test_static_index_list()); CALL_SUBTEST(test_type2index_list()); CALL_SUBTEST(test_dynamic_index_list()); CALL_SUBTEST(test_mixed_index_list()); +#endif } diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp index 5c3184833..0269853a9 100644 --- a/unsupported/test/cxx11_tensor_reduction.cpp +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -284,7 +284,7 @@ static void test_static_dims() { Tensor out(72, 97); in.setRandom(); -#if __cplusplus <= 199711L +#ifndef EIGEN_HAS_CONSTEXPR array reduction_axis; reduction_axis[0] = 1; reduction_axis[1] = 3; @@ -314,7 +314,7 @@ static void test_innermost_last_dims() { in.setRandom(); // Reduce on the innermost dimensions. -#if __cplusplus <= 199711L +#ifndef EIGEN_HAS_CONSTEXPR array reduction_axis; reduction_axis[0] = 0; reduction_axis[1] = 1; @@ -345,7 +345,7 @@ static void test_innermost_first_dims() { in.setRandom(); // Reduce on the innermost dimensions. -#if __cplusplus <= 199711L +#ifndef EIGEN_HAS_CONSTEXPR array reduction_axis; reduction_axis[0] = 2; reduction_axis[1] = 3; @@ -376,7 +376,7 @@ static void test_reduce_middle_dims() { in.setRandom(); // Reduce on the innermost dimensions. -#if __cplusplus <= 199711L +#ifndef EIGEN_HAS_CONSTEXPR array reduction_axis; reduction_axis[0] = 1; reduction_axis[1] = 2; -- cgit v1.2.3 From 74e460b9950503ef5a306337a136e1d37795deae Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Feb 2015 14:26:24 +0100 Subject: Fix symmetric product --- Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 21f8175d2..860e233b9 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -374,7 +374,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix Date: Sat, 7 Feb 2015 22:00:46 +0100 Subject: Add a SparseCompressedBase class providing (un)compressed accessors (like data()/*Stride() for dense matrices), and a CompressedAccessBit flag (similar to DirectAccessBit for dense matrices). --- Eigen/SparseCore | 1 + Eigen/src/Core/util/Constants.h | 13 ++ Eigen/src/SparseCore/SparseCompressedBase.h | 199 ++++++++++++++++++++++++++++ Eigen/src/SparseCore/SparseMatrix.h | 131 +++--------------- Eigen/src/SparseCore/SparseUtil.h | 6 +- 5 files changed, 231 insertions(+), 119 deletions(-) create mode 100644 Eigen/src/SparseCore/SparseCompressedBase.h (limited to 'Eigen') diff --git a/Eigen/SparseCore b/Eigen/SparseCore index d5c0f6271..e2071519b 100644 --- a/Eigen/SparseCore +++ b/Eigen/SparseCore @@ -31,6 +31,7 @@ #include "src/SparseCore/SparseAssign.h" #include "src/SparseCore/CompressedStorage.h" #include "src/SparseCore/AmbiVector.h" +#include "src/SparseCore/SparseCompressedBase.h" #include "src/SparseCore/SparseMatrix.h" #include "src/SparseCore/MappedSparseMatrix.h" #include "src/SparseCore/SparseVector.h" diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 9b40093f0..d1855b50b 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -163,6 +163,19 @@ const unsigned int NestByRefBit = 0x100; * \sa \ref RowMajorBit, \ref TopicStorageOrders */ const unsigned int NoPreferredStorageOrderBit = 0x200; +/** \ingroup flags + * + * Means that the underlying coefficients can be accessed through pointers to the sparse (un)compressed storage format, + * that is, the expression provides: + * \code + inline const Scalar* valuePtr() const; + inline const Index* innerIndexPtr() const; + inline const Index* outerIndexPtr() const; + inline const Index* innerNonZeroPtr() const; + \endcode + */ +const unsigned int CompressedAccessBit = 0x400; + // list of flags that are inherited by default const unsigned int HereditaryBits = RowMajorBit diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h new file mode 100644 index 000000000..00658181e --- /dev/null +++ b/Eigen/src/SparseCore/SparseCompressedBase.h @@ -0,0 +1,199 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPARSE_COMPRESSED_BASE_H +#define EIGEN_SPARSE_COMPRESSED_BASE_H + +namespace Eigen { + +template class SparseCompressedBase; + +namespace internal { + +template +struct traits > : traits +{}; + +} // end namespace internal + +template +class SparseCompressedBase + : public SparseMatrixBase +{ + public: + typedef SparseMatrixBase Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseCompressedBase) + using Base::operator=; + using Base::IsRowMajor; + + class InnerIterator; + class ReverseInnerIterator; + + /** \returns a const pointer to the array of values. + * This function is aimed at interoperability with other libraries. + * \sa innerIndexPtr(), outerIndexPtr() */ + inline const Scalar* valuePtr() const { return derived().valuePtr(); } + /** \returns a non-const pointer to the array of values. + * This function is aimed at interoperability with other libraries. + * \sa innerIndexPtr(), outerIndexPtr() */ + inline Scalar* valuePtr() { return derived().valuePtr(); } + + /** \returns a const pointer to the array of inner indices. + * This function is aimed at interoperability with other libraries. + * \sa valuePtr(), outerIndexPtr() */ + inline const Index* innerIndexPtr() const { return derived().innerIndexPtr(); } + /** \returns a non-const pointer to the array of inner indices. + * This function is aimed at interoperability with other libraries. + * \sa valuePtr(), outerIndexPtr() */ + inline Index* innerIndexPtr() { return derived().innerIndexPtr(); } + + /** \returns a const pointer to the array of the starting positions of the inner vectors. + * This function is aimed at interoperability with other libraries. + * \sa valuePtr(), innerIndexPtr() */ + inline const Index* outerIndexPtr() const { return derived().outerIndexPtr(); } + /** \returns a non-const pointer to the array of the starting positions of the inner vectors. + * This function is aimed at interoperability with other libraries. + * \sa valuePtr(), innerIndexPtr() */ + inline Index* outerIndexPtr() { return derived().outerIndexPtr(); } + + /** \returns a const pointer to the array of the number of non zeros of the inner vectors. + * This function is aimed at interoperability with other libraries. + * \warning it returns the null pointer 0 in compressed mode */ + inline const Index* innerNonZeroPtr() const { return derived().innerNonZeroPtr(); } + /** \returns a non-const pointer to the array of the number of non zeros of the inner vectors. + * This function is aimed at interoperability with other libraries. + * \warning it returns the null pointer 0 in compressed mode */ + inline Index* innerNonZeroPtr() { return derived().innerNonZeroPtr(); } + + /** \returns whether \c *this is in compressed form. */ + inline bool isCompressed() const { return innerNonZeroPtr()==0; } + +}; + +template +class SparseCompressedBase::InnerIterator +{ + public: + InnerIterator(const SparseCompressedBase& mat, Index outer) + : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_id(mat.outerIndexPtr()[outer]) + { + if(mat.isCompressed()) + m_end = mat.outerIndexPtr()[outer+1]; + else + m_end = m_id + mat.innerNonZeroPtr()[outer]; + } + + inline InnerIterator& operator++() { m_id++; return *this; } + + inline const Scalar& value() const { return m_values[m_id]; } + inline Scalar& valueRef() { return const_cast(m_values[m_id]); } + + inline Index index() const { return m_indices[m_id]; } + inline Index outer() const { return m_outer; } + inline Index row() const { return IsRowMajor ? m_outer : index(); } + inline Index col() const { return IsRowMajor ? index() : m_outer; } + + inline operator bool() const { return (m_id < m_end); } + + protected: + const Scalar* m_values; + const Index* m_indices; + const Index m_outer; + Index m_id; + Index m_end; + private: + // If you get here, then you're not using the right InnerIterator type, e.g.: + // SparseMatrix A; + // SparseMatrix::InnerIterator it(A,0); + template InnerIterator(const SparseMatrixBase&,Index outer); +}; + +template +class SparseCompressedBase::ReverseInnerIterator +{ + public: + ReverseInnerIterator(const SparseCompressedBase& mat, Index outer) + : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.outerIndexPtr()[outer]) + { + if(mat.isCompressed()) + m_id = mat.outerIndexPtr()[outer+1]; + else + m_id = m_start + mat.innerNonZeroPtr()[outer]; + } + + inline ReverseInnerIterator& operator--() { --m_id; return *this; } + + inline const Scalar& value() const { return m_values[m_id-1]; } + inline Scalar& valueRef() { return const_cast(m_values[m_id-1]); } + + inline Index index() const { return m_indices[m_id-1]; } + inline Index outer() const { return m_outer; } + inline Index row() const { return IsRowMajor ? m_outer : index(); } + inline Index col() const { return IsRowMajor ? index() : m_outer; } + + inline operator bool() const { return (m_id > m_start); } + + protected: + const Scalar* m_values; + const Index* m_indices; + const Index m_outer; + Index m_id; + const Index m_start; +}; + +namespace internal { + +template +struct evaluator > + : evaluator_base +{ + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Index Index; + typedef typename Derived::InnerIterator InnerIterator; + typedef typename Derived::ReverseInnerIterator ReverseInnerIterator; + + enum { + CoeffReadCost = NumTraits::ReadCost, + Flags = Derived::Flags + }; + + evaluator() : m_matrix(0) {} + explicit evaluator(const Derived &mat) : m_matrix(&mat) {} + + operator Derived&() { return m_matrix->const_cast_derived(); } + operator const Derived&() const { return *m_matrix; } + + typedef typename DenseCoeffsBase::CoeffReturnType CoeffReturnType; + Scalar coeff(Index row, Index col) const + { return m_matrix->coeff(row,col); } + + Scalar& coeffRef(Index row, Index col) + { + eigen_internal_assert(row>=0 && rowrows() && col>=0 && colcols()); + + const Index outer = Derived::IsRowMajor ? row : col; + const Index inner = Derived::IsRowMajor ? col : row; + + Index start = m_matrix->outerIndexPtr()[outer]; + Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer]; + eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist"); + const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner) + - m_matrix->innerIndexPtr(); + eigen_assert((pinnerIndexPtr()[p]==inner) && "written coefficient does not exist"); + return m_matrix->const_cast_derived().valuePtr()[p]; + } + + const Derived *m_matrix; +}; + +} + +} // end namespace Eigen + +#endif // EIGEN_SPARSE_COMPRESSED_BASE_H diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 93677c786..74b4c6a9d 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -51,7 +51,7 @@ struct traits > ColsAtCompileTime = Dynamic, MaxRowsAtCompileTime = Dynamic, MaxColsAtCompileTime = Dynamic, - Flags = _Options | NestByRefBit | LvalueBit, + Flags = _Options | NestByRefBit | LvalueBit | CompressedAccessBit, SupportedAccessPatterns = InnerRandomAccessPattern }; }; @@ -90,16 +90,20 @@ struct traits, DiagIndex> template class SparseMatrix - : public SparseMatrixBase > + : public SparseCompressedBase > { public: - EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix) + typedef SparseCompressedBase Base; + using Base::isCompressed; + _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix) EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=) EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=) typedef MappedSparseMatrix Map; typedef Diagonal DiagonalReturnType; typedef Diagonal ConstDiagonalReturnType; + typedef typename Base::InnerIterator InnerIterator; + typedef typename Base::ReverseInnerIterator ReverseInnerIterator; using Base::IsRowMajor; @@ -123,9 +127,6 @@ class SparseMatrix public: - /** \returns whether \c *this is in compressed form. */ - inline bool isCompressed() const { return m_innerNonZeros==0; } - /** \returns the number of rows of the matrix */ inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; } /** \returns the number of columns of the matrix */ @@ -241,9 +242,6 @@ class SparseMatrix public: - class InnerIterator; - class ReverseInnerIterator; - /** Removes all non zeros but keep allocated memory */ inline void setZero() { @@ -875,76 +873,6 @@ private: }; }; -template -class SparseMatrix::InnerIterator -{ - public: - InnerIterator(const SparseMatrix& mat, Index outer) - : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_id(mat.m_outerIndex[outer]) - { - if(mat.isCompressed()) - m_end = mat.m_outerIndex[outer+1]; - else - m_end = m_id + mat.m_innerNonZeros[outer]; - } - - inline InnerIterator& operator++() { m_id++; return *this; } - - inline const Scalar& value() const { return m_values[m_id]; } - inline Scalar& valueRef() { return const_cast(m_values[m_id]); } - - inline Index index() const { return m_indices[m_id]; } - inline Index outer() const { return m_outer; } - inline Index row() const { return IsRowMajor ? m_outer : index(); } - inline Index col() const { return IsRowMajor ? index() : m_outer; } - - inline operator bool() const { return (m_id < m_end); } - - protected: - const Scalar* m_values; - const Index* m_indices; - const Index m_outer; - Index m_id; - Index m_end; - private: - // If you get here, then you're not using the right InnerIterator type, e.g.: - // SparseMatrix A; - // SparseMatrix::InnerIterator it(A,0); - template InnerIterator(const SparseMatrixBase&,Index outer); -}; - -template -class SparseMatrix::ReverseInnerIterator -{ - public: - ReverseInnerIterator(const SparseMatrix& mat, Index outer) - : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.m_outerIndex[outer]) - { - if(mat.isCompressed()) - m_id = mat.m_outerIndex[outer+1]; - else - m_id = m_start + mat.m_innerNonZeros[outer]; - } - - inline ReverseInnerIterator& operator--() { --m_id; return *this; } - - inline const Scalar& value() const { return m_values[m_id-1]; } - inline Scalar& valueRef() { return const_cast(m_values[m_id-1]); } - - inline Index index() const { return m_indices[m_id-1]; } - inline Index outer() const { return m_outer; } - inline Index row() const { return IsRowMajor ? m_outer : index(); } - inline Index col() const { return IsRowMajor ? index() : m_outer; } - - inline operator bool() const { return (m_id > m_start); } - - protected: - const Scalar* m_values; - const Index* m_indices; - const Index m_outer; - Index m_id; - const Index m_start; -}; namespace internal { @@ -1075,6 +1003,10 @@ EIGEN_DONT_INLINE SparseMatrix& SparseMatrix::value), YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + #endif + const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator::Flags & RowMajorBit); if (needToTranspose) { @@ -1277,45 +1209,12 @@ namespace internal { template struct evaluator > - : evaluator_base > + : evaluator > > { - typedef _Scalar Scalar; - typedef _Index Index; + typedef evaluator > > Base; typedef SparseMatrix<_Scalar,_Options,_Index> SparseMatrixType; - typedef typename SparseMatrixType::InnerIterator InnerIterator; - typedef typename SparseMatrixType::ReverseInnerIterator ReverseInnerIterator; - - enum { - CoeffReadCost = NumTraits<_Scalar>::ReadCost, - Flags = SparseMatrixType::Flags - }; - - evaluator() : m_matrix(0) {} - explicit evaluator(const SparseMatrixType &mat) : m_matrix(&mat) {} - - operator SparseMatrixType&() { return m_matrix->const_cast_derived(); } - operator const SparseMatrixType&() const { return *m_matrix; } - - typedef typename DenseCoeffsBase::CoeffReturnType CoeffReturnType; - Scalar coeff(Index row, Index col) const - { return m_matrix->coeff(row,col); } - - Scalar& coeffRef(Index row, Index col) - { - eigen_internal_assert(row>=0 && rowrows() && col>=0 && colcols()); - - const Index outer = SparseMatrixType::IsRowMajor ? row : col; - const Index inner = SparseMatrixType::IsRowMajor ? col : row; - - Index start = m_matrix->outerIndexPtr()[outer]; - Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer]; - eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist"); - const Index p = m_matrix->data().searchLowerIndex(start,end-1,inner); - eigen_assert((pdata().index(p)==inner) && "written coefficient does not exist"); - return m_matrix->const_cast_derived().data().value(p); - } - - const SparseMatrixType *m_matrix; + evaluator() : Base() {} + explicit evaluator(const SparseMatrixType &mat) : Base(mat) {} }; } diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h index 8de227b88..ba4803646 100644 --- a/Eigen/src/SparseCore/SparseUtil.h +++ b/Eigen/src/SparseCore/SparseUtil.h @@ -43,8 +43,7 @@ EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, -=) \ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=) -#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, BaseClass) \ - typedef BaseClass Base; \ +#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \ typedef typename Eigen::internal::traits::Scalar Scalar; \ typedef typename Eigen::NumTraits::Real RealScalar; \ typedef typename Eigen::internal::nested::type Nested; \ @@ -59,7 +58,8 @@ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=) using Base::const_cast_derived; #define EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \ - _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, Eigen::SparseMatrixBase) + typedef Eigen::SparseMatrixBase Base; \ + _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) const int CoherentAccessPattern = 0x1; const int InnerRandomAccessPattern = 0x2 | CoherentAccessPattern; -- cgit v1.2.3 From 08081f829397de11651d8d779b9eed916ccc88d7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 7 Feb 2015 22:02:14 +0100 Subject: Make SparseTranspose inherit SparseCompressBase when possible --- Eigen/src/SparseCore/SparseTranspose.h | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/SparseCore/SparseTranspose.h b/Eigen/src/SparseCore/SparseTranspose.h index c3d2d1a16..37ce7b0d5 100644 --- a/Eigen/src/SparseCore/SparseTranspose.h +++ b/Eigen/src/SparseCore/SparseTranspose.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008-2014 Gael Guennebaud +// Copyright (C) 2008-2015 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -12,13 +12,41 @@ namespace Eigen { +namespace internal { + template + class SparseTransposeImpl + : public SparseMatrixBase > + {}; + + template + class SparseTransposeImpl + : public SparseCompressedBase > + { + typedef SparseCompressedBase > Base; + public: + using Base::derived; + typedef typename Base::Scalar Scalar; + typedef typename Base::Index Index; + + inline const Scalar* valuePtr() const { return derived().nestedExpression().valuePtr(); } + inline const Index* innerIndexPtr() const { return derived().nestedExpression().innerIndexPtr(); } + inline const Index* outerIndexPtr() const { return derived().nestedExpression().outerIndexPtr(); } + inline const Index* innerNonZeroPtr() const { return derived().nestedExpression().innerNonZeroPtr(); } + + inline Scalar* valuePtr() { return derived().nestedExpression().valuePtr(); } + inline Index* innerIndexPtr() { return derived().nestedExpression().innerIndexPtr(); } + inline Index* outerIndexPtr() { return derived().nestedExpression().outerIndexPtr(); } + inline Index* innerNonZeroPtr() { return derived().nestedExpression().innerNonZeroPtr(); } + }; +} + // Implement nonZeros() for transpose. I'm not sure that's the best approach for that. // Perhaps it should be implemented in Transpose<> itself. template class TransposeImpl - : public SparseMatrixBase > + : public internal::SparseTransposeImpl { protected: - typedef SparseMatrixBase > Base; + typedef internal::SparseTransposeImpl Base; public: inline typename MatrixType::Index nonZeros() const { return Base::derived().nestedExpression().nonZeros(); } }; -- cgit v1.2.3 From f3be317614b6b6709737d6ef7bea0ffe96c285d4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 7 Feb 2015 22:03:25 +0100 Subject: Add a Map specialization. --- Eigen/SparseCore | 1 + Eigen/src/SparseCore/SparseMap.h | 211 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 212 insertions(+) create mode 100644 Eigen/src/SparseCore/SparseMap.h (limited to 'Eigen') diff --git a/Eigen/SparseCore b/Eigen/SparseCore index e2071519b..91cdfd3f0 100644 --- a/Eigen/SparseCore +++ b/Eigen/SparseCore @@ -33,6 +33,7 @@ #include "src/SparseCore/AmbiVector.h" #include "src/SparseCore/SparseCompressedBase.h" #include "src/SparseCore/SparseMatrix.h" +#include "src/SparseCore/SparseMap.h" #include "src/SparseCore/MappedSparseMatrix.h" #include "src/SparseCore/SparseVector.h" #include "src/SparseCore/SparseCwiseUnaryOp.h" diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h new file mode 100644 index 000000000..91e8f7480 --- /dev/null +++ b/Eigen/src/SparseCore/SparseMap.h @@ -0,0 +1,211 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPARSE_MAP_H +#define EIGEN_SPARSE_MAP_H + +namespace Eigen { + +template::has_write_access ? WriteAccessors : ReadOnlyAccessors +> class SparseMapBase; + +template +class SparseMapBase + : public SparseCompressedBase +{ + public: + typedef SparseCompressedBase Base; + typedef typename Base::Scalar Scalar; + typedef typename Base::Index Index; + enum { IsRowMajor = Base::IsRowMajor }; + + protected: + + typedef typename internal::conditional< + bool(internal::is_lvalue::value), + Scalar *, const Scalar *>::type ScalarPointer; + typedef typename internal::conditional< + bool(internal::is_lvalue::value), + Index *, const Index *>::type IndexPointer; + + Index m_outerSize; + Index m_innerSize; + Index m_nnz; + IndexPointer m_outerIndex; + IndexPointer m_innerIndices; + ScalarPointer m_values; + IndexPointer m_innerNonZeros; + + public: + + inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; } + inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; } + inline Index innerSize() const { return m_innerSize; } + inline Index outerSize() const { return m_outerSize; } + + bool isCompressed() const { return m_innerNonZeros==0; } + + //---------------------------------------- + // direct access interface + inline const Scalar* valuePtr() const { return m_values; } + inline const Index* innerIndexPtr() const { return m_innerIndices; } + inline const Index* outerIndexPtr() const { return m_outerIndex; } + inline const Index* innerNonZeroPtr() const { return m_innerNonZeros; } + //---------------------------------------- + + inline Scalar coeff(Index row, Index col) const + { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + Index start = m_outerIndex[outer]; + Index end = isCompressed() ? m_outerIndex[outer+1] : start + m_innerNonZeros[outer]; + if (start==end) + return Scalar(0); + else if (end>0 && inner==m_innerIndices[end-1]) + return m_values[end-1]; + // ^^ optimization: let's first check if it is the last coefficient + // (very common in high level algorithms) + + const Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner); + const Index id = r-&m_innerIndices[0]; + return ((*r==inner) && (id +class SparseMapBase + : public SparseMapBase +{ + typedef MapBase ReadOnlyMapBase; + + public: + typedef SparseMapBase Base; + typedef typename Base::Scalar Scalar; + typedef typename Base::Index Index; + enum { IsRowMajor = Base::IsRowMajor }; + + public: + + //---------------------------------------- + // direct access interface + using Base::valuePtr; + using Base::innerIndexPtr; + using Base::outerIndexPtr; + using Base::innerNonZeroPtr; + inline Scalar* valuePtr() { return Base::m_values; } + inline Index* innerIndexPtr() { return Base::m_innerIndices; } + inline Index* outerIndexPtr() { return Base::m_outerIndex; } + inline Index* innerNonZeroPtr() { return Base::m_innerNonZeros; } + //---------------------------------------- + + inline Scalar& coeffRef(Index row, Index col) + { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + Index start = Base::m_outerIndex[outer]; + Index end = Base::isCompressed() ? Base::m_outerIndex[outer+1] : start + Base::m_innerNonZeros[outer]; + eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix"); + eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient"); + Index* r = std::lower_bound(&Base::m_innerIndices[start],&Base::m_innerIndices[end],inner); + const Index id = r - &Base::m_innerIndices[0]; + eigen_assert((*r==inner) && (id(Base::m_values)[id]; + } + + inline SparseMapBase(Index rows, Index cols, Index nnz, Index* outerIndexPtr, Index* innerIndexPtr, + Scalar* valuePtr, Index* innerNonZerosPtr = 0) + : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) + {} + + /** Empty destructor */ + inline ~SparseMapBase() {} +}; + +template +class Map, Options, StrideType> + : public SparseMapBase, Options, StrideType> > +{ + public: + typedef SparseMapBase > Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(Map) + enum { IsRowMajor = Base::IsRowMajor }; + + public: + + inline Map(Index rows, Index cols, Index nnz, Index* outerIndexPtr, + Index* innerIndexPtr, Scalar* valuePtr, Index* innerNonZerosPtr = 0) + : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) + {} + + /** Empty destructor */ + inline ~Map() {} +}; + +template +class Map, Options, StrideType> + : public SparseMapBase, Options, StrideType> > +{ + public: + typedef SparseMapBase > Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(Map) + enum { IsRowMajor = Base::IsRowMajor }; + + public: + + inline Map(Index rows, Index cols, Index nnz, const Index* outerIndexPtr, + const Index* innerIndexPtr, const Scalar* valuePtr, const Index* innerNonZerosPtr = 0) + : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) + {} + + /** Empty destructor */ + inline ~Map() {} +}; + +namespace internal { + +template +struct evaluator, Options, StrideType> > + : evaluator, Options, StrideType> > > +{ + typedef evaluator, Options, StrideType> > > Base; + typedef Map, Options, StrideType> XprType; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} +}; + +template +struct evaluator, Options, StrideType> > + : evaluator, Options, StrideType> > > +{ + typedef evaluator, Options, StrideType> > > Base; + typedef Map, Options, StrideType> XprType; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} +}; + +} + +} // end namespace Eigen + +#endif // EIGEN_SPARSE_MAP_H -- cgit v1.2.3 From f2ff8c091e02b4aab8c7568807c09e43f51d1156 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 7 Feb 2015 22:04:18 +0100 Subject: Add a Ref specialization. --- Eigen/SparseCore | 1 + Eigen/src/SparseCore/SparseRef.h | 200 +++++++++++++++++++++++++++++++++++++++ test/CMakeLists.txt | 1 + test/sparse_ref.cpp | 96 +++++++++++++++++++ 4 files changed, 298 insertions(+) create mode 100644 Eigen/src/SparseCore/SparseRef.h create mode 100644 test/sparse_ref.cpp (limited to 'Eigen') diff --git a/Eigen/SparseCore b/Eigen/SparseCore index 91cdfd3f0..48ed967b8 100644 --- a/Eigen/SparseCore +++ b/Eigen/SparseCore @@ -36,6 +36,7 @@ #include "src/SparseCore/SparseMap.h" #include "src/SparseCore/MappedSparseMatrix.h" #include "src/SparseCore/SparseVector.h" +#include "src/SparseCore/SparseRef.h" #include "src/SparseCore/SparseCwiseUnaryOp.h" #include "src/SparseCore/SparseCwiseBinaryOp.h" #include "src/SparseCore/SparseTranspose.h" diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h new file mode 100644 index 000000000..cfea6ce8a --- /dev/null +++ b/Eigen/src/SparseCore/SparseRef.h @@ -0,0 +1,200 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPARSE_REF_H +#define EIGEN_SPARSE_REF_H + +namespace Eigen { + +namespace internal { + +template class SparseRefBase; + +template +struct traits, _Options, _StrideType> > + : public traits > +{ + typedef SparseMatrix PlainObjectType; + enum { + Options = _Options, + Flags = traits >::Flags | CompressedAccessBit | NestByRefBit + }; + + template struct match { + enum { + StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), + MatchAtCompileTime = (Derived::Flags&CompressedAccessBit) && StorageOrderMatch + }; + typedef typename internal::conditional::type type; + }; + +}; + +template +struct traits, _Options, _StrideType> > + : public traits, _Options, _StrideType> > +{ + enum { + Flags = (traits >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit + }; +}; + +template +struct traits > : public traits {}; + +template class SparseRefBase +// : public MappedSparseMatrix + : public SparseMapBase +{ +// typedef typename internal::traits::PlainObjectType PlainObjectType; + +public: + + typedef SparseMapBase Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseRefBase) + + SparseRefBase() + : Base(RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime, 0, 0, 0, 0, 0) + {} + + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(SparseRefBase) + +protected: + + + template + void construct(Expression& expr) + { + ::new (static_cast(this)) Base(expr.rows(), expr.cols(), expr.nonZeros(), expr.outerIndexPtr(), expr.innerIndexPtr(), expr.valuePtr(), expr.innerNonZeroPtr()); + } +}; + +} // namespace internal + +template +class Ref, Options, StrideType > + : public internal::SparseRefBase, Options, StrideType > > +{ + typedef SparseMatrix PlainObjectType; + typedef internal::traits Traits; + template + inline Ref(const SparseMatrix& expr); + template + inline Ref(const MappedSparseMatrix& expr); + public: + + typedef internal::SparseRefBase Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(Ref) + + + #ifndef EIGEN_PARSED_BY_DOXYGEN + template + inline Ref(SparseMatrix& expr) + { + EIGEN_STATIC_ASSERT(bool(Traits::template match >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); + Base::construct(expr.derived()); + } + + template + inline Ref(MappedSparseMatrix& expr) + { + EIGEN_STATIC_ASSERT(bool(Traits::template match >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); + Base::construct(expr.derived()); + } + + template + inline Ref(const SparseCompressedBase& expr) + #else + template + inline Ref(SparseCompressedBase& expr) + #endif + { + EIGEN_STATIC_ASSERT(bool(internal::is_lvalue::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); + EIGEN_STATIC_ASSERT(bool(Traits::template match::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); + Base::construct(expr.const_cast_derived()); + } + + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Ref) + +}; + +// this is the const ref version +template +class Ref, Options, StrideType> + : public internal::SparseRefBase, Options, StrideType> > +{ + typedef SparseMatrix TPlainObjectType; + typedef internal::traits Traits; + public: + + typedef internal::SparseRefBase Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(Ref) + + template + inline Ref(const SparseMatrixBase& expr) + { + construct(expr.derived(), typename Traits::template match::type()); + } + + inline Ref(const Ref& other) : Base(other) { + // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy + } + + template + inline Ref(const RefBase& other) { + construct(other.derived(), typename Traits::template match::type()); + } + + protected: + + template + void construct(const Expression& expr,internal::true_type) + { + Base::construct(expr); + } + + template + void construct(const Expression& expr, internal::false_type) + { + m_object = expr; + Base::construct(m_object); + } + + protected: + TPlainObjectType m_object; +}; + + +namespace internal { + +template +struct evaluator, Options, StrideType> > + : evaluator, Options, StrideType> > > +{ + typedef evaluator, Options, StrideType> > > Base; + typedef Ref, Options, StrideType> XprType; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} +}; + +template +struct evaluator, Options, StrideType> > + : evaluator, Options, StrideType> > > +{ + typedef evaluator, Options, StrideType> > > Base; + typedef Ref, Options, StrideType> XprType; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} +}; + +} + +} // end namespace Eigen + +#endif // EIGEN_SPARSE_REF_H diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f57d8ce36..168749634 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -228,6 +228,7 @@ ei_add_test(stddeque) ei_add_test(sparse_basic) ei_add_test(sparse_vector) ei_add_test(sparse_product) +ei_add_test(sparse_ref) ei_add_test(sparse_solvers) ei_add_test(sparse_permutations) ei_add_test(simplicial_cholesky) diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp new file mode 100644 index 000000000..b261cecf3 --- /dev/null +++ b/test/sparse_ref.cpp @@ -0,0 +1,96 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 20015 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// This unit test cannot be easily written to work with EIGEN_DEFAULT_TO_ROW_MAJOR +#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR +#undef EIGEN_DEFAULT_TO_ROW_MAJOR +#endif + +static long int nb_temporaries; + +inline void on_temporary_creation() { + // here's a great place to set a breakpoint when debugging failures in this test! + nb_temporaries++; +} + +#define EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN { on_temporary_creation(); } + +#include "main.h" +#include + +#define VERIFY_EVALUATION_COUNT(XPR,N) {\ + nb_temporaries = 0; \ + XPR; \ + if(nb_temporaries!=N) std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; \ + VERIFY( (#XPR) && nb_temporaries==N ); \ + } + +template void check_const_correctness(const PlainObjectType&) +{ + // verify that ref-to-const don't have LvalueBit + typedef typename internal::add_const::type ConstPlainObjectType; + VERIFY( !(internal::traits >::Flags & LvalueBit) ); + VERIFY( !(internal::traits >::Flags & LvalueBit) ); + VERIFY( !(Ref::Flags & LvalueBit) ); + VERIFY( !(Ref::Flags & LvalueBit) ); +} + +template +EIGEN_DONT_INLINE void call_ref_1(Ref > a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); } + +template +EIGEN_DONT_INLINE void call_ref_2(const Ref >& a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); } + +void call_ref() +{ +// SparseVector > ca = VectorXcf::Random(10).sparseView(); +// SparseVector a = VectorXf::Random(10).sparseView(); + SparseMatrix A = MatrixXf::Random(10,10).sparseView(); + SparseMatrix B = MatrixXf::Random(10,10).sparseView(); + const SparseMatrix& Ac(A); + Block > Ab(A,0,1, 3,3); + const Block > Abc(A,0,1,3,3); + + + VERIFY_EVALUATION_COUNT( call_ref_1(A, A), 0); +// VERIFY_EVALUATION_COUNT( call_ref_1(Ac, Ac), 0); // does not compile on purpose + VERIFY_EVALUATION_COUNT( call_ref_2(A, A), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(A.transpose(), A.transpose()), 1); + VERIFY_EVALUATION_COUNT( call_ref_2(Ac,Ac), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(A+A,2*Ac), 1); + VERIFY_EVALUATION_COUNT( call_ref_2(B, B), 1); + VERIFY_EVALUATION_COUNT( call_ref_2(B.transpose(), B.transpose()), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(A*A, A*A), 1); + + Ref > Ar(A); + VERIFY_EVALUATION_COUNT( call_ref_1(Ar, Ar), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(Ar, Ar), 0); + + Ref > Br(B); + VERIFY_EVALUATION_COUNT( call_ref_1(Br.transpose(), Br.transpose()), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(Br, Br), 1); + VERIFY_EVALUATION_COUNT( call_ref_2(Br.transpose(), Br.transpose()), 0); + + Ref > Arc(A); +// VERIFY_EVALUATION_COUNT( call_ref_1(Arc, Arc), 0); // does not compile on purpose + VERIFY_EVALUATION_COUNT( call_ref_2(Arc, Arc), 0); + + VERIFY_EVALUATION_COUNT( call_ref_2(A.middleCols(1,3), A.middleCols(1,3)), 1); // should be 0 + + VERIFY_EVALUATION_COUNT( call_ref_2(A.block(1,1,3,3), A.block(1,1,3,3)), 1); // should be 0 (allocate starts/nnz only) +} + +void test_sparse_ref() +{ + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST_1( check_const_correctness(SparseMatrix()) ); + CALL_SUBTEST_1( check_const_correctness(SparseMatrix()) ); + CALL_SUBTEST_2( call_ref() ); + } +} -- cgit v1.2.3 From 3af29caae87b00ed8f002605573d227ad1b629a4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Feb 2015 10:23:45 +0100 Subject: Cleaning and add more unit tests for Ref and Map --- Eigen/src/SparseCore/MappedSparseMatrix.h | 172 +++--------------------------- Eigen/src/SparseCore/SparseMap.h | 34 +++++- Eigen/src/SparseCore/SparseRef.h | 12 +-- test/sparse_basic.cpp | 20 ++++ test/sparse_ref.cpp | 5 +- 5 files changed, 72 insertions(+), 171 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/SparseCore/MappedSparseMatrix.h b/Eigen/src/SparseCore/MappedSparseMatrix.h index 2852c669a..533479fd0 100644 --- a/Eigen/src/SparseCore/MappedSparseMatrix.h +++ b/Eigen/src/SparseCore/MappedSparseMatrix.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008 Gael Guennebaud +// Copyright (C) 2008-2014 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -10,9 +10,10 @@ #ifndef EIGEN_MAPPED_SPARSEMATRIX_H #define EIGEN_MAPPED_SPARSEMATRIX_H -namespace Eigen { +namespace Eigen { -/** \class MappedSparseMatrix +/** \deprecated Use Map > + * \class MappedSparseMatrix * * \brief Sparse matrix * @@ -25,179 +26,38 @@ namespace internal { template struct traits > : traits > {}; -} +} // end namespace internal template class MappedSparseMatrix - : public SparseMatrixBase > + : public Map > { - public: - EIGEN_SPARSE_PUBLIC_INTERFACE(MappedSparseMatrix) - enum { IsRowMajor = Base::IsRowMajor }; - - protected: - - Index m_outerSize; - Index m_innerSize; - Index m_nnz; - Index* m_outerIndex; - Index* m_innerIndices; - Scalar* m_values; + typedef Map > Base; public: - - inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; } - inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; } - inline Index innerSize() const { return m_innerSize; } - inline Index outerSize() const { return m_outerSize; } - bool isCompressed() const { return true; } - - //---------------------------------------- - // direct access interface - inline const Scalar* valuePtr() const { return m_values; } - inline Scalar* valuePtr() { return m_values; } - - inline const Index* innerIndexPtr() const { return m_innerIndices; } - inline Index* innerIndexPtr() { return m_innerIndices; } - - inline const Index* outerIndexPtr() const { return m_outerIndex; } - inline Index* outerIndexPtr() { return m_outerIndex; } - //---------------------------------------- - - inline Scalar coeff(Index row, Index col) const - { - const Index outer = IsRowMajor ? row : col; - const Index inner = IsRowMajor ? col : row; - - Index start = m_outerIndex[outer]; - Index end = m_outerIndex[outer+1]; - if (start==end) - return Scalar(0); - else if (end>0 && inner==m_innerIndices[end-1]) - return m_values[end-1]; - // ^^ optimization: let's first check if it is the last coefficient - // (very common in high level algorithms) + typedef typename Base::Index Index; + typedef typename Base::Scalar Scalar; - const Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner); - const Index id = r-&m_innerIndices[0]; - return ((*r==inner) && (id=start && "you probably called coeffRef on a non finalized matrix"); - eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient"); - Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end],inner); - const Index id = r-&m_innerIndices[0]; - eigen_assert((*r==inner) && (id -class MappedSparseMatrix::InnerIterator -{ - public: - InnerIterator(const MappedSparseMatrix& mat, Index outer) - : m_matrix(mat), - m_outer(outer), - m_id(mat.outerIndexPtr()[outer]), - m_start(m_id), - m_end(mat.outerIndexPtr()[outer+1]) - {} - - inline InnerIterator& operator++() { m_id++; return *this; } - - inline Scalar value() const { return m_matrix.valuePtr()[m_id]; } - inline Scalar& valueRef() { return const_cast(m_matrix.valuePtr()[m_id]); } - - inline Index index() const { return m_matrix.innerIndexPtr()[m_id]; } - inline Index row() const { return IsRowMajor ? m_outer : index(); } - inline Index col() const { return IsRowMajor ? index() : m_outer; } - - inline operator bool() const { return (m_id < m_end) && (m_id>=m_start); } - - protected: - const MappedSparseMatrix& m_matrix; - const Index m_outer; - Index m_id; - const Index m_start; - const Index m_end; -}; - -template -class MappedSparseMatrix::ReverseInnerIterator -{ - public: - ReverseInnerIterator(const MappedSparseMatrix& mat, Index outer) - : m_matrix(mat), - m_outer(outer), - m_id(mat.outerIndexPtr()[outer+1]), - m_start(mat.outerIndexPtr()[outer]), - m_end(m_id) - {} - - inline ReverseInnerIterator& operator--() { m_id--; return *this; } - - inline Scalar value() const { return m_matrix.valuePtr()[m_id-1]; } - inline Scalar& valueRef() { return const_cast(m_matrix.valuePtr()[m_id-1]); } - - inline Index index() const { return m_matrix.innerIndexPtr()[m_id-1]; } - inline Index row() const { return IsRowMajor ? m_outer : index(); } - inline Index col() const { return IsRowMajor ? index() : m_outer; } - - inline operator bool() const { return (m_id <= m_end) && (m_id>m_start); } - - protected: - const MappedSparseMatrix& m_matrix; - const Index m_outer; - Index m_id; - const Index m_start; - const Index m_end; -}; - namespace internal { template struct evaluator > - : evaluator_base > + : evaluator > > { - typedef MappedSparseMatrix<_Scalar,_Options,_Index> MappedSparseMatrixType; - typedef typename MappedSparseMatrixType::InnerIterator InnerIterator; - typedef typename MappedSparseMatrixType::ReverseInnerIterator ReverseInnerIterator; - - enum { - CoeffReadCost = NumTraits<_Scalar>::ReadCost, - Flags = MappedSparseMatrixType::Flags - }; - - evaluator() : m_matrix(0) {} - explicit evaluator(const MappedSparseMatrixType &mat) : m_matrix(&mat) {} - - operator MappedSparseMatrixType&() { return m_matrix->const_cast_derived(); } - operator const MappedSparseMatrixType&() const { return *m_matrix; } + typedef MappedSparseMatrix<_Scalar,_Options,_Index> XprType; + typedef evaluator > Base; - const MappedSparseMatrixType *m_matrix; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} }; } diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h index 91e8f7480..72dedb1ec 100644 --- a/Eigen/src/SparseCore/SparseMap.h +++ b/Eigen/src/SparseCore/SparseMap.h @@ -12,6 +12,32 @@ namespace Eigen { +namespace internal { + +template +struct traits, Options, StrideType> > + : public traits > +{ + typedef SparseMatrix PlainObjectType; + typedef traits TraitsBase; + enum { + Flags = TraitsBase::Flags & (~NestByRefBit) + }; +}; + +template +struct traits, Options, StrideType> > + : public traits > +{ + typedef SparseMatrix PlainObjectType; + typedef traits TraitsBase; + enum { + Flags = TraitsBase::Flags & (~ (NestByRefBit | LvalueBit)) + }; +}; + +} // end namespace internal + template::has_write_access ? WriteAccessors : ReadOnlyAccessors > class SparseMapBase; @@ -25,7 +51,7 @@ class SparseMapBase typedef typename Base::Scalar Scalar; typedef typename Base::Index Index; enum { IsRowMajor = Base::IsRowMajor }; - + using Base::operator=; protected: typedef typename internal::conditional< @@ -103,6 +129,8 @@ class SparseMapBase typedef typename Base::Scalar Scalar; typedef typename Base::Index Index; enum { IsRowMajor = Base::IsRowMajor }; + + using Base::operator=; public: @@ -147,7 +175,7 @@ class Map, Options, StrideType> : public SparseMapBase, Options, StrideType> > { public: - typedef SparseMapBase > Base; + typedef SparseMapBase Base; _EIGEN_SPARSE_PUBLIC_INTERFACE(Map) enum { IsRowMajor = Base::IsRowMajor }; @@ -167,7 +195,7 @@ class Map, Options, StrideType : public SparseMapBase, Options, StrideType> > { public: - typedef SparseMapBase > Base; + typedef SparseMapBase Base; _EIGEN_SPARSE_PUBLIC_INTERFACE(Map) enum { IsRowMajor = Base::IsRowMajor }; diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h index cfea6ce8a..2ca039323 100644 --- a/Eigen/src/SparseCore/SparseRef.h +++ b/Eigen/src/SparseCore/SparseRef.h @@ -23,7 +23,7 @@ struct traits, _Options, _Stride typedef SparseMatrix PlainObjectType; enum { Options = _Options, - Flags = traits >::Flags | CompressedAccessBit | NestByRefBit + Flags = traits >::Flags | CompressedAccessBit | NestByRefBit }; template struct match { @@ -41,7 +41,7 @@ struct traits, _Options, _ : public traits, _Options, _StrideType> > { enum { - Flags = (traits >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit + Flags = (traits >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit }; }; @@ -49,11 +49,8 @@ template struct traits > : public traits {}; template class SparseRefBase -// : public MappedSparseMatrix : public SparseMapBase { -// typedef typename internal::traits::PlainObjectType PlainObjectType; - public: typedef SparseMapBase Base; @@ -63,8 +60,6 @@ public: : Base(RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime, 0, 0, 0, 0, 0) {} - EIGEN_INHERIT_ASSIGNMENT_OPERATORS(SparseRefBase) - protected: @@ -119,9 +114,6 @@ class Ref, Options, StrideType > EIGEN_STATIC_ASSERT(bool(Traits::template match::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); Base::construct(expr.const_cast_derived()); } - - EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Ref) - }; // this is the const ref version diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index 097959c84..a19de296a 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -408,6 +408,26 @@ template void sparse_basic(const SparseMatrixType& re m.setFromTriplets(triplets.begin(), triplets.end()); VERIFY_IS_APPROX(m, refMat); } + + // test Map + { + DenseMatrix refMat2(rows, cols), refMat3(rows, cols); + SparseMatrixType m2(rows, cols), m3(rows, cols); + initSparse(density, refMat2, m2); + initSparse(density, refMat3, m3); + { + Map mapMat2(m2.rows(), m2.cols(), m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(), m2.innerNonZeroPtr()); + Map mapMat3(m3.rows(), m3.cols(), m3.nonZeros(), m3.outerIndexPtr(), m3.innerIndexPtr(), m3.valuePtr(), m3.innerNonZeroPtr()); + VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3); + VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3); + } + { + MappedSparseMatrix mapMat2(m2.rows(), m2.cols(), m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(), m2.innerNonZeroPtr()); + MappedSparseMatrix mapMat3(m3.rows(), m3.cols(), m3.nonZeros(), m3.outerIndexPtr(), m3.innerIndexPtr(), m3.valuePtr(), m3.innerNonZeroPtr()); + VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3); + VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3); + } + } // test triangularView { diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp index b261cecf3..27700f827 100644 --- a/test/sparse_ref.cpp +++ b/test/sparse_ref.cpp @@ -69,8 +69,9 @@ void call_ref() VERIFY_EVALUATION_COUNT( call_ref_2(A*A, A*A), 1); Ref > Ar(A); - VERIFY_EVALUATION_COUNT( call_ref_1(Ar, Ar), 0); - VERIFY_EVALUATION_COUNT( call_ref_2(Ar, Ar), 0); + VERIFY_IS_APPROX(Ar+Ar, A+A); + VERIFY_EVALUATION_COUNT( call_ref_1(Ar, A), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(Ar, A), 0); Ref > Br(B); VERIFY_EVALUATION_COUNT( call_ref_1(Br.transpose(), Br.transpose()), 0); -- cgit v1.2.3 From d4ec48575e94ec469ef9fbf005a0a6e67571f528 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Feb 2015 11:14:36 +0100 Subject: Make Block inherit SparseCompressedBase in the case of an inner-panels and fix valuePtr() innerIndexPtr() --- Eigen/src/Core/Block.h | 2 +- Eigen/src/SparseCore/SparseBlock.h | 17 ++++++++++++----- test/sparse_ref.cpp | 8 +++++--- 3 files changed, 18 insertions(+), 9 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 9cf9d5432..5f6307206 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -87,7 +87,7 @@ struct traits > : traits::value ? LvalueBit : 0, FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, - Flags = (traits::Flags & DirectAccessBit) | FlagsLvalueBit | FlagsRowMajorBit + Flags = (traits::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit // FIXME DirectAccessBit should not be handled by expressions }; }; diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index 9e4da2057..6b8ade5aa 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -74,7 +74,7 @@ namespace internal { template class sparse_matrix_block_impl - : public SparseMatrixBase > + : public SparseCompressedBase > { typedef typename internal::remove_all::type _MatrixTypeNested; typedef Block BlockType; @@ -172,19 +172,24 @@ public: } inline const Scalar* valuePtr() const - { return m_matrix.valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; } + { return m_matrix.valuePtr(); } inline Scalar* valuePtr() - { return m_matrix.const_cast_derived().valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; } + { return m_matrix.const_cast_derived().valuePtr(); } inline const Index* innerIndexPtr() const - { return m_matrix.innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; } + { return m_matrix.innerIndexPtr(); } inline Index* innerIndexPtr() - { return m_matrix.const_cast_derived().innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; } + { return m_matrix.const_cast_derived().innerIndexPtr(); } inline const Index* outerIndexPtr() const { return m_matrix.outerIndexPtr() + m_outerStart; } inline Index* outerIndexPtr() { return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; } + + inline const Index* innerNonZeroPtr() const + { return isCompressed() ? 0 : m_matrix.innerNonZeroPtr(); } + inline Index* innerNonZeroPtr() + { return isCompressed() ? 0 : m_matrix.const_cast_derived().innerNonZeroPtr(); } Index nonZeros() const { @@ -196,6 +201,8 @@ public: else return Map >(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum(); } + + bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; } const Scalar& lastCoeff() const { diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp index 27700f827..e7380ba21 100644 --- a/test/sparse_ref.cpp +++ b/test/sparse_ref.cpp @@ -51,8 +51,8 @@ void call_ref() { // SparseVector > ca = VectorXcf::Random(10).sparseView(); // SparseVector a = VectorXf::Random(10).sparseView(); - SparseMatrix A = MatrixXf::Random(10,10).sparseView(); - SparseMatrix B = MatrixXf::Random(10,10).sparseView(); + SparseMatrix A = MatrixXf::Random(10,10).sparseView(0.5,1); + SparseMatrix B = MatrixXf::Random(10,10).sparseView(0.5,1); const SparseMatrix& Ac(A); Block > Ab(A,0,1, 3,3); const Block > Abc(A,0,1,3,3); @@ -82,7 +82,9 @@ void call_ref() // VERIFY_EVALUATION_COUNT( call_ref_1(Arc, Arc), 0); // does not compile on purpose VERIFY_EVALUATION_COUNT( call_ref_2(Arc, Arc), 0); - VERIFY_EVALUATION_COUNT( call_ref_2(A.middleCols(1,3), A.middleCols(1,3)), 1); // should be 0 + VERIFY_EVALUATION_COUNT( call_ref_2(A.middleCols(1,3), A.middleCols(1,3)), 0); + + VERIFY_EVALUATION_COUNT( call_ref_2(A.col(2), A.col(2)), 0); VERIFY_EVALUATION_COUNT( call_ref_2(A.block(1,1,3,3), A.block(1,1,3,3)), 1); // should be 0 (allocate starts/nnz only) } -- cgit v1.2.3 From 87629cd6395433966977e868ec091c3fc754956c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Feb 2015 11:41:25 +0100 Subject: bug #897: makes iterative sparse solvers use a Ref instead of a SparseMatrix pointer. This fixes usage of iterative solvers with a Map. --- Eigen/src/IterativeLinearSolvers/BiCGSTAB.h | 2 +- .../src/IterativeLinearSolvers/ConjugateGradient.h | 2 +- .../IterativeLinearSolvers/IterativeSolverBase.h | 53 +++++++++++++++------- test/sparse_solver.h | 5 +- 4 files changed, 41 insertions(+), 21 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h index 224fe913f..a50680133 100644 --- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h @@ -193,7 +193,7 @@ public: m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - if(!internal::bicgstab(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error)) + if(!internal::bicgstab(mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error)) failed = true; } m_info = failed ? NumericalIssue diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index b5ef6d60f..3e024bda1 100644 --- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -206,7 +206,7 @@ public: m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - internal::conjugate_gradient(mp_matrix->template selfadjointView(), b.col(j), xj, + internal::conjugate_gradient(mp_matrix.template selfadjointView(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error); } diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h index f33c868bb..6f48075b4 100644 --- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h @@ -37,7 +37,7 @@ public: /** Default constructor. */ IterativeSolverBase() - : mp_matrix(0) + : m_dummy(0,0), mp_matrix(m_dummy) { init(); } @@ -52,10 +52,11 @@ public: * this class becomes invalid. Call compute() to update it with the new * matrix A, or modify a copy of A. */ - explicit IterativeSolverBase(const MatrixType& A) + template + explicit IterativeSolverBase(const SparseMatrixBase& A) { init(); - compute(A); + compute(A.derived()); } ~IterativeSolverBase() {} @@ -65,9 +66,11 @@ public: * Currently, this function mostly calls analyzePattern on the preconditioner. In the future * we might, for instance, implement column reordering for faster matrix vector products. */ - Derived& analyzePattern(const MatrixType& A) + template + Derived& analyzePattern(const SparseMatrixBase& A) { - m_preconditioner.analyzePattern(A); + grab(A); + m_preconditioner.analyzePattern(mp_matrix); m_isInitialized = true; m_analysisIsOk = true; m_info = Success; @@ -83,11 +86,12 @@ public: * this class becomes invalid. Call compute() to update it with the new * matrix A, or modify a copy of A. */ - Derived& factorize(const MatrixType& A) + template + Derived& factorize(const SparseMatrixBase& A) { eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); - mp_matrix = &A; - m_preconditioner.factorize(A); + grab(A); + m_preconditioner.factorize(mp_matrix); m_factorizationIsOk = true; m_info = Success; return derived(); @@ -103,10 +107,11 @@ public: * this class becomes invalid. Call compute() to update it with the new * matrix A, or modify a copy of A. */ - Derived& compute(const MatrixType& A) + template + Derived& compute(const SparseMatrixBase& A) { - mp_matrix = &A; - m_preconditioner.compute(A); + grab(A); + m_preconditioner.compute(mp_matrix); m_isInitialized = true; m_analysisIsOk = true; m_factorizationIsOk = true; @@ -115,9 +120,9 @@ public: } /** \internal */ - Index rows() const { return mp_matrix ? mp_matrix->rows() : 0; } + Index rows() const { return mp_matrix.rows(); } /** \internal */ - Index cols() const { return mp_matrix ? mp_matrix->cols() : 0; } + Index cols() const { return mp_matrix.cols(); } /** \returns the tolerance threshold used by the stopping criteria */ RealScalar tolerance() const { return m_tolerance; } @@ -135,13 +140,18 @@ public: /** \returns a read-only reference to the preconditioner. */ const Preconditioner& preconditioner() const { return m_preconditioner; } - /** \returns the max number of iterations */ + /** \returns the max number of iterations. + * It is either the value setted by setMaxIterations or, by default, + * twice the number of columns of the matrix. + */ int maxIterations() const { - return (mp_matrix && m_maxIterations<0) ? mp_matrix->cols() : m_maxIterations; + return (m_maxIterations<0) ? 2*mp_matrix.cols() : m_maxIterations; } - /** Sets the max number of iterations */ + /** Sets the max number of iterations. + * Default is twice the number of columns of the matrix. + */ Derived& setMaxIterations(int maxIters) { m_maxIterations = maxIters; @@ -210,7 +220,16 @@ protected: m_maxIterations = -1; m_tolerance = NumTraits::epsilon(); } - const MatrixType* mp_matrix; + + template + void grab(const SparseMatrixBase &A) + { + mp_matrix.~Ref(); + ::new (&mp_matrix) Ref(A); + } + + MatrixType m_dummy; + Ref mp_matrix; Preconditioner m_preconditioner; int m_maxIterations; diff --git a/test/sparse_solver.h b/test/sparse_solver.h index ee350d561..887ff02a9 100644 --- a/test/sparse_solver.h +++ b/test/sparse_solver.h @@ -32,7 +32,7 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A, x = solver.solve(b); if (solver.info() != Success) { - std::cerr << "sparse solver testing: solving failed\n"; + std::cerr << "sparse solver testing: solving failed (" << typeid(Solver).name() << ")\n"; return; } VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!"); @@ -75,7 +75,8 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A, xm = solver.solve(bm); if (solver.info() != Success) { - std::cerr << "sparse solver testing: solving failed\n"; + std::cerr << "sparse solver testing: solving with a Map failed\n"; + exit(0); return; } VERIFY(oldb.isApprox(bm) && "sparse solver testing: the rhs should not be modified!"); -- cgit v1.2.3 From c6e8caf0900ae303e9e7399bed00af705015ff17 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 10 Feb 2015 18:57:41 +0100 Subject: Allows Lower|Upper as a template argument of CG and MINRES: in this case the full matrix will be considered. --- Eigen/src/IterativeLinearSolvers/ConjugateGradient.h | 11 +++++++---- test/conjugate_gradient.cpp | 6 ++++-- test/sparse_solver.h | 5 ++++- unsupported/Eigen/src/IterativeSolvers/MINRES.h | 7 ++++++- unsupported/test/minres.cpp | 2 ++ 5 files changed, 23 insertions(+), 8 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index 3e024bda1..4857dd9e9 100644 --- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -113,8 +113,8 @@ struct traits > * The matrix A must be selfadjoint. The matrix A and the vectors x and b can be either dense or sparse. * * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix. - * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower - * or Upper. Default is Lower. + * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower, + * Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower. * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner * * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations() @@ -197,6 +197,10 @@ public: template void _solve_with_guess_impl(const Rhs& b, Dest& x) const { + typedef typename internal::conditional&, + SparseSelfAdjointView, UpLo> + >::type MatrixWrapperType; m_iterations = Base::maxIterations(); m_error = Base::m_tolerance; @@ -206,8 +210,7 @@ public: m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - internal::conjugate_gradient(mp_matrix.template selfadjointView(), b.col(j), xj, - Base::m_preconditioner, m_iterations, m_error); + internal::conjugate_gradient(MatrixWrapperType(mp_matrix), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error); } m_isInitialized = true; diff --git a/test/conjugate_gradient.cpp b/test/conjugate_gradient.cpp index 869051b31..019cc4d64 100644 --- a/test/conjugate_gradient.cpp +++ b/test/conjugate_gradient.cpp @@ -12,13 +12,15 @@ template void test_conjugate_gradient_T() { - ConjugateGradient, Lower> cg_colmajor_lower_diag; - ConjugateGradient, Upper> cg_colmajor_upper_diag; + ConjugateGradient, Lower > cg_colmajor_lower_diag; + ConjugateGradient, Upper > cg_colmajor_upper_diag; + ConjugateGradient, Lower|Upper> cg_colmajor_loup_diag; ConjugateGradient, Lower, IdentityPreconditioner> cg_colmajor_lower_I; ConjugateGradient, Upper, IdentityPreconditioner> cg_colmajor_upper_I; CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_lower_diag) ); CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_diag) ); + CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_loup_diag) ); CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_lower_I) ); CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_I) ); } diff --git a/test/sparse_solver.h b/test/sparse_solver.h index 887ff02a9..6cf99e0b0 100644 --- a/test/sparse_solver.h +++ b/test/sparse_solver.h @@ -195,7 +195,10 @@ int generate_sparse_spd_problem(Solver& , typename Solver::MatrixType& A, typena dA = dM * dM.adjoint(); halfA.resize(size,size); - halfA.template selfadjointView().rankUpdate(M); + if(Solver::UpLo==(Lower|Upper)) + halfA = A; + else + halfA.template selfadjointView().rankUpdate(M); return size; } diff --git a/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/unsupported/Eigen/src/IterativeSolvers/MINRES.h index a34902001..65cffc255 100644 --- a/unsupported/Eigen/src/IterativeSolvers/MINRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h @@ -250,6 +250,11 @@ namespace Eigen { template void _solve_with_guess_impl(const Rhs& b, Dest& x) const { + typedef typename internal::conditional&, + SparseSelfAdjointView, UpLo> + >::type MatrixWrapperType; + m_iterations = Base::maxIterations(); m_error = Base::m_tolerance; @@ -259,7 +264,7 @@ namespace Eigen { m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - internal::minres(mp_matrix.template selfadjointView(), b.col(j), xj, + internal::minres(MatrixWrapperType(mp_matrix), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error); } diff --git a/unsupported/test/minres.cpp b/unsupported/test/minres.cpp index 81b762c37..f6e526bbd 100644 --- a/unsupported/test/minres.cpp +++ b/unsupported/test/minres.cpp @@ -21,6 +21,7 @@ template void test_minres_T() // Diagonal preconditioner MINRES, Lower, DiagonalPreconditioner > minres_colmajor_lower_diag; MINRES, Upper, DiagonalPreconditioner > minres_colmajor_upper_diag; + MINRES, Upper, DiagonalPreconditioner > minres_colmajor_uplo_diag; // call tests for SPD matrix CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_I) ); @@ -28,6 +29,7 @@ template void test_minres_T() CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_diag) ); CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_upper_diag) ); + CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_uplo_diag) ); // TO DO: symmetric semi-definite matrix // TO DO: symmetric indefinite matrix -- cgit v1.2.3 From c3f3580b8f7e4af89f4c7cdbe036ac1cac750128 Mon Sep 17 00:00:00 2001 From: Jan Blechta Date: Tue, 10 Feb 2015 14:24:39 +0100 Subject: Fix bug #733: step by step solving is not a good example for solveWithGuess --- Eigen/src/IterativeLinearSolvers/BiCGSTAB.h | 6 +----- Eigen/src/IterativeLinearSolvers/ConjugateGradient.h | 15 +-------------- unsupported/Eigen/src/IterativeSolvers/GMRES.h | 17 ++--------------- unsupported/Eigen/src/IterativeSolvers/MINRES.h | 15 +-------------- 4 files changed, 5 insertions(+), 48 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h index a50680133..31a43cb56 100644 --- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h @@ -139,11 +139,7 @@ struct traits > * \include BiCGSTAB_simple.cpp * * By default the iterations start with x=0 as an initial guess of the solution. - * One can control the start using the solveWithGuess() method. Here is a step by - * step execution example starting with a random guess and printing the evolution - * of the estimated error: - * \include BiCGSTAB_step_by_step.cpp - * Note that such a step by step execution is slightly slower. + * One can control the start using the solveWithGuess() method. * * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index 4857dd9e9..1e819fc9f 100644 --- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -137,20 +137,7 @@ struct traits > * \endcode * * By default the iterations start with x=0 as an initial guess of the solution. - * One can control the start using the solveWithGuess() method. Here is a step by - * step execution example starting with a random guess and printing the evolution - * of the estimated error: - * * \code - * x = VectorXd::Random(n); - * cg.setMaxIterations(1); - * int i = 0; - * do { - * x = cg.solveWithGuess(b,x); - * std::cout << i << " : " << cg.error() << std::endl; - * ++i; - * } while (cg.info()!=Success && i<100); - * \endcode - * Note that such a step by step excution is slightly slower. + * One can control the start using the solveWithGuess() method. * * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ diff --git a/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/unsupported/Eigen/src/IterativeSolvers/GMRES.h index 39610c074..30f82b52e 100644 --- a/unsupported/Eigen/src/IterativeSolvers/GMRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/GMRES.h @@ -250,21 +250,8 @@ struct traits > * \endcode * * By default the iterations start with x=0 as an initial guess of the solution. - * One can control the start using the solveWithGuess() method. Here is a step by - * step execution example starting with a random guess and printing the evolution - * of the estimated error: - * * \code - * x = VectorXd::Random(n); - * solver.setMaxIterations(1); - * int i = 0; - * do { - * x = solver.solveWithGuess(b,x); - * std::cout << i << " : " << solver.error() << std::endl; - * ++i; - * } while (solver.info()!=Success && i<100); - * \endcode - * Note that such a step by step excution is slightly slower. - * + * One can control the start using the solveWithGuess() method. + * * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ template< typename _MatrixType, typename _Preconditioner> diff --git a/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/unsupported/Eigen/src/IterativeSolvers/MINRES.h index 93a83e5b7..c4d969a72 100644 --- a/unsupported/Eigen/src/IterativeSolvers/MINRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h @@ -189,20 +189,7 @@ namespace Eigen { * \endcode * * By default the iterations start with x=0 as an initial guess of the solution. - * One can control the start using the solveWithGuess() method. Here is a step by - * step execution example starting with a random guess and printing the evolution - * of the estimated error: - * * \code - * x = VectorXd::Random(n); - * mr.setMaxIterations(1); - * int i = 0; - * do { - * x = mr.solveWithGuess(b,x); - * std::cout << i << " : " << mr.error() << std::endl; - * ++i; - * } while (mr.info()!=Success && i<100); - * \endcode - * Note that such a step by step excution is slightly slower. + * One can control the start using the solveWithGuess() method. * * \sa class ConjugateGradient, BiCGSTAB, SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ -- cgit v1.2.3 From cc5d7ff5238da45ef7416ec94f18227486ed9643 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 14:02:38 -0800 Subject: Added vectorized implementation of the exponential function for ARM/NEON --- Eigen/Core | 1 + Eigen/src/Core/arch/NEON/MathFunctions.h | 91 ++++++++++++++++++++++++++++++++ Eigen/src/Core/arch/NEON/PacketMath.h | 2 +- 3 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 Eigen/src/Core/arch/NEON/MathFunctions.h (limited to 'Eigen') diff --git a/Eigen/Core b/Eigen/Core index b5af63623..416e901ae 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -300,6 +300,7 @@ using std::ptrdiff_t; #include "src/Core/arch/AltiVec/Complex.h" #elif defined EIGEN_VECTORIZE_NEON #include "src/Core/arch/NEON/PacketMath.h" + #include "src/Core/arch/NEON/MathFunctions.h" #include "src/Core/arch/NEON/Complex.h" #endif diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h new file mode 100644 index 000000000..6bb05bb92 --- /dev/null +++ b/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -0,0 +1,91 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The sin, cos, exp, and log functions of this file come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +#ifndef EIGEN_MATH_FUNCTIONS_NEON_H +#define EIGEN_MATH_FUNCTIONS_NEON_H + +namespace Eigen { + +namespace internal { + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexp(const Packet4f& _x) +{ + Packet4f x = _x; + Packet4f tmp, fx; + + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); + _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); + _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); + + x = vminq_f32(x, p4f_exp_hi); + x = vmaxq_f32(x, p4f_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF); + + /* perform a floorf */ + tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); + + /* if greater, substract 1 */ + Packet4ui mask = vcgtq_f32(tmp, fx); + mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1)); + + fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); + + tmp = vmulq_f32(fx, p4f_cephes_exp_C1); + Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2); + x = vsubq_f32(x, tmp); + x = vsubq_f32(x, z); + + Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x); + z = vmulq_f32(x, x); + y = vaddq_f32(y, p4f_cephes_exp_p1); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p2); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p3); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p4); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p5); + + y = vmulq_f32(y, z); + y = vaddq_f32(y, x); + y = vaddq_f32(y, p4f_1); + + /* build 2^n */ + int32x4_t mm; + mm = vcvtq_s32_f32(fx); + mm = vaddq_s32(mm, p4i_0x7f); + mm = vshlq_n_s32(mm, 23); + Packet4f pow2n = vreinterpretq_f32_s32(mm); + + y = vmulq_f32(y, pow2n); + return y; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_NEON_H diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 9afd86bec..559682cf7 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -88,7 +88,7 @@ template<> struct packet_traits : default_packet_traits HasSin = 0, HasCos = 0, HasLog = 0, - HasExp = 0, + HasExp = 1, HasSqrt = 0 }; }; -- cgit v1.2.3 From fe25f3b8e3ee86d808b5f8f75f789cee8dffd581 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 10 Feb 2015 23:11:35 +0100 Subject: FMA has been wrongly disabled --- Eigen/src/Core/arch/AVX/PacketMath.h | 2 +- Eigen/src/Core/arch/SSE/PacketMath.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 485bac10b..fc2e78b2d 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -133,7 +133,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, co return pset1(0); } -#ifdef EIGEN_VECTORIZE_FMA +#ifdef __FMA__ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { #if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 3f6fb0254..63a859699 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -227,7 +227,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, co // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } -#ifdef EIGEN_VECTORIZE_FMA +#ifdef __FMA__ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } #endif -- cgit v1.2.3 From f669f5656ab550010c5dd92ce2da7d3fab07babd Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 14:29:47 -0800 Subject: Marked a few functions as EIGEN_DEVICE_FUNC to enable the use of tensors in cuda kernels. --- Eigen/src/Core/util/Memory.h | 48 +++++++++++----------- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 4 +- 4 files changed, 29 insertions(+), 29 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index bacf236fb..16f8cc1b0 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -143,8 +143,8 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = *** Implementation of generic aligned realloc (when no realloc can be used)*** *****************************************************************************/ -void* aligned_malloc(std::size_t size); -void aligned_free(void *ptr); +EIGEN_DEVICE_FUNC void* aligned_malloc(std::size_t size); +EIGEN_DEVICE_FUNC void aligned_free(void *ptr); /** \internal * \brief Reallocates aligned memory. @@ -185,33 +185,33 @@ inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size) *****************************************************************************/ #ifdef EIGEN_NO_MALLOC -inline void check_that_malloc_is_allowed() +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() { eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)"); } #elif defined EIGEN_RUNTIME_NO_MALLOC -inline bool is_malloc_allowed_impl(bool update, bool new_value = false) +EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false) { static bool value = true; if (update == 1) value = new_value; return value; } -inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); } -inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); } -inline void check_that_malloc_is_allowed() +EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); } +EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); } +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() { eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)"); } #else -inline void check_that_malloc_is_allowed() +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() {} #endif /** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements. * On allocation error, the returned pointer is null, and std::bad_alloc is thrown. */ -inline void* aligned_malloc(size_t size) +EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size) { check_that_malloc_is_allowed(); @@ -237,7 +237,7 @@ inline void* aligned_malloc(size_t size) } /** \internal Frees memory allocated with aligned_malloc. */ -inline void aligned_free(void *ptr) +EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) { #if !EIGEN_ALIGN std::free(ptr); @@ -298,12 +298,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size) /** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned. * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown. */ -template inline void* conditional_aligned_malloc(size_t size) +template EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size) { return aligned_malloc(size); } -template<> inline void* conditional_aligned_malloc(size_t size) +template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size) { check_that_malloc_is_allowed(); @@ -314,12 +314,12 @@ template<> inline void* conditional_aligned_malloc(size_t size) } /** \internal Frees memory allocated with conditional_aligned_malloc */ -template inline void conditional_aligned_free(void *ptr) +template EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { aligned_free(ptr); } -template<> inline void conditional_aligned_free(void *ptr) +template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { std::free(ptr); } @@ -341,7 +341,7 @@ template<> inline void* conditional_aligned_realloc(void* ptr, size_t new /** \internal Destructs the elements of an array. * The \a size parameters tells on how many objects to call the destructor of T. */ -template inline void destruct_elements_of_array(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, size_t size) { // always destruct an array starting from the end. if(ptr) @@ -351,7 +351,7 @@ template inline void destruct_elements_of_array(T *ptr, size_t size) /** \internal Constructs the elements of an array. * The \a size parameter tells on how many objects to call the constructor of T. */ -template inline T* construct_elements_of_array(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, size_t size) { size_t i; EIGEN_TRY @@ -371,7 +371,7 @@ template inline T* construct_elements_of_array(T *ptr, size_t size) *****************************************************************************/ template -EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) { if(size > size_t(-1) / sizeof(T)) throw_std_bad_alloc(); @@ -381,7 +381,7 @@ EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown. * The default constructor of T is called. */ -template inline T* aligned_new(size_t size) +template EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size) { check_size_for_overflow(size); T *result = reinterpret_cast(aligned_malloc(sizeof(T)*size)); @@ -396,7 +396,7 @@ template inline T* aligned_new(size_t size) } } -template inline T* conditional_aligned_new(size_t size) +template EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size) { check_size_for_overflow(size); T *result = reinterpret_cast(conditional_aligned_malloc(sizeof(T)*size)); @@ -414,7 +414,7 @@ template inline T* conditional_aligned_new(size_t size) /** \internal Deletes objects constructed with aligned_new * The \a size parameters tells on how many objects to call the destructor of T. */ -template inline void aligned_delete(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t size) { destruct_elements_of_array(ptr, size); aligned_free(ptr); @@ -423,13 +423,13 @@ template inline void aligned_delete(T *ptr, size_t size) /** \internal Deletes objects constructed with conditional_aligned_new * The \a size parameters tells on how many objects to call the destructor of T. */ -template inline void conditional_aligned_delete(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, size_t size) { destruct_elements_of_array(ptr, size); conditional_aligned_free(ptr); } -template inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size) +template EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size) { check_size_for_overflow(new_size); check_size_for_overflow(old_size); @@ -452,7 +452,7 @@ template inline T* conditional_aligned_realloc_new(T* pt } -template inline T* conditional_aligned_new_auto(size_t size) +template EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(size_t size) { if(size==0) return 0; // short-cut. Also fixes Bug 884 @@ -495,7 +495,7 @@ template inline T* conditional_aligned_realloc_new_auto( return result; } -template inline void conditional_aligned_delete_auto(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, size_t size) { if(NumTraits::RequireInitialization) destruct_elements_of_array(ptr, size); diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 0e8a4b8d6..037219f23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -350,7 +350,7 @@ class Tensor : public TensorBase > } #endif - void resize(const array& dimensions) + EIGEN_DEVICE_FUNC void resize(const array& dimensions) { std::size_t i; Index size = Index(1); @@ -367,7 +367,7 @@ class Tensor : public TensorBase > #endif } - void resize(const DSizes& dimensions) { + EIGEN_DEVICE_FUNC void resize(const DSizes& dimensions) { array dims; for (std::size_t i = 0; i < NumIndices; ++i) { dims[i] = dimensions[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index d81197e6d..2ad52b2f9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -275,7 +275,7 @@ struct DSizes : array { } #endif - DSizes& operator = (const array& other) { + EIGEN_DEVICE_FUNC DSizes& operator = (const array& other) { *static_cast(this) = other; return *this; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index dfe85602a..1b227e8c2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -112,9 +112,9 @@ class TensorStorage& dimensions() const {return m_dimensions;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& dimensions() const {return m_dimensions;} - void resize(DenseIndex size, const array& nbDimensions) + EIGEN_DEVICE_FUNC void resize(DenseIndex size, const array& nbDimensions) { const DenseIndex currentSz = internal::array_prod(m_dimensions); if(size != currentSz) -- cgit v1.2.3 From 409547a0c83604b6dea70b8523674ac19e2af958 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 12 Feb 2015 21:04:31 +0100 Subject: update EIGEN_FAST_MATH documentation --- Eigen/src/Core/util/Macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 13f8fdd4e..27167d3dc 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -385,7 +385,7 @@ /** Allows to disable some optimizations which might affect the accuracy of the result. * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. * They currently include: - * - single precision Cwise::sin() and Cwise::cos() when SSE vectorization is enabled. + * - single precision ArrayBase::sin() and ArrayBase::cos() when SSE vectorization is enabled. */ #ifndef EIGEN_FAST_MATH #define EIGEN_FAST_MATH 1 -- cgit v1.2.3