diff options
Diffstat (limited to 'Eigen/src/LU')
-rw-r--r-- | Eigen/src/LU/CMakeLists.txt | 8 | ||||
-rw-r--r-- | Eigen/src/LU/FullPivLU.h | 63 | ||||
-rw-r--r-- | Eigen/src/LU/InverseImpl.h | 6 | ||||
-rw-r--r-- | Eigen/src/LU/PartialPivLU.h | 77 | ||||
-rw-r--r-- | Eigen/src/LU/PartialPivLU_LAPACKE.h (renamed from Eigen/src/LU/PartialPivLU_MKL.h) | 26 | ||||
-rw-r--r-- | Eigen/src/LU/arch/CMakeLists.txt | 6 | ||||
-rw-r--r-- | Eigen/src/LU/arch/Inverse_SSE.h | 28 |
7 files changed, 128 insertions, 86 deletions
diff --git a/Eigen/src/LU/CMakeLists.txt b/Eigen/src/LU/CMakeLists.txt deleted file mode 100644 index e0d8d78c1..000000000 --- a/Eigen/src/LU/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -FILE(GLOB Eigen_LU_SRCS "*.h") - -INSTALL(FILES - ${Eigen_LU_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/LU COMPONENT Devel - ) - -ADD_SUBDIRECTORY(arch) diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 64b9eb7f1..03b6af706 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -52,6 +52,8 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> > * \include class_FullPivLU.cpp * Output: \verbinclude class_FullPivLU.out * + * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. + * * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse() */ template<typename _MatrixType> class FullPivLU @@ -97,6 +99,15 @@ template<typename _MatrixType> class FullPivLU template<typename InputType> explicit FullPivLU(const EigenBase<InputType>& matrix); + /** \brief Constructs a LU factorization from a given matrix + * + * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref. + * + * \sa FullPivLU(const EigenBase&) + */ + template<typename InputType> + explicit FullPivLU(EigenBase<InputType>& matrix); + /** Computes the LU decomposition of the given matrix. * * \param matrix the matrix of which to compute the LU decomposition. @@ -105,7 +116,11 @@ template<typename _MatrixType> class FullPivLU * \returns a reference to *this */ template<typename InputType> - FullPivLU& compute(const EigenBase<InputType>& matrix); + FullPivLU& compute(const EigenBase<InputType>& matrix) { + m_lu = matrix.derived(); + computeInPlace(); + return *this; + } /** \returns the LU decomposition matrix: the upper-triangular part is U, the * unit-lower-triangular part is L (at least for square matrices; in the non-square @@ -141,7 +156,7 @@ template<typename _MatrixType> class FullPivLU * * \sa permutationQ() */ - inline const PermutationPType& permutationP() const + EIGEN_DEVICE_FUNC inline const PermutationPType& permutationP() const { eigen_assert(m_isInitialized && "LU is not initialized."); return m_p; @@ -391,8 +406,8 @@ template<typename _MatrixType> class FullPivLU MatrixType reconstructedMatrix() const; - inline Index rows() const { return m_lu.rows(); } - inline Index cols() const { return m_lu.cols(); } + EIGEN_DEVICE_FUNC inline Index rows() const { return m_lu.rows(); } + EIGEN_DEVICE_FUNC inline Index cols() const { return m_lu.cols(); } #ifndef EIGEN_PARSED_BY_DOXYGEN template<typename RhsType, typename DstType> @@ -418,9 +433,10 @@ template<typename _MatrixType> class FullPivLU PermutationQType m_q; IntColVectorType m_rowsTranspositions; IntRowVectorType m_colsTranspositions; - Index m_det_pq, m_nonzero_pivots; + Index m_nonzero_pivots; RealScalar m_l1_norm; RealScalar m_maxpivot, m_prescribedThreshold; + signed char m_det_pq; bool m_isInitialized, m_usePrescribedThreshold; }; @@ -458,25 +474,28 @@ FullPivLU<MatrixType>::FullPivLU(const EigenBase<InputType>& matrix) template<typename MatrixType> template<typename InputType> -FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const EigenBase<InputType>& matrix) +FullPivLU<MatrixType>::FullPivLU(EigenBase<InputType>& matrix) + : m_lu(matrix.derived()), + m_p(matrix.rows()), + m_q(matrix.cols()), + m_rowsTranspositions(matrix.rows()), + m_colsTranspositions(matrix.cols()), + m_isInitialized(false), + m_usePrescribedThreshold(false) { - check_template_parameters(); - - // the permutations are stored as int indices, so just to be sure: - eigen_assert(matrix.rows()<=NumTraits<int>::highest() && matrix.cols()<=NumTraits<int>::highest()); - - m_lu = matrix.derived(); - m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); - computeInPlace(); - - m_isInitialized = true; - return *this; } template<typename MatrixType> void FullPivLU<MatrixType>::computeInPlace() { + check_template_parameters(); + + // the permutations are stored as int indices, so just to be sure: + eigen_assert(m_lu.rows()<=NumTraits<int>::highest() && m_lu.cols()<=NumTraits<int>::highest()); + + m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); + const Index size = m_lu.diagonalSize(); const Index rows = m_lu.rows(); const Index cols = m_lu.cols(); @@ -556,6 +575,8 @@ void FullPivLU<MatrixType>::computeInPlace() m_q.applyTranspositionOnTheRight(k, m_colsTranspositions.coeff(k)); m_det_pq = (number_of_transpositions%2) ? -1 : 1; + + m_isInitialized = true; } template<typename MatrixType> @@ -838,12 +859,12 @@ namespace internal { /***** Implementation of inverse() *****************************************************/ -template<typename DstXprType, typename MatrixType, typename Scalar> -struct Assignment<DstXprType, Inverse<FullPivLU<MatrixType> >, internal::assign_op<Scalar>, Dense2Dense, Scalar> +template<typename DstXprType, typename MatrixType> +struct Assignment<DstXprType, Inverse<FullPivLU<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename FullPivLU<MatrixType>::Scalar>, Dense2Dense> { typedef FullPivLU<MatrixType> LuType; typedef Inverse<LuType> SrcXprType; - static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &) + static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename MatrixType::Scalar> &) { dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols())); } @@ -858,14 +879,12 @@ struct Assignment<DstXprType, Inverse<FullPivLU<MatrixType> >, internal::assign_ * * \sa class FullPivLU */ -#ifndef __CUDACC__ template<typename Derived> inline const FullPivLU<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::fullPivLu() const { return FullPivLU<PlainObject>(eval()); } -#endif } // end namespace Eigen diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h index e202a55cb..3134632e1 100644 --- a/Eigen/src/LU/InverseImpl.h +++ b/Eigen/src/LU/InverseImpl.h @@ -286,11 +286,11 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 4> namespace internal { // Specialization for "dense = dense_xpr.inverse()" -template<typename DstXprType, typename XprType, typename Scalar> -struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<Scalar>, Dense2Dense, Scalar> +template<typename DstXprType, typename XprType> +struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar>, Dense2Dense> { typedef Inverse<XprType> SrcXprType; - static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &) + static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar> &) { // FIXME shall we resize dst here? const int Size = EIGEN_PLAIN_ENUM_MIN(XprType::ColsAtCompileTime,DstXprType::ColsAtCompileTime); diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index 2e6d91939..d43961887 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -26,6 +26,17 @@ template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> > }; }; +template<typename T,typename Derived> +struct enable_if_ref; +// { +// typedef Derived type; +// }; + +template<typename T,typename Derived> +struct enable_if_ref<Ref<T>,Derived> { + typedef Derived type; +}; + } // end namespace internal /** \ingroup LU_Module @@ -57,6 +68,8 @@ template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> > * * The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP(). * + * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. + * * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU */ template<typename _MatrixType> class PartialPivLU @@ -102,8 +115,22 @@ template<typename _MatrixType> class PartialPivLU template<typename InputType> explicit PartialPivLU(const EigenBase<InputType>& matrix); + /** Constructor for \link InplaceDecomposition inplace decomposition \endlink + * + * \param matrix the matrix of which to compute the LU decomposition. + * + * \warning The matrix should have full rank (e.g. if it's square, it should be invertible). + * If you need to deal with non-full rank, use class FullPivLU instead. + */ template<typename InputType> - PartialPivLU& compute(const EigenBase<InputType>& matrix); + explicit PartialPivLU(EigenBase<InputType>& matrix); + + template<typename InputType> + PartialPivLU& compute(const EigenBase<InputType>& matrix) { + m_lu = matrix.derived(); + compute(); + return *this; + } /** \returns the LU decomposition matrix: the upper-triangular part is U, the * unit-lower-triangular part is L (at least for square matrices; in the non-square @@ -251,11 +278,13 @@ template<typename _MatrixType> class PartialPivLU EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } + void compute(); + MatrixType m_lu; PermutationType m_p; TranspositionType m_rowsTranspositions; - Index m_det_p; RealScalar m_l1_norm; + signed char m_det_p; bool m_isInitialized; }; @@ -264,8 +293,8 @@ PartialPivLU<MatrixType>::PartialPivLU() : m_lu(), m_p(), m_rowsTranspositions(), - m_det_p(0), m_l1_norm(0), + m_det_p(0), m_isInitialized(false) { } @@ -275,8 +304,8 @@ PartialPivLU<MatrixType>::PartialPivLU(Index size) : m_lu(size, size), m_p(size), m_rowsTranspositions(size), - m_det_p(0), m_l1_norm(0), + m_det_p(0), m_isInitialized(false) { } @@ -284,16 +313,29 @@ PartialPivLU<MatrixType>::PartialPivLU(Index size) template<typename MatrixType> template<typename InputType> PartialPivLU<MatrixType>::PartialPivLU(const EigenBase<InputType>& matrix) - : m_lu(matrix.rows(), matrix.rows()), + : m_lu(matrix.rows(),matrix.cols()), m_p(matrix.rows()), m_rowsTranspositions(matrix.rows()), - m_det_p(0), m_l1_norm(0), + m_det_p(0), m_isInitialized(false) { compute(matrix.derived()); } +template<typename MatrixType> +template<typename InputType> +PartialPivLU<MatrixType>::PartialPivLU(EigenBase<InputType>& matrix) + : m_lu(matrix.derived()), + m_p(matrix.rows()), + m_rowsTranspositions(matrix.rows()), + m_l1_norm(0), + m_det_p(0), + m_isInitialized(false) +{ + compute(); +} + namespace internal { /** \internal This is the blocked version of fullpivlu_unblocked() */ @@ -434,7 +476,7 @@ struct partial_lu_impl // update permutations and apply them to A_0 for(Index i=k; i<k+bs; ++i) { - Index piv = (row_transpositions[i] += k); + Index piv = (row_transpositions[i] += internal::convert_index<PivIndex>(k)); A_0.row(i).swap(A_0.row(piv)); } @@ -470,19 +512,17 @@ void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, t } // end namespace internal template<typename MatrixType> -template<typename InputType> -PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const EigenBase<InputType>& matrix) +void PartialPivLU<MatrixType>::compute() { check_template_parameters(); // the row permutation is stored as int indices, so just to be sure: - eigen_assert(matrix.rows()<NumTraits<int>::highest()); + eigen_assert(m_lu.rows()<NumTraits<int>::highest()); - m_lu = matrix.derived(); m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); - eigen_assert(matrix.rows() == matrix.cols() && "PartialPivLU is only for square (and moreover invertible) matrices"); - const Index size = matrix.rows(); + eigen_assert(m_lu.rows() == m_lu.cols() && "PartialPivLU is only for square (and moreover invertible) matrices"); + const Index size = m_lu.rows(); m_rowsTranspositions.resize(size); @@ -493,7 +533,6 @@ PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const EigenBase<Inpu m_p = m_rowsTranspositions; m_isInitialized = true; - return *this; } template<typename MatrixType> @@ -525,12 +564,12 @@ MatrixType PartialPivLU<MatrixType>::reconstructedMatrix() const namespace internal { /***** Implementation of inverse() *****************************************************/ -template<typename DstXprType, typename MatrixType, typename Scalar> -struct Assignment<DstXprType, Inverse<PartialPivLU<MatrixType> >, internal::assign_op<Scalar>, Dense2Dense, Scalar> +template<typename DstXprType, typename MatrixType> +struct Assignment<DstXprType, Inverse<PartialPivLU<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename PartialPivLU<MatrixType>::Scalar>, Dense2Dense> { typedef PartialPivLU<MatrixType> LuType; typedef Inverse<LuType> SrcXprType; - static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &) + static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename LuType::Scalar> &) { dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols())); } @@ -545,14 +584,12 @@ struct Assignment<DstXprType, Inverse<PartialPivLU<MatrixType> >, internal::assi * * \sa class PartialPivLU */ -#ifndef __CUDACC__ template<typename Derived> inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::partialPivLu() const { return PartialPivLU<PlainObject>(eval()); } -#endif /** \lu_module * @@ -562,14 +599,12 @@ MatrixBase<Derived>::partialPivLu() const * * \sa class PartialPivLU */ -#ifndef __CUDACC__ template<typename Derived> inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::lu() const { return PartialPivLU<PlainObject>(eval()); } -#endif } // end namespace Eigen diff --git a/Eigen/src/LU/PartialPivLU_MKL.h b/Eigen/src/LU/PartialPivLU_LAPACKE.h index 9035953c8..755168a94 100644 --- a/Eigen/src/LU/PartialPivLU_MKL.h +++ b/Eigen/src/LU/PartialPivLU_LAPACKE.h @@ -25,7 +25,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to LAPACKe * LU decomposition with partial pivoting based on LAPACKE_?getrf function. ******************************************************************************** */ @@ -33,20 +33,18 @@ #ifndef EIGEN_PARTIALLU_LAPACK_H #define EIGEN_PARTIALLU_LAPACK_H -#include "Eigen/src/Core/util/MKL_support.h" - namespace Eigen { namespace internal { -/** \internal Specialization for the data types supported by MKL */ +/** \internal Specialization for the data types supported by LAPACKe */ -#define EIGEN_MKL_LU_PARTPIV(EIGTYPE, MKLTYPE, MKLPREFIX) \ +#define EIGEN_LAPACKE_LU_PARTPIV(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX) \ template<int StorageOrder> \ struct partial_lu_impl<EIGTYPE, StorageOrder, lapack_int> \ { \ /* \internal performs the LU decomposition in-place of the matrix represented */ \ - static lapack_int blocked_lu(lapack_int rows, lapack_int cols, EIGTYPE* lu_data, lapack_int luStride, lapack_int* row_transpositions, lapack_int& nb_transpositions, lapack_int maxBlockSize=256) \ + static lapack_int blocked_lu(Index rows, Index cols, EIGTYPE* lu_data, Index luStride, lapack_int* row_transpositions, lapack_int& nb_transpositions, lapack_int maxBlockSize=256) \ { \ EIGEN_UNUSED_VARIABLE(maxBlockSize);\ lapack_int matrix_order, first_zero_pivot; \ @@ -54,14 +52,14 @@ struct partial_lu_impl<EIGTYPE, StorageOrder, lapack_int> \ EIGTYPE* a; \ /* Set up parameters for ?getrf */ \ matrix_order = StorageOrder==RowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \ - lda = luStride; \ + lda = convert_index<lapack_int>(luStride); \ a = lu_data; \ ipiv = row_transpositions; \ - m = rows; \ - n = cols; \ + m = convert_index<lapack_int>(rows); \ + n = convert_index<lapack_int>(cols); \ nb_transpositions = 0; \ \ - info = LAPACKE_##MKLPREFIX##getrf( matrix_order, m, n, (MKLTYPE*)a, lda, ipiv ); \ + info = LAPACKE_##LAPACKE_PREFIX##getrf( matrix_order, m, n, (LAPACKE_TYPE*)a, lda, ipiv ); \ \ for(int i=0;i<m;i++) { ipiv[i]--; if (ipiv[i]!=i) nb_transpositions++; } \ \ @@ -73,10 +71,10 @@ struct partial_lu_impl<EIGTYPE, StorageOrder, lapack_int> \ } \ }; -EIGEN_MKL_LU_PARTPIV(double, double, d) -EIGEN_MKL_LU_PARTPIV(float, float, s) -EIGEN_MKL_LU_PARTPIV(dcomplex, MKL_Complex16, z) -EIGEN_MKL_LU_PARTPIV(scomplex, MKL_Complex8, c) +EIGEN_LAPACKE_LU_PARTPIV(double, double, d) +EIGEN_LAPACKE_LU_PARTPIV(float, float, s) +EIGEN_LAPACKE_LU_PARTPIV(dcomplex, lapack_complex_double, z) +EIGEN_LAPACKE_LU_PARTPIV(scomplex, lapack_complex_float, c) } // end namespace internal diff --git a/Eigen/src/LU/arch/CMakeLists.txt b/Eigen/src/LU/arch/CMakeLists.txt deleted file mode 100644 index f6b7ed9ec..000000000 --- a/Eigen/src/LU/arch/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_LU_arch_SRCS "*.h") - -INSTALL(FILES - ${Eigen_LU_arch_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/LU/arch COMPONENT Devel - ) diff --git a/Eigen/src/LU/arch/Inverse_SSE.h b/Eigen/src/LU/arch/Inverse_SSE.h index e1470c664..ebb64a62b 100644 --- a/Eigen/src/LU/arch/Inverse_SSE.h +++ b/Eigen/src/LU/arch/Inverse_SSE.h @@ -153,10 +153,12 @@ struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType> iC = _mm_mul_ps(rd,iC); iD = _mm_mul_ps(rd,iD); - result.template writePacket<ResultAlignment>( 0, _mm_shuffle_ps(iA,iB,0x77)); - result.template writePacket<ResultAlignment>( 4, _mm_shuffle_ps(iA,iB,0x22)); - result.template writePacket<ResultAlignment>( 8, _mm_shuffle_ps(iC,iD,0x77)); - result.template writePacket<ResultAlignment>(12, _mm_shuffle_ps(iC,iD,0x22)); + Index res_stride = result.outerStride(); + float* res = result.data(); + pstoret<float, Packet4f, ResultAlignment>(res+0, _mm_shuffle_ps(iA,iB,0x77)); + pstoret<float, Packet4f, ResultAlignment>(res+res_stride, _mm_shuffle_ps(iA,iB,0x22)); + pstoret<float, Packet4f, ResultAlignment>(res+2*res_stride, _mm_shuffle_ps(iC,iD,0x77)); + pstoret<float, Packet4f, ResultAlignment>(res+3*res_stride, _mm_shuffle_ps(iC,iD,0x22)); } }; @@ -316,14 +318,16 @@ struct compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultType> iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1); iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2); - result.template writePacket<ResultAlignment>( 0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1)); // iA# / det - result.template writePacket<ResultAlignment>( 4, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2)); - result.template writePacket<ResultAlignment>( 2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1)); // iB# / det - result.template writePacket<ResultAlignment>( 6, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2)); - result.template writePacket<ResultAlignment>( 8, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1)); // iC# / det - result.template writePacket<ResultAlignment>(12, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2)); - result.template writePacket<ResultAlignment>(10, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1)); // iD# / det - result.template writePacket<ResultAlignment>(14, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2)); + Index res_stride = result.outerStride(); + double* res = result.data(); + pstoret<double, Packet2d, ResultAlignment>(res+0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1)); + pstoret<double, Packet2d, ResultAlignment>(res+res_stride, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2)); + pstoret<double, Packet2d, ResultAlignment>(res+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1)); + pstoret<double, Packet2d, ResultAlignment>(res+res_stride+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2)); + pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1)); + pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2)); + pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1)); + pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2)); } }; |