aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Alexey Frunze <afrunze@wavecomp.com>2018-07-06 16:04:30 -0700
committerGravatar Alexey Frunze <afrunze@wavecomp.com>2018-07-06 16:04:30 -0700
commit3875fb05aaa019c700d68b074e61471a96c85b36 (patch)
treea3ca21891fb7f07bb4b8f521939b4d382e8ae4f8
parent44ea5f7623d1d5211de753b4ea719b63d55b228a (diff)
Add support for MIPS SIMD (MSA)
-rw-r--r--CMakeLists.txt6
-rw-r--r--Eigen/src/Core/arch/MSA/Complex.h759
-rw-r--r--Eigen/src/Core/arch/MSA/MathFunctions.h387
-rw-r--r--Eigen/src/Core/arch/MSA/PacketMath.h1317
-rw-r--r--Eigen/src/Core/util/ConfigureVectorization.h2
-rw-r--r--Eigen/src/Core/util/Constants.h3
-rw-r--r--cmake/EigenTesting.cmake8
7 files changed, 2481 insertions, 1 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91545bdb0..0547ee681 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -256,6 +256,12 @@ if(NOT MSVC)
message(STATUS "Enabling VSX in tests/examples")
endif()
+ option(EIGEN_TEST_MSA "Enable/Disable MSA in tests/examples" OFF)
+ if(EIGEN_TEST_MSA)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmsa")
+ message(STATUS "Enabling MSA in tests/examples")
+ endif()
+
option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
if(EIGEN_TEST_NEON)
if(EIGEN_TEST_FMA)
diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h
new file mode 100644
index 000000000..9a45cf51e
--- /dev/null
+++ b/Eigen/src/Core/arch/MSA/Complex.h
@@ -0,0 +1,759 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+// Chris Larsen
+// Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_MSA_H
+#define EIGEN_COMPLEX_MSA_H
+
+#include <iostream>
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet2cf {
+ EIGEN_STRONG_INLINE Packet2cf() {
+ }
+ EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex<float>& a,
+ const std::complex<float>& b) {
+ Packet4f t = { std::real(a), std::imag(a), std::real(b), std::imag(b) };
+ v = t;
+ }
+ EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {
+ }
+ EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) {
+ }
+ EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) {
+ v = b.v;
+ return *this;
+ }
+ EIGEN_STRONG_INLINE Packet2cf conjugate(void) const {
+ return Packet2cf((Packet4f)__builtin_msa_bnegi_d((v2u64)v, 63));
+ }
+ EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
+ Packet4f v1, v2;
+
+ // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
+ v1 = (Packet4f)__builtin_msa_ilvev_w((v4i32)v, (v4i32)v);
+ // Get the imag values of a | a1_im | a1_im | a2_im | a2_im |
+ v2 = (Packet4f)__builtin_msa_ilvod_w((v4i32)v, (v4i32)v);
+ // Multiply the real a with b
+ v1 = pmul(v1, b.v);
+ // Multiply the imag a with b
+ v2 = pmul(v2, b.v);
+ // Conjugate v2
+ v2 = Packet2cf(v2).conjugate().v;
+ // Swap real/imag elements in v2.
+ v2 = (Packet4f)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(1, 0, 3, 2));
+ // Add and return the result
+ v = padd(v1, v2);
+ return *this;
+ }
+ EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
+ return Packet2cf(*this) *= b;
+ }
+ EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
+ v = padd(v, b.v);
+ return *this;
+ }
+ EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
+ return Packet2cf(*this) += b;
+ }
+ EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
+ v = psub(v, b.v);
+ return *this;
+ }
+ EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
+ return Packet2cf(*this) -= b;
+ }
+ EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) {
+ *this *= b.conjugate();
+ Packet4f s = pmul<Packet4f>(b.v, b.v);
+ s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+ v = pdiv(v, s);
+ return *this;
+ }
+ EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const {
+ return Packet2cf(*this) /= b;
+ }
+ EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
+ return Packet2cf(pnegate(v));
+ }
+
+ Packet4f v;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2cf& value) {
+ os << "[ (" << value.v[0] << ", " << value.v[1]
+ << "i),"
+ " ("
+ << value.v[2] << ", " << value.v[3] << "i) ]";
+ return os;
+}
+
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
+ typedef Packet2cf type;
+ typedef Packet2cf half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 2,
+ HasHalfPacket = 0,
+
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
+ HasNegate = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasSetLinear = 0,
+ HasBlend = 1
+ };
+};
+
+template <>
+struct unpacket_traits<Packet2cf> {
+ typedef std::complex<float> type;
+ enum { size = 2, alignment = Aligned16 };
+ typedef Packet2cf half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
+ EIGEN_MSA_DEBUG;
+
+ float f0 = from.real(), f1 = from.imag();
+ Packet4f v0 = { f0, f0, f0, f0 };
+ Packet4f v1 = { f1, f1, f1, f1 };
+ return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ EIGEN_MSA_DEBUG;
+
+ return a + b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ EIGEN_MSA_DEBUG;
+
+ return a - b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+ EIGEN_MSA_DEBUG;
+
+ return -a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+ EIGEN_MSA_DEBUG;
+
+ return a.conjugate();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ EIGEN_MSA_DEBUG;
+
+ return a * b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ EIGEN_MSA_DEBUG;
+
+ return Packet2cf(pand(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ EIGEN_MSA_DEBUG;
+
+ return Packet2cf(por(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ EIGEN_MSA_DEBUG;
+
+ return Packet2cf(pxor(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ EIGEN_MSA_DEBUG;
+
+ return Packet2cf(pandnot(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+ EIGEN_MSA_DEBUG;
+
+ return pset1<Packet2cf>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to,
+ const Packet2cf& from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_ALIGNED_STORE pstore<float>((float*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to,
+ const Packet2cf& from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_UNALIGNED_STORE pstoreu<float>((float*)to, from.v);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
+ const std::complex<float>* from, Index stride) {
+ EIGEN_MSA_DEBUG;
+
+ return Packet2cf(from[0 * stride], from[1 * stride]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to,
+ const Packet2cf& from,
+ Index stride) {
+ EIGEN_MSA_DEBUG;
+
+ *to = std::complex<float>(from.v[0], from.v[1]);
+ to += stride;
+ *to = std::complex<float>(from.v[2], from.v[3]);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+ EIGEN_MSA_DEBUG;
+
+ prefetch(reinterpret_cast<const float*>(addr));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
+ EIGEN_MSA_DEBUG;
+
+ return std::complex<float>(a.v[0], a.v[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
+ EIGEN_MSA_DEBUG;
+
+ return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
+ EIGEN_MSA_DEBUG;
+
+ return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
+ EIGEN_MSA_DEBUG;
+
+ Packet4f value = (Packet4f)preverse((Packet2d)a.v);
+ value += a.v;
+ return std::complex<float>(value[0], value[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs) {
+ EIGEN_MSA_DEBUG;
+
+ Packet4f sum1, sum2, sum;
+
+ // Add the first two 64-bit float32x2_t of vecs[0]
+ sum1 = (Packet4f)__builtin_msa_ilvr_d((v2i64)vecs[1].v, (v2i64)vecs[0].v);
+ sum2 = (Packet4f)__builtin_msa_ilvl_d((v2i64)vecs[1].v, (v2i64)vecs[0].v);
+ sum = padd(sum1, sum2);
+
+ return Packet2cf(sum);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
+ EIGEN_MSA_DEBUG;
+
+ return std::complex<float>((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]),
+ (a.v[0] * a.v[3]) + (a.v[1] * a.v[2]));
+}
+
+template <int Offset>
+struct palign_impl<Offset, Packet2cf> {
+ EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) {
+ if (Offset == 1) {
+ first.v = (Packet4f)__builtin_msa_sldi_b((v16i8)second.v, (v16i8)first.v, Offset * 8);
+ }
+ }
+};
+
+template <>
+struct conj_helper<Packet2cf, Packet2cf, false, true> {
+ EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
+ const Packet2cf& c) const {
+ return padd(pmul(x, y), c);
+ }
+
+ EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
+ return internal::pmul(a, pconj(b));
+ }
+};
+
+template <>
+struct conj_helper<Packet2cf, Packet2cf, true, false> {
+ EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
+ const Packet2cf& c) const {
+ return padd(pmul(x, y), c);
+ }
+
+ EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
+ return internal::pmul(pconj(a), b);
+ }
+};
+
+template <>
+struct conj_helper<Packet2cf, Packet2cf, true, true> {
+ EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
+ const Packet2cf& c) const {
+ return padd(pmul(x, y), c);
+ }
+
+ EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
+ return pconj(internal::pmul(a, b));
+ }
+};
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ EIGEN_MSA_DEBUG;
+
+ return a / b;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2cf, 2>& value) {
+ os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]";
+ return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
+ EIGEN_MSA_DEBUG;
+
+ Packet4f tmp =
+ (Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
+ kernel.packet[0].v =
+ (Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
+ kernel.packet[1].v = tmp;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+ const Packet2cf& elsePacket) {
+ return (Packet2cf)(Packet4f)pblend<Packet2d>(ifPacket, (Packet2d)thenPacket.v,
+ (Packet2d)elsePacket.v);
+}
+
+//---------- double ----------
+
+struct Packet1cd {
+ EIGEN_STRONG_INLINE Packet1cd() {
+ }
+ EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex<double>& a) {
+ v[0] = std::real(a);
+ v[1] = std::imag(a);
+ }
+ EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {
+ }
+ EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) {
+ }
+ EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) {
+ v = b.v;
+ return *this;
+ }
+ EIGEN_STRONG_INLINE Packet1cd conjugate(void) const {
+ static const v2u64 p2ul_CONJ_XOR = { 0x0, 0x8000000000000000 };
+ return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR);
+ }
+ EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
+ Packet2d v1, v2;
+
+ // Get the real values of a | a1_re | a1_re
+ v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)v, (v2i64)v);
+ // Get the imag values of a | a1_im | a1_im
+ v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)v, (v2i64)v);
+ // Multiply the real a with b
+ v1 = pmul(v1, b.v);
+ // Multiply the imag a with b
+ v2 = pmul(v2, b.v);
+ // Conjugate v2
+ v2 = Packet1cd(v2).conjugate().v;
+ // Swap real/imag elements in v2.
+ v2 = (Packet2d)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+ // Add and return the result
+ v = padd(v1, v2);
+ return *this;
+ }
+ EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
+ return Packet1cd(*this) *= b;
+ }
+ EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
+ v = padd(v, b.v);
+ return *this;
+ }
+ EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
+ return Packet1cd(*this) += b;
+ }
+ EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
+ v = psub(v, b.v);
+ return *this;
+ }
+ EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
+ return Packet1cd(*this) -= b;
+ }
+ EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) {
+ *this *= b.conjugate();
+ Packet2d s = pmul<Packet2d>(b.v, b.v);
+ s = padd(s, preverse<Packet2d>(s));
+ v = pdiv(v, s);
+ return *this;
+ }
+ EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const {
+ return Packet1cd(*this) /= b;
+ }
+ EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
+ return Packet1cd(pnegate(v));
+ }
+
+ Packet2d v;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Packet1cd& value) {
+ os << "[ (" << value.v[0] << ", " << value.v[1] << "i) ]";
+ return os;
+}
+
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
+ typedef Packet1cd type;
+ typedef Packet1cd half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 0,
+ size = 1,
+ HasHalfPacket = 0,
+
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
+ HasNegate = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasSetLinear = 0
+ };
+};
+
+template <>
+struct unpacket_traits<Packet1cd> {
+ typedef std::complex<double> type;
+ enum { size = 1, alignment = Aligned16 };
+ typedef Packet1cd half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) {
+ EIGEN_MSA_DEBUG;
+
+ return Packet1cd(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ EIGEN_MSA_DEBUG;
+
+ return a + b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ EIGEN_MSA_DEBUG;
+
+ return a - b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+ EIGEN_MSA_DEBUG;
+
+ return -a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+ EIGEN_MSA_DEBUG;
+
+ return a.conjugate();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ EIGEN_MSA_DEBUG;
+
+ return a * b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ EIGEN_MSA_DEBUG;
+
+ return Packet1cd(pand(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ EIGEN_MSA_DEBUG;
+
+ return Packet1cd(por(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ EIGEN_MSA_DEBUG;
+
+ return Packet1cd(pxor(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ EIGEN_MSA_DEBUG;
+
+ return Packet1cd(pandnot(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+ EIGEN_MSA_DEBUG;
+
+ return pset1<Packet1cd>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to,
+ const Packet1cd& from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_ALIGNED_STORE pstore<double>((double*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to,
+ const Packet1cd& from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_UNALIGNED_STORE pstoreu<double>((double*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+ EIGEN_MSA_DEBUG;
+
+ prefetch(reinterpret_cast<const double*>(addr));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
+ const std::complex<double>* from, Index stride __attribute__((unused))) {
+ EIGEN_MSA_DEBUG;
+
+ Packet1cd res;
+ res.v[0] = std::real(from[0]);
+ res.v[1] = std::imag(from[0]);
+ return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to,
+ const Packet1cd& from,
+ Index stride
+ __attribute__((unused))) {
+ EIGEN_MSA_DEBUG;
+
+ pstore(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
+ EIGEN_MSA_DEBUG;
+
+ return std::complex<double>(a.v[0], a.v[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+ EIGEN_MSA_DEBUG;
+
+ return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
+ EIGEN_MSA_DEBUG;
+
+ return pfirst(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) {
+ EIGEN_MSA_DEBUG;
+
+ return vecs[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
+ EIGEN_MSA_DEBUG;
+
+ return pfirst(a);
+}
+
+template <int Offset>
+struct palign_impl<Offset, Packet1cd> {
+ static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) {
+ // FIXME is it sure we never have to align a Packet1cd?
+ // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes
+ // boundary...
+ }
+};
+
+template <>
+struct conj_helper<Packet1cd, Packet1cd, false, true> {
+ EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
+ const Packet1cd& c) const {
+ return padd(pmul(x, y), c);
+ }
+
+ EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
+ return internal::pmul(a, pconj(b));
+ }
+};
+
+template <>
+struct conj_helper<Packet1cd, Packet1cd, true, false> {
+ EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
+ const Packet1cd& c) const {
+ return padd(pmul(x, y), c);
+ }
+
+ EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
+ return internal::pmul(pconj(a), b);
+ }
+};
+
+template <>
+struct conj_helper<Packet1cd, Packet1cd, true, true> {
+ EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
+ const Packet1cd& c) const {
+ return padd(pmul(x, y), c);
+ }
+
+ EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
+ return pconj(internal::pmul(a, b));
+ }
+};
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ EIGEN_MSA_DEBUG;
+
+ return a / b;
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
+ EIGEN_MSA_DEBUG;
+
+ return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet1cd, 2>& value) {
+ os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]";
+ return os;
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
+ EIGEN_MSA_DEBUG;
+
+ Packet2d v1, v2;
+
+ v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
+ // Get the imag values of a
+ v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
+
+ kernel.packet[0].v = v1;
+ kernel.packet[1].v = v2;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_MSA_H
diff --git a/Eigen/src/Core/arch/MSA/MathFunctions.h b/Eigen/src/Core/arch/MSA/MathFunctions.h
new file mode 100644
index 000000000..98e23e36f
--- /dev/null
+++ b/Eigen/src/Core/arch/MSA/MathFunctions.h
@@ -0,0 +1,387 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+// Chris Larsen
+// Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+/* The tanh function of this file is an adaptation of
+ * template<typename T> T generic_fast_tanh_float(const T&)
+ * from MathFunctionsImpl.h.
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_MSA_H
+#define EIGEN_MATH_FUNCTIONS_MSA_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+plog<Packet4f>(const Packet4f& _x) {
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+ static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+ static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+ // Convert negative argument into NAN (quiet negative, to be specific).
+ Packet4f zero = (Packet4f)__builtin_msa_ldi_w(0);
+ Packet4i neg_mask = __builtin_msa_fclt_w(_x, zero);
+ Packet4i zero_mask = __builtin_msa_fceq_w(_x, zero);
+ Packet4f non_neg_x_or_nan = padd(_x, (Packet4f)neg_mask); // Add 0.0 or NAN.
+ Packet4f x = non_neg_x_or_nan;
+
+ // Extract exponent from x = mantissa * 2**exponent, where 1.0 <= mantissa < 2.0.
+ // N.B. the exponent is one less of what frexpf() would return.
+ Packet4i e_int = __builtin_msa_ftint_s_w(__builtin_msa_flog2_w(x));
+ // Multiply x by 2**(-exponent-1) to get 0.5 <= x < 1.0 as from frexpf().
+ x = __builtin_msa_fexp2_w(x, (Packet4i)__builtin_msa_nori_b((v16u8)e_int, 0));
+
+ /*
+ if (x < SQRTHF) {
+ x = x + x - 1.0;
+ } else {
+ e += 1;
+ x = x - 1.0;
+ }
+ */
+ Packet4f xx = padd(x, x);
+ Packet4i ge_mask = __builtin_msa_fcle_w(p4f_cephes_SQRTHF, x);
+ e_int = psub(e_int, ge_mask);
+ x = (Packet4f)__builtin_msa_bsel_v((v16u8)ge_mask, (v16u8)xx, (v16u8)x);
+ x = psub(x, p4f_1);
+ Packet4f e = __builtin_msa_ffint_s_w(e_int);
+
+ Packet4f x2 = pmul(x, x);
+ Packet4f x3 = pmul(x2, x);
+
+ Packet4f y, y1, y2;
+ y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
+ y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
+ y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
+ y = pmadd(y, x, p4f_cephes_log_p2);
+ y1 = pmadd(y1, x, p4f_cephes_log_p5);
+ y2 = pmadd(y2, x, p4f_cephes_log_p8);
+ y = pmadd(y, x3, y1);
+ y = pmadd(y, x3, y2);
+ y = pmul(y, x3);
+
+ y = pmadd(e, p4f_cephes_log_q1, y);
+ x = __builtin_msa_fmsub_w(x, x2, p4f_half);
+ x = padd(x, y);
+ x = pmadd(e, p4f_cephes_log_q2, x);
+
+ // x is now the logarithm result candidate. We still need to handle the
+ // extreme arguments of zero and positive infinity, though.
+ // N.B. if the argument is +INFINITY, x is NAN because the polynomial terms
+ // contain infinities of both signs (see the coefficients and code above).
+ // INFINITY - INFINITY is NAN.
+
+ // If the argument is +INFINITY, make it the new result candidate.
+ // To achieve that we choose the smaller of the result candidate and the
+ // argument.
+ // This is correct for all finite pairs of values (the logarithm is smaller
+ // than the argument).
+ // This is also correct in the special case when the argument is +INFINITY
+ // and the result candidate is NAN. This is because the fmin.df instruction
+ // prefers non-NANs to NANs.
+ x = __builtin_msa_fmin_w(x, non_neg_x_or_nan);
+
+ // If the argument is zero (including -0.0), the result becomes -INFINITY.
+ Packet4i neg_infs = __builtin_msa_slli_w(zero_mask, 23);
+ x = (Packet4f)__builtin_msa_bsel_v((v16u8)zero_mask, (v16u8)x, (v16u8)neg_infs);
+
+ return x;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+pexp<Packet4f>(const Packet4f& _x) {
+ // Limiting single-precision pexp's argument to [-128, +128] lets pexp
+ // reach 0 and INFINITY naturally.
+ static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f);
+ static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f);
+ static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+ static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+ Packet4f x = _x;
+
+ // Clamp x.
+ x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x,
+ (v16u8)p4f_exp_lo);
+ x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x,
+ (v16u8)p4f_exp_hi);
+
+ // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
+ Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0);
+ Packet4f x2 = pmadd(x, p4f_cephes_LOG2EF, x2_add);
+ Packet4i x2_int = __builtin_msa_ftrunc_s_w(x2);
+ Packet4f x2_int_f = __builtin_msa_ffint_s_w(x2_int);
+
+ x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C1);
+ x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C2);
+
+ Packet4f z = pmul(x, x);
+
+ Packet4f y = p4f_cephes_exp_p0;
+ y = pmadd(y, x, p4f_cephes_exp_p1);
+ y = pmadd(y, x, p4f_cephes_exp_p2);
+ y = pmadd(y, x, p4f_cephes_exp_p3);
+ y = pmadd(y, x, p4f_cephes_exp_p4);
+ y = pmadd(y, x, p4f_cephes_exp_p5);
+ y = pmadd(y, z, x);
+ y = padd(y, p4f_1);
+
+ // y *= 2**exponent.
+ y = __builtin_msa_fexp2_w(y, x2_int);
+
+ return y;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+ptanh<Packet4f>(const Packet4f& _x) {
+ static _EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f);
+ static _EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f);
+ // The monomial coefficients of the numerator polynomial (odd).
+ static _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f);
+ static _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f);
+ static _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f);
+ static _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f);
+ static _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
+ static _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
+ static _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
+ // The monomial coefficients of the denominator polynomial (even).
+ static _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f);
+ static _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f);
+ static _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f);
+ static _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f);
+
+ Packet4f x = pabs(_x);
+ Packet4i tiny_mask = __builtin_msa_fclt_w(x, p4f_tanh_tiny);
+
+ // Clamp the inputs to the range [-9, 9] since anything outside
+ // this range is -/+1.0f in single-precision.
+ x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x,
+ (v16u8)p4f_tanh_hi);
+
+ // Since the polynomials are odd/even, we need x**2.
+ Packet4f x2 = pmul(x, x);
+
+ // Evaluate the numerator polynomial p.
+ Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11);
+ p = pmadd(x2, p, p4f_alpha_9);
+ p = pmadd(x2, p, p4f_alpha_7);
+ p = pmadd(x2, p, p4f_alpha_5);
+ p = pmadd(x2, p, p4f_alpha_3);
+ p = pmadd(x2, p, p4f_alpha_1);
+ p = pmul(x, p);
+
+ // Evaluate the denominator polynomial q.
+ Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4);
+ q = pmadd(x2, q, p4f_beta_2);
+ q = pmadd(x2, q, p4f_beta_0);
+
+ // Divide the numerator by the denominator.
+ p = pdiv(p, q);
+
+ // Reinstate the sign.
+ p = (Packet4f)__builtin_msa_binsli_w((v4u32)p, (v4u32)_x, 0);
+
+ // When the argument is very small in magnitude it's more accurate to just return it.
+ p = (Packet4f)__builtin_msa_bsel_v((v16u8)tiny_mask, (v16u8)p, (v16u8)_x);
+
+ return p;
+}
+
+template <bool sine>
+Packet4f psincos_inner_msa_float(const Packet4f& _x) {
+ static _EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f); // Approx. (2**24) / (4/Pi).
+ static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f);
+ static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
+ static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
+ static _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f);
+ static _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f);
+ static _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f);
+ static _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f);
+ static _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f);
+ static _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f);
+ static _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4/Pi.
+ static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+ static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+ Packet4f x = pabs(_x);
+
+ // Translate infinite arguments into NANs.
+ Packet4f zero_or_nan_if_inf = psub(_x, _x);
+ x = padd(x, zero_or_nan_if_inf);
+ // Prevent sin/cos from generating values larger than 1.0 in magnitude
+ // for very large arguments by setting x to 0.0.
+ Packet4i small_or_nan_mask = __builtin_msa_fcult_w(x, p4f_sincos_max_arg);
+ x = pand(x, (Packet4f)small_or_nan_mask);
+
+ // Scale x by 4/Pi to find x's octant.
+ Packet4f y = pmul(x, p4f_cephes_FOPI);
+ // Get the octant. We'll reduce x by this number of octants or by one more than it.
+ Packet4i y_int = __builtin_msa_ftrunc_s_w(y);
+ // x's from even-numbered octants will translate to octant 0: [0, +Pi/4].
+ // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
+ // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
+ Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1);
+ Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0);
+ y = __builtin_msa_ffint_s_w(y_int2);
+
+ // Compute the sign to apply to the polynomial.
+ Packet4i sign_mask = sine ? pxor(__builtin_msa_slli_w(y_int1, 29), (Packet4i)_x)
+ : __builtin_msa_slli_w(__builtin_msa_addvi_w(y_int, 3), 29);
+
+ // Get the polynomial selection mask.
+ // We'll calculate both (sin and cos) polynomials and then select from the two.
+ Packet4i poly_mask = __builtin_msa_ceqi_w(__builtin_msa_slli_w(y_int2, 30), 0);
+
+ // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4.
+ // The magic pass: "Extended precision modular arithmetic"
+ // x = ((x - y * DP1) - y * DP2) - y * DP3
+ Packet4f tmp1 = pmul(y, p4f_minus_cephes_DP1);
+ Packet4f tmp2 = pmul(y, p4f_minus_cephes_DP2);
+ Packet4f tmp3 = pmul(y, p4f_minus_cephes_DP3);
+ x = padd(x, tmp1);
+ x = padd(x, tmp2);
+ x = padd(x, tmp3);
+
+ // Evaluate the cos(x) polynomial.
+ y = p4f_coscof_p0;
+ Packet4f z = pmul(x, x);
+ y = pmadd(y, z, p4f_coscof_p1);
+ y = pmadd(y, z, p4f_coscof_p2);
+ y = pmul(y, z);
+ y = pmul(y, z);
+ y = __builtin_msa_fmsub_w(y, z, p4f_half);
+ y = padd(y, p4f_1);
+
+ // Evaluate the sin(x) polynomial.
+ Packet4f y2 = p4f_sincof_p0;
+ y2 = pmadd(y2, z, p4f_sincof_p1);
+ y2 = pmadd(y2, z, p4f_sincof_p2);
+ y2 = pmul(y2, z);
+ y2 = pmadd(y2, x, x);
+
+ // Select the correct result from the two polynomials.
+ y = sine ? (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y, (v16u8)y2)
+ : (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y2, (v16u8)y);
+
+ // Update the sign.
+ sign_mask = pxor(sign_mask, (Packet4i)y);
+ y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0);
+ return y;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+psin<Packet4f>(const Packet4f& x) {
+ return psincos_inner_msa_float</* sine */ true>(x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+pcos<Packet4f>(const Packet4f& x) {
+ return psincos_inner_msa_float</* sine */ false>(x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d
+pexp<Packet2d>(const Packet2d& _x) {
+ // Limiting double-precision pexp's argument to [-1024, +1024] lets pexp
+ // reach 0 and INFINITY naturally.
+ static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0);
+ static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0);
+ static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+ static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+ static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+ static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+ static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+ static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+ static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+ static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+ static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+ static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+ static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+ static _EIGEN_DECLARE_CONST_Packet2d(1, 1.0);
+ static _EIGEN_DECLARE_CONST_Packet2d(2, 2.0);
+
+ Packet2d x = _x;
+
+ // Clamp x.
+ x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x,
+ (v16u8)p2d_exp_lo);
+ x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x,
+ (v16u8)p2d_exp_hi);
+
+ // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
+ Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0);
+ Packet2d x2 = pmadd(x, p2d_cephes_LOG2EF, x2_add);
+ Packet2l x2_long = __builtin_msa_ftrunc_s_d(x2);
+ Packet2d x2_long_d = __builtin_msa_ffint_s_d(x2_long);
+
+ x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C1);
+ x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C2);
+
+ x2 = pmul(x, x);
+
+ Packet2d px = p2d_cephes_exp_p0;
+ px = pmadd(px, x2, p2d_cephes_exp_p1);
+ px = pmadd(px, x2, p2d_cephes_exp_p2);
+ px = pmul(px, x);
+
+ Packet2d qx = p2d_cephes_exp_q0;
+ qx = pmadd(qx, x2, p2d_cephes_exp_q1);
+ qx = pmadd(qx, x2, p2d_cephes_exp_q2);
+ qx = pmadd(qx, x2, p2d_cephes_exp_q3);
+
+ x = pdiv(px, psub(qx, px));
+ x = pmadd(p2d_2, x, p2d_1);
+
+ // x *= 2**exponent.
+ x = __builtin_msa_fexp2_d(x, x2_long);
+
+ return x;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_MSA_H
diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h
new file mode 100644
index 000000000..94c15d132
--- /dev/null
+++ b/Eigen/src/Core/arch/MSA/PacketMath.h
@@ -0,0 +1,1317 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+// Chris Larsen
+// Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_MSA_H
+#define EIGEN_PACKET_MATH_MSA_H
+
+#include <iostream>
+#include <string>
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#endif
+
+#if 0
+#define EIGEN_MSA_DEBUG \
+ static bool firstTime = true; \
+ do { \
+ if (firstTime) { \
+ std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \
+ firstTime = false; \
+ } \
+ } while (0)
+#else
+#define EIGEN_MSA_DEBUG
+#endif
+
+#define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a))
+
+typedef v4f32 Packet4f;
+typedef v4i32 Packet4i;
+typedef v4u32 Packet4ui;
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X }
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X }
+#define _EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X }
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) {
+ os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+ return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4i& value) {
+ os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+ return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4ui& value) {
+ os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+ return os;
+}
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+ typedef Packet4f type;
+ typedef Packet4f half; // Packet2f intrinsics not implemented yet
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 4,
+ HasHalfPacket = 0, // Packet2f intrinsics not implemented yet
+ // FIXME check the Has*
+ HasDiv = 1,
+ HasSin = EIGEN_FAST_MATH,
+ HasCos = EIGEN_FAST_MATH,
+ HasTanh = EIGEN_FAST_MATH,
+ HasLog = 1,
+ HasExp = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+ HasRound = 1,
+ HasFloor = 1,
+ HasCeil = 1,
+ HasBlend = 1
+ };
+};
+
+template <>
+struct packet_traits<int32_t> : default_packet_traits {
+ typedef Packet4i type;
+ typedef Packet4i half; // Packet2i intrinsics not implemented yet
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 4,
+ HasHalfPacket = 0, // Packet2i intrinsics not implemented yet
+ // FIXME check the Has*
+ HasDiv = 1,
+ HasBlend = 1
+ };
+};
+
+template <>
+struct unpacket_traits<Packet4f> {
+ typedef float type;
+ enum { size = 4, alignment = Aligned16 };
+ typedef Packet4f half;
+};
+
+template <>
+struct unpacket_traits<Packet4i> {
+ typedef int32_t type;
+ enum { size = 4, alignment = Aligned16 };
+ typedef Packet4i half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+ EIGEN_MSA_DEBUG;
+
+ Packet4f v = { from, from, from, from };
+ return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_fill_w(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
+ EIGEN_MSA_DEBUG;
+
+ float f = *from;
+ Packet4f v = { f, f, f, f };
+ return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pload1<Packet4i>(const int32_t* from) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_fill_w(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_fadd_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_addv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+ EIGEN_MSA_DEBUG;
+
+ static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f };
+ return padd(pset1<Packet4f>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
+ EIGEN_MSA_DEBUG;
+
+ static const Packet4i countdown = { 0, 1, 2, 3 };
+ return padd(pset1<Packet4i>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_fsub_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_subv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
+ EIGEN_MSA_DEBUG;
+
+ return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
+ EIGEN_MSA_DEBUG;
+
+ return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+ EIGEN_MSA_DEBUG;
+
+ return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_fmul_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_mulv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_fdiv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_div_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_fmadd_w(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+ EIGEN_MSA_DEBUG;
+
+ // Use "asm" construct to avoid __builtin_msa_maddv_w GNU C bug.
+ Packet4i value = c;
+ __asm__("maddv.w %w[value], %w[a], %w[b]\n"
+ // Outputs
+ : [value] "+f"(value)
+ // Inputs
+ : [a] "f"(a), [b] "f"(b));
+ return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ EIGEN_MSA_DEBUG;
+
+ return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ EIGEN_MSA_DEBUG;
+
+ return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ EIGEN_MSA_DEBUG;
+
+ return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ EIGEN_MSA_DEBUG;
+
+ return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ EIGEN_MSA_DEBUG;
+
+ return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ EIGEN_MSA_DEBUG;
+
+ return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ EIGEN_MSA_DEBUG;
+
+ return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ EIGEN_MSA_DEBUG;
+
+ return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+ // This prefers numbers to NaNs.
+ return __builtin_msa_fmin_w(a, b);
+#else
+ // This prefers NaNs to numbers.
+ Packet4i aNaN = __builtin_msa_fcun_w(a, a);
+ Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN);
+ return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_min_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+ // This prefers numbers to NaNs.
+ return __builtin_msa_fmax_w(a, b);
+#else
+ // This prefers NaNs to numbers.
+ Packet4i aNaN = __builtin_msa_fcun_w(a, a);
+ Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN);
+ return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_max_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_ALIGNED_LOAD return __builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4i)__builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
+ EIGEN_MSA_DEBUG;
+
+ float f0 = from[0], f1 = from[1];
+ Packet4f v0 = { f0, f0, f0, f0 };
+ Packet4f v1 = { f1, f1, f1, f1 };
+ return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
+ EIGEN_MSA_DEBUG;
+
+ int32_t i0 = from[0], i1 = from[1];
+ Packet4i v0 = { i0, i0, i0, i0 };
+ Packet4i v1 = { i1, i1, i1, i1 };
+ return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
+ EIGEN_MSA_DEBUG;
+
+ float f = *from;
+ Packet4f v = { f, f, f, f };
+ v[1] = from[stride];
+ v[2] = from[2 * stride];
+ v[3] = from[3 * stride];
+ return v;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
+ EIGEN_MSA_DEBUG;
+
+ int32_t i = *from;
+ Packet4i v = { i, i, i, i };
+ v[1] = from[stride];
+ v[2] = from[2 * stride];
+ v[3] = from[3 * stride];
+ return v;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from,
+ Index stride) {
+ EIGEN_MSA_DEBUG;
+
+ *to = from[0];
+ to += stride;
+ *to = from[1];
+ to += stride;
+ *to = from[2];
+ to += stride;
+ *to = from[3];
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
+ Index stride) {
+ EIGEN_MSA_DEBUG;
+
+ *to = from[0];
+ to += stride;
+ *to = from[1];
+ to += stride;
+ *to = from[2];
+ to += stride;
+ *to = from[3];
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+ EIGEN_MSA_DEBUG;
+
+ __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
+ EIGEN_MSA_DEBUG;
+
+ __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+ EIGEN_MSA_DEBUG;
+
+ return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
+ EIGEN_MSA_DEBUG;
+
+ return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+ EIGEN_MSA_DEBUG;
+
+ return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
+ EIGEN_MSA_DEBUG;
+
+ return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
+ EIGEN_MSA_DEBUG;
+
+ Packet4i zero = __builtin_msa_ldi_w(0);
+ return __builtin_msa_add_a_w(zero, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
+ EIGEN_MSA_DEBUG;
+
+ Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+ s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+ return s[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) {
+ EIGEN_MSA_DEBUG;
+
+ v4i32 tmp1, tmp2, tmp3, tmp4;
+ Packet4f sum;
+
+ tmp1 = __builtin_msa_ilvr_w((v4i32)vecs[1], (v4i32)vecs[0]);
+ tmp2 = __builtin_msa_ilvr_w((v4i32)vecs[3], (v4i32)vecs[2]);
+ tmp3 = __builtin_msa_ilvl_w((v4i32)vecs[1], (v4i32)vecs[0]);
+ tmp4 = __builtin_msa_ilvl_w((v4i32)vecs[3], (v4i32)vecs[2]);
+
+ sum = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+ sum = padd(sum, (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1));
+ sum = padd(sum, (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3));
+ sum = padd(sum, (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3));
+
+ return sum;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) {
+ EIGEN_MSA_DEBUG;
+
+ v4i32 tmp1, tmp2, tmp3, tmp4;
+ Packet4i sum;
+
+ tmp1 = __builtin_msa_ilvr_w((v4i32)vecs[1], (v4i32)vecs[0]);
+ tmp2 = __builtin_msa_ilvr_w((v4i32)vecs[3], (v4i32)vecs[2]);
+ tmp3 = __builtin_msa_ilvl_w((v4i32)vecs[1], (v4i32)vecs[0]);
+ tmp4 = __builtin_msa_ilvl_w((v4i32)vecs[3], (v4i32)vecs[2]);
+
+ sum = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+ sum = padd(sum, (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1));
+ sum = padd(sum, (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3));
+ sum = padd(sum, (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3));
+
+ return sum;
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
+ EIGEN_MSA_DEBUG;
+
+ Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+ s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+ return s[0];
+}
+
+// Other reduction functions:
+// mul
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
+ EIGEN_MSA_DEBUG;
+
+ Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+ p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+ return p[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
+ EIGEN_MSA_DEBUG;
+
+ Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+ p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+ return p[0];
+}
+
+// min
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
+ EIGEN_MSA_DEBUG;
+
+ // Swap 64-bit halves of a.
+ Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+#if !EIGEN_FAST_MATH
+ // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
+ // masks of all zeroes/ones in low 64 bits.
+ v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
+ // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
+ unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
+#endif
+ // Continue with min computation.
+ Packet4f v = __builtin_msa_fmin_w(a, swapped);
+ v = __builtin_msa_fmin_w(
+ v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+#if !EIGEN_FAST_MATH
+ // Based on the mask select between v and 4 qNaNs.
+ v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
+ v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
+#endif
+ return v[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
+ EIGEN_MSA_DEBUG;
+
+ Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+ m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+ return m[0];
+}
+
+// max
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
+ EIGEN_MSA_DEBUG;
+
+ // Swap 64-bit halves of a.
+ Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+#if !EIGEN_FAST_MATH
+ // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
+ // masks of all zeroes/ones in low 64 bits.
+ v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
+ // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
+ unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
+#endif
+ // Continue with max computation.
+ Packet4f v = __builtin_msa_fmax_w(a, swapped);
+ v = __builtin_msa_fmax_w(
+ v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+#if !EIGEN_FAST_MATH
+ // Based on the mask select between v and 4 qNaNs.
+ v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
+ v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
+#endif
+ return v[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
+ EIGEN_MSA_DEBUG;
+
+ Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+ m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+ return m[0];
+}
+
+#define PALIGN_MSA(Offset, Type, Command) \
+ template <> \
+ struct palign_impl<Offset, Type> { \
+ EIGEN_STRONG_INLINE static void run(Type& first, const Type& second) { \
+ if (Offset != 0) first = (Type)(Command((v16i8)second, (v16i8)first, Offset * 4)); \
+ } \
+ };
+
+PALIGN_MSA(0, Packet4f, __builtin_msa_sldi_b)
+PALIGN_MSA(1, Packet4f, __builtin_msa_sldi_b)
+PALIGN_MSA(2, Packet4f, __builtin_msa_sldi_b)
+PALIGN_MSA(3, Packet4f, __builtin_msa_sldi_b)
+PALIGN_MSA(0, Packet4i, __builtin_msa_sldi_b)
+PALIGN_MSA(1, Packet4i, __builtin_msa_sldi_b)
+PALIGN_MSA(2, Packet4i, __builtin_msa_sldi_b)
+PALIGN_MSA(3, Packet4i, __builtin_msa_sldi_b)
+
+#undef PALIGN_MSA
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4f, 4>& value) {
+ os << "[ " << value.packet[0] << "," << std::endl
+ << " " << value.packet[1] << "," << std::endl
+ << " " << value.packet[2] << "," << std::endl
+ << " " << value.packet[3] << " ]";
+ return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+ EIGEN_MSA_DEBUG;
+
+ v4i32 tmp1, tmp2, tmp3, tmp4;
+
+ tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
+ tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
+ tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
+ tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
+
+ kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+ kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
+ kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
+ kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4i, 4>& value) {
+ os << "[ " << value.packet[0] << "," << std::endl
+ << " " << value.packet[1] << "," << std::endl
+ << " " << value.packet[2] << "," << std::endl
+ << " " << value.packet[3] << " ]";
+ return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
+ EIGEN_MSA_DEBUG;
+
+ v4i32 tmp1, tmp2, tmp3, tmp4;
+
+ tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]);
+ tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]);
+ tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]);
+ tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]);
+
+ kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+ kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
+ kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
+ kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_fsqrt_w(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
+ EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+ return __builtin_msa_frsqrt_w(a);
+#else
+ Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1));
+ return pdiv(ones, psqrt(a));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+ Packet4f v = a;
+ int32_t old_mode, new_mode;
+ asm volatile(
+ "cfcmsa %[old_mode], $1\n"
+ "ori %[new_mode], %[old_mode], 3\n" // 3 = round towards -INFINITY.
+ "ctcmsa $1, %[new_mode]\n"
+ "frint.w %w[v], %w[v]\n"
+ "ctcmsa $1, %[old_mode]\n"
+ : // outputs
+ [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+ [v] "+f"(v)
+ : // inputs
+ : // clobbers
+ );
+ return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+ Packet4f v = a;
+ int32_t old_mode, new_mode;
+ asm volatile(
+ "cfcmsa %[old_mode], $1\n"
+ "ori %[new_mode], %[old_mode], 3\n"
+ "xori %[new_mode], %[new_mode], 1\n" // 2 = round towards +INFINITY.
+ "ctcmsa $1, %[new_mode]\n"
+ "frint.w %w[v], %w[v]\n"
+ "ctcmsa $1, %[old_mode]\n"
+ : // outputs
+ [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+ [v] "+f"(v)
+ : // inputs
+ : // clobbers
+ );
+ return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
+ Packet4f v = a;
+ int32_t old_mode, new_mode;
+ asm volatile(
+ "cfcmsa %[old_mode], $1\n"
+ "ori %[new_mode], %[old_mode], 3\n"
+ "xori %[new_mode], %[new_mode], 3\n" // 0 = round to nearest, ties to even.
+ "ctcmsa $1, %[new_mode]\n"
+ "frint.w %w[v], %w[v]\n"
+ "ctcmsa $1, %[old_mode]\n"
+ : // outputs
+ [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+ [v] "+f"(v)
+ : // inputs
+ : // clobbers
+ );
+ return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+ const Packet4f& elsePacket) {
+ Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
+ ifPacket.select[3] };
+ Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
+ return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
+ const Packet4i& elsePacket) {
+ Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
+ ifPacket.select[3] };
+ Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
+ return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+//---------- double ----------
+
+typedef v2f64 Packet2d;
+typedef v2i64 Packet2l;
+typedef v2u64 Packet2ul;
+
+#define _EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X }
+#define _EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X }
+#define _EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X }
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) {
+ os << "[ " << value[0] << ", " << value[1] << " ]";
+ return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2l& value) {
+ os << "[ " << value[0] << ", " << value[1] << " ]";
+ return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2ul& value) {
+ os << "[ " << value[0] << ", " << value[1] << " ]";
+ return os;
+}
+
+template <>
+struct packet_traits<double> : default_packet_traits {
+ typedef Packet2d type;
+ typedef Packet2d half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 2,
+ HasHalfPacket = 0,
+ // FIXME check the Has*
+ HasDiv = 1,
+ HasExp = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+ HasRound = 1,
+ HasFloor = 1,
+ HasCeil = 1,
+ HasBlend = 1
+ };
+};
+
+template <>
+struct unpacket_traits<Packet2d> {
+ typedef double type;
+ enum { size = 2, alignment = Aligned16 };
+ typedef Packet2d half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+ EIGEN_MSA_DEBUG;
+
+ Packet2d value = { from, from };
+ return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_fadd_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+ EIGEN_MSA_DEBUG;
+
+ static const Packet2d countdown = { 0.0, 1.0 };
+ return padd(pset1<Packet2d>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_fsub_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+ EIGEN_MSA_DEBUG;
+
+ return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+ EIGEN_MSA_DEBUG;
+
+ return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_fmul_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_fdiv_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_fmadd_d(c, a, b);
+}
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using MSA
+// intrinsics
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ EIGEN_MSA_DEBUG;
+
+ return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ EIGEN_MSA_DEBUG;
+
+ return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ EIGEN_MSA_DEBUG;
+
+ return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ EIGEN_MSA_DEBUG;
+
+ return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+ // This prefers numbers to NaNs.
+ return __builtin_msa_fmin_d(a, b);
+#else
+ // This prefers NaNs to numbers.
+ v2i64 aNaN = __builtin_msa_fcun_d(a, a);
+ v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN);
+ return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+ // This prefers numbers to NaNs.
+ return __builtin_msa_fmax_d(a, b);
+#else
+ // This prefers NaNs to numbers.
+ v2i64 aNaN = __builtin_msa_fcun_d(a, a);
+ v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN);
+ return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
+ EIGEN_MSA_DEBUG;
+
+ Packet2d value = { *from, *from };
+ return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+ EIGEN_MSA_DEBUG;
+
+ EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
+ EIGEN_MSA_DEBUG;
+
+ Packet2d value;
+ value[0] = *from;
+ from += stride;
+ value[1] = *from;
+ return value;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from,
+ Index stride) {
+ EIGEN_MSA_DEBUG;
+
+ *to = from[0];
+ to += stride;
+ *to = from[1];
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+ EIGEN_MSA_DEBUG;
+
+ __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+ EIGEN_MSA_DEBUG;
+
+ return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+ EIGEN_MSA_DEBUG;
+
+ return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
+ EIGEN_MSA_DEBUG;
+
+ return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
+ EIGEN_MSA_DEBUG;
+
+ Packet2d s = padd(a, preverse(a));
+ return s[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) {
+ EIGEN_MSA_DEBUG;
+
+ Packet2d v0 = (Packet2d)__builtin_msa_ilvev_d((v2i64)vecs[1], (v2i64)vecs[0]);
+ Packet2d v1 = (Packet2d)__builtin_msa_ilvod_d((v2i64)vecs[1], (v2i64)vecs[0]);
+
+ return padd(v0, v1);
+}
+
+// Other reduction functions:
+// mul
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+ EIGEN_MSA_DEBUG;
+
+ Packet2d p = pmul(a, preverse(a));
+ return p[0];
+}
+
+// min
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+ EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+ Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+ Packet2d v = __builtin_msa_fmin_d(a, swapped);
+ return v[0];
+#else
+ double a0 = a[0], a1 = a[1];
+ return ((std::isnan)(a0) || a0 < a1) ? a0 : a1;
+#endif
+}
+
+// max
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+ EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+ Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+ Packet2d v = __builtin_msa_fmax_d(a, swapped);
+ return v[0];
+#else
+ double a0 = a[0], a1 = a[1];
+ return ((std::isnan)(a0) || a0 > a1) ? a0 : a1;
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) {
+ EIGEN_MSA_DEBUG;
+
+ return __builtin_msa_fsqrt_d(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
+ EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+ return __builtin_msa_frsqrt_d(a);
+#else
+ Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1));
+ return pdiv(ones, psqrt(a));
+#endif
+}
+
+#define PALIGN_MSA(Offset, Type, Command) \
+ template <> \
+ struct palign_impl<Offset, Type> { \
+ EIGEN_STRONG_INLINE static void run(Type& first, const Type& second) { \
+ if (Offset != 0) first = (Type)(Command((v16i8)second, (v16i8)first, Offset * 8)); \
+ } \
+ };
+
+PALIGN_MSA(0, Packet2d, __builtin_msa_sldi_b)
+PALIGN_MSA(1, Packet2d, __builtin_msa_sldi_b)
+
+#undef PALIGN_MSA
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2d, 2>& value) {
+ os << "[ " << value.packet[0] << "," << std::endl << " " << value.packet[1] << " ]";
+ return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
+ EIGEN_MSA_DEBUG;
+
+ Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
+ Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
+ kernel.packet[0] = trn1;
+ kernel.packet[1] = trn2;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+ Packet2d v = a;
+ int32_t old_mode, new_mode;
+ asm volatile(
+ "cfcmsa %[old_mode], $1\n"
+ "ori %[new_mode], %[old_mode], 3\n" // 3 = round towards -INFINITY.
+ "ctcmsa $1, %[new_mode]\n"
+ "frint.d %w[v], %w[v]\n"
+ "ctcmsa $1, %[old_mode]\n"
+ : // outputs
+ [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+ [v] "+f"(v)
+ : // inputs
+ : // clobbers
+ );
+ return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+ Packet2d v = a;
+ int32_t old_mode, new_mode;
+ asm volatile(
+ "cfcmsa %[old_mode], $1\n"
+ "ori %[new_mode], %[old_mode], 3\n"
+ "xori %[new_mode], %[new_mode], 1\n" // 2 = round towards +INFINITY.
+ "ctcmsa $1, %[new_mode]\n"
+ "frint.d %w[v], %w[v]\n"
+ "ctcmsa $1, %[old_mode]\n"
+ : // outputs
+ [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+ [v] "+f"(v)
+ : // inputs
+ : // clobbers
+ );
+ return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
+ Packet2d v = a;
+ int32_t old_mode, new_mode;
+ asm volatile(
+ "cfcmsa %[old_mode], $1\n"
+ "ori %[new_mode], %[old_mode], 3\n"
+ "xori %[new_mode], %[new_mode], 3\n" // 0 = round to nearest, ties to even.
+ "ctcmsa $1, %[new_mode]\n"
+ "frint.d %w[v], %w[v]\n"
+ "ctcmsa $1, %[old_mode]\n"
+ : // outputs
+ [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+ [v] "+f"(v)
+ : // inputs
+ : // clobbers
+ );
+ return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
+ const Packet2d& elsePacket) {
+ Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
+ Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0);
+ return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_MSA_H
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 27f65f672..653e979b1 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -94,7 +94,7 @@
// certain common platform (compiler+architecture combinations) to avoid these problems.
// Only static alignment is really problematic (relies on nonstandard compiler extensions),
// try to keep heap alignment even when we have to disable static alignment.
- #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64)
+ #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS)
#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
#elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6)
// Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support.
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 5d37e5d04..f1afb4db9 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -468,6 +468,7 @@ namespace Architecture
AltiVec = 0x2,
VSX = 0x3,
NEON = 0x4,
+ MSA = 0x5,
#if defined EIGEN_VECTORIZE_SSE
Target = SSE
#elif defined EIGEN_VECTORIZE_ALTIVEC
@@ -476,6 +477,8 @@ namespace Architecture
Target = VSX
#elif defined EIGEN_VECTORIZE_NEON
Target = NEON
+#elif defined EIGEN_VECTORIZE_MSA
+ Target = MSA
#else
Target = Generic
#endif
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index 1d4486c05..7712f109a 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -452,6 +452,12 @@ macro(ei_testing_print_summary)
message(STATUS "VSX: Using architecture defaults")
endif()
+ if(EIGEN_TEST_MSA)
+ message(STATUS "MIPS MSA: ON")
+ else()
+ message(STATUS "MIPS MSA: Using architecture defaults")
+ endif()
+
if(EIGEN_TEST_NEON)
message(STATUS "ARM NEON: ON")
else()
@@ -655,6 +661,8 @@ macro(ei_get_cxxflags VAR)
set(${VAR} SSE3)
elseif(EIGEN_TEST_SSE2 OR IS_64BIT_ENV)
set(${VAR} SSE2)
+ elseif(EIGEN_TEST_MSA)
+ set(${VAR} MSA)
endif()
if(EIGEN_TEST_OPENMP)