From 3875fb05aaa019c700d68b074e61471a96c85b36 Mon Sep 17 00:00:00 2001 From: Alexey Frunze Date: Fri, 6 Jul 2018 16:04:30 -0700 Subject: Add support for MIPS SIMD (MSA) --- CMakeLists.txt | 6 + Eigen/src/Core/arch/MSA/Complex.h | 759 +++++++++++++++ Eigen/src/Core/arch/MSA/MathFunctions.h | 387 ++++++++ Eigen/src/Core/arch/MSA/PacketMath.h | 1317 ++++++++++++++++++++++++++ Eigen/src/Core/util/ConfigureVectorization.h | 2 +- Eigen/src/Core/util/Constants.h | 3 + cmake/EigenTesting.cmake | 8 + 7 files changed, 2481 insertions(+), 1 deletion(-) create mode 100644 Eigen/src/Core/arch/MSA/Complex.h create mode 100644 Eigen/src/Core/arch/MSA/MathFunctions.h create mode 100644 Eigen/src/Core/arch/MSA/PacketMath.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 91545bdb0..0547ee681 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -256,6 +256,12 @@ if(NOT MSVC) message(STATUS "Enabling VSX in tests/examples") endif() + option(EIGEN_TEST_MSA "Enable/Disable MSA in tests/examples" OFF) + if(EIGEN_TEST_MSA) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmsa") + message(STATUS "Enabling MSA in tests/examples") + endif() + option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF) if(EIGEN_TEST_NEON) if(EIGEN_TEST_FMA) diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h new file mode 100644 index 000000000..9a45cf51e --- /dev/null +++ b/Eigen/src/Core/arch/MSA/Complex.h @@ -0,0 +1,759 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Wave Computing, Inc. +// Written by: +// Chris Larsen +// Alexey Frunze (afrunze@wavecomp.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_MSA_H +#define EIGEN_COMPLEX_MSA_H + +#include + +namespace Eigen { + +namespace internal { + +//---------- float ---------- +struct Packet2cf { + EIGEN_STRONG_INLINE Packet2cf() { + } + EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex& a, + const std::complex& b) { + Packet4f t = { std::real(a), std::imag(a), std::real(b), std::imag(b) }; + v = t; + } + EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) { + } + EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) { + } + EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) { + v = b.v; + return *this; + } + EIGEN_STRONG_INLINE Packet2cf conjugate(void) const { + return Packet2cf((Packet4f)__builtin_msa_bnegi_d((v2u64)v, 63)); + } + EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) { + Packet4f v1, v2; + + // Get the real values of a | a1_re | a1_re | a2_re | a2_re | + v1 = (Packet4f)__builtin_msa_ilvev_w((v4i32)v, (v4i32)v); + // Get the imag values of a | a1_im | a1_im | a2_im | a2_im | + v2 = (Packet4f)__builtin_msa_ilvod_w((v4i32)v, (v4i32)v); + // Multiply the real a with b + v1 = pmul(v1, b.v); + // Multiply the imag a with b + v2 = pmul(v2, b.v); + // Conjugate v2 + v2 = Packet2cf(v2).conjugate().v; + // Swap real/imag elements in v2. + v2 = (Packet4f)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(1, 0, 3, 2)); + // Add and return the result + v = padd(v1, v2); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const { + return Packet2cf(*this) *= b; + } + EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) { + v = padd(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const { + return Packet2cf(*this) += b; + } + EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) { + v = psub(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const { + return Packet2cf(*this) -= b; + } + EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) { + *this *= b.conjugate(); + Packet4f s = pmul(b.v, b.v); + s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + v = pdiv(v, s); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const { + return Packet2cf(*this) /= b; + } + EIGEN_STRONG_INLINE Packet2cf operator-(void) const { + return Packet2cf(pnegate(v)); + } + + Packet4f v; +}; + +inline std::ostream& operator<<(std::ostream& os, const Packet2cf& value) { + os << "[ (" << value.v[0] << ", " << value.v[1] + << "i)," + " (" + << value.v[2] << ", " << value.v[3] << "i) ]"; + return os; +} + +template <> +struct packet_traits > : default_packet_traits { + typedef Packet2cf type; + typedef Packet2cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0, + HasBlend = 1 + }; +}; + +template <> +struct unpacket_traits { + typedef std::complex type; + enum { size = 2, alignment = Aligned16 }; + typedef Packet2cf half; +}; + +template <> +EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { + EIGEN_MSA_DEBUG; + + float f0 = from.real(), f1 = from.imag(); + Packet4f v0 = { f0, f0, f0, f0 }; + Packet4f v1 = { f1, f1, f1, f1 }; + return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return a + b; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return a - b; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return -a; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return a.conjugate(); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return a * b; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pand(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return Packet2cf(pand(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf por(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return Packet2cf(por(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pxor(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return Packet2cf(pxor(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return Packet2cf(pandnot(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { + EIGEN_MSA_DEBUG; + + return pset1(*from); +} + +template <> +EIGEN_STRONG_INLINE void pstore >(std::complex* to, + const Packet2cf& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, + const Packet2cf& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>( + const std::complex* from, Index stride) { + EIGEN_MSA_DEBUG; + + return Packet2cf(from[0 * stride], from[1 * stride]); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, + const Packet2cf& from, + Index stride) { + EIGEN_MSA_DEBUG; + + *to = std::complex(from.v[0], from.v[1]); + to += stride; + *to = std::complex(from.v[2], from.v[3]); +} + +template <> +EIGEN_STRONG_INLINE void prefetch >(const std::complex* addr) { + EIGEN_MSA_DEBUG; + + prefetch(reinterpret_cast(addr)); +} + +template <> +EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return std::complex(a.v[0], a.v[1]); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + Packet4f value = (Packet4f)preverse((Packet2d)a.v); + value += a.v; + return std::complex(value[0], value[1]); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) { + EIGEN_MSA_DEBUG; + + Packet4f sum1, sum2, sum; + + // Add the first two 64-bit float32x2_t of vecs[0] + sum1 = (Packet4f)__builtin_msa_ilvr_d((v2i64)vecs[1].v, (v2i64)vecs[0].v); + sum2 = (Packet4f)__builtin_msa_ilvl_d((v2i64)vecs[1].v, (v2i64)vecs[0].v); + sum = padd(sum1, sum2); + + return Packet2cf(sum); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return std::complex((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]), + (a.v[0] * a.v[3]) + (a.v[1] * a.v[2])); +} + +template +struct palign_impl { + EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) { + if (Offset == 1) { + first.v = (Packet4f)__builtin_msa_sldi_b((v16i8)second.v, (v16i8)first.v, Offset * 8); + } + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, + const Packet2cf& c) const { + return padd(pmul(x, y), c); + } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { + return internal::pmul(a, pconj(b)); + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, + const Packet2cf& c) const { + return padd(pmul(x, y), c); + } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { + return internal::pmul(pconj(a), b); + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, + const Packet2cf& c) const { + return padd(pmul(x, y), c); + } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { + return pconj(internal::pmul(a, b)); + } +}; + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f) + +template <> +EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return a / b; +} + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]"; + return os; +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + Packet4f tmp = + (Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v); + kernel.packet[0].v = + (Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v); + kernel.packet[1].v = tmp; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, + const Packet2cf& elsePacket) { + return (Packet2cf)(Packet4f)pblend(ifPacket, (Packet2d)thenPacket.v, + (Packet2d)elsePacket.v); +} + +//---------- double ---------- + +struct Packet1cd { + EIGEN_STRONG_INLINE Packet1cd() { + } + EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex& a) { + v[0] = std::real(a); + v[1] = std::imag(a); + } + EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) { + } + EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) { + } + EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) { + v = b.v; + return *this; + } + EIGEN_STRONG_INLINE Packet1cd conjugate(void) const { + static const v2u64 p2ul_CONJ_XOR = { 0x0, 0x8000000000000000 }; + return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR); + } + EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) { + Packet2d v1, v2; + + // Get the real values of a | a1_re | a1_re + v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)v, (v2i64)v); + // Get the imag values of a | a1_im | a1_im + v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)v, (v2i64)v); + // Multiply the real a with b + v1 = pmul(v1, b.v); + // Multiply the imag a with b + v2 = pmul(v2, b.v); + // Conjugate v2 + v2 = Packet1cd(v2).conjugate().v; + // Swap real/imag elements in v2. + v2 = (Packet2d)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); + // Add and return the result + v = padd(v1, v2); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const { + return Packet1cd(*this) *= b; + } + EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) { + v = padd(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const { + return Packet1cd(*this) += b; + } + EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) { + v = psub(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const { + return Packet1cd(*this) -= b; + } + EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) { + *this *= b.conjugate(); + Packet2d s = pmul(b.v, b.v); + s = padd(s, preverse(s)); + v = pdiv(v, s); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const { + return Packet1cd(*this) /= b; + } + EIGEN_STRONG_INLINE Packet1cd operator-(void) const { + return Packet1cd(pnegate(v)); + } + + Packet2d v; +}; + +inline std::ostream& operator<<(std::ostream& os, const Packet1cd& value) { + os << "[ (" << value.v[0] << ", " << value.v[1] << "i) ]"; + return os; +} + +template <> +struct packet_traits > : default_packet_traits { + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef std::complex type; + enum { size = 1, alignment = Aligned16 }; + typedef Packet1cd half; +}; + +template <> +EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { + EIGEN_MSA_DEBUG; + + return Packet1cd(from); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return a + b; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return a - b; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return -a; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return a.conjugate(); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return a * b; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pand(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return Packet1cd(pand(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd por(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return Packet1cd(por(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pxor(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return Packet1cd(pxor(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return Packet1cd(pandnot(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { + EIGEN_MSA_DEBUG; + + return pset1(*from); +} + +template <> +EIGEN_STRONG_INLINE void pstore >(std::complex* to, + const Packet1cd& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, + const Packet1cd& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); +} + +template <> +EIGEN_STRONG_INLINE void prefetch >(const std::complex* addr) { + EIGEN_MSA_DEBUG; + + prefetch(reinterpret_cast(addr)); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>( + const std::complex* from, Index stride __attribute__((unused))) { + EIGEN_MSA_DEBUG; + + Packet1cd res; + res.v[0] = std::real(from[0]); + res.v[1] = std::imag(from[0]); + return res; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, + const Packet1cd& from, + Index stride + __attribute__((unused))) { + EIGEN_MSA_DEBUG; + + pstore(to, from); +} + +template <> +EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return std::complex(a.v[0], a.v[1]); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return a; +} + +template <> +EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return pfirst(a); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { + EIGEN_MSA_DEBUG; + + return vecs[0]; +} + +template <> +EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return pfirst(a); +} + +template +struct palign_impl { + static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) { + // FIXME is it sure we never have to align a Packet1cd? + // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes + // boundary... + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, + const Packet1cd& c) const { + return padd(pmul(x, y), c); + } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { + return internal::pmul(a, pconj(b)); + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, + const Packet1cd& c) const { + return padd(pmul(x, y), c); + } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { + return internal::pmul(pconj(a), b); + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, + const Packet1cd& c) const { + return padd(pmul(x, y), c); + } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { + return pconj(internal::pmul(a, b)); + } +}; + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d) + +template <> +EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return a / b; +} + +EIGEN_STRONG_INLINE Packet1cd pcplxflip /**/ (const Packet1cd& x) { + EIGEN_MSA_DEBUG; + + return Packet1cd(preverse(Packet2d(x.v))); +} + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]"; + return os; +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + Packet2d v1, v2; + + v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v); + // Get the imag values of a + v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v); + + kernel.packet[0].v = v1; + kernel.packet[1].v = v2; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_MSA_H diff --git a/Eigen/src/Core/arch/MSA/MathFunctions.h b/Eigen/src/Core/arch/MSA/MathFunctions.h new file mode 100644 index 000000000..98e23e36f --- /dev/null +++ b/Eigen/src/Core/arch/MSA/MathFunctions.h @@ -0,0 +1,387 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2007 Julien Pommier +// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com) +// Copyright (C) 2016 Gael Guennebaud +// +// Copyright (C) 2018 Wave Computing, Inc. +// Written by: +// Chris Larsen +// Alexey Frunze (afrunze@wavecomp.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The sin, cos, exp, and log functions of this file come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +/* The tanh function of this file is an adaptation of + * template T generic_fast_tanh_float(const T&) + * from MathFunctionsImpl.h. + */ + +#ifndef EIGEN_MATH_FUNCTIONS_MSA_H +#define EIGEN_MATH_FUNCTIONS_MSA_H + +namespace Eigen { + +namespace internal { + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +plog(const Packet4f& _x) { + static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); + + // Convert negative argument into NAN (quiet negative, to be specific). + Packet4f zero = (Packet4f)__builtin_msa_ldi_w(0); + Packet4i neg_mask = __builtin_msa_fclt_w(_x, zero); + Packet4i zero_mask = __builtin_msa_fceq_w(_x, zero); + Packet4f non_neg_x_or_nan = padd(_x, (Packet4f)neg_mask); // Add 0.0 or NAN. + Packet4f x = non_neg_x_or_nan; + + // Extract exponent from x = mantissa * 2**exponent, where 1.0 <= mantissa < 2.0. + // N.B. the exponent is one less of what frexpf() would return. + Packet4i e_int = __builtin_msa_ftint_s_w(__builtin_msa_flog2_w(x)); + // Multiply x by 2**(-exponent-1) to get 0.5 <= x < 1.0 as from frexpf(). + x = __builtin_msa_fexp2_w(x, (Packet4i)__builtin_msa_nori_b((v16u8)e_int, 0)); + + /* + if (x < SQRTHF) { + x = x + x - 1.0; + } else { + e += 1; + x = x - 1.0; + } + */ + Packet4f xx = padd(x, x); + Packet4i ge_mask = __builtin_msa_fcle_w(p4f_cephes_SQRTHF, x); + e_int = psub(e_int, ge_mask); + x = (Packet4f)__builtin_msa_bsel_v((v16u8)ge_mask, (v16u8)xx, (v16u8)x); + x = psub(x, p4f_1); + Packet4f e = __builtin_msa_ffint_s_w(e_int); + + Packet4f x2 = pmul(x, x); + Packet4f x3 = pmul(x2, x); + + Packet4f y, y1, y2; + y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); + y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); + y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); + y = pmadd(y, x, p4f_cephes_log_p2); + y1 = pmadd(y1, x, p4f_cephes_log_p5); + y2 = pmadd(y2, x, p4f_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + y = pmadd(e, p4f_cephes_log_q1, y); + x = __builtin_msa_fmsub_w(x, x2, p4f_half); + x = padd(x, y); + x = pmadd(e, p4f_cephes_log_q2, x); + + // x is now the logarithm result candidate. We still need to handle the + // extreme arguments of zero and positive infinity, though. + // N.B. if the argument is +INFINITY, x is NAN because the polynomial terms + // contain infinities of both signs (see the coefficients and code above). + // INFINITY - INFINITY is NAN. + + // If the argument is +INFINITY, make it the new result candidate. + // To achieve that we choose the smaller of the result candidate and the + // argument. + // This is correct for all finite pairs of values (the logarithm is smaller + // than the argument). + // This is also correct in the special case when the argument is +INFINITY + // and the result candidate is NAN. This is because the fmin.df instruction + // prefers non-NANs to NANs. + x = __builtin_msa_fmin_w(x, non_neg_x_or_nan); + + // If the argument is zero (including -0.0), the result becomes -INFINITY. + Packet4i neg_infs = __builtin_msa_slli_w(zero_mask, 23); + x = (Packet4f)__builtin_msa_bsel_v((v16u8)zero_mask, (v16u8)x, (v16u8)neg_infs); + + return x; +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +pexp(const Packet4f& _x) { + // Limiting single-precision pexp's argument to [-128, +128] lets pexp + // reach 0 and INFINITY naturally. + static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f); + static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); + + Packet4f x = _x; + + // Clamp x. + x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x, + (v16u8)p4f_exp_lo); + x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x, + (v16u8)p4f_exp_hi); + + // Round to nearest integer by adding 0.5 (with x's sign) and truncating. + Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0); + Packet4f x2 = pmadd(x, p4f_cephes_LOG2EF, x2_add); + Packet4i x2_int = __builtin_msa_ftrunc_s_w(x2); + Packet4f x2_int_f = __builtin_msa_ffint_s_w(x2_int); + + x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C1); + x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C2); + + Packet4f z = pmul(x, x); + + Packet4f y = p4f_cephes_exp_p0; + y = pmadd(y, x, p4f_cephes_exp_p1); + y = pmadd(y, x, p4f_cephes_exp_p2); + y = pmadd(y, x, p4f_cephes_exp_p3); + y = pmadd(y, x, p4f_cephes_exp_p4); + y = pmadd(y, x, p4f_cephes_exp_p5); + y = pmadd(y, z, x); + y = padd(y, p4f_1); + + // y *= 2**exponent. + y = __builtin_msa_fexp2_w(y, x2_int); + + return y; +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +ptanh(const Packet4f& _x) { + static _EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f); + // The monomial coefficients of the numerator polynomial (odd). + static _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f); + // The monomial coefficients of the denominator polynomial (even). + static _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f); + + Packet4f x = pabs(_x); + Packet4i tiny_mask = __builtin_msa_fclt_w(x, p4f_tanh_tiny); + + // Clamp the inputs to the range [-9, 9] since anything outside + // this range is -/+1.0f in single-precision. + x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x, + (v16u8)p4f_tanh_hi); + + // Since the polynomials are odd/even, we need x**2. + Packet4f x2 = pmul(x, x); + + // Evaluate the numerator polynomial p. + Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11); + p = pmadd(x2, p, p4f_alpha_9); + p = pmadd(x2, p, p4f_alpha_7); + p = pmadd(x2, p, p4f_alpha_5); + p = pmadd(x2, p, p4f_alpha_3); + p = pmadd(x2, p, p4f_alpha_1); + p = pmul(x, p); + + // Evaluate the denominator polynomial q. + Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4); + q = pmadd(x2, q, p4f_beta_2); + q = pmadd(x2, q, p4f_beta_0); + + // Divide the numerator by the denominator. + p = pdiv(p, q); + + // Reinstate the sign. + p = (Packet4f)__builtin_msa_binsli_w((v4u32)p, (v4u32)_x, 0); + + // When the argument is very small in magnitude it's more accurate to just return it. + p = (Packet4f)__builtin_msa_bsel_v((v16u8)tiny_mask, (v16u8)p, (v16u8)_x); + + return p; +} + +template +Packet4f psincos_inner_msa_float(const Packet4f& _x) { + static _EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f); // Approx. (2**24) / (4/Pi). + static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f); + static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); + static _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f); + static _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4/Pi. + static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); + + Packet4f x = pabs(_x); + + // Translate infinite arguments into NANs. + Packet4f zero_or_nan_if_inf = psub(_x, _x); + x = padd(x, zero_or_nan_if_inf); + // Prevent sin/cos from generating values larger than 1.0 in magnitude + // for very large arguments by setting x to 0.0. + Packet4i small_or_nan_mask = __builtin_msa_fcult_w(x, p4f_sincos_max_arg); + x = pand(x, (Packet4f)small_or_nan_mask); + + // Scale x by 4/Pi to find x's octant. + Packet4f y = pmul(x, p4f_cephes_FOPI); + // Get the octant. We'll reduce x by this number of octants or by one more than it. + Packet4i y_int = __builtin_msa_ftrunc_s_w(y); + // x's from even-numbered octants will translate to octant 0: [0, +Pi/4]. + // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0]. + // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1). + Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1); + Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0); + y = __builtin_msa_ffint_s_w(y_int2); + + // Compute the sign to apply to the polynomial. + Packet4i sign_mask = sine ? pxor(__builtin_msa_slli_w(y_int1, 29), (Packet4i)_x) + : __builtin_msa_slli_w(__builtin_msa_addvi_w(y_int, 3), 29); + + // Get the polynomial selection mask. + // We'll calculate both (sin and cos) polynomials and then select from the two. + Packet4i poly_mask = __builtin_msa_ceqi_w(__builtin_msa_slli_w(y_int2, 30), 0); + + // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4. + // The magic pass: "Extended precision modular arithmetic" + // x = ((x - y * DP1) - y * DP2) - y * DP3 + Packet4f tmp1 = pmul(y, p4f_minus_cephes_DP1); + Packet4f tmp2 = pmul(y, p4f_minus_cephes_DP2); + Packet4f tmp3 = pmul(y, p4f_minus_cephes_DP3); + x = padd(x, tmp1); + x = padd(x, tmp2); + x = padd(x, tmp3); + + // Evaluate the cos(x) polynomial. + y = p4f_coscof_p0; + Packet4f z = pmul(x, x); + y = pmadd(y, z, p4f_coscof_p1); + y = pmadd(y, z, p4f_coscof_p2); + y = pmul(y, z); + y = pmul(y, z); + y = __builtin_msa_fmsub_w(y, z, p4f_half); + y = padd(y, p4f_1); + + // Evaluate the sin(x) polynomial. + Packet4f y2 = p4f_sincof_p0; + y2 = pmadd(y2, z, p4f_sincof_p1); + y2 = pmadd(y2, z, p4f_sincof_p2); + y2 = pmul(y2, z); + y2 = pmadd(y2, x, x); + + // Select the correct result from the two polynomials. + y = sine ? (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y, (v16u8)y2) + : (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y2, (v16u8)y); + + // Update the sign. + sign_mask = pxor(sign_mask, (Packet4i)y); + y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0); + return y; +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +psin(const Packet4f& x) { + return psincos_inner_msa_float(x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +pcos(const Packet4f& x) { + return psincos_inner_msa_float(x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d +pexp(const Packet2d& _x) { + // Limiting double-precision pexp's argument to [-1024, +1024] lets pexp + // reach 0 and INFINITY naturally. + static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0); + static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); + static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); + static _EIGEN_DECLARE_CONST_Packet2d(1, 1.0); + static _EIGEN_DECLARE_CONST_Packet2d(2, 2.0); + + Packet2d x = _x; + + // Clamp x. + x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x, + (v16u8)p2d_exp_lo); + x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x, + (v16u8)p2d_exp_hi); + + // Round to nearest integer by adding 0.5 (with x's sign) and truncating. + Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0); + Packet2d x2 = pmadd(x, p2d_cephes_LOG2EF, x2_add); + Packet2l x2_long = __builtin_msa_ftrunc_s_d(x2); + Packet2d x2_long_d = __builtin_msa_ffint_s_d(x2_long); + + x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C1); + x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C2); + + x2 = pmul(x, x); + + Packet2d px = p2d_cephes_exp_p0; + px = pmadd(px, x2, p2d_cephes_exp_p1); + px = pmadd(px, x2, p2d_cephes_exp_p2); + px = pmul(px, x); + + Packet2d qx = p2d_cephes_exp_q0; + qx = pmadd(qx, x2, p2d_cephes_exp_q1); + qx = pmadd(qx, x2, p2d_cephes_exp_q2); + qx = pmadd(qx, x2, p2d_cephes_exp_q3); + + x = pdiv(px, psub(qx, px)); + x = pmadd(p2d_2, x, p2d_1); + + // x *= 2**exponent. + x = __builtin_msa_fexp2_d(x, x2_long); + + return x; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_MSA_H diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h new file mode 100644 index 000000000..94c15d132 --- /dev/null +++ b/Eigen/src/Core/arch/MSA/PacketMath.h @@ -0,0 +1,1317 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Wave Computing, Inc. +// Written by: +// Chris Larsen +// Alexey Frunze (afrunze@wavecomp.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_MSA_H +#define EIGEN_PACKET_MATH_MSA_H + +#include +#include + +namespace Eigen { + +namespace internal { + +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#endif + +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 +#endif + +#if 0 +#define EIGEN_MSA_DEBUG \ + static bool firstTime = true; \ + do { \ + if (firstTime) { \ + std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \ + firstTime = false; \ + } \ + } while (0) +#else +#define EIGEN_MSA_DEBUG +#endif + +#define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a)) + +typedef v4f32 Packet4f; +typedef v4i32 Packet4i; +typedef v4u32 Packet4ui; + +#define _EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X } +#define _EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X } +#define _EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X } + +inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) { + os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Packet4i& value) { + os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Packet4ui& value) { + os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; + return os; +} + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4f type; + typedef Packet4f half; // Packet2f intrinsics not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, // Packet2f intrinsics not implemented yet + // FIXME check the Has* + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasTanh = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasBlend = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4i type; + typedef Packet4i half; // Packet2i intrinsics not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, // Packet2i intrinsics not implemented yet + // FIXME check the Has* + HasDiv = 1, + HasBlend = 1 + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + enum { size = 4, alignment = Aligned16 }; + typedef Packet4f half; +}; + +template <> +struct unpacket_traits { + typedef int32_t type; + enum { size = 4, alignment = Aligned16 }; + typedef Packet4i half; +}; + +template <> +EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { + EIGEN_MSA_DEBUG; + + Packet4f v = { from, from, from, from }; + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fill_w(from); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pload1(const float* from) { + EIGEN_MSA_DEBUG; + + float f = *from; + Packet4f v = { f, f, f, f }; + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4i pload1(const int32_t* from) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fill_w(*from); +} + +template <> +EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fadd_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_addv_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f plset(const float& a) { + EIGEN_MSA_DEBUG; + + static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f }; + return padd(pset1(a), countdown); +} + +template <> +EIGEN_STRONG_INLINE Packet4i plset(const int32_t& a) { + EIGEN_MSA_DEBUG; + + static const Packet4i countdown = { 0, 1, 2, 3 }; + return padd(pset1(a), countdown); +} + +template <> +EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fsub_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_subv_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fmul_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_mulv_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fdiv_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_div_s_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fmadd_w(c, a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { + EIGEN_MSA_DEBUG; + + // Use "asm" construct to avoid __builtin_msa_maddv_w GNU C bug. + Packet4i value = c; + __asm__("maddv.w %w[value], %w[a], %w[b]\n" + // Outputs + : [value] "+f"(value) + // Inputs + : [a] "f"(a), [b] "f"(b)); + return value; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255)); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255)); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + // This prefers numbers to NaNs. + return __builtin_msa_fmin_w(a, b); +#else + // This prefers NaNs to numbers. + Packet4i aNaN = __builtin_msa_fcun_w(a, a); + Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN); + return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_min_s_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + // This prefers numbers to NaNs. + return __builtin_msa_fmax_w(a, b); +#else + // This prefers NaNs to numbers. + Packet4i aNaN = __builtin_msa_fcun_w(a, a); + Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN); + return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_max_s_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pload(const float* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_LOAD return __builtin_msa_ld_w(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4i)__builtin_msa_ld_w(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { + EIGEN_MSA_DEBUG; + + float f0 = from[0], f1 = from[1]; + Packet4f v0 = { f0, f0, f0, f0 }; + Packet4f v1 = { f1, f1, f1, f1 }; + return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0); +} + +template <> +EIGEN_STRONG_INLINE Packet4i ploaddup(const int32_t* from) { + EIGEN_MSA_DEBUG; + + int32_t i0 = from[0], i1 = from[1]; + Packet4i v0 = { i0, i0, i0, i0 }; + Packet4i v1 = { i1, i1, i1, i1 }; + return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { + EIGEN_MSA_DEBUG; + + float f = *from; + Packet4f v = { f, f, f, f }; + v[1] = from[stride]; + v[2] = from[2 * stride]; + v[3] = from[3 * stride]; + return v; +} + +template <> +EIGEN_DEVICE_FUNC inline Packet4i pgather(const int32_t* from, Index stride) { + EIGEN_MSA_DEBUG; + + int32_t i = *from; + Packet4i v = { i, i, i, i }; + v[1] = from[stride]; + v[2] = from[2 * stride]; + v[3] = from[3 * stride]; + return v; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, + Index stride) { + EIGEN_MSA_DEBUG; + + *to = from[0]; + to += stride; + *to = from[1]; + to += stride; + *to = from[2]; + to += stride; + *to = from[3]; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(int32_t* to, const Packet4i& from, + Index stride) { + EIGEN_MSA_DEBUG; + + *to = from[0]; + to += stride; + *to = from[1]; + to += stride; + *to = from[2]; + to += stride; + *to = from[3]; +} + +template <> +EIGEN_STRONG_INLINE void prefetch(const float* addr) { + EIGEN_MSA_DEBUG; + + __builtin_prefetch(addr); +} + +template <> +EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { + EIGEN_MSA_DEBUG; + + __builtin_prefetch(addr); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return a[0]; +} + +template <> +EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + return a[0]; +} + +template <> +EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0)); +} + +template <> +EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0)); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i zero = __builtin_msa_ldi_w(0); + return __builtin_msa_add_a_w(zero, a); +} + +template <> +EIGEN_STRONG_INLINE float predux(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return s[0]; +} + +template <> +EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) { + EIGEN_MSA_DEBUG; + + v4i32 tmp1, tmp2, tmp3, tmp4; + Packet4f sum; + + tmp1 = __builtin_msa_ilvr_w((v4i32)vecs[1], (v4i32)vecs[0]); + tmp2 = __builtin_msa_ilvr_w((v4i32)vecs[3], (v4i32)vecs[2]); + tmp3 = __builtin_msa_ilvl_w((v4i32)vecs[1], (v4i32)vecs[0]); + tmp4 = __builtin_msa_ilvl_w((v4i32)vecs[3], (v4i32)vecs[2]); + + sum = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1); + sum = padd(sum, (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1)); + sum = padd(sum, (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3)); + sum = padd(sum, (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3)); + + return sum; +} + +template <> +EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) { + EIGEN_MSA_DEBUG; + + v4i32 tmp1, tmp2, tmp3, tmp4; + Packet4i sum; + + tmp1 = __builtin_msa_ilvr_w((v4i32)vecs[1], (v4i32)vecs[0]); + tmp2 = __builtin_msa_ilvr_w((v4i32)vecs[3], (v4i32)vecs[2]); + tmp3 = __builtin_msa_ilvl_w((v4i32)vecs[1], (v4i32)vecs[0]); + tmp4 = __builtin_msa_ilvl_w((v4i32)vecs[3], (v4i32)vecs[2]); + + sum = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1); + sum = padd(sum, (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1)); + sum = padd(sum, (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3)); + sum = padd(sum, (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3)); + + return sum; +} + +template <> +EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return s[0]; +} + +// Other reduction functions: +// mul +template <> +EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return p[0]; +} + +template <> +EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return p[0]; +} + +// min +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + // Swap 64-bit halves of a. + Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); +#if !EIGEN_FAST_MATH + // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit + // masks of all zeroes/ones in low 64 bits. + v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped); + // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes. + unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0); +#endif + // Continue with min computation. + Packet4f v = __builtin_msa_fmin_w(a, swapped); + v = __builtin_msa_fmin_w( + v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); +#if !EIGEN_FAST_MATH + // Based on the mask select between v and 4 qNaNs. + v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000); + v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v); +#endif + return v[0]; +} + +template <> +EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return m[0]; +} + +// max +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + // Swap 64-bit halves of a. + Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); +#if !EIGEN_FAST_MATH + // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit + // masks of all zeroes/ones in low 64 bits. + v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped); + // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes. + unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0); +#endif + // Continue with max computation. + Packet4f v = __builtin_msa_fmax_w(a, swapped); + v = __builtin_msa_fmax_w( + v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); +#if !EIGEN_FAST_MATH + // Based on the mask select between v and 4 qNaNs. + v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000); + v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v); +#endif + return v[0]; +} + +template <> +EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return m[0]; +} + +#define PALIGN_MSA(Offset, Type, Command) \ + template <> \ + struct palign_impl { \ + EIGEN_STRONG_INLINE static void run(Type& first, const Type& second) { \ + if (Offset != 0) first = (Type)(Command((v16i8)second, (v16i8)first, Offset * 4)); \ + } \ + }; + +PALIGN_MSA(0, Packet4f, __builtin_msa_sldi_b) +PALIGN_MSA(1, Packet4f, __builtin_msa_sldi_b) +PALIGN_MSA(2, Packet4f, __builtin_msa_sldi_b) +PALIGN_MSA(3, Packet4f, __builtin_msa_sldi_b) +PALIGN_MSA(0, Packet4i, __builtin_msa_sldi_b) +PALIGN_MSA(1, Packet4i, __builtin_msa_sldi_b) +PALIGN_MSA(2, Packet4i, __builtin_msa_sldi_b) +PALIGN_MSA(3, Packet4i, __builtin_msa_sldi_b) + +#undef PALIGN_MSA + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << "," << std::endl + << " " << value.packet[1] << "," << std::endl + << " " << value.packet[2] << "," << std::endl + << " " << value.packet[3] << " ]"; + return os; +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + v4i32 tmp1, tmp2, tmp3, tmp4; + + tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]); + tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]); + tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]); + tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]); + + kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1); + kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1); + kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3); + kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3); +} + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << "," << std::endl + << " " << value.packet[1] << "," << std::endl + << " " << value.packet[2] << "," << std::endl + << " " << value.packet[3] << " ]"; + return os; +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + v4i32 tmp1, tmp2, tmp3, tmp4; + + tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]); + tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]); + tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]); + tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]); + + kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1); + kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1); + kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3); + kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3); +} + +template <> +EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fsqrt_w(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + return __builtin_msa_frsqrt_w(a); +#else + Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1)); + return pdiv(ones, psqrt(a)); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { + Packet4f v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" // 3 = round towards -INFINITY. + "ctcmsa $1, %[new_mode]\n" + "frint.w %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { + Packet4f v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" + "xori %[new_mode], %[new_mode], 1\n" // 2 = round towards +INFINITY. + "ctcmsa $1, %[new_mode]\n" + "frint.w %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { + Packet4f v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" + "xori %[new_mode], %[new_mode], 3\n" // 0 = round to nearest, ties to even. + "ctcmsa $1, %[new_mode]\n" + "frint.w %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, + const Packet4f& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], + ifPacket.select[3] }; + Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0); + return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, + const Packet4i& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], + ifPacket.select[3] }; + Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0); + return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); +} + +//---------- double ---------- + +typedef v2f64 Packet2d; +typedef v2i64 Packet2l; +typedef v2u64 Packet2ul; + +#define _EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X } +#define _EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X } +#define _EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X } + +inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) { + os << "[ " << value[0] << ", " << value[1] << " ]"; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Packet2l& value) { + os << "[ " << value[0] << ", " << value[1] << " ]"; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Packet2ul& value) { + os << "[ " << value[0] << ", " << value[1] << " ]"; + return os; +} + +template <> +struct packet_traits : default_packet_traits { + typedef Packet2d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + // FIXME check the Has* + HasDiv = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasBlend = 1 + }; +}; + +template <> +struct unpacket_traits { + typedef double type; + enum { size = 2, alignment = Aligned16 }; + typedef Packet2d half; +}; + +template <> +EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { + EIGEN_MSA_DEBUG; + + Packet2d value = { from, from }; + return value; +} + +template <> +EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fadd_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d plset(const double& a) { + EIGEN_MSA_DEBUG; + + static const Packet2d countdown = { 0.0, 1.0 }; + return padd(pset1(a), countdown); +} + +template <> +EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fsub_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fmul_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fdiv_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fmadd_d(c, a, b); +} + +// Logical Operations are not supported for float, so we have to reinterpret casts using MSA +// intrinsics +template <> +EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255)); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pload(const double* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + // This prefers numbers to NaNs. + return __builtin_msa_fmin_d(a, b); +#else + // This prefers NaNs to numbers. + v2i64 aNaN = __builtin_msa_fcun_d(a, a); + v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN); + return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + // This prefers numbers to NaNs. + return __builtin_msa_fmax_d(a, b); +#else + // This prefers NaNs to numbers. + v2i64 aNaN = __builtin_msa_fcun_d(a, a); + v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN); + return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { + EIGEN_MSA_DEBUG; + + Packet2d value = { *from, *from }; + return value; +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { + EIGEN_MSA_DEBUG; + + Packet2d value; + value[0] = *from; + from += stride; + value[1] = *from; + return value; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, + Index stride) { + EIGEN_MSA_DEBUG; + + *to = from[0]; + to += stride; + *to = from[1]; +} + +template <> +EIGEN_STRONG_INLINE void prefetch(const double* addr) { + EIGEN_MSA_DEBUG; + + __builtin_prefetch(addr); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return a[0]; +} + +template <> +EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63); +} + +template <> +EIGEN_STRONG_INLINE double predux(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + Packet2d s = padd(a, preverse(a)); + return s[0]; +} + +template <> +EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) { + EIGEN_MSA_DEBUG; + + Packet2d v0 = (Packet2d)__builtin_msa_ilvev_d((v2i64)vecs[1], (v2i64)vecs[0]); + Packet2d v1 = (Packet2d)__builtin_msa_ilvod_d((v2i64)vecs[1], (v2i64)vecs[0]); + + return padd(v0, v1); +} + +// Other reduction functions: +// mul +template <> +EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + Packet2d p = pmul(a, preverse(a)); + return p[0]; +} + +// min +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); + Packet2d v = __builtin_msa_fmin_d(a, swapped); + return v[0]; +#else + double a0 = a[0], a1 = a[1]; + return ((std::isnan)(a0) || a0 < a1) ? a0 : a1; +#endif +} + +// max +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); + Packet2d v = __builtin_msa_fmax_d(a, swapped); + return v[0]; +#else + double a0 = a[0], a1 = a[1]; + return ((std::isnan)(a0) || a0 > a1) ? a0 : a1; +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fsqrt_d(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + return __builtin_msa_frsqrt_d(a); +#else + Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1)); + return pdiv(ones, psqrt(a)); +#endif +} + +#define PALIGN_MSA(Offset, Type, Command) \ + template <> \ + struct palign_impl { \ + EIGEN_STRONG_INLINE static void run(Type& first, const Type& second) { \ + if (Offset != 0) first = (Type)(Command((v16i8)second, (v16i8)first, Offset * 8)); \ + } \ + }; + +PALIGN_MSA(0, Packet2d, __builtin_msa_sldi_b) +PALIGN_MSA(1, Packet2d, __builtin_msa_sldi_b) + +#undef PALIGN_MSA + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << "," << std::endl << " " << value.packet[1] << " ]"; + return os; +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]); + Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]); + kernel.packet[0] = trn1; + kernel.packet[1] = trn2; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { + Packet2d v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" // 3 = round towards -INFINITY. + "ctcmsa $1, %[new_mode]\n" + "frint.d %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { + Packet2d v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" + "xori %[new_mode], %[new_mode], 1\n" // 2 = round towards +INFINITY. + "ctcmsa $1, %[new_mode]\n" + "frint.d %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { + Packet2d v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" + "xori %[new_mode], %[new_mode], 3\n" // 0 = round to nearest, ties to even. + "ctcmsa $1, %[new_mode]\n" + "frint.d %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, + const Packet2d& elsePacket) { + Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; + Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0); + return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_MSA_H diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 27f65f672..653e979b1 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -94,7 +94,7 @@ // certain common platform (compiler+architecture combinations) to avoid these problems. // Only static alignment is really problematic (relies on nonstandard compiler extensions), // try to keep heap alignment even when we have to disable static alignment. - #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64) + #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS) #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6) // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support. diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 5d37e5d04..f1afb4db9 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -468,6 +468,7 @@ namespace Architecture AltiVec = 0x2, VSX = 0x3, NEON = 0x4, + MSA = 0x5, #if defined EIGEN_VECTORIZE_SSE Target = SSE #elif defined EIGEN_VECTORIZE_ALTIVEC @@ -476,6 +477,8 @@ namespace Architecture Target = VSX #elif defined EIGEN_VECTORIZE_NEON Target = NEON +#elif defined EIGEN_VECTORIZE_MSA + Target = MSA #else Target = Generic #endif diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 1d4486c05..7712f109a 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -452,6 +452,12 @@ macro(ei_testing_print_summary) message(STATUS "VSX: Using architecture defaults") endif() + if(EIGEN_TEST_MSA) + message(STATUS "MIPS MSA: ON") + else() + message(STATUS "MIPS MSA: Using architecture defaults") + endif() + if(EIGEN_TEST_NEON) message(STATUS "ARM NEON: ON") else() @@ -655,6 +661,8 @@ macro(ei_get_cxxflags VAR) set(${VAR} SSE3) elseif(EIGEN_TEST_SSE2 OR IS_64BIT_ENV) set(${VAR} SSE2) + elseif(EIGEN_TEST_MSA) + set(${VAR} MSA) endif() if(EIGEN_TEST_OPENMP) -- cgit v1.2.3