#ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_ #define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_ #ifdef _MSC_VER #include #include #include #endif inline int _mm256_extract_epi16_N0(const __m256i X) { return _mm_extract_epi16(_mm256_extractf128_si256(X, 0 >> 3), 0 % 8); } inline int _mm256_extract_epi16_N1(const __m256i X) { return _mm_extract_epi16(_mm256_extractf128_si256(X, 1 >> 3), 1 % 8); } inline int _mm256_extract_epi8_N0(const __m256i X) { return _mm_extract_epi8(_mm256_extractf128_si256((X), 0 >> 4), 0 % 16); } inline int _mm256_extract_epi8_N1(const __m256i X) { return _mm_extract_epi8(_mm256_extractf128_si256((X), 1 >> 4), 1 % 16); } namespace Eigen { namespace internal { typedef struct Packet32q8i { __m256i val; operator __m256i() const { return val; } Packet32q8i(); Packet32q8i(__m256i val) : val(val) {} } Packet32q8i; typedef struct Packet16q16i { __m256i val; operator __m256i() const { return val; } Packet16q16i(); Packet16q16i(__m256i val) : val(val) {} } Packet16q16i; typedef struct Packet32q8u { __m256i val; operator __m256i() const { return val; } Packet32q8u(); Packet32q8u(__m256i val) : val(val) {} } Packet32q8u; typedef struct Packet16q8i { __m128i val; operator __m128i() const { return val; } Packet16q8i(); Packet16q8i(__m128i val) : val(val) {} } Packet16q8i; typedef struct Packet16q8u { __m128i val; operator __m128i() const { return val; } Packet16q8u(); Packet16q8u(__m128i val) : val(val) {} } Packet16q8u; typedef struct Packet8q16i { __m128i val; operator __m128i() const { return val; } Packet8q16i(); Packet8q16i(__m128i val) : val(val) {} } Packet8q16i; typedef struct Packet8q32i { __m256i val; operator __m256i() const { return val; } Packet8q32i(); Packet8q32i(__m256i val) : val(val) {} } Packet8q32i; typedef struct Packet4q32i { __m128i val; operator __m128i() const { return val; } Packet4q32i(); Packet4q32i(__m128i val) : val(val) {} } Packet4q32i; #ifndef EIGEN_VECTORIZE_AVX512 template <> struct packet_traits : default_packet_traits { typedef Packet32q8i type; typedef Packet16q8i half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 32, }; enum { HasAdd = 0, HasSub = 0, HasMul = 0, HasNegate = 0, HasAbs = 0, HasAbs2 = 0, HasMin = 1, HasMax = 1, HasConj = 0, HasSetLinear = 0 }; }; template <> struct packet_traits : default_packet_traits { typedef Packet32q8u type; typedef Packet16q8u half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 32, }; enum { HasAdd = 0, HasSub = 0, HasMul = 0, HasNegate = 0, HasAbs = 0, HasAbs2 = 0, HasMin = 1, HasMax = 1, HasConj = 0, HasSetLinear = 0 }; }; template <> struct packet_traits : default_packet_traits { typedef Packet16q16i type; typedef Packet8q16i half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 16, }; enum { HasAdd = 0, HasSub = 0, HasMul = 0, HasNegate = 0, HasAbs = 0, HasAbs2 = 0, HasMin = 1, HasMax = 1, HasConj = 0, HasSetLinear = 0 }; }; template <> struct packet_traits : default_packet_traits { typedef Packet8q32i type; typedef Packet4q32i half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 8, }; enum { HasAdd = 1, HasSub = 1, HasMul = 1, HasNegate = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 1, HasMax = 1, HasConj = 0, HasSetLinear = 0 }; }; #endif template <> struct unpacket_traits { typedef QInt8 type; typedef Packet16q8i half; enum { size = 32, alignment=Aligned32 }; }; template <> struct unpacket_traits { typedef QInt16 type; typedef Packet8q16i half; enum { size = 16, alignment=Aligned32 }; }; template <> struct unpacket_traits { typedef QUInt8 type; typedef Packet16q8u half; enum { size = 32, alignment=Aligned32 }; }; template <> struct unpacket_traits { typedef QInt32 type; typedef Packet4q32i half; enum { size = 8, alignment=Aligned32 }; }; // Unaligned load template <> EIGEN_STRONG_INLINE Packet32q8i ploadu(const QInt8* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( reinterpret_cast(from)); } template <> EIGEN_STRONG_INLINE Packet32q8u ploadu(const QUInt8* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( reinterpret_cast(from)); } template <> EIGEN_STRONG_INLINE Packet16q16i ploadu(const QInt16* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( reinterpret_cast(from)); } template <> EIGEN_STRONG_INLINE Packet8q32i ploadu(const QInt32* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( reinterpret_cast(from)); } // Aligned load template <> EIGEN_STRONG_INLINE Packet32q8i pload(const QInt8* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( reinterpret_cast(from)); } template <> EIGEN_STRONG_INLINE Packet32q8u pload(const QUInt8* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( reinterpret_cast(from)); } template <> EIGEN_STRONG_INLINE Packet16q16i pload(const QInt16* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( reinterpret_cast(from)); } template <> EIGEN_STRONG_INLINE Packet8q32i pload(const QInt32* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( reinterpret_cast(from)); } // Unaligned store template <> EIGEN_STRONG_INLINE void pstoreu(QInt8* to, const Packet32q8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( reinterpret_cast<__m256i*>(to), from.val); } template <> EIGEN_STRONG_INLINE void pstoreu(QUInt8* to, const Packet32q8u& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( reinterpret_cast<__m256i*>(to), from.val); } template <> EIGEN_STRONG_INLINE void pstoreu(QInt16* to, const Packet16q16i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( reinterpret_cast<__m256i*>(to), from.val); } template <> EIGEN_STRONG_INLINE void pstoreu(QInt32* to, const Packet8q32i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( reinterpret_cast<__m256i*>(to), from.val); } // Aligned store template <> EIGEN_STRONG_INLINE void pstore(QInt32* to, const Packet8q32i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from.val); } template <> EIGEN_STRONG_INLINE void pstore(QInt16* to, const Packet16q16i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from.val); } template <> EIGEN_STRONG_INLINE void pstore(QUInt8* to, const Packet32q8u& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from.val); } template <> EIGEN_STRONG_INLINE void pstore(QInt8* to, const Packet32q8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from.val); } // Extract first element. template <> EIGEN_STRONG_INLINE QInt32 pfirst(const Packet8q32i& a) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); } template <> EIGEN_STRONG_INLINE QInt16 pfirst(const Packet16q16i& a) { return _mm256_extract_epi16_N0(a.val); } template <> EIGEN_STRONG_INLINE QUInt8 pfirst(const Packet32q8u& a) { return static_cast(_mm256_extract_epi8_N0(a.val)); } template <> EIGEN_STRONG_INLINE QInt8 pfirst(const Packet32q8i& a) { return _mm256_extract_epi8_N0(a.val); } // Initialize to constant value. template <> EIGEN_STRONG_INLINE Packet32q8i pset1(const QInt8& from) { return _mm256_set1_epi8(from.value); } template <> EIGEN_STRONG_INLINE Packet32q8u pset1(const QUInt8& from) { return _mm256_set1_epi8(static_cast(from.value)); } template <> EIGEN_STRONG_INLINE Packet8q32i pset1(const QInt32& from) { return _mm256_set1_epi32(from.value); } // Basic arithmetic packet ops for QInt32. template <> EIGEN_STRONG_INLINE Packet8q32i padd(const Packet8q32i& a, const Packet8q32i& b) { return _mm256_add_epi32(a.val, b.val); } template <> EIGEN_STRONG_INLINE Packet16q16i pset1(const QInt16& from) { return _mm256_set1_epi16(from.value); } template <> EIGEN_STRONG_INLINE Packet8q32i psub(const Packet8q32i& a, const Packet8q32i& b) { return _mm256_sub_epi32(a.val, b.val); } // Note: mullo truncates the result to 32 bits. template <> EIGEN_STRONG_INLINE Packet8q32i pmul(const Packet8q32i& a, const Packet8q32i& b) { return _mm256_mullo_epi32(a.val, b.val); } template <> EIGEN_STRONG_INLINE Packet8q32i pnegate(const Packet8q32i& a) { return _mm256_sub_epi32(_mm256_setzero_si256(), a.val); } // Min and max. template <> EIGEN_STRONG_INLINE Packet8q32i pmin(const Packet8q32i& a, const Packet8q32i& b) { return _mm256_min_epi32(a.val, b.val); } template <> EIGEN_STRONG_INLINE Packet8q32i pmax(const Packet8q32i& a, const Packet8q32i& b) { return _mm256_max_epi32(a.val, b.val); } template <> EIGEN_STRONG_INLINE Packet16q16i pmin(const Packet16q16i& a, const Packet16q16i& b) { return _mm256_min_epi16(a.val, b.val); } template <> EIGEN_STRONG_INLINE Packet16q16i pmax(const Packet16q16i& a, const Packet16q16i& b) { return _mm256_max_epi16(a.val, b.val); } template <> EIGEN_STRONG_INLINE Packet32q8u pmin(const Packet32q8u& a, const Packet32q8u& b) { return _mm256_min_epu8(a.val, b.val); } template <> EIGEN_STRONG_INLINE Packet32q8u pmax(const Packet32q8u& a, const Packet32q8u& b) { return _mm256_max_epu8(a.val, b.val); } template <> EIGEN_STRONG_INLINE Packet32q8i pmin(const Packet32q8i& a, const Packet32q8i& b) { return _mm256_min_epi8(a.val, b.val); } template <> EIGEN_STRONG_INLINE Packet32q8i pmax(const Packet32q8i& a, const Packet32q8i& b) { return _mm256_max_epi8(a.val, b.val); } // Reductions. template <> EIGEN_STRONG_INLINE QInt32 predux_min(const Packet8q32i& a) { __m256i tmp = _mm256_min_epi32(a, _mm256_permute2f128_si256(a, a, 1)); tmp = _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); return pfirst( _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, 1))); } template <> EIGEN_STRONG_INLINE QInt32 predux_max(const Packet8q32i& a) { __m256i tmp = _mm256_max_epi32(a, _mm256_permute2f128_si256(a, a, 1)); tmp = _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); return pfirst( _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, 1))); } template <> EIGEN_STRONG_INLINE QInt16 predux_min(const Packet16q16i& a) { __m256i tmp = _mm256_min_epi16(a, _mm256_permute2f128_si256(a, a, 1)); tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, 1)); return std::min(_mm256_extract_epi16_N0(tmp), _mm256_extract_epi16_N1(tmp)); } template <> EIGEN_STRONG_INLINE QInt16 predux_max(const Packet16q16i& a) { __m256i tmp = _mm256_max_epi16(a, _mm256_permute2f128_si256(a, a, 1)); tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, 1)); return std::max(_mm256_extract_epi16_N0(tmp), _mm256_extract_epi16_N1(tmp)); } template <> EIGEN_STRONG_INLINE QUInt8 predux_min(const Packet32q8u& a) { __m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1)); tmp = _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); tmp = _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, 1)); tmp = _mm256_min_epu8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); return std::min(static_cast(_mm256_extract_epi8_N0(tmp)), static_cast(_mm256_extract_epi8_N1(tmp))); } template <> EIGEN_STRONG_INLINE QUInt8 predux_max(const Packet32q8u& a) { __m256i tmp = _mm256_max_epu8(a, _mm256_permute2f128_si256(a, a, 1)); tmp = _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); tmp = _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, 1)); tmp = _mm256_max_epu8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); return std::max(static_cast(_mm256_extract_epi8_N0(tmp)), static_cast(_mm256_extract_epi8_N1(tmp))); } template <> EIGEN_STRONG_INLINE QInt8 predux_min(const Packet32q8i& a) { __m256i tmp = _mm256_min_epi8(a, _mm256_permute2f128_si256(a, a, 1)); tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, 1)); tmp = _mm256_min_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); return std::min(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp)); } template <> EIGEN_STRONG_INLINE QInt8 predux_max(const Packet32q8i& a) { __m256i tmp = _mm256_max_epi8(a, _mm256_permute2f128_si256(a, a, 1)); tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, 1)); tmp = _mm256_max_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); return std::max(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp)); } // Vectorized scaling of Packet32q8i by float. template<> struct scalar_product_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; #ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op) #else scalar_product_op() { EIGEN_SCALAR_BINARY_OP_PLUGIN } #endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const QInt32& a, const double& b) const { return a * b; } EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a, const double& b) const { __m256d scale = _mm256_set1_pd(b); __m256d a_lo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(a)); __m128i result_lo = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_lo)); __m256d a_hi = _mm256_cvtepi32_pd(_mm256_extracti128_si256(a, 1)); __m128i result_hi = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_hi)); return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1); } }; template <> struct functor_traits> { enum { Cost = 4 * NumTraits::MulCost, PacketAccess = true }; }; } // end namespace internal } // end namespace Eigen #endif // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_