diff options
author | Everton Constantino <everton.constantino@ibm.com> | 2020-05-19 19:24:11 +0000 |
---|---|---|
committer | Rasmus Munk Larsen <rmlarsen@google.com> | 2020-05-19 19:24:11 +0000 |
commit | 8a7f360ec3160acdb545d05712435d8ca0334357 (patch) | |
tree | 1180b7764e12d75ef84835e73939b06d312ebfed /Eigen | |
parent | a145e4adf5e71391d64c0ab150a8c26851cf332d (diff) |
- Vectorizing MMA packing.
- Optimizing MMA kernel.
- Adding PacketBlock store to blas_data_mapper.
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/Core | 4 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AltiVec/MatrixProduct.h | 311 | ||||
-rwxr-xr-x | Eigen/src/Core/util/BlasUtil.h | 54 |
3 files changed, 369 insertions, 0 deletions
diff --git a/Eigen/Core b/Eigen/Core index 426f9cbc0..2594a7dd9 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -330,6 +330,10 @@ using std::ptrdiff_t; #include "src/Core/CoreIterators.h" #include "src/Core/ConditionEstimator.h" +#if defined(EIGEN_ARCH_PPC) +#include "src/Core/arch/AltiVec/MatrixProduct.h" +#endif + #include "src/Core/BooleanRedux.h" #include "src/Core/Select.h" #include "src/Core/VectorwiseOp.h" diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h new file mode 100644 index 000000000..3bfbfdc87 --- /dev/null +++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -0,0 +1,311 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H +#define EIGEN_MATRIX_PRODUCT_ALTIVEC_H + +#ifdef __MMA__ + +namespace Eigen { + +namespace internal { + +const int accRows = 4; +const int accCols = 4; +const int accCount = 4; +const int floatVectorSize = 4; + +typedef struct +{ + __vector float v0; + __vector float v1; + __vector float v2; + __vector float v3; +} Packet4fx4; + +union PacketQuad +{ + __struct_quad sc; + Packet4fx4 sf; +}; + +template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode> +struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> +{ + void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode> +void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> + ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + int ri = 0, j; + for(j = 0; j + floatVectorSize < rows; j+=floatVectorSize) + { + int i; + for(i = 0; i + floatVectorSize < depth; i+=floatVectorSize) + { + PacketBlock<Packet4f, 4> block; + block.packet[0] = lhs.template loadPacket<Packet4f>(j, i + 0); + block.packet[1] = lhs.template loadPacket<Packet4f>(j, i + 1); + block.packet[2] = lhs.template loadPacket<Packet4f>(j, i + 2); + block.packet[3] = lhs.template loadPacket<Packet4f>(j, i + 3); + + pstore<float>((float *)(blockA + ri ), block.packet[0]); + pstore<float>((float *)(blockA + ri + 4), block.packet[1]); + pstore<float>((float *)(blockA + ri + 8), block.packet[2]); + pstore<float>((float *)(blockA + ri + 12), block.packet[3]); + ri += 4*floatVectorSize; + } + for(; i < depth; i++) + { + Packet4f lhsV = lhs.template loadPacket<Packet4f>(j, i); + pstore<float>((float *)(blockA + ri), lhsV); + ri += floatVectorSize; + } + } + for(int i = 0; i < depth; i++) + { + int k = j; + for(; k < rows; k++) + { + blockA[ri] = lhs(k, i); + ri += 1; + } + } +} + +template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> +{ + void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> + ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + int ri = 0, j; + for(j = 0; j + floatVectorSize < cols; j+=floatVectorSize) + { + int i; + for(i = 0; i + floatVectorSize < depth; i+=floatVectorSize) + { + PacketBlock<Packet4f, 4> block; + block.packet[0] = rhs.template loadPacket<Packet4f>(i, j + 0); + block.packet[1] = rhs.template loadPacket<Packet4f>(i, j + 1); + block.packet[2] = rhs.template loadPacket<Packet4f>(i, j + 2); + block.packet[3] = rhs.template loadPacket<Packet4f>(i, j + 3); + + ptranspose(block); + + pstore<float>((float *)(blockB + ri ), block.packet[0]); + pstore<float>((float *)(blockB + ri + 4), block.packet[1]); + pstore<float>((float *)(blockB + ri + 8), block.packet[2]); + pstore<float>((float *)(blockB + ri + 12), block.packet[3]); + + ri += 4*floatVectorSize; + } + for(; i < depth; i++) + { + blockB[ri+0] = rhs(i, j+0); + blockB[ri+1] = rhs(i, j+1); + blockB[ri+2] = rhs(i, j+2); + blockB[ri+3] = rhs(i, j+3); + ri += floatVectorSize; + } + } + for(int i = 0; i < depth; i++) + { + int k = j; + for(; k < cols; k++) + { + blockB[ri] = rhs(i, k); + ri += 1; + } + } +} + +template<typename DataMapper, typename Index, typename Scalar> +EIGEN_STRONG_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, Scalar alpha, __vector_quad *acc) +{ + //[TODO] + // + //Packet4fx4 r; + // + //__builtin_mma_disassemble_acc((void *)&r, *acc); + // + PacketQuad result; + result.sc = __builtin_mma_disassemble_acc(*acc); + + Packet4f pAlpha = pset1<Packet4f>(alpha); + + PacketBlock<Packet4f, 4> block; + block.packet[0] = pAlpha*result.sf.v3; + block.packet[1] = pAlpha*result.sf.v2; + block.packet[2] = pAlpha*result.sf.v1; + block.packet[3] = pAlpha*result.sf.v0; + + data.template storePacketBlock<Packet4f, 4>(i, j, block); +} + +template<typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +struct gebp_kernel<float, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + void operator()(const DataMapper& res, const float* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, float alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template<typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +void gebp_kernel<float, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> + ::operator()(const DataMapper& res, const float* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, float alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const int remaining_rows = rows % accRows; + const int remaining_cols = cols % accCols; + const int remaining_depth = depth % floatVectorSize; + const int timesRows = (rows / accRows); + const int timesCols = (cols / accCols); + + int row; + for(row = 0; row + accRows <= rows; row += accRows) + { + const float *rhs_base = blockB; + const float *lhs_base = blockA + (row/accRows)*depth*floatVectorSize; + + int col; + for(col = 0; col + accCount*accCols <= cols; col += accCount*accCols){ + const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize; + const float *rhs_ptr2 = rhs_base + ((col/accCols) + 1)*depth*floatVectorSize; + const float *rhs_ptr3 = rhs_base + ((col/accCols) + 2)*depth*floatVectorSize; + const float *rhs_ptr4 = rhs_base + ((col/accCols) + 3)*depth*floatVectorSize; + const float *lhs_ptr = lhs_base; + + __vector_quad acc, acc2, acc3, acc4; + __builtin_mma_xxsetaccz(&acc); + __builtin_mma_xxsetaccz(&acc2); + __builtin_mma_xxsetaccz(&acc3); + __builtin_mma_xxsetaccz(&acc4); + + for(int k = 0; k < depth; k++) + { + __vector float lhsV = *((__vector float *)lhs_ptr ); + __vector float rhsV = *((__vector float *)rhs_ptr ); + __vector float rhs2V = *((__vector float *)rhs_ptr2); + __vector float rhs3V = *((__vector float *)rhs_ptr3); + __vector float rhs4V = *((__vector float *)rhs_ptr4); + + __builtin_mma_xvf32gerpp(&acc, (__vector unsigned char) rhsV, (__vector unsigned char) lhsV); + __builtin_mma_xvf32gerpp(&acc2, (__vector unsigned char) rhs2V, (__vector unsigned char) lhsV); + __builtin_mma_xvf32gerpp(&acc3, (__vector unsigned char) rhs3V, (__vector unsigned char) lhsV); + __builtin_mma_xvf32gerpp(&acc4, (__vector unsigned char) rhs4V, (__vector unsigned char) lhsV); + + lhs_ptr += floatVectorSize; + rhs_ptr += floatVectorSize; + rhs_ptr2 += floatVectorSize; + rhs_ptr3 += floatVectorSize; + rhs_ptr4 += floatVectorSize; + } + + storeAccumulator<DataMapper, Index, float>(row, col , res, alpha, &acc ); + storeAccumulator<DataMapper, Index, float>(row, col + 1*accCols, res, alpha, &acc2); + storeAccumulator<DataMapper, Index, float>(row, col + 2*accCols, res, alpha, &acc3); + storeAccumulator<DataMapper, Index, float>(row, col + 3*accCols, res, alpha, &acc4); + } + for(; col + accCols <= cols; col += accCols){ + const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize; + const float *lhs_ptr = lhs_base; + + __vector_quad acc; + __builtin_mma_xxsetaccz(&acc); + for(int k = 0; k < depth; k++) + { + __vector float lhsV = *((__vector float *)lhs_ptr); + __vector float rhsV = *((__vector float *)rhs_ptr); + + __builtin_mma_xvf32gerpp(&acc, (__vector unsigned char) rhsV, (__vector unsigned char) lhsV); + + lhs_ptr += floatVectorSize; + rhs_ptr += floatVectorSize; + } + + storeAccumulator<DataMapper, Index, float>(row, col, res, alpha, &acc); + } + + if(remaining_cols > 0) + { + const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize; + const float *lhs_ptr = lhs_base; + for(int k = 0; k < depth; k++) + { + for(int arow = 0; arow < accRows; arow++) + { + for(int acol = 0; acol < remaining_cols; acol++ ) + { + res(row + arow, col + acol) += lhs_ptr[arow]*rhs_ptr[acol]; + } + } + rhs_ptr += remaining_cols; + lhs_ptr += floatVectorSize; + } + } + } + + if(remaining_rows > 0) + { + const float *rhs_base = blockB; + const float *lhs_base = blockA + (row/accRows)*depth*floatVectorSize; + + int col; + for(col = 0; col + accCols <= cols; col += accCols) + { + const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize; + const float *lhs_ptr = lhs_base; + for(int k = 0; k < depth; k++) + { + for(int arow = 0; arow < remaining_rows; arow++) + { + for(int acol = 0; acol < accCols; acol++ ) + { + res(row + arow, col + acol) += lhs_ptr[arow]*rhs_ptr[acol]; + } + } + rhs_ptr += floatVectorSize; + lhs_ptr += remaining_rows; + } + } + + if(remaining_cols > 0) + { + const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize; + const float *lhs_ptr = lhs_base; + for(int k = 0; k < depth; k++) + { + for(int arow = 0; arow < remaining_rows; arow++) + { + for(int acol = 0; acol < remaining_cols; acol++ ) + { + res(row + arow, col + acol) += lhs_ptr[arow]*rhs_ptr[acol]; + } + } + rhs_ptr += remaining_cols; + lhs_ptr += remaining_rows; + } + } + } + } + +} // end namespace internal + +} // end namespace Eigen + +#endif // __MMA__ +#endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 643558cba..01e647f17 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -195,6 +195,55 @@ protected: template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned, int Incr = 1> class blas_data_mapper; +// TMP to help PacketBlock store implementation. +// There's currently no known use case for PacketBlock load. +// The default implementation assumes ColMajor order. +// It always store each packet sequentially one `stride` apart. +template<typename Index, typename Scalar, typename Packet, int n, int idx, int StorageOrder> +struct PacketBlockManagement +{ + PacketBlockManagement<Index, Scalar, Packet, n, idx - 1, StorageOrder> pbm; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const { + pbm.store(to, stride, i, j, block); + pstoreu<Scalar>(to + i + (j + idx)*stride, block.packet[idx]); + } +}; + +// PacketBlockManagement specialization to take care of RowMajor order without ifs. +template<typename Index, typename Scalar, typename Packet, int n, int idx> +struct PacketBlockManagement<Index, Scalar, Packet, n, idx, RowMajor> +{ + PacketBlockManagement<Index, Scalar, Packet, n, idx - 1, RowMajor> pbm; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const { + pbm.store(to, stride, i, j, block); + pstoreu<Scalar>(to + j + (i + idx)*stride, block.packet[idx]); + } +}; + +template<typename Index, typename Scalar, typename Packet, int n, int StorageOrder> +struct PacketBlockManagement<Index, Scalar, Packet, n, -1, StorageOrder> +{ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const { + EIGEN_UNUSED_VARIABLE(to); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j); + EIGEN_UNUSED_VARIABLE(block); + } +}; + +template<typename Index, typename Scalar, typename Packet, int n> +struct PacketBlockManagement<Index, Scalar, Packet, n, -1, RowMajor> +{ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const { + EIGEN_UNUSED_VARIABLE(to); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j); + EIGEN_UNUSED_VARIABLE(block); + } +}; + template<typename Scalar, typename Index, int StorageOrder, int AlignmentType> class blas_data_mapper<Scalar,Index,StorageOrder,AlignmentType,1> { @@ -258,6 +307,11 @@ public: return internal::first_default_aligned(m_data, size); } + template<typename SubPacket, int n> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketBlock(Index i, Index j, const PacketBlock<SubPacket, n> &block) const { + PacketBlockManagement<Index, Scalar, SubPacket, n, n-1, StorageOrder> pbm; + pbm.store(m_data, m_stride, i, j, block); + } protected: Scalar* EIGEN_RESTRICT m_data; const Index m_stride; |