aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen
diff options
context:
space:
mode:
authorGravatar Everton Constantino <everton.constantino@ibm.com>2020-05-19 19:24:11 +0000
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2020-05-19 19:24:11 +0000
commit8a7f360ec3160acdb545d05712435d8ca0334357 (patch)
tree1180b7764e12d75ef84835e73939b06d312ebfed /Eigen
parenta145e4adf5e71391d64c0ab150a8c26851cf332d (diff)
- Vectorizing MMA packing.
- Optimizing MMA kernel. - Adding PacketBlock store to blas_data_mapper.
Diffstat (limited to 'Eigen')
-rw-r--r--Eigen/Core4
-rw-r--r--Eigen/src/Core/arch/AltiVec/MatrixProduct.h311
-rwxr-xr-xEigen/src/Core/util/BlasUtil.h54
3 files changed, 369 insertions, 0 deletions
diff --git a/Eigen/Core b/Eigen/Core
index 426f9cbc0..2594a7dd9 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -330,6 +330,10 @@ using std::ptrdiff_t;
#include "src/Core/CoreIterators.h"
#include "src/Core/ConditionEstimator.h"
+#if defined(EIGEN_ARCH_PPC)
+#include "src/Core/arch/AltiVec/MatrixProduct.h"
+#endif
+
#include "src/Core/BooleanRedux.h"
#include "src/Core/Select.h"
#include "src/Core/VectorwiseOp.h"
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
new file mode 100644
index 000000000..3bfbfdc87
--- /dev/null
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -0,0 +1,311 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H
+#define EIGEN_MATRIX_PRODUCT_ALTIVEC_H
+
+#ifdef __MMA__
+
+namespace Eigen {
+
+namespace internal {
+
+const int accRows = 4;
+const int accCols = 4;
+const int accCount = 4;
+const int floatVectorSize = 4;
+
+typedef struct
+{
+ __vector float v0;
+ __vector float v1;
+ __vector float v2;
+ __vector float v3;
+} Packet4fx4;
+
+union PacketQuad
+{
+ __struct_quad sc;
+ Packet4fx4 sf;
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+{
+ void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+ ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+ int ri = 0, j;
+ for(j = 0; j + floatVectorSize < rows; j+=floatVectorSize)
+ {
+ int i;
+ for(i = 0; i + floatVectorSize < depth; i+=floatVectorSize)
+ {
+ PacketBlock<Packet4f, 4> block;
+ block.packet[0] = lhs.template loadPacket<Packet4f>(j, i + 0);
+ block.packet[1] = lhs.template loadPacket<Packet4f>(j, i + 1);
+ block.packet[2] = lhs.template loadPacket<Packet4f>(j, i + 2);
+ block.packet[3] = lhs.template loadPacket<Packet4f>(j, i + 3);
+
+ pstore<float>((float *)(blockA + ri ), block.packet[0]);
+ pstore<float>((float *)(blockA + ri + 4), block.packet[1]);
+ pstore<float>((float *)(blockA + ri + 8), block.packet[2]);
+ pstore<float>((float *)(blockA + ri + 12), block.packet[3]);
+ ri += 4*floatVectorSize;
+ }
+ for(; i < depth; i++)
+ {
+ Packet4f lhsV = lhs.template loadPacket<Packet4f>(j, i);
+ pstore<float>((float *)(blockA + ri), lhsV);
+ ri += floatVectorSize;
+ }
+ }
+ for(int i = 0; i < depth; i++)
+ {
+ int k = j;
+ for(; k < rows; k++)
+ {
+ blockA[ri] = lhs(k, i);
+ ri += 1;
+ }
+ }
+}
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+{
+ void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+ ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+ int ri = 0, j;
+ for(j = 0; j + floatVectorSize < cols; j+=floatVectorSize)
+ {
+ int i;
+ for(i = 0; i + floatVectorSize < depth; i+=floatVectorSize)
+ {
+ PacketBlock<Packet4f, 4> block;
+ block.packet[0] = rhs.template loadPacket<Packet4f>(i, j + 0);
+ block.packet[1] = rhs.template loadPacket<Packet4f>(i, j + 1);
+ block.packet[2] = rhs.template loadPacket<Packet4f>(i, j + 2);
+ block.packet[3] = rhs.template loadPacket<Packet4f>(i, j + 3);
+
+ ptranspose(block);
+
+ pstore<float>((float *)(blockB + ri ), block.packet[0]);
+ pstore<float>((float *)(blockB + ri + 4), block.packet[1]);
+ pstore<float>((float *)(blockB + ri + 8), block.packet[2]);
+ pstore<float>((float *)(blockB + ri + 12), block.packet[3]);
+
+ ri += 4*floatVectorSize;
+ }
+ for(; i < depth; i++)
+ {
+ blockB[ri+0] = rhs(i, j+0);
+ blockB[ri+1] = rhs(i, j+1);
+ blockB[ri+2] = rhs(i, j+2);
+ blockB[ri+3] = rhs(i, j+3);
+ ri += floatVectorSize;
+ }
+ }
+ for(int i = 0; i < depth; i++)
+ {
+ int k = j;
+ for(; k < cols; k++)
+ {
+ blockB[ri] = rhs(i, k);
+ ri += 1;
+ }
+ }
+}
+
+template<typename DataMapper, typename Index, typename Scalar>
+EIGEN_STRONG_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, Scalar alpha, __vector_quad *acc)
+{
+ //[TODO]
+ //
+ //Packet4fx4 r;
+ //
+ //__builtin_mma_disassemble_acc((void *)&r, *acc);
+ //
+ PacketQuad result;
+ result.sc = __builtin_mma_disassemble_acc(*acc);
+
+ Packet4f pAlpha = pset1<Packet4f>(alpha);
+
+ PacketBlock<Packet4f, 4> block;
+ block.packet[0] = pAlpha*result.sf.v3;
+ block.packet[1] = pAlpha*result.sf.v2;
+ block.packet[2] = pAlpha*result.sf.v1;
+ block.packet[3] = pAlpha*result.sf.v0;
+
+ data.template storePacketBlock<Packet4f, 4>(i, j, block);
+}
+
+template<typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<float, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+ void operator()(const DataMapper& res, const float* blockA, const RhsScalar* blockB,
+ Index rows, Index depth, Index cols, float alpha,
+ Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<float, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+ ::operator()(const DataMapper& res, const float* blockA, const RhsScalar* blockB,
+ Index rows, Index depth, Index cols, float alpha,
+ Index strideA, Index strideB, Index offsetA, Index offsetB)
+ {
+ const int remaining_rows = rows % accRows;
+ const int remaining_cols = cols % accCols;
+ const int remaining_depth = depth % floatVectorSize;
+ const int timesRows = (rows / accRows);
+ const int timesCols = (cols / accCols);
+
+ int row;
+ for(row = 0; row + accRows <= rows; row += accRows)
+ {
+ const float *rhs_base = blockB;
+ const float *lhs_base = blockA + (row/accRows)*depth*floatVectorSize;
+
+ int col;
+ for(col = 0; col + accCount*accCols <= cols; col += accCount*accCols){
+ const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize;
+ const float *rhs_ptr2 = rhs_base + ((col/accCols) + 1)*depth*floatVectorSize;
+ const float *rhs_ptr3 = rhs_base + ((col/accCols) + 2)*depth*floatVectorSize;
+ const float *rhs_ptr4 = rhs_base + ((col/accCols) + 3)*depth*floatVectorSize;
+ const float *lhs_ptr = lhs_base;
+
+ __vector_quad acc, acc2, acc3, acc4;
+ __builtin_mma_xxsetaccz(&acc);
+ __builtin_mma_xxsetaccz(&acc2);
+ __builtin_mma_xxsetaccz(&acc3);
+ __builtin_mma_xxsetaccz(&acc4);
+
+ for(int k = 0; k < depth; k++)
+ {
+ __vector float lhsV = *((__vector float *)lhs_ptr );
+ __vector float rhsV = *((__vector float *)rhs_ptr );
+ __vector float rhs2V = *((__vector float *)rhs_ptr2);
+ __vector float rhs3V = *((__vector float *)rhs_ptr3);
+ __vector float rhs4V = *((__vector float *)rhs_ptr4);
+
+ __builtin_mma_xvf32gerpp(&acc, (__vector unsigned char) rhsV, (__vector unsigned char) lhsV);
+ __builtin_mma_xvf32gerpp(&acc2, (__vector unsigned char) rhs2V, (__vector unsigned char) lhsV);
+ __builtin_mma_xvf32gerpp(&acc3, (__vector unsigned char) rhs3V, (__vector unsigned char) lhsV);
+ __builtin_mma_xvf32gerpp(&acc4, (__vector unsigned char) rhs4V, (__vector unsigned char) lhsV);
+
+ lhs_ptr += floatVectorSize;
+ rhs_ptr += floatVectorSize;
+ rhs_ptr2 += floatVectorSize;
+ rhs_ptr3 += floatVectorSize;
+ rhs_ptr4 += floatVectorSize;
+ }
+
+ storeAccumulator<DataMapper, Index, float>(row, col , res, alpha, &acc );
+ storeAccumulator<DataMapper, Index, float>(row, col + 1*accCols, res, alpha, &acc2);
+ storeAccumulator<DataMapper, Index, float>(row, col + 2*accCols, res, alpha, &acc3);
+ storeAccumulator<DataMapper, Index, float>(row, col + 3*accCols, res, alpha, &acc4);
+ }
+ for(; col + accCols <= cols; col += accCols){
+ const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize;
+ const float *lhs_ptr = lhs_base;
+
+ __vector_quad acc;
+ __builtin_mma_xxsetaccz(&acc);
+ for(int k = 0; k < depth; k++)
+ {
+ __vector float lhsV = *((__vector float *)lhs_ptr);
+ __vector float rhsV = *((__vector float *)rhs_ptr);
+
+ __builtin_mma_xvf32gerpp(&acc, (__vector unsigned char) rhsV, (__vector unsigned char) lhsV);
+
+ lhs_ptr += floatVectorSize;
+ rhs_ptr += floatVectorSize;
+ }
+
+ storeAccumulator<DataMapper, Index, float>(row, col, res, alpha, &acc);
+ }
+
+ if(remaining_cols > 0)
+ {
+ const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize;
+ const float *lhs_ptr = lhs_base;
+ for(int k = 0; k < depth; k++)
+ {
+ for(int arow = 0; arow < accRows; arow++)
+ {
+ for(int acol = 0; acol < remaining_cols; acol++ )
+ {
+ res(row + arow, col + acol) += lhs_ptr[arow]*rhs_ptr[acol];
+ }
+ }
+ rhs_ptr += remaining_cols;
+ lhs_ptr += floatVectorSize;
+ }
+ }
+ }
+
+ if(remaining_rows > 0)
+ {
+ const float *rhs_base = blockB;
+ const float *lhs_base = blockA + (row/accRows)*depth*floatVectorSize;
+
+ int col;
+ for(col = 0; col + accCols <= cols; col += accCols)
+ {
+ const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize;
+ const float *lhs_ptr = lhs_base;
+ for(int k = 0; k < depth; k++)
+ {
+ for(int arow = 0; arow < remaining_rows; arow++)
+ {
+ for(int acol = 0; acol < accCols; acol++ )
+ {
+ res(row + arow, col + acol) += lhs_ptr[arow]*rhs_ptr[acol];
+ }
+ }
+ rhs_ptr += floatVectorSize;
+ lhs_ptr += remaining_rows;
+ }
+ }
+
+ if(remaining_cols > 0)
+ {
+ const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize;
+ const float *lhs_ptr = lhs_base;
+ for(int k = 0; k < depth; k++)
+ {
+ for(int arow = 0; arow < remaining_rows; arow++)
+ {
+ for(int acol = 0; acol < remaining_cols; acol++ )
+ {
+ res(row + arow, col + acol) += lhs_ptr[arow]*rhs_ptr[acol];
+ }
+ }
+ rhs_ptr += remaining_cols;
+ lhs_ptr += remaining_rows;
+ }
+ }
+ }
+ }
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // __MMA__
+#endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 643558cba..01e647f17 100755
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -195,6 +195,55 @@ protected:
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned, int Incr = 1>
class blas_data_mapper;
+// TMP to help PacketBlock store implementation.
+// There's currently no known use case for PacketBlock load.
+// The default implementation assumes ColMajor order.
+// It always store each packet sequentially one `stride` apart.
+template<typename Index, typename Scalar, typename Packet, int n, int idx, int StorageOrder>
+struct PacketBlockManagement
+{
+ PacketBlockManagement<Index, Scalar, Packet, n, idx - 1, StorageOrder> pbm;
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const {
+ pbm.store(to, stride, i, j, block);
+ pstoreu<Scalar>(to + i + (j + idx)*stride, block.packet[idx]);
+ }
+};
+
+// PacketBlockManagement specialization to take care of RowMajor order without ifs.
+template<typename Index, typename Scalar, typename Packet, int n, int idx>
+struct PacketBlockManagement<Index, Scalar, Packet, n, idx, RowMajor>
+{
+ PacketBlockManagement<Index, Scalar, Packet, n, idx - 1, RowMajor> pbm;
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const {
+ pbm.store(to, stride, i, j, block);
+ pstoreu<Scalar>(to + j + (i + idx)*stride, block.packet[idx]);
+ }
+};
+
+template<typename Index, typename Scalar, typename Packet, int n, int StorageOrder>
+struct PacketBlockManagement<Index, Scalar, Packet, n, -1, StorageOrder>
+{
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const {
+ EIGEN_UNUSED_VARIABLE(to);
+ EIGEN_UNUSED_VARIABLE(stride);
+ EIGEN_UNUSED_VARIABLE(i);
+ EIGEN_UNUSED_VARIABLE(j);
+ EIGEN_UNUSED_VARIABLE(block);
+ }
+};
+
+template<typename Index, typename Scalar, typename Packet, int n>
+struct PacketBlockManagement<Index, Scalar, Packet, n, -1, RowMajor>
+{
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const {
+ EIGEN_UNUSED_VARIABLE(to);
+ EIGEN_UNUSED_VARIABLE(stride);
+ EIGEN_UNUSED_VARIABLE(i);
+ EIGEN_UNUSED_VARIABLE(j);
+ EIGEN_UNUSED_VARIABLE(block);
+ }
+};
+
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType>
class blas_data_mapper<Scalar,Index,StorageOrder,AlignmentType,1>
{
@@ -258,6 +307,11 @@ public:
return internal::first_default_aligned(m_data, size);
}
+ template<typename SubPacket, int n>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketBlock(Index i, Index j, const PacketBlock<SubPacket, n> &block) const {
+ PacketBlockManagement<Index, Scalar, SubPacket, n, n-1, StorageOrder> pbm;
+ pbm.store(m_data, m_stride, i, j, block);
+ }
protected:
Scalar* EIGEN_RESTRICT m_data;
const Index m_stride;