aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
diff options
context:
space:
mode:
authorGravatar Rasmus Munk Larsen <rmlarsen@google.com>2018-09-26 16:47:13 -0700
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2018-09-26 16:47:13 -0700
commit3815aeed7a0304ea7703adf96124bd7f2d0530c1 (patch)
tree6c75c53471ff3c600952f775e8988dfd693afa47 /unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
parent0a3356f4ece30cd486b616eb1da9128aa4f245be (diff)
Parallelize tensor contraction over the inner dimension in cases where where one or both of the outer dimensions (m and n) are small but k is large. This speeds up individual matmul microbenchmarks by up to 85%.
Naming below is BM_Matmul_M_K_N_THREADS, measured on a 2-socket Intel Broadwell-based server. Benchmark Base (ns) New (ns) Improvement ------------------------------------------------------------------ BM_Matmul_1_80_13522_1 387457 396013 -2.2% BM_Matmul_1_80_13522_2 406487 230789 +43.2% BM_Matmul_1_80_13522_4 395821 123211 +68.9% BM_Matmul_1_80_13522_6 391625 97002 +75.2% BM_Matmul_1_80_13522_8 408986 113828 +72.2% BM_Matmul_1_80_13522_16 399988 67600 +83.1% BM_Matmul_1_80_13522_22 411546 60044 +85.4% BM_Matmul_1_80_13522_32 393528 57312 +85.4% BM_Matmul_1_80_13522_44 390047 63525 +83.7% BM_Matmul_1_80_13522_88 387876 63592 +83.6% BM_Matmul_1_1500_500_1 245359 248119 -1.1% BM_Matmul_1_1500_500_2 401833 143271 +64.3% BM_Matmul_1_1500_500_4 210519 100231 +52.4% BM_Matmul_1_1500_500_6 251582 86575 +65.6% BM_Matmul_1_1500_500_8 211499 80444 +62.0% BM_Matmul_3_250_512_1 70297 68551 +2.5% BM_Matmul_3_250_512_2 70141 52450 +25.2% BM_Matmul_3_250_512_4 67872 58204 +14.2% BM_Matmul_3_250_512_6 71378 63340 +11.3% BM_Matmul_3_250_512_8 69595 41652 +40.2% BM_Matmul_3_250_512_16 72055 42549 +40.9% BM_Matmul_3_250_512_22 70158 54023 +23.0% BM_Matmul_3_250_512_32 71541 56042 +21.7% BM_Matmul_3_250_512_44 71843 57019 +20.6% BM_Matmul_3_250_512_88 69951 54045 +22.7% BM_Matmul_3_1500_512_1 369328 374284 -1.4% BM_Matmul_3_1500_512_2 428656 223603 +47.8% BM_Matmul_3_1500_512_4 205599 139508 +32.1% BM_Matmul_3_1500_512_6 214278 139071 +35.1% BM_Matmul_3_1500_512_8 184149 142338 +22.7% BM_Matmul_3_1500_512_16 156462 156983 -0.3% BM_Matmul_3_1500_512_22 163905 158259 +3.4% BM_Matmul_3_1500_512_32 155314 157662 -1.5% BM_Matmul_3_1500_512_44 235434 158657 +32.6% BM_Matmul_3_1500_512_88 156779 160275 -2.2% BM_Matmul_1500_4_512_1 363358 349528 +3.8% BM_Matmul_1500_4_512_2 303134 263319 +13.1% BM_Matmul_1500_4_512_4 176208 130086 +26.2% BM_Matmul_1500_4_512_6 148026 115449 +22.0% BM_Matmul_1500_4_512_8 131656 98421 +25.2% BM_Matmul_1500_4_512_16 134011 82861 +38.2% BM_Matmul_1500_4_512_22 134950 85685 +36.5% BM_Matmul_1500_4_512_32 133165 90081 +32.4% BM_Matmul_1500_4_512_44 133203 90644 +32.0% BM_Matmul_1500_4_512_88 134106 100566 +25.0% BM_Matmul_4_1500_512_1 439243 435058 +1.0% BM_Matmul_4_1500_512_2 451830 257032 +43.1% BM_Matmul_4_1500_512_4 276434 164513 +40.5% BM_Matmul_4_1500_512_6 182542 144827 +20.7% BM_Matmul_4_1500_512_8 179411 166256 +7.3% BM_Matmul_4_1500_512_16 158101 155560 +1.6% BM_Matmul_4_1500_512_22 152435 155448 -1.9% BM_Matmul_4_1500_512_32 155150 149538 +3.6% BM_Matmul_4_1500_512_44 193842 149777 +22.7% BM_Matmul_4_1500_512_88 149544 154468 -3.3%
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h25
1 files changed, 22 insertions, 3 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 3b22e43e7..ea17a897d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -590,6 +590,25 @@ struct TensorContractionEvaluatorBase
// zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+ this->template evalGemmPartial<lhs_inner_dim_contiguous,
+ rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, Alignment>(buffer,
+ 0, k, 1);
+ }
+
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+ EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
+ // columns in left side, rows in right side
+ const Index k = this->m_k_size;
+
+ eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= k);
+ const Index k_slice = k_end - k_start;
+
+ // rows in left side
+ const Index m = this->m_i_size;
+
+ // columns in right side
+ const Index n = this->m_j_size;
// define mr, nr, and all of my data mapper types
typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
@@ -620,7 +639,7 @@ struct TensorContractionEvaluatorBase
typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
// Declare GEBP packing and kernel structs
- internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> pack_lhs;
+ internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> pack_lhs;
internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
@@ -635,7 +654,7 @@ struct TensorContractionEvaluatorBase
OutputMapper output(buffer, m);
// Sizes of the blocks to load in cache. See the Goto paper for details.
- internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k, m, n, 1);
+ internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k_slice, m, n, num_threads);
const Index kc = blocking.kc();
const Index mc = numext::mini(m, blocking.mc());
const Index nc = numext::mini(n, blocking.nc());
@@ -648,7 +667,7 @@ struct TensorContractionEvaluatorBase
for(Index i2=0; i2<m; i2+=mc)
{
const Index actual_mc = numext::mini(i2+mc,m)-i2;
- for (Index k2 = 0; k2 < k; k2 += kc) {
+ for (Index k2 = k_start; k2 < k_end; k2 += kc) {
// make sure we don't overshoot right edge of left matrix, then pack vertical panel
const Index actual_kc = numext::mini(k2 + kc, k) - k2;
pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);