Improve performance of contraction kernels

* Force-inline implementations. They pass around pointers to shared memory blocks. Without inlining compiler must operate via generic pointers. Inlining allows compiler to detect that we're operating on shared memory which allows generation of substantially faster code. * Fixed a long-standing typo which resulted in launching 8x more kernels than we needed (.z dimension of the block is unused by the kernel).
author: Artem Belevich <tra@google.com> 2019-12-05 12:48:34 -0800
committer: Artem Belevich <tra@google.com> 2019-12-05 12:48:34 -0800
commit: 25230d1862ecfe3f1bf91c12eefe52dbdc0179b9 (patch)
tree: 3db318567c010c65bf9539332d3fce38bff7fe18 /unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
parent: 08eeb648ea6c329b9b1fb3063993572c21404974 (diff)
1 files changed, 3 insertions, 3 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
index 3471d1056..2a224eddf 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
@@ -531,7 +531,7 @@ EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
 template<typename Index, typename LhsMapper,
          typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
          bool CHECK_RHS_BOUNDARY>
-__device__ EIGEN_STRONG_INLINE void
+__device__ __forceinline__ void
 EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
                        const OutputMapper output, float2 lhs_shmem2[][16],
                        float2 rhs_shmem2[][8], const Index m_size,
@@ -771,7 +771,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
 template<typename Index, typename LhsMapper,
          typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
          bool CHECK_RHS_BOUNDARY>
-__device__ EIGEN_STRONG_INLINE void
+__device__ __forceinline__ void
 EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
                        const OutputMapper output, float2 lhs_shmem2[][32],
                        float2 rhs_shmem2[][8], const Index m_size,
@@ -1335,7 +1335,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     const Index m_blocks = (m + 63) / 64;
     const Index n_blocks = (n + 63) / 64;
     const dim3 num_blocks(m_blocks, n_blocks, 1);
-    const dim3 block_size(8, 8, 8);
+    const dim3 block_size(8, 8, 1);
     LAUNCH_GPU_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
     }
   };
author	Artem Belevich <tra@google.com>	2019-12-05 12:48:34 -0800
committer	Artem Belevich <tra@google.com>	2019-12-05 12:48:34 -0800
commit	25230d1862ecfe3f1bf91c12eefe52dbdc0179b9 (patch)
tree	3db318567c010c65bf9539332d3fce38bff7fe18 /unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
parent	08eeb648ea6c329b9b1fb3063993572c21404974 (diff)