From f85038b7f3e9a0bd7d2bfbed96cc966863aeea57 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Wed, 3 Feb 2021 08:18:28 -0800 Subject: Fix excessive GEBP register spilling for 32-bit NEON. Clang does a poor job of optimizing the GEBP microkernel on 32-bit ARM, leading to excessive 16-byte register spills, slowing down basic f32 matrix multiplication by approx 50%. By specializing `gebp_traits`, we can eliminate the register spills. Volatile inline ASM both acts as a barrier to prevent reordering and enforces strict register use. In a simple f32 matrix multiply example, this modification reduces 16-byte spills from 109 instances to zero, leading to a 1.5x speed increase (search for `16-byte Spill` in the assembly in https://godbolt.org/z/chsPbE). This is a replacement of !379. See there for further discussion. Also moved `gebp_traits` specializations for NEON to `Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h` to be alongside other NEON-specific code. Fixes #2138. --- Eigen/Core | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'Eigen/Core') diff --git a/Eigen/Core b/Eigen/Core index 4d9a3309c..1a60dcba4 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -340,7 +340,9 @@ using std::ptrdiff_t; #include "src/Core/ConditionEstimator.h" #if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) -#include "src/Core/arch/AltiVec/MatrixProduct.h" + #include "src/Core/arch/AltiVec/MatrixProduct.h" +#elif defined EIGEN_VECTORIZE_NEON + #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h" #endif #include "src/Core/BooleanRedux.h" -- cgit v1.2.3