diff options
author | Mehdi Goli <mehdi.goli@codeplay.com> | 2019-11-28 10:08:54 +0000 |
---|---|---|
committer | Mehdi Goli <mehdi.goli@codeplay.com> | 2019-11-28 10:08:54 +0000 |
commit | 00f32752f7d0b193c6788691c3cf0b76457a044d (patch) | |
tree | 792e46110f0751ea8802fa9d403d1472d5977ac3 /Eigen/src/Core/util | |
parent | ea51a9eace7e4f0ea839e61eb2df85ccfb94aee8 (diff) |
[SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch.
* Unifying all loadLocalTile from lhs and rhs to an extract_block function.
* Adding get_tensor operation which was missing in TensorContractionMapper.
* Adding the -D method missing from cmake for Disable_Skinny Contraction operation.
* Wrapping all the indices in TensorScanSycl into Scan parameter struct.
* Fixing typo in Device SYCL
* Unifying load to private register for tall/skinny no shared
* Unifying load to vector tile for tensor-vector/vector-tensor operation
* Removing all the LHS/RHS class for extracting data from global
* Removing Outputfunction from TensorContractionSkinnyNoshared.
* Combining the local memory version of tall/skinny and normal tensor contraction into one kernel.
* Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel.
* Combining General Tensor-Vector and VectorTensor contraction into one kernel.
* Making double buffering optional for Tensor contraction when local memory is version is used.
* Modifying benchmark to accept custom Reduction Sizes
* Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host
* Adding Test for SYCL
* Modifying SYCL CMake
Diffstat (limited to 'Eigen/src/Core/util')
-rw-r--r-- | Eigen/src/Core/util/ConfigureVectorization.h | 30 | ||||
-rw-r--r-- | Eigen/src/Core/util/Macros.h | 2 |
2 files changed, 20 insertions, 12 deletions
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index fdb1627a1..d52805d32 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -240,15 +240,19 @@ #define EIGEN_VECTORIZE_SSE4_2 #endif #ifdef __AVX__ - #define EIGEN_VECTORIZE_AVX + #ifndef EIGEN_USE_SYCL + #define EIGEN_VECTORIZE_AVX + #endif #define EIGEN_VECTORIZE_SSE3 #define EIGEN_VECTORIZE_SSSE3 #define EIGEN_VECTORIZE_SSE4_1 #define EIGEN_VECTORIZE_SSE4_2 #endif #ifdef __AVX2__ - #define EIGEN_VECTORIZE_AVX2 - #define EIGEN_VECTORIZE_AVX + #ifndef EIGEN_USE_SYCL + #define EIGEN_VECTORIZE_AVX2 + #define EIGEN_VECTORIZE_AVX + #endif #define EIGEN_VECTORIZE_SSE3 #define EIGEN_VECTORIZE_SSSE3 #define EIGEN_VECTORIZE_SSE4_1 @@ -267,19 +271,23 @@ #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638). #endif #endif - #define EIGEN_VECTORIZE_AVX512 - #define EIGEN_VECTORIZE_AVX2 - #define EIGEN_VECTORIZE_AVX + #ifndef EIGEN_USE_SYCL + #define EIGEN_VECTORIZE_AVX512 + #define EIGEN_VECTORIZE_AVX2 + #define EIGEN_VECTORIZE_AVX + #endif #define EIGEN_VECTORIZE_FMA #define EIGEN_VECTORIZE_SSE3 #define EIGEN_VECTORIZE_SSSE3 #define EIGEN_VECTORIZE_SSE4_1 #define EIGEN_VECTORIZE_SSE4_2 - #ifdef __AVX512DQ__ - #define EIGEN_VECTORIZE_AVX512DQ - #endif - #ifdef __AVX512ER__ - #define EIGEN_VECTORIZE_AVX512ER + #ifndef EIGEN_USE_SYCL + #ifdef __AVX512DQ__ + #define EIGEN_VECTORIZE_AVX512DQ + #endif + #ifdef __AVX512ER__ + #define EIGEN_VECTORIZE_AVX512ER + #endif #endif #endif diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index e7bf75a81..2b40c5fd0 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -854,7 +854,7 @@ #ifndef EIGEN_DONT_VECTORIZE #define EIGEN_DONT_VECTORIZE #endif - #define EIGEN_DEVICE_FUNC __attribute__((always_inline)) + #define EIGEN_DEVICE_FUNC __attribute__((flatten)) __attribute__((always_inline)) // All functions callable from CUDA/HIP code must be qualified with __device__ #elif defined(EIGEN_GPUCC) #define EIGEN_DEVICE_FUNC __host__ __device__ |