From 00f32752f7d0b193c6788691c3cf0b76457a044d Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Thu, 28 Nov 2019 10:08:54 +0000 Subject: [SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch. * Unifying all loadLocalTile from lhs and rhs to an extract_block function. * Adding get_tensor operation which was missing in TensorContractionMapper. * Adding the -D method missing from cmake for Disable_Skinny Contraction operation. * Wrapping all the indices in TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL * Unifying load to private register for tall/skinny no shared * Unifying load to vector tile for tensor-vector/vector-tensor operation * Removing all the LHS/RHS class for extracting data from global * Removing Outputfunction from TensorContractionSkinnyNoshared. * Combining the local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining General Tensor-Vector and VectorTensor contraction into one kernel. * Making double buffering optional for Tensor contraction when local memory is version is used. * Modifying benchmark to accept custom Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL CMake --- CMakeLists.txt | 15 + Eigen/src/Core/arch/SYCL/InteropHeaders.h | 4 + Eigen/src/Core/util/ConfigureVectorization.h | 30 +- Eigen/src/Core/util/Macros.h | 2 +- bench/tensors/README | 17 +- bench/tensors/eigen_sycl_bench.sh | 30 + bench/tensors/eigen_sycl_bench_contract.sh | 7 + bench/tensors/tensor_benchmarks.h | 102 +- bench/tensors/tensor_benchmarks_sycl.cc | 133 +- .../tensor_benchmarks_sycl_include_headers.cc | 2 - bench/tensors/tensor_contract_sycl_bench.cc | 325 ++++ cmake/EigenTesting.cmake | 161 +- cmake/FindComputeCpp.cmake | 539 ++++-- unsupported/Eigen/CXX11/Tensor | 23 +- .../Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h | 152 -- .../CXX11/src/Tensor/TensorContractionMapper.h | 4 + .../Eigen/CXX11/src/Tensor/TensorContractionSycl.h | 1890 ++++++++++++++++---- .../Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h | 689 +++---- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 245 ++- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 150 +- .../CXX11/src/Tensor/TensorForwardDeclarations.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorReductionSycl.h | 673 +++++-- .../Eigen/CXX11/src/Tensor/TensorScanSycl.h | 512 ++++++ unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h | 120 -- .../Tensor/TensorSyclConvertToDeviceExpression.h | 205 --- .../CXX11/src/Tensor/TensorSyclExprConstructor.h | 514 ------ .../CXX11/src/Tensor/TensorSyclExtractAccessor.h | 310 ---- .../CXX11/src/Tensor/TensorSyclExtractFunctors.h | 467 ----- .../Eigen/CXX11/src/Tensor/TensorSyclFunctors.h | 248 --- .../Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h | 213 --- .../CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h | 302 ---- unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h | 96 - .../Eigen/CXX11/src/Tensor/TensorSyclTuple.h | 239 --- unsupported/doc/Overview.dox | 3 + unsupported/doc/SYCL.dox | 9 + unsupported/doc/examples/CMakeLists.txt | 4 + unsupported/doc/examples/SYCL/CMakeLists.txt | 38 + unsupported/doc/examples/SYCL/CwiseMul.cpp | 63 + unsupported/test/CMakeLists.txt | 129 +- unsupported/test/cxx11_tensor_argmax_sycl.cpp | 136 +- unsupported/test/cxx11_tensor_builtins_sycl.cpp | 497 ++--- unsupported/test/cxx11_tensor_chipping_sycl.cpp | 7 +- unsupported/test/cxx11_tensor_contract_sycl.cpp | 1010 +++++++++-- unsupported/test/cxx11_tensor_custom_op_sycl.cpp | 5 + unsupported/test/cxx11_tensor_forced_eval_sycl.cpp | 5 +- unsupported/test/cxx11_tensor_image_op_sycl.cpp | 103 ++ unsupported/test/cxx11_tensor_math_sycl.cpp | 105 ++ unsupported/test/cxx11_tensor_morphing_sycl.cpp | 138 ++ unsupported/test/cxx11_tensor_random_sycl.cpp | 100 ++ unsupported/test/cxx11_tensor_reduction_sycl.cpp | 941 +++++++++- unsupported/test/cxx11_tensor_reverse_sycl.cpp | 128 +- unsupported/test/cxx11_tensor_scan_sycl.cpp | 141 ++ unsupported/test/cxx11_tensor_shuffling_sycl.cpp | 52 +- unsupported/test/cxx11_tensor_sycl.cpp | 91 +- 56 files changed, 7321 insertions(+), 4811 deletions(-) create mode 100755 bench/tensors/eigen_sycl_bench.sh create mode 100644 bench/tensors/eigen_sycl_bench_contract.sh delete mode 100644 bench/tensors/tensor_benchmarks_sycl_include_headers.cc create mode 100644 bench/tensors/tensor_contract_sycl_bench.cc delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h mode change 100644 => 100755 unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h create mode 100644 unsupported/doc/SYCL.dox create mode 100644 unsupported/doc/examples/SYCL/CMakeLists.txt create mode 100644 unsupported/doc/examples/SYCL/CwiseMul.cpp create mode 100644 unsupported/test/cxx11_tensor_image_op_sycl.cpp create mode 100644 unsupported/test/cxx11_tensor_math_sycl.cpp create mode 100644 unsupported/test/cxx11_tensor_random_sycl.cpp create mode 100644 unsupported/test/cxx11_tensor_scan_sycl.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 619bd18f8..36a155133 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -492,6 +492,21 @@ if(EIGEN_TEST_SYCL) else() message(STATUS "Using ComputeCPP SYCL") include(FindComputeCpp) + set(COMPUTECPP_DRIVER_DEFAULT_VALUE OFF) + if (NOT MSVC) + set(COMPUTECPP_DRIVER_DEFAULT_VALUE ON) + endif() + option(COMPUTECPP_USE_COMPILER_DRIVER + "Use ComputeCpp driver instead of a 2 steps compilation" + ${COMPUTECPP_DRIVER_DEFAULT_VALUE} + ) + endif(EIGEN_SYCL_TRISYCL) + option(EIGEN_DONT_VECTORIZE_SYCL "Don't use vectorisation in the SYCL tests." OFF) + if(EIGEN_DONT_VECTORIZE_SYCL) + message(STATUS "Disabling SYCL vectorization in tests/examples") + # When disabling SYCL vectorization, also disable Eigen default vectorization + add_definitions(-DEIGEN_DONT_VECTORIZE=1) + add_definitions(-DEIGEN_DONT_VECTORIZE_SYCL=1) endif() endif() diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h index d76030419..5cef1a49f 100644 --- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h +++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h @@ -161,6 +161,8 @@ struct PacketWrapper { eigen_assert(false && "INDEX MUST BE BETWEEN 0 and 3"); abort(); } + __builtin_unreachable(); + } EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type( Scalar in, Scalar other) { @@ -203,6 +205,8 @@ struct PacketWrapper { eigen_assert(false && "INDEX MUST BE BETWEEN 0 and 1"); abort(); } + __builtin_unreachable(); + } EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type( Scalar in, Scalar other) { diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index fdb1627a1..d52805d32 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -240,15 +240,19 @@ #define EIGEN_VECTORIZE_SSE4_2 #endif #ifdef __AVX__ - #define EIGEN_VECTORIZE_AVX + #ifndef EIGEN_USE_SYCL + #define EIGEN_VECTORIZE_AVX + #endif #define EIGEN_VECTORIZE_SSE3 #define EIGEN_VECTORIZE_SSSE3 #define EIGEN_VECTORIZE_SSE4_1 #define EIGEN_VECTORIZE_SSE4_2 #endif #ifdef __AVX2__ - #define EIGEN_VECTORIZE_AVX2 - #define EIGEN_VECTORIZE_AVX + #ifndef EIGEN_USE_SYCL + #define EIGEN_VECTORIZE_AVX2 + #define EIGEN_VECTORIZE_AVX + #endif #define EIGEN_VECTORIZE_SSE3 #define EIGEN_VECTORIZE_SSSE3 #define EIGEN_VECTORIZE_SSE4_1 @@ -267,19 +271,23 @@ #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638). #endif #endif - #define EIGEN_VECTORIZE_AVX512 - #define EIGEN_VECTORIZE_AVX2 - #define EIGEN_VECTORIZE_AVX + #ifndef EIGEN_USE_SYCL + #define EIGEN_VECTORIZE_AVX512 + #define EIGEN_VECTORIZE_AVX2 + #define EIGEN_VECTORIZE_AVX + #endif #define EIGEN_VECTORIZE_FMA #define EIGEN_VECTORIZE_SSE3 #define EIGEN_VECTORIZE_SSSE3 #define EIGEN_VECTORIZE_SSE4_1 #define EIGEN_VECTORIZE_SSE4_2 - #ifdef __AVX512DQ__ - #define EIGEN_VECTORIZE_AVX512DQ - #endif - #ifdef __AVX512ER__ - #define EIGEN_VECTORIZE_AVX512ER + #ifndef EIGEN_USE_SYCL + #ifdef __AVX512DQ__ + #define EIGEN_VECTORIZE_AVX512DQ + #endif + #ifdef __AVX512ER__ + #define EIGEN_VECTORIZE_AVX512ER + #endif #endif #endif diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index e7bf75a81..2b40c5fd0 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -854,7 +854,7 @@ #ifndef EIGEN_DONT_VECTORIZE #define EIGEN_DONT_VECTORIZE #endif - #define EIGEN_DEVICE_FUNC __attribute__((always_inline)) + #define EIGEN_DEVICE_FUNC __attribute__((flatten)) __attribute__((always_inline)) // All functions callable from CUDA/HIP code must be qualified with __device__ #elif defined(EIGEN_GPUCC) #define EIGEN_DEVICE_FUNC __host__ __device__ diff --git a/bench/tensors/README b/bench/tensors/README index 69342cc9c..dcbf0217a 100644 --- a/bench/tensors/README +++ b/bench/tensors/README @@ -11,15 +11,10 @@ nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBU We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code. nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_53 -o benchmarks_fp16_gpu -last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call -g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu +To compile and run the benchmark for SYCL, using ComputeCpp, simply run the +following commands: +1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY} +2. bash eigen_sycl_bench.sh -To compile and run the benchmark for SYCL, using ComputeCpp you currently need following passes (only for translation units containing device code): -1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code. -{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc -DEIGEN_USE_SYCL=1 -2. The host compilation pass that generates the final host binary. -clang++ -O3 -c benchmark_main.cc -pthread -I ../../ -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 -o benchmark_main.o -clang++ -O3 tensor_benchmarks_sycl_include_headers.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 benchmark_main.o -o tensor_benchmark_sycl -export LD_LIBRARY_PATH={ComputeCpp_ROOT}/lib -3. Run the benchmark -./tensor_benchmark_sycl +Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call +g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu diff --git a/bench/tensors/eigen_sycl_bench.sh b/bench/tensors/eigen_sycl_bench.sh new file mode 100755 index 000000000..3f67b3d86 --- /dev/null +++ b/bench/tensors/eigen_sycl_bench.sh @@ -0,0 +1,30 @@ +rm -f tensor_benchmark_sycl +: "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}" +echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR +${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++ \ +tensor_benchmarks_sycl.cc \ +benchmark_main.cc \ +-I ../../ \ +-I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ \ +-std=c++11 \ +-march=native \ +-O3 \ +-DNDEBUG \ +-DEIGEN_MPL2_ONLY \ +-DEIGEN_USE_SYCL=1 \ +-DEIGEN_SYCL_LOCAL_MEM=1 \ +-no-serial-memop \ +-mllvm \ +-inline-threshold=10000 \ +-fsycl-ih-last \ +-sycl-driver \ +-Xclang -cl-mad-enable \ +-lOpenCL \ +-lComputeCpp \ +-lpthread \ +-o \ +tensor_benchmark_sycl\ +${@:1} + +export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH +./tensor_benchmark_sycl diff --git a/bench/tensors/eigen_sycl_bench_contract.sh b/bench/tensors/eigen_sycl_bench_contract.sh new file mode 100644 index 000000000..73fd6c4a0 --- /dev/null +++ b/bench/tensors/eigen_sycl_bench_contract.sh @@ -0,0 +1,7 @@ +rm -f tensor_contract_sycl_bench +: "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}" +echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR +${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++ tensor_contract_sycl_bench.cc -I ../../ -I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ -std=c++11 -O3 -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -no-serial-memop -mllvm -inline-threshold=10000 -fsycl-ih-last -sycl-driver -Xclang -cl-mad-enable -lOpenCL -lComputeCpp -lpthread -o tensor_contract_sycl_bench ${@:1} +export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH +./tensor_contract_sycl_bench + diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 3a640ede4..0825e1563 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -27,6 +27,11 @@ template class BenchmarkSuite { initialize(); } + BenchmarkSuite(const Device& device, size_t m, size_t k) + : m_(1), k_(k), n_(m), device_(device) { + initialize(); + } + ~BenchmarkSuite() { device_.deallocate(a_); device_.deallocate(b_); @@ -79,6 +84,11 @@ template class BenchmarkSuite { sizes[0] = m_; sizes[1] = m_; TensorMap, Eigen::Aligned> C(c_, sizes); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = C.random(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = C.random(); @@ -264,6 +274,7 @@ template class BenchmarkSuite { finalizeBenchmark(static_cast(m_) * k_ * num_iters); } + void broadcasting(int num_iters) { Eigen::array size_a; size_a[0] = m_; @@ -406,8 +417,8 @@ for (int iter = 0; iter < 10; ++iter) { b_, input_size); Eigen::array output_size; output_size[0] = k_; - TensorMap, Eigen::Aligned> C( - c_, output_size); + TensorMap, Eigen::Aligned> A( + a_, output_size); #ifndef EIGEN_HAS_INDEX_LIST Eigen::array sum_along_dim; @@ -419,12 +430,12 @@ for (int iter = 0; iter < 10; ++iter) { #endif #ifdef EIGEN_USE_SYCL // warmup for sycl for (int iter = 0; iter < 10; ++iter) { - C.device(device_) = B.sum(sum_along_dim); + A.device(device_) = B.sum(sum_along_dim); } #endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { - C.device(device_) = B.sum(sum_along_dim); + A.device(device_) = B.sum(sum_along_dim); } // Record the number of FLOP executed per second (assuming one operation // per value) @@ -455,37 +466,27 @@ for (int iter = 0; iter < 10; ++iter) { finalizeBenchmark(static_cast(k_) * n_ * num_iters); } + + // do a contraction which is equivalent to a matrix multiplication void contraction(int num_iters) { - Eigen::array sizeA; - sizeA[0] = m_; - sizeA[1] = k_; - Eigen::array sizeB; - sizeB[0] = k_; - sizeB[1] = n_; - Eigen::array sizeC; - sizeC[0] = m_; - sizeC[1] = n_; + contraction(Eigen::ColMajor)>(num_iters, false, false); + } - const TensorMap, Eigen::Aligned> A(a_, sizeA); - const TensorMap, Eigen::Aligned> B(b_, sizeB); - TensorMap, Eigen::Aligned> C(c_, sizeC); + void contractionRowMajor(int num_iters) { + contraction(Eigen::RowMajor)>(num_iters, false, false); + } + + void contractionRowMajorAT(int num_iters) { + contraction(Eigen::RowMajor)>(num_iters, true, false); + } - typedef typename Tensor::DimensionPair DimPair; - Eigen::array dims; - dims[0] = DimPair(1, 0); -#ifdef EIGEN_USE_SYCL // warmup for sycl - for (int iter = 0; iter < 10; ++iter) { - C.device(device_) = A.contract(B, dims); - } -#endif - StartBenchmarkTiming(); - for (int iter = 0; iter < num_iters; ++iter) { - C.device(device_) = A.contract(B, dims); - } - // Record the number of FLOP executed per second (size_ multiplications and - // additions for each value in the resulting tensor) - finalizeBenchmark(static_cast(2) * m_ * n_ * k_ * num_iters); + void contractionRowMajorBT(int num_iters) { + contraction(Eigen::RowMajor)>(num_iters, false, true); + } + + void contractionRowMajorABT(int num_iters) { + contraction(Eigen::RowMajor)>(num_iters, true, true); } void convolution(int num_iters, int kernel_x, int kernel_y) { @@ -513,13 +514,49 @@ for (int iter = 0; iter < 10; ++iter) { for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.convolve(B, dims); } - // Record the number of FLOP executed per second (kernel_size + // Record the number of FLOPs executed per second (kernel_size // multiplications and additions for each value in the resulting tensor) finalizeBenchmark(static_cast(2) * (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters); } private: + // do a contraction which is equivalent to a matrix multiplication + template + void contraction(int num_iters, bool trans_a, bool trans_b) { + Eigen::array sizeA; + sizeA[0] = (trans_a ? k_: m_); + sizeA[1] = (trans_a ? m_: k_); + Eigen::array sizeB; + sizeB[0] = (trans_b ? n_: k_); + sizeB[1] = (trans_b ? k_: n_); + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + TensorIndex a_contract_dim = (trans_a ? 0 : 1); + TensorIndex b_contract_dim = (trans_b ? 1 : 0); + dims[0] = DimPair(a_contract_dim, b_contract_dim); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + // Record the number of FLOP executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(static_cast(2) * m_ * n_ * k_ * num_iters); + } + void initialize() { a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); @@ -531,7 +568,6 @@ for (int iter = 0; iter < 10; ++iter) { device_.memset(b_, 23, k_ * n_ * sizeof(T)); device_.memset(c_, 31, m_ * n_ * sizeof(T)); - //BenchmarkUseRealTime(); } inline void finalizeBenchmark(int64_t num_items) { diff --git a/bench/tensors/tensor_benchmarks_sycl.cc b/bench/tensors/tensor_benchmarks_sycl.cc index cb6daac15..b8a096684 100644 --- a/bench/tensors/tensor_benchmarks_sycl.cc +++ b/bench/tensors/tensor_benchmarks_sycl.cc @@ -5,19 +5,76 @@ #include "tensor_benchmarks.h" -#define BM_FuncGPU(FUNC) \ - static void BM_##FUNC(int iters, int N) { \ - StopBenchmarkTiming(); \ - cl::sycl::gpu_selector selector; \ - Eigen::QueueInterface queue(selector); \ - Eigen::SyclDevice device(&queue); \ - BenchmarkSuite suite(device, N); \ - suite.FUNC(iters); \ - } \ +cl::sycl::gpu_selector selector; +Eigen::QueueInterface queue(selector); +#define BM_FuncWithInput2DimsGPU(FUNC, D1, D2) \ + static void BM_##FUNC##_##D1##x##D2(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, D1, D2); \ + suite.FUNC(iters); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2, 10, 10); + +BM_FuncWithInput2DimsGPU(rowReduction, 256, 100352); +BM_FuncWithInput2DimsGPU(rowReduction, 64, 100352); +BM_FuncWithInput2DimsGPU(rowReduction, 512, 25088); +BM_FuncWithInput2DimsGPU(rowReduction, 128, 25088); +BM_FuncWithInput2DimsGPU(rowReduction, 102, 6272); +BM_FuncWithInput2DimsGPU(rowReduction, 256, 6272); +BM_FuncWithInput2DimsGPU(rowReduction, 204, 1568); +BM_FuncWithInput2DimsGPU(rowReduction, 512, 1568); +BM_FuncWithInput2DimsGPU(rowReduction, 1024, 1568); +BM_FuncWithInput2DimsGPU(rowReduction, 2048, 1568); + +BM_FuncWithInput2DimsGPU(colReduction, 100352, 256); +BM_FuncWithInput2DimsGPU(colReduction, 100352, 64); +BM_FuncWithInput2DimsGPU(colReduction, 25088, 512); +BM_FuncWithInput2DimsGPU(colReduction, 6272, 102); +BM_FuncWithInput2DimsGPU(colReduction, 25088, 128); +BM_FuncWithInput2DimsGPU(colReduction, 6272, 256); +BM_FuncWithInput2DimsGPU(colReduction, 1568, 204); +BM_FuncWithInput2DimsGPU(colReduction, 1568, 512); +BM_FuncWithInput2DimsGPU(colReduction, 1568, 1024); +BM_FuncWithInput2DimsGPU(colReduction, 1568, 2048); +BM_FuncWithInput2DimsGPU(fullReduction, 1001, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 2050048, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 2097152, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 2048, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 262144, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 256, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 589824, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 1024, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 524288, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 512, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 2359296, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 1048576, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 131072, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 16384, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 9408, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 64, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 4096, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 36864, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 32768, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 128, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 147456, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 65536, 1); +#define BM_FuncGPU(FUNC) \ + static void BM_##FUNC(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters); \ + } \ BENCHMARK_RANGE(BM_##FUNC, 10, 5000); +BM_FuncGPU(rowReduction); +BM_FuncGPU(colReduction); +BM_FuncGPU(fullReduction); + BM_FuncGPU(memcpy); BM_FuncGPU(typeCasting); +BM_FuncGPU(random); BM_FuncGPU(slicing); BM_FuncGPU(rowChip); BM_FuncGPU(colChip); @@ -28,40 +85,50 @@ BM_FuncGPU(broadcasting); BM_FuncGPU(coeffWiseOp); BM_FuncGPU(algebraicFunc); BM_FuncGPU(transcendentalFunc); -BM_FuncGPU(rowReduction); -BM_FuncGPU(colReduction); -BM_FuncGPU(fullReduction); - - // Contractions -#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ - static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ - StopBenchmarkTiming(); \ - cl::sycl::gpu_selector selector; \ - Eigen::QueueInterface queue(selector); \ - Eigen::SyclDevice device(&queue); \ - BenchmarkSuite suite(device, D1, D2, D3); \ - suite.FUNC(iters); \ - } \ +#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ + static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } \ BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); - BM_FuncWithInputDimsGPU(contraction, N, N, N); BM_FuncWithInputDimsGPU(contraction, 64, N, N); BM_FuncWithInputDimsGPU(contraction, N, 64, N); BM_FuncWithInputDimsGPU(contraction, N, N, 64); +BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, N); +BM_FuncWithInputDimsGPU(contractionRowMajor, 64, N, N); +BM_FuncWithInputDimsGPU(contractionRowMajor, N, 64, N); +BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, 64); + +BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, N); +BM_FuncWithInputDimsGPU(contractionRowMajorAT, 64, N, N); +BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, 64, N); +BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, 64); + +BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, N); +BM_FuncWithInputDimsGPU(contractionRowMajorBT, 64, N, N); +BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, 64, N); +BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, 64); + + +BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, N); +BM_FuncWithInputDimsGPU(contractionRowMajorABT, 64, N, N); +BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, 64, N); +BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, 64); // Convolutions -#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ - static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ - StopBenchmarkTiming(); \ - cl::sycl::gpu_selector selector; \ - Eigen::QueueInterface queue(selector); \ - Eigen::SyclDevice device(&queue); \ - BenchmarkSuite suite(device, N); \ - suite.FUNC(iters, DIM1, DIM2); \ - } \ +#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ + static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters, DIM1, DIM2); \ + } \ BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); BM_FuncWithKernelDimsGPU(convolution, 7, 1); diff --git a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc deleted file mode 100644 index bcc3c4c79..000000000 --- a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc +++ /dev/null @@ -1,2 +0,0 @@ -#include "tensor_benchmarks_sycl.cc" -#include "tensor_benchmarks_sycl.sycl" diff --git a/bench/tensors/tensor_contract_sycl_bench.cc b/bench/tensors/tensor_contract_sycl_bench.cc new file mode 100644 index 000000000..8f2defe42 --- /dev/null +++ b/bench/tensors/tensor_contract_sycl_bench.cc @@ -0,0 +1,325 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifndef EIGEN_BENCH_CONTRACT_SYCL +#define EIGEN_BENCH_CONTRACT_SYCL +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#include +#include +#include +#include +#include + +#include + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; +std::ofstream out("Result.txt"); + +std::chrono::time_point get_time(){ + std::chrono::time_point start, end; + return std::chrono::system_clock::now(); +} + +template +void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){ + + std::chrono::duration elapsed_seconds = end-start; + std::cout <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " << + static_cast((static_cast(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n"; + out <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " << + static_cast((static_cast(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n"; +} + +// do a contraction which is equivalent to a matrix multiplication +template +void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + + Eigen::array sizeA; + sizeA[0] = m_; + sizeA[1] = k_; + Eigen::array sizeB; + sizeB[0] = k_; + sizeB[1] = n_; + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(1, 0); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); +} + + + +// do a contraction which is equivalent to a matrix multiplication +template +void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + + Eigen::array sizeA; + sizeA[0] = m_; + sizeA[1] = k_; + Eigen::array sizeB; + sizeB[0] = k_; + sizeB[1] = n_; + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(1, 0); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); +} + + +template +void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + Eigen::array sizeA; + sizeA[0] = k_; + sizeA[1] = m_; + Eigen::array sizeB; + sizeB[0] = k_; + sizeB[1] = n_; + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(0, 0); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); + +} + +template +void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + + Eigen::array sizeA; + sizeA[0] = m_; + sizeA[1] = k_; + Eigen::array sizeB; + sizeB[0] = n_; + sizeB[1] = k_; + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(1, 1); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); + +} + +template +void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + + Eigen::array sizeA; + sizeA[0] = k_; + sizeA[1] = m_; + Eigen::array sizeB; + sizeB[0] = n_; + sizeB[1] = k_; + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(0, 1); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); +} + +int main() { + cl::sycl::gpu_selector selector; + Eigen::QueueInterface queue(selector); + Eigen::SyclDevice device(&queue); + int64_t num_iters =20; + for(int64_t m = 32; m <= 4096; m *= 2) + for(int64_t k = 32; k <= 4096; k *= 2) + for(int64_t n = 32; n <= 4096; n*= 2){ + (contraction(device, num_iters, m, k, n)); + (contractionRowMajor(device, num_iters, m, k, n)); + (contractionAT(device, num_iters, m, k, n)); + (contractionBT(device, num_iters, m, k, n)); + (contractionABT(device, num_iters, m, k, n)); + } + return 0; + } + +#endif // EIGEN_BENCH_CONTRACT_SYCL diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index f8ffe2387..524223717 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -113,111 +113,28 @@ macro(ei_add_test_internal testname testname_with_suffix) add_dependencies("Build${current_subproject}" ${targetname}) set_property(TEST ${testname_with_suffix} PROPERTY LABELS "${current_subproject}") endif() - -endmacro() - -# SYCL -macro(ei_add_test_internal_sycl testname testname_with_suffix) - set(targetname ${testname_with_suffix}) - - if(EIGEN_ADD_TEST_FILENAME_EXTENSION) - set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION}) - else() - set(filename ${testname}.cpp) - endif() - - set( include_file "${CMAKE_CURRENT_BINARY_DIR}/inc_${filename}") - set( bc_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.sycl") - set( host_file "${CMAKE_CURRENT_SOURCE_DIR}/${filename}") - - if(NOT EIGEN_SYCL_TRISYCL) - include_directories( SYSTEM ${COMPUTECPP_PACKAGE_ROOT_DIR}/include) - - add_custom_command( - OUTPUT ${include_file} - COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${host_file}\\\"" > ${include_file} - COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${bc_file}\\\"" >> ${include_file} - DEPENDS ${filename} ${bc_file} - COMMENT "Building ComputeCpp integration header file ${include_file}" - ) - - # Add a custom target for the generated integration header - add_custom_target("${testname}_integration_header_sycl" DEPENDS ${include_file}) - - add_executable(${targetname} ${include_file}) - add_dependencies(${targetname} "${testname}_integration_header_sycl") - else() - add_executable(${targetname} ${host_file}) - endif() - - add_sycl_to_target(${targetname} ${CMAKE_CURRENT_BINARY_DIR} ${filename}) - - if (targetname MATCHES "^eigen2_") - add_dependencies(eigen2_buildtests ${targetname}) - else() - add_dependencies(buildtests ${targetname}) - endif() - - if(EIGEN_NO_ASSERTION_CHECKING) - ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_NO_ASSERTION_CHECKING=1") - else() - if(EIGEN_DEBUG_ASSERTS) - ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_DEBUG_ASSERTS=1") - endif() - endif() - - ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}") - - if(MSVC AND NOT EIGEN_SPLIT_LARGE_TESTS) - ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj") - endif() - - # let the user pass flags. - if(${ARGC} GREATER 2) - ei_add_target_property(${targetname} COMPILE_FLAGS "${ARGV2}") - endif() - - if(EIGEN_TEST_CUSTOM_CXX_FLAGS) - ei_add_target_property(${targetname} COMPILE_FLAGS "${EIGEN_TEST_CUSTOM_CXX_FLAGS}") - endif() - - if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) - target_link_libraries(${targetname} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) - endif() - if(EXTERNAL_LIBS) - target_link_libraries(${targetname} ${EXTERNAL_LIBS}) - endif() - if(EIGEN_TEST_CUSTOM_LINKER_FLAGS) - target_link_libraries(${targetname} ${EIGEN_TEST_CUSTOM_LINKER_FLAGS}) - endif() - - if(${ARGC} GREATER 3) - set(libs_to_link ${ARGV3}) - # it could be that some cmake module provides a bad library string " " (just spaces), - # and that severely breaks target_link_libraries ("can't link to -l-lstdc++" errors). - # so we check for strings containing only spaces. - string(STRIP "${libs_to_link}" libs_to_link_stripped) - string(LENGTH "${libs_to_link_stripped}" libs_to_link_stripped_length) - if(${libs_to_link_stripped_length} GREATER 0) - # notice: no double quotes around ${libs_to_link} here. It may be a list. - target_link_libraries(${targetname} ${libs_to_link}) - endif() - endif() - - add_test(${testname_with_suffix} "${targetname}") - - # Specify target and test labels according to EIGEN_CURRENT_SUBPROJECT - get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT) - if ((current_subproject) AND (NOT (current_subproject STREQUAL ""))) - set_property(TARGET ${targetname} PROPERTY LABELS "Build${current_subproject}") - add_dependencies("Build${current_subproject}" ${targetname}) - set_property(TEST ${testname_with_suffix} PROPERTY LABELS "${current_subproject}") - endif() - - -endmacro() - - + if(EIGEN_SYCL) + # Force include of the SYCL file at the end to avoid errors. + set_property(TARGET ${targetname} PROPERTY COMPUTECPP_INCLUDE_AFTER 1) + # Set COMPILE_FLAGS to COMPILE_DEFINITIONS instead to avoid having to duplicate the flags + # to the device compiler. + get_target_property(target_compile_flags ${targetname} COMPILE_FLAGS) + separate_arguments(target_compile_flags) + foreach(flag ${target_compile_flags}) + if(${flag} MATCHES "^-D.*") + string(REPLACE "-D" "" definition_flag ${flag}) + set_property(TARGET ${targetname} APPEND PROPERTY COMPILE_DEFINITIONS ${definition_flag}) + list(REMOVE_ITEM target_compile_flags ${flag}) + endif() + endforeach() + set_property(TARGET ${targetname} PROPERTY COMPILE_FLAGS ${target_compile_flags}) + # Link against pthread and add sycl to target + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) + target_link_libraries(${targetname} Threads::Threads) + add_sycl_to_target(TARGET ${targetname} SOURCES ${filename}) + endif(EIGEN_SYCL) +endmacro(ei_add_test_internal) # Macro to add a test # # the unique mandatory parameter testname must correspond to a file @@ -296,40 +213,6 @@ macro(ei_add_test testname) endif() endmacro() -macro(ei_add_test_sycl testname) - get_property(EIGEN_TESTS_LIST GLOBAL PROPERTY EIGEN_TESTS_LIST) - set(EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}${testname}\n") - set_property(GLOBAL PROPERTY EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}") - - if(EIGEN_ADD_TEST_FILENAME_EXTENSION) - set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION}) - else() - set(filename ${testname}.cpp) - endif() - - file(READ "${filename}" test_source) - set(parts 0) - string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+" - occurrences "${test_source}") - string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurrences}") - list(REMOVE_DUPLICATES suffixes) - if(EIGEN_SPLIT_LARGE_TESTS AND suffixes) - add_custom_target(${testname}) - foreach(suffix ${suffixes}) - ei_add_test_internal_sycl(${testname} ${testname}_${suffix} - "${ARGV1} -DEIGEN_TEST_PART_${suffix}=1" "${ARGV2}") - add_dependencies(${testname} ${testname}_${suffix}) - endforeach() - else() - set(symbols_to_enable_all_parts "") - foreach(suffix ${suffixes}) - set(symbols_to_enable_all_parts - "${symbols_to_enable_all_parts} -DEIGEN_TEST_PART_${suffix}=1") - endforeach() - ei_add_test_internal_sycl(${testname} ${testname} "${ARGV1} ${symbols_to_enable_all_parts}" "${ARGV2}") - endif() -endmacro() - # adds a failtest, i.e. a test that succeed if the program fails to compile # note that the test runner for these is CMake itself, when passed -DEIGEN_FAILTEST=ON # so here we're just running CMake commands immediately, we're not adding any targets. diff --git a/cmake/FindComputeCpp.cmake b/cmake/FindComputeCpp.cmake index f84a2554d..c926ee292 100644 --- a/cmake/FindComputeCpp.cmake +++ b/cmake/FindComputeCpp.cmake @@ -2,7 +2,7 @@ # FindComputeCpp #--------------- # -# Copyright 2016 Codeplay Software Ltd. +# Copyright 2016-2018 Codeplay Software Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use these files except in compliance with the License. @@ -23,244 +23,421 @@ # # Tools for finding and building with ComputeCpp. # -# User must define COMPUTECPP_PACKAGE_ROOT_DIR pointing to the ComputeCpp -# installation. +# User must define ComputeCpp_DIR pointing to the ComputeCpp +# installation. # # Latest version of this file can be found at: # https://github.com/codeplaysoftware/computecpp-sdk -# Require CMake version 3.2.2 or higher -cmake_minimum_required(VERSION 3.2.2) - -# Check that a supported host compiler can be found -if(CMAKE_COMPILER_IS_GNUCXX) - # Require at least gcc 4.8 - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8) - message(FATAL_ERROR - "host compiler - Not found! (gcc version must be at least 4.8)") - else() - message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}") - endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - # Require at least clang 3.6 - if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6) - message(FATAL_ERROR - "host compiler - Not found! (clang version must be at least 3.6)") - else() - message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}") - endif() -else() - message(WARNING - "host compiler - Not found! (ComputeCpp supports GCC and Clang, see readme)") -endif() - -set(COMPUTECPP_64_BIT_DEFAULT ON) -option(COMPUTECPP_64_BIT_CODE "Compile device code in 64 bit mode" - ${COMPUTECPP_64_BIT_DEFAULT}) -mark_as_advanced(COMPUTECPP_64_BIT_CODE) - -option(COMPUTECPP_DISABLE_GCC_DUAL_ABI "Compile with pre-5.1 ABI" OFF) -mark_as_advanced(COMPUTECPP_DISABLE_GCC_DUAL_ABI) +cmake_minimum_required(VERSION 3.4.3) +include(FindPackageHandleStandardArgs) set(COMPUTECPP_USER_FLAGS "" CACHE STRING "User flags for compute++") +separate_arguments(COMPUTECPP_USER_FLAGS) mark_as_advanced(COMPUTECPP_USER_FLAGS) -# Find OpenCL package -find_package(OpenCL REQUIRED) +set(COMPUTECPP_BITCODE "spir64" CACHE STRING + "Bitcode type to use as SYCL target in compute++") +mark_as_advanced(COMPUTECPP_BITCODE) -# Find ComputeCpp packagee -if(NOT COMPUTECPP_PACKAGE_ROOT_DIR) - message(FATAL_ERROR - "ComputeCpp package - Not found! (please set COMPUTECPP_PACKAGE_ROOT_DIR") -else() - message(STATUS "ComputeCpp package - Found") -endif() +find_package(OpenCL REQUIRED) -# Obtain the path to compute++ -find_program(COMPUTECPP_DEVICE_COMPILER compute++ PATHS - ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin) -if (EXISTS ${COMPUTECPP_DEVICE_COMPILER}) - mark_as_advanced(COMPUTECPP_DEVICE_COMPILER) - message(STATUS "compute++ - Found") -else() - message(FATAL_ERROR "compute++ - Not found! (${COMPUTECPP_DEVICE_COMPILER})") -endif() +# Find ComputeCpp package -# Obtain the path to computecpp_info -find_program(COMPUTECPP_INFO_TOOL computecpp_info PATHS - ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin) -if (EXISTS ${COMPUTECPP_INFO_TOOL}) - mark_as_advanced(${COMPUTECPP_INFO_TOOL}) - message(STATUS "computecpp_info - Found") -else() - message(FATAL_ERROR "computecpp_info - Not found! (${COMPUTECPP_INFO_TOOL})") +if(DEFINED ComputeCpp_DIR) + set(computecpp_find_hint ${ComputeCpp_DIR}) +elseif(DEFINED ENV{COMPUTECPP_DIR}) + set(computecpp_find_hint $ENV{COMPUTECPP_DIR}) endif() -# Obtain the path to the ComputeCpp runtime library -find_library(COMPUTECPP_RUNTIME_LIBRARY ComputeCpp PATHS ${COMPUTECPP_PACKAGE_ROOT_DIR} - HINTS ${COMPUTECPP_PACKAGE_ROOT_DIR}/lib PATH_SUFFIXES lib - DOC "ComputeCpp Runtime Library" NO_DEFAULT_PATH) +# Used for running executables on the host +set(computecpp_host_find_hint ${computecpp_find_hint}) -if (EXISTS ${COMPUTECPP_RUNTIME_LIBRARY}) - mark_as_advanced(COMPUTECPP_RUNTIME_LIBRARY) - message(STATUS "libComputeCpp.so - Found") -else() - message(FATAL_ERROR "libComputeCpp.so - Not found!") +if(CMAKE_CROSSCOMPILING) + # ComputeCpp_HOST_DIR is used to find executables that are run on the host + if(DEFINED ComputeCpp_HOST_DIR) + set(computecpp_host_find_hint ${ComputeCpp_HOST_DIR}) + elseif(DEFINED ENV{COMPUTECPP_HOST_DIR}) + set(computecpp_host_find_hint $ENV{COMPUTECPP_HOST_DIR}) + endif() endif() -# Obtain the ComputeCpp include directory -set(COMPUTECPP_INCLUDE_DIRECTORY ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/) -if (NOT EXISTS ${COMPUTECPP_INCLUDE_DIRECTORY}) - message(FATAL_ERROR "ComputeCpp includes - Not found!") +find_program(ComputeCpp_DEVICE_COMPILER_EXECUTABLE compute++ + HINTS ${computecpp_host_find_hint} + PATH_SUFFIXES bin) + +find_program(ComputeCpp_INFO_EXECUTABLE computecpp_info + HINTS ${computecpp_host_find_hint} + PATH_SUFFIXES bin) + +find_library(COMPUTECPP_RUNTIME_LIBRARY + NAMES ComputeCpp ComputeCpp_vs2015 + HINTS ${computecpp_find_hint} + PATH_SUFFIXES lib + DOC "ComputeCpp Runtime Library") + +find_library(COMPUTECPP_RUNTIME_LIBRARY_DEBUG + NAMES ComputeCpp ComputeCpp_vs2015_d + HINTS ${computecpp_find_hint} + PATH_SUFFIXES lib + DOC "ComputeCpp Debug Runtime Library") + +find_path(ComputeCpp_INCLUDE_DIRS + NAMES "CL/sycl.hpp" + HINTS ${computecpp_find_hint}/include + DOC "The ComputeCpp include directory") +get_filename_component(ComputeCpp_INCLUDE_DIRS ${ComputeCpp_INCLUDE_DIRS} ABSOLUTE) + +get_filename_component(computecpp_canonical_root_dir "${ComputeCpp_INCLUDE_DIRS}/.." ABSOLUTE) +set(ComputeCpp_ROOT_DIR "${computecpp_canonical_root_dir}" CACHE PATH + "The root of the ComputeCpp install") + +if(NOT ComputeCpp_INFO_EXECUTABLE) + message(WARNING "Can't find computecpp_info - check ComputeCpp_DIR") else() - message(STATUS "ComputeCpp includes - Found") -endif() + execute_process(COMMAND ${ComputeCpp_INFO_EXECUTABLE} "--dump-version" + OUTPUT_VARIABLE ComputeCpp_VERSION + RESULT_VARIABLE ComputeCpp_INFO_EXECUTABLE_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE) + if(NOT ComputeCpp_INFO_EXECUTABLE_RESULT EQUAL "0") + message(WARNING "Package version - Error obtaining version!") + endif() -# Obtain the package version -execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-version" - OUTPUT_VARIABLE COMPUTECPP_PACKAGE_VERSION - RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE) -if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0") - message(FATAL_ERROR "Package version - Error obtaining version!") -else() - mark_as_advanced(COMPUTECPP_PACKAGE_VERSION) - message(STATUS "Package version - ${COMPUTECPP_PACKAGE_VERSION}") + execute_process(COMMAND ${ComputeCpp_INFO_EXECUTABLE} "--dump-is-supported" + OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED + RESULT_VARIABLE ComputeCpp_INFO_EXECUTABLE_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE) + if(NOT ComputeCpp_INFO_EXECUTABLE_RESULT EQUAL "0") + message(WARNING "platform - Error checking platform support!") + else() + mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED) + if (COMPUTECPP_PLATFORM_IS_SUPPORTED) + message(STATUS "platform - your system can support ComputeCpp") + else() + message(WARNING "platform - your system CANNOT support ComputeCpp") + endif() + endif() endif() -# Obtain the device compiler flags -execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-device-compiler-flags" - OUTPUT_VARIABLE COMPUTECPP_DEVICE_COMPILER_FLAGS - RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE) -if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0") - message(FATAL_ERROR "compute++ flags - Error obtaining compute++ flags!") -else() - mark_as_advanced(COMPUTECPP_COMPILER_FLAGS) - message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}") +find_package_handle_standard_args(ComputeCpp + REQUIRED_VARS ComputeCpp_ROOT_DIR + ComputeCpp_DEVICE_COMPILER_EXECUTABLE + ComputeCpp_INFO_EXECUTABLE + COMPUTECPP_RUNTIME_LIBRARY + COMPUTECPP_RUNTIME_LIBRARY_DEBUG + ComputeCpp_INCLUDE_DIRS + VERSION_VAR ComputeCpp_VERSION) +mark_as_advanced(ComputeCpp_ROOT_DIR + ComputeCpp_DEVICE_COMPILER_EXECUTABLE + ComputeCpp_INFO_EXECUTABLE + COMPUTECPP_RUNTIME_LIBRARY + COMPUTECPP_RUNTIME_LIBRARY_DEBUG + ComputeCpp_INCLUDE_DIRS + ComputeCpp_VERSION) + +if(NOT ComputeCpp_FOUND) + return() endif() -# Check if the platform is supported -execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported" - OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED - RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE) -if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0") - message(FATAL_ERROR "platform - Error checking platform support!") -else() - mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED) - if (COMPUTECPP_PLATFORM_IS_SUPPORTED) - message(STATUS "platform - your system can support ComputeCpp") - else() - message(STATUS "platform - your system CANNOT support ComputeCpp") +list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -O2 -mllvm -inline-threshold=1000 -intelspirmetadata) +mark_as_advanced(COMPUTECPP_DEVICE_COMPILER_FLAGS) + +if(CMAKE_CROSSCOMPILING) + if(NOT COMPUTECPP_DONT_USE_TOOLCHAIN) + list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS --gcc-toolchain=${COMPUTECPP_TOOLCHAIN_DIR}) endif() + list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS --sysroot=${COMPUTECPP_SYSROOT_DIR}) + list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -target ${COMPUTECPP_TARGET_TRIPLE}) +endif() + +list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -sycl-target ${COMPUTECPP_BITCODE}) +message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}") + +if(NOT TARGET OpenCL::OpenCL) + add_library(OpenCL::OpenCL UNKNOWN IMPORTED) + set_target_properties(OpenCL::OpenCL PROPERTIES + IMPORTED_LOCATION "${OpenCL_LIBRARIES}" + INTERFACE_INCLUDE_DIRECTORIES "${OpenCL_INCLUDE_DIRS}" + ) endif() -set(COMPUTECPP_USER_FLAGS - -sycl-compress-name - -Wall - -no-serial-memop - -DEIGEN_NO_ASSERTION_CHECKING=1 +if(NOT TARGET ComputeCpp::ComputeCpp) + add_library(ComputeCpp::ComputeCpp UNKNOWN IMPORTED) + set_target_properties(ComputeCpp::ComputeCpp PROPERTIES + IMPORTED_LOCATION_DEBUG "${COMPUTECPP_RUNTIME_LIBRARY_DEBUG}" + IMPORTED_LOCATION_RELWITHDEBINFO "${COMPUTECPP_RUNTIME_LIBRARY_DEBUG}" + IMPORTED_LOCATION "${COMPUTECPP_RUNTIME_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${ComputeCpp_INCLUDE_DIRS}" + INTERFACE_LINK_LIBRARIES "OpenCL::OpenCL" ) +endif() + +# This property allows targets to specify that their sources should be +# compiled with the integration header included after the user's +# sources, not before (e.g. when an enum is used in a kernel name, this +# is not technically valid SYCL code but can work with ComputeCpp) +define_property( + TARGET PROPERTY COMPUTECPP_INCLUDE_AFTER + BRIEF_DOCS "Include integration header after user source" + FULL_DOCS "Changes compiler arguments such that the source file is + actually the integration header, and the .cpp file is included on + the command line so that it is seen by the compiler first. Enables + non-standards-conformant SYCL code to compile with ComputeCpp." +) +define_property( + TARGET PROPERTY INTERFACE_COMPUTECPP_FLAGS + BRIEF_DOCS "Interface compile flags to provide compute++" + FULL_DOCS "Set additional compile flags to pass to compute++ when compiling + any target which links to this one." +) +define_property( + SOURCE PROPERTY COMPUTECPP_SOURCE_FLAGS + BRIEF_DOCS "Source file compile flags for compute++" + FULL_DOCS "Set additional compile flags for compiling the SYCL integration + header for the given source file." +) #################### -# __build_sycl +# __build_ir #################### # # Adds a custom target for running compute++ and adding a dependency for the # resulting integration header. # -# targetName : Name of the target. -# sourceFile : Source file to be compiled. -# binaryDir : Intermediate directory to output the integration header. -# fileCounter : Counter included in name of custom target. Different counter -# values prevent duplicated names of custom target when source files with the same name, -# but located in different directories, are used for the same target. +# TARGET : Name of the target. +# SOURCE : Source file to be compiled. +# COUNTER : Counter included in name of custom target. Different counter +# values prevent duplicated names of custom target when source files with +# the same name, but located in different directories, are used for the +# same target. # -function(__build_spir targetName sourceFile binaryDir fileCounter) - - # Retrieve source file name. - get_filename_component(sourceFileName ${sourceFile} NAME) - - # Set the path to the Sycl file. - set(outputSyclFile ${binaryDir}/${sourceFileName}.sycl) - - # Add any user-defined include to the device compiler - set(device_compiler_includes "") - get_property(includeDirectories DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY - INCLUDE_DIRECTORIES) - foreach(directory ${includeDirectories}) - set(device_compiler_includes "-I${directory}" ${device_compiler_includes}) - endforeach() - get_target_property(targetIncludeDirectories ${targetName} INCLUDE_DIRECTORIES) - foreach(directory ${targetIncludeDirectories}) - set(device_compiler_includes "-I${directory}" ${device_compiler_includes}) - endforeach() - if (CMAKE_INCLUDE_PATH) - foreach(directory ${CMAKE_INCLUDE_PATH}) - set(device_compiler_includes "-I${directory}" - ${device_compiler_includes}) +function(__build_ir) + set(options) + set(one_value_args + TARGET + SOURCE + COUNTER + ) + set(multi_value_args) + cmake_parse_arguments(SDK_BUILD_IR + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN} + ) + get_filename_component(sourceFileName ${SDK_BUILD_IR_SOURCE} NAME) + + # Set the path to the integration header. + # The .sycl filename must depend on the target so that different targets + # using the same source file will be generated with a different rule. + set(baseSyclName ${CMAKE_CURRENT_BINARY_DIR}/${SDK_BUILD_IR_TARGET}_${sourceFileName}) + set(outputSyclFile ${baseSyclName}.sycl) + set(depFileName ${baseSyclName}.sycl.d) + + set(include_directories "$") + set(compile_definitions "$") + set(generated_include_directories + $<$:-I\"$\">) + set(generated_compile_definitions + $<$:-D$>) + + # Obtain language standard of the file + set(device_compiler_cxx_standard) + get_target_property(targetCxxStandard ${SDK_BUILD_IR_TARGET} CXX_STANDARD) + if (targetCxxStandard MATCHES 17) + set(device_compiler_cxx_standard "-std=c++1z") + elseif (targetCxxStandard MATCHES 14) + set(device_compiler_cxx_standard "-std=c++14") + elseif (targetCxxStandard MATCHES 11) + set(device_compiler_cxx_standard "-std=c++11") + elseif (targetCxxStandard MATCHES 98) + message(FATAL_ERROR "SYCL applications cannot be compiled using C++98") + else () + set(device_compiler_cxx_standard "") + endif() + + get_property(source_compile_flags + SOURCE ${SDK_BUILD_IR_SOURCE} + PROPERTY COMPUTECPP_SOURCE_FLAGS + ) + separate_arguments(source_compile_flags) + if(source_compile_flags) + list(APPEND computecpp_source_flags ${source_compile_flags}) + endif() + + list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS + ${device_compiler_cxx_standard} + ${COMPUTECPP_USER_FLAGS} + ${computecpp_source_flags} + ) + + set(ir_dependencies ${SDK_BUILD_IR_SOURCE}) + get_target_property(target_libraries ${SDK_BUILD_IR_TARGET} LINK_LIBRARIES) + if(target_libraries) + foreach(library ${target_libraries}) + list(APPEND ir_dependencies ${library}) endforeach() endif() - set(COMPUTECPP_DEVICE_COMPILER_FLAGS - ${COMPUTECPP_DEVICE_COMPILER_FLAGS} - ${COMPUTECPP_USER_FLAGS}) - # Convert argument list format - separate_arguments(COMPUTECPP_DEVICE_COMPILER_FLAGS) + # Depfile support was only added in CMake 3.7 + # CMake throws an error if it is unsupported by the generator (i. e. not ninja) + if((NOT CMAKE_VERSION VERSION_LESS 3.7.0) AND + CMAKE_GENERATOR MATCHES "Ninja") + file(RELATIVE_PATH relOutputFile ${CMAKE_BINARY_DIR} ${outputSyclFile}) + set(generate_depfile -MMD -MF ${depFileName} -MT ${relOutputFile}) + set(enable_depfile DEPFILE ${depFileName}) + endif() # Add custom command for running compute++ add_custom_command( OUTPUT ${outputSyclFile} - COMMAND ${COMPUTECPP_DEVICE_COMPILER} + COMMAND ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE} ${COMPUTECPP_DEVICE_COMPILER_FLAGS} - -isystem ${COMPUTECPP_INCLUDE_DIRECTORY} - ${COMPUTECPP_PLATFORM_SPECIFIC_ARGS} - ${device_compiler_includes} + ${generated_include_directories} + ${generated_compile_definitions} -o ${outputSyclFile} - -c ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile} - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile} - IMPLICIT_DEPENDS CXX "${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}" - WORKING_DIRECTORY ${binaryDir} + -c ${SDK_BUILD_IR_SOURCE} + ${generate_depfile} + DEPENDS ${ir_dependencies} + IMPLICIT_DEPENDS CXX ${SDK_BUILD_IR_SOURCE} + ${enable_depfile} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMENT "Building ComputeCpp integration header file ${outputSyclFile}") - # Add a custom target for the generated integration header - add_custom_target(${targetName}_integration_header DEPENDS ${outputSyclFile}) + # Name: (user-defined name)_(source file)_(counter)_ih + set(headerTargetName + ${SDK_BUILD_IR_TARGET}_${sourceFileName}_${SDK_BUILD_IR_COUNTER}_ih) + + if(NOT MSVC) + # Add a custom target for the generated integration header + add_custom_target(${headerTargetName} DEPENDS ${outputSyclFile}) + add_dependencies(${SDK_BUILD_IR_TARGET} ${headerTargetName}) + endif() + + # This property can be set on a per-target basis to indicate that the + # integration header should appear after the main source listing + get_target_property(includeAfter ${SDK_ADD_SYCL_TARGET} COMPUTECPP_INCLUDE_AFTER) + + if(includeAfter) + # Change the source file to the integration header - e.g. + # g++ -c source_file_name.cpp.sycl + get_target_property(current_sources ${SDK_BUILD_IR_TARGET} SOURCES) + # Remove absolute path to source file + list(REMOVE_ITEM current_sources ${SDK_BUILD_IR_SOURCE}) + # Remove relative path to source file + string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" "" + rel_source_file ${SDK_BUILD_IR_SOURCE} + ) + list(REMOVE_ITEM current_sources ${rel_source_file}) + # Add SYCL header to source list + list(APPEND current_sources ${outputSyclFile}) + set_property(TARGET ${SDK_BUILD_IR_TARGET} + PROPERTY SOURCES ${current_sources}) + # CMake/gcc don't know what language a .sycl file is, so tell them + set_property(SOURCE ${outputSyclFile} PROPERTY LANGUAGE CXX) + set(includedFile ${SDK_BUILD_IR_SOURCE}) + set(cppFile ${outputSyclFile}) + else() + set_property(SOURCE ${outputSyclFile} PROPERTY HEADER_FILE_ONLY ON) + set(includedFile ${outputSyclFile}) + set(cppFile ${SDK_BUILD_IR_SOURCE}) + endif() - # Add a dependency on the integration header - add_dependencies(${targetName} ${targetName}_integration_header) + # Force inclusion of the integration header for the host compiler + if(MSVC) + # Group SYCL files inside Visual Studio + source_group("SYCL" FILES ${outputSyclFile}) + + if(includeAfter) + # Allow the source file to be edited using Visual Studio. + # It will be added as a header file so it won't be compiled. + set_property(SOURCE ${SDK_BUILD_IR_SOURCE} PROPERTY HEADER_FILE_ONLY true) + endif() - # Set the host compiler C++ standard to C++11 - set_property(TARGET ${targetName} PROPERTY CXX_STANDARD 11) + # Add both source and the sycl files to the VS solution. + target_sources(${SDK_BUILD_IR_TARGET} PUBLIC ${SDK_BUILD_IR_SOURCE} ${outputSyclFile}) - # Disable GCC dual ABI on GCC 5.1 and higher - if(COMPUTECPP_DISABLE_GCC_DUAL_ABI) - set_property(TARGET ${targetName} APPEND PROPERTY COMPILE_DEFINITIONS - "_GLIBCXX_USE_CXX11_ABI=0") + set(forceIncludeFlags "/FI${includedFile} /TP") + else() + set(forceIncludeFlags "-include ${includedFile} -x c++") endif() -endfunction() + set_property( + SOURCE ${cppFile} + APPEND_STRING PROPERTY COMPILE_FLAGS "${forceIncludeFlags}" + ) + +endfunction(__build_ir) ####################### # add_sycl_to_target ####################### # # Adds a SYCL compilation custom command associated with an existing -# target and sets a dependency on that new command. +# target and sets a dependancy on that new command. # -# targetName : Name of the target to add a SYCL to. -# binaryDir : Intermediate directory to output the integration header. -# sourceFiles : Source files to be compiled for SYCL. +# TARGET : Name of the target to add SYCL to. +# SOURCES : Source files to be compiled for SYCL. # -function(add_sycl_to_target targetName binaryDir sourceFiles) +function(add_sycl_to_target) + set(options) + set(one_value_args + TARGET + ) + set(multi_value_args + SOURCES + ) + cmake_parse_arguments(SDK_ADD_SYCL + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN} + ) - set(sourceFiles ${sourceFiles} ${ARGN}) - set(fileCounter 0) - # Add custom target to run compute++ and generate the integration header - foreach(sourceFile ${sourceFiles}) - __build_spir(${targetName} ${sourceFile} ${binaryDir} ${fileCounter}) - math(EXPR fileCounter "${fileCounter} + 1") - endforeach() + # If the CXX compiler is set to compute++ enable the driver. + get_filename_component(cmakeCxxCompilerFileName "${CMAKE_CXX_COMPILER}" NAME) + if("${cmakeCxxCompilerFileName}" STREQUAL "compute++") + if(MSVC) + message(FATAL_ERROR "The compiler driver is not supported by this system, + revert the CXX compiler to your default host compiler.") + endif() - # Link with the ComputeCpp runtime library - target_link_libraries(${targetName} PUBLIC ${COMPUTECPP_RUNTIME_LIBRARY} - PUBLIC ${OpenCL_LIBRARIES}) + get_target_property(includeAfter ${SDK_ADD_SYCL_TARGET} COMPUTECPP_INCLUDE_AFTER) + if(includeAfter) + list(APPEND COMPUTECPP_USER_FLAGS -fsycl-ih-last) + endif() + list(INSERT COMPUTECPP_DEVICE_COMPILER_FLAGS 0 -sycl-driver) + # Prepend COMPUTECPP_DEVICE_COMPILER_FLAGS and append COMPUTECPP_USER_FLAGS + foreach(prop COMPILE_OPTIONS INTERFACE_COMPILE_OPTIONS) + get_target_property(target_compile_options ${SDK_ADD_SYCL_TARGET} ${prop}) + if(NOT target_compile_options) + set(target_compile_options "") + endif() + set_property( + TARGET ${SDK_ADD_SYCL_TARGET} + PROPERTY ${prop} + ${COMPUTECPP_DEVICE_COMPILER_FLAGS} + ${target_compile_options} + ${COMPUTECPP_USER_FLAGS} + ) + endforeach() + else() + set(fileCounter 0) + list(INSERT COMPUTECPP_DEVICE_COMPILER_FLAGS 0 -sycl) + # Add custom target to run compute++ and generate the integration header + foreach(sourceFile ${SDK_ADD_SYCL_SOURCES}) + if(NOT IS_ABSOLUTE ${sourceFile}) + set(sourceFile "${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}") + endif() + __build_ir( + TARGET ${SDK_ADD_SYCL_TARGET} + SOURCE ${sourceFile} + COUNTER ${fileCounter} + ) + MATH(EXPR fileCounter "${fileCounter} + 1") + endforeach() + endif() -endfunction() + set_property(TARGET ${SDK_ADD_SYCL_TARGET} + APPEND PROPERTY LINK_LIBRARIES ComputeCpp::ComputeCpp) + set_property(TARGET ${SDK_ADD_SYCL_TARGET} + APPEND PROPERTY INTERFACE_LINK_LIBRARIES ComputeCpp::ComputeCpp) +endfunction(add_sycl_to_target) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 6a8dc2cd8..f8a62253c 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -15,19 +15,6 @@ #if EIGEN_HAS_CXX11 -#if defined(EIGEN_USE_SYCL) -#undef min -#undef max -#undef isnan -#undef isinf -#undef isfinite -#include -#include -#include -#include -#include -#endif - #include "../SpecialFunctions" #include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" @@ -72,7 +59,7 @@ typedef unsigned __int64 uint64_t; #include #endif -#ifdef EIGEN_USE_THREADS +#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL) #include "ThreadPool" #endif @@ -147,7 +134,13 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorScan.h" #include "src/Tensor/TensorTrace.h" -#include "src/Tensor/TensorSycl.h" +#ifdef EIGEN_USE_SYCL +#include "src/Tensor/TensorReductionSycl.h" +#include "src/Tensor/TensorConvolutionSycl.h" +#include "src/Tensor/TensorContractionSycl.h" +#include "src/Tensor/TensorScanSycl.h" +#endif + #include "src/Tensor/TensorExecutor.h" #include "src/Tensor/TensorDevice.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h deleted file mode 100644 index 2184c94b3..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h +++ /dev/null @@ -1,152 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorArgMaxSycl.h - * \brief: - * TensorArgMaxSycl - * -*****************************************************************/ - -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP -namespace Eigen { -namespace internal { - template - struct eval, Eigen::Dense> - { - typedef const TensorTupleReducerDeviceOp& type; - }; - - template - struct nested, 1, - typename eval >::type> - { - typedef TensorTupleReducerDeviceOp type; - }; - -template -struct traits > : public traits -{ - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef Index Scalar; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - - -}// end namespace internal -template -class TensorTupleReducerDeviceOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - typedef typename XprType::CoeffReturnType TupleType; - typedef Index CoeffReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerDeviceOp(XprType expr, - const Index return_dim, - const StrideDims strides, - const Index stride_mod, const Index stride_div) - :m_xpr(expr), m_return_dim(return_dim), m_strides(strides), m_stride_mod(stride_mod), m_stride_div(stride_div) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - Index return_dim() const { return m_return_dim; } - - EIGEN_DEVICE_FUNC - const StrideDims& strides() const { return m_strides; } - - EIGEN_DEVICE_FUNC - const Index& stride_mod() const { return m_stride_mod; } - - EIGEN_DEVICE_FUNC - const Index& stride_div() const { return m_stride_div; } - - protected: - typename Eigen::internal::remove_all::type m_xpr; - const Index m_return_dim; - const StrideDims m_strides; - const Index m_stride_mod; - const Index m_stride_div; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, SyclKernelDevice> -{ - typedef TensorTupleReducerDeviceOp XprType; - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::TupleType TupleType; - typedef typename TensorEvaluator::Dimensions Dimensions; - - enum { - IsAligned = false, - PacketAccess = false, - BlockAccessV2 = false, - PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; - //===--------------------------------------------------------------------===// - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const SyclKernelDevice& device) - : m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()), - m_stride_div(op.stride_div()){} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { - return m_impl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - const TupleType v = m_impl.coeff(index); - return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; - } -typedef typename MakeGlobalPointer::CoeffReturnType >::Type ptr_Dev_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_Dev_type data() const { return const_cast(m_impl.data()); } - -protected: - TensorEvaluator m_impl; - const Index m_return_dim; - const StrideDims m_strides; - const Index m_stride_mod; - const Index m_stride_div; -}; -} // end namespace Eigen -#endif //UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index 50865d404..9ab900b4a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -545,6 +545,10 @@ class TensorContractionInputMapper EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { return VectorMapper(*this, i, j); } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader& get_tensor() const { + return Base::m_tensor; + } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h old mode 100644 new mode 100755 index 35f931c53..a6ca1777a --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -1,136 +1,1386 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. +// This file is part of Eigen, a lightweight C++ template library for linear algebra. // // Mehdi Goli Codeplay Software Ltd. // Ralph Potter Codeplay Software Ltd. // Luke Iwanski Codeplay Software Ltd. // Contact: // -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not +// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /***************************************************************** - * TensorTensorContractionsycl.h + * TensorContractionSycl.h * * \brief: - * TensorContractionsycl + * TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend * -*****************************************************************/ + *****************************************************************/ #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H + namespace Eigen { -template struct LaunchSyclKernels; -template -struct TensorEvaluator, const Eigen::SyclDevice> : - public TensorContractionEvaluatorBase, const Eigen::SyclDevice> > { +namespace TensorSycl { +namespace internal { + +#ifndef EIGEN_SYCL_DISABLE_GEMV +/*! + * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector + * contraction kernel on various hardware devices. + * + * \tparam Scalar: determines the element type of the tensor/vector + * + * \tparam StorageIndex determines the Index type. + * + * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group + * + * \tparam CFactor: determines the number of contracting element to be process by each thread + * + * \tparam NCFactor: determines the number of non-contracting element to be process by each thread + */ +template +struct TVPanelSize { + // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0; + // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1; + // TileSizeDimNC: determines the tile size for the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor; + // TileSizeDimC: determines the tile size for the contracting dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC; + // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC; + // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC; + // BC : determines if supporting bank conflict is required + static EIGEN_CONSTEXPR bool BC = false; +}; +#endif + +/*! + * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor + contraction kernel on various hardware devices. + * + * \tparam Scalar: determines the element type of the tensor + * + * \tparam StorageIndex: determines the Index type. + * + * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the + available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro). + * + * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the + available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro). + * + * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered + */ + +template +struct TTPanelSize { + // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered + static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK; + // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the + // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro// +#ifndef EIGEN_SYCL_REG_M + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M; +#else + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M; +#endif +// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the +// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro +#ifndef EIGEN_SYCL_REG_N + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N; +#else + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N; +#endif + // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0; + // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1; + // TileSizeDimM: determines the tile size for the m dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM; + // TileSizeDimN: determines the tile size for the n dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN; + // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisable by packetsize + static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs = + ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN)); + // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisable by packetsize + static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs = + ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM)); + // BC : determines if supporting bank conflict is required + static EIGEN_CONSTEXPR bool BC = true; + // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by + // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device doesnot have sufficient local memory) + static EIGEN_CONSTEXPR bool DoubleBuffer = +#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER + false; +#else + true; +#endif +}; + +/* ! + * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to + * specialize the contraction algorithm based on device support for dedicated local memory. + */ +enum class contraction_type { local, no_local }; +/* ! + * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private). + */ +enum class data_source { global_mem, local_mem, private_mem }; + +/*! + * \brief read, a template function used for loading the data from global + memory. This function is used to guarantee coalesced and vectorized load whenever possible + * + * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode + * + * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and + vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \tparam PacketType: determines the type of packet + * + * \tparam TensorMapper: determines the input tensor mapper type + * + * \tparam StorageIndex: determines the Index type + + * \param tensorMapper: is the input tensor + * + * \param NCIndex: is the non-contracting dim index + * + * \param CIndex is the contracting dim index + * + * \param ld: is the leading dimension of the flattened tensor + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type read( + const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) { + const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex; + const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex; + return tensorMapper.get_tensor().template packet(row + (col * ld)); +} + +/*! + * \brief read, special overload of read function, when the read access is not vectorized + * + * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode + * + * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and + vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \tparam PacketType: determines the type of packet + * + * \tparam TensorMapper: determines the input tensor mapper type + * + * \tparam StorageIndex: determines the Index type + + * \param tensorMapper: is the input tensor + * + * \param NCIndex: is the non-contracting dim index + * + * \param CIndex: is the contracting dim index + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type read( + const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) { + const StorageIndex row = (IsRhs) ? CIndex : NCIndex; + const StorageIndex col = (IsRhs) ? NCIndex : CIndex; + return tensorMapper(row, col); +} + +/*! + * \brief write, a template function used for storing the data to local memory. This function is used to guarantee + * coalesced and vectorized store whenever possible. + * + * \tparam StorageIndex: determines the Index type + * + * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + * + * \param CIndex is the contracting dim index + */ + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if
::type + write(PacketType &packet_data, DataScalar ptr) { + EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits::size; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; i++) { + *ptr = PacketWrapper::scalarize(i, packet_data); + ptr += ld; + } +} + +/*! + * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function + * is used to guarantee coalesced and vectorized store whenever possible. + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + */ + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if< + Eigen::internal::unpacket_traits::size != 1 && dt == data_source::global_mem, void>::type +write(PacketType &packet_data, DataScalar *ptr) { + ::Eigen::internal::pstoreu(ptr, packet_data); +} + +/*! + * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled. + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if< + Eigen::internal::unpacket_traits::size == 1 && dt == data_source::global_mem, void>::type +write(PacketType &packet_data, DataScalar *ptr) { + *ptr = packet_data; +} + +/*! + * \brief check_boundary: is used to check the edge condition for non-internal blocks. + * + * \tparam is_internal: determines if the block is internal + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) { + return true; +} + +/*! + * \brief check_boundary: specialization of the check_boundary for non-internal blocks. + * + * \param cond: true when the data is in range. Otherwise false + */ +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool cond) { + return cond; +} + +/*! + * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed + * by each workgroup. + * + * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed + * + * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode + * + * \tparam PacketType: determines the type of packet + * + * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be + * packetType; Otherwise it will be scalar Type + * + * \param elements_per_access determines the size of each element based on OutType + * + * \param is_coalesced_layout determines whether or not the Tensor data in a memory can be access coalesced and + * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the + * Tensor Block for each workgroup + * + * \param c_stride determines the stride of contracting dimension to access the next adjustment element within the + * Tensor Block for each workgroup + */ +template +struct BlockProperties { + static EIGEN_CONSTEXPR bool packet_load = packet_load_; + typedef typename Eigen::internal::unpacket_traits::type OutScalar; + static EIGEN_CONSTEXPR bool is_rhs = is_rhs_; + typedef typename Eigen::internal::conditional::type OutType; + static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits::size; + static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs); + static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1); + static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access); +}; + +/*! + * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup. Please see + * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup, + * work-items + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \param linearLocalThreadId: determines the linearized location of a thread within a work-group + * + * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when + * tall/skinny algorithm is used + * + * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of + * the flattened tensor. + * + * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the + * flattened tensor. It will be > 1 when tall/skinny algorithm is used. + * + * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a + * flattened tensor. The position determines the distance of each thread within the workgroup from each other + * independent from their global position. + * + * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a + * flattened tensor. The position determines the distance of each thread within the workgroup from each other + * independent from their global position. + * + * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a + * flattened tensor + * + * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a + * flattened tensor + * + * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the + * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used. + * + * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or + * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be + * resolve by compiler. + */ +template +struct ThreadProperties { + const StorageIndex linearLocalThreadId; + const StorageIndex kGroupId; + const StorageIndex mGroupOffset; + const StorageIndex nGroupOffset; + const StorageIndex kGroupOffset; + const StorageIndex mLocalOffset; + const StorageIndex nLocalOffset; + const StorageIndex mGlobalOffset; + const StorageIndex nGlobalOffset; + StorageIndex kSize; + const bool is_internal; + // this is used to adjust the last block + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties( + const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_, + const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_, + const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_, + StorageIndex kSize_, const bool is_internal_) + : linearLocalThreadId(linearLocalThreadId_), + kGroupId(kGroupId_), + mGroupOffset(mGroupOffset_), + nGroupOffset(nGroupOffset_), + kGroupOffset(kGroupOffset_), + mLocalOffset(mLocalOffset_), + nLocalOffset(nLocalOffset_), + mGlobalOffset(mGlobalOffset_), + nGlobalOffset(nGlobalOffset_), + kSize(kSize_), + is_internal(is_internal_) {} +}; + +/*! + * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation. + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam LhsScalar: determines the left-hand-side scalar type + * + * \tparam RhsScalar: determines the right-hand-side scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix + * + * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Properties: determines the Contraction Panel properties + * + * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory + access is used to guarantee that always the memory access are coalesced. + * + * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output. + Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny + contraction is used. So in this case, a final reduction step is required to compute final output. + + * \tparam contraction_tp: it is an enum value representing whether the local memroy/no local memory implementation of + the algorithm to be used + * + * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group + * + * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) + * + * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) + * + * \param out_res: determines the output tensor containing the contraction result + * + * \param groupSizeM: a logical number determining the number of work-group for m dimension + * + * \param groupSizeN: a logical number determining the number of work-group for n dimension + * + * \param numTiles: determines total number of tiles on the k dimension + * + * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix + */ +template +class TensorContractionKernel { + public: + typedef typename Eigen::TensorSycl::internal::Vectorise::PacketReturnType + PacketReturnType; + static EIGEN_CONSTEXPR int PacketSize = + Eigen::TensorSycl::internal::Vectorise::PacketSize; + static EIGEN_CONSTEXPR bool is_lhs_transposed = + !::Eigen::internal::TensorContractionInputMapperTrait::inner_dim_contiguous; + static EIGEN_CONSTEXPR bool is_rhs_transposed = + !::Eigen::internal::TensorContractionInputMapperTrait::inner_dim_contiguous; + + typedef BlockProperties + LHSBlockProperties; + + typedef BlockProperties + RHSBlockProperties; + + static EIGEN_CONSTEXPR StorageIndex NStride = + contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride; + + typedef cl::sycl::accessor Scratch; + typedef cl::sycl::multi_ptr local_ptr; + typedef OutScalar * /*cl::sycl::multi_ptr*/ private_ptr; + typedef + typename ::Eigen::internal::conditional::type + tile_ptr; + static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local + ? Properties::TileSizeDimM + Properties::BC + : Properties::WorkLoadPerThreadM; + static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local + ? Properties::TileSizeDimN + Properties::BC + : Properties::WorkLoadPerThreadN; + static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; + + /** + * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not + * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to + * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting + * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out + * different type of memory needed when local/no_local memory computation is called. + * + * \tparam contraction_type: it is an enum value representing whether the local memroy/no local memory implementation + of the algorithm to be used + * \tparam the private memory size + * \param ptr the tile memory pointer type + */ + template + struct MemHolder { + tile_ptr ptr; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {} + }; + /** + * \brief specialization of memHolder class when no local memory kernel is used. + */ + template + struct MemHolder { + OutScalar ptr[MemSize] = {OutScalar{0}}; + }; + /** + * \brief TiledMemory: contains required memory pointer for loading each tile of the TensorContraction panel from + * global memory to local/private memory when local/no_local algorithm used. + * + * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the + * selected contraction_type. + * + * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the + * selected contraction_type. + * + * \param lhs_extract_index: determins the position of each thread on a local memory for lhs input. When private + * memory is used this is set to zero as this is not applicable in case of private memory. + * + * \param rhs_extract_index: determins the position of each thread on a local memory for rhs input. When private + * memory is used this is set to zero as this is not applicable in case of private memory. + * + * \param lhs_scratch_compute : determines the location to load for computation for lhs_local memory. This is the + * same as lhs_scratch_extract for private memory. + * + * \param rhs_scratch_compute : determines the location to load for computation for rhs_local memory. This is the + * same as rhs_scratch_extract for private memory. + */ + struct TiledMemory { + MemHolder lhs_scratch_extract; + MemHolder rhs_scratch_extract; + tile_ptr lhs_scratch_ptr_compute; + tile_ptr rhs_scratch_ptr_compute; + const std::pair lhs_extract_index; + const std::pair rhs_extract_index; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TiledMemory(const ThreadProperties &, local_ptr, + typename ::Eigen::internal::enable_if::type * = 0) + : lhs_scratch_extract{}, + rhs_scratch_extract{}, + lhs_scratch_ptr_compute(lhs_scratch_extract.ptr), + rhs_scratch_ptr_compute(rhs_scratch_extract.ptr), + lhs_extract_index(std::pair(StorageIndex{0}, StorageIndex{0})), + rhs_extract_index(std::pair(StorageIndex{0}, StorageIndex{0})) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TiledMemory(const ThreadProperties &thread_properties, local_ptr block_start_ptr, + typename ::Eigen::internal::enable_if::type * = 0) + : lhs_scratch_extract{block_start_ptr}, + rhs_scratch_extract{lhs_scratch_extract.ptr + + ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)}, + lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset), + rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset), + lhs_extract_index( + local_id_extract(thread_properties.linearLocalThreadId)), + rhs_extract_index( + local_id_extract(thread_properties.linearLocalThreadId)) {} + }; + + Scratch scratch; + const LhsMapper lhs; + const RhsMapper rhs; + OutAccessor out_res; + const StorageIndex groupSizeM; + const StorageIndex groupSizeN; + const StorageIndex numTiles; + const TripleDim triple_dim; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, + const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex groupSizeM_, + const StorageIndex groupSizeN_, + const StorageIndex numTiles_, + const TripleDim triple_dim_) + : scratch(scratch_), + lhs(lhs_), + rhs(rhs_), + out_res(out_res_), + groupSizeM(groupSizeM_), + groupSizeN(groupSizeN_), + numTiles(numTiles_), + triple_dim(triple_dim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, + const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex groupSizeM_, + const StorageIndex numTiles_, + const TripleDim triple_dim_) + : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + const StorageIndex linearLocalThreadId = itemID.get_local_id(0); + const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM; + const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM; + const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM; + const StorageIndex tmp = itemID.get_group(0) / groupSizeM; + const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN; + const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN; + const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM; + const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN; + const StorageIndex mLocalOffset = PacketSize * mLocalThreadId; + const StorageIndex nLocalOffset = NStride * nLocalThreadId; + const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset; + const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset; + + const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK; + StorageIndex kGroupOffset = kGroupId * kSizePerWG; + const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM && + triple_dim.N - nGroupOffset >= Properties::TileSizeDimN && + triple_dim.K - kGroupOffset >= kSizePerWG; + // this is used to adjust the last block + StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset); + // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to + // tile + kGroupOffset += kSize; + + auto thread_properties = + ThreadProperties(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset, + mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal); + + auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N); + + (thread_properties.is_internal) ? compute_panel(itemID, thread_properties, out_ptr) + : compute_panel(itemID, thread_properties, out_ptr); + } + // The compute block computes the contraction operation private block for each thread and store the resutl in the + // privateRes memory of Each computation the compute block function is independent of local and no local concepts as + // it only compute the block on each thread's private memory space + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr, + PacketReturnType *privateRes) { + StorageIndex idx = 0; + EIGEN_CONSTEXPR StorageIndex lhs_stride = + contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) { + auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)}; + StorageIndex lhs_index = 0; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { + PacketReturnType lhsPack{}; + Eigen::TensorSycl::internal::PacketWrapper::set_packet(lhsPack, + lhs_block_ptr + lhs_index); + privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]); + + lhs_index += lhs_stride; + idx++; + } + } + } + // The store function write the computed contraction operation in the private memory of each thread to the global + // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base + // class. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes, + StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) { + auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC { + return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N); + }; + // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is + // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId* + // WorkLoadPerThreadN slice of N + EIGEN_CONSTEXPR StorageIndex GlobalNStride = + contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) { + // output leading dimension + StorageIndex outputLD = 0; + // When local memory is used the PrivateNstride is always 1 because the coalesed access on N is loaded into Local + // memory and extracting from local to global is the same as no transposed version. However, when local memory is + // not used and RHS is transposed we packetize the load for RHS. + EIGEN_UNROLL_LOOP + for (StorageIndex nId = 0; nId < PrivateNStride; nId++) { + StorageIndex globalRow = mGlobalOffset; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { + PacketReturnType privetOut = privateRes[wLPTM]; + if (check_boundary(chk_bound(globalRow, nId))) { + // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second + // StorageIndex Therefore it is always coalesced layout + write(privetOut, out_ptr + outputLD + globalRow); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex mId = 0; mId < PacketSize; mId++) { + StorageIndex mOffset = globalRow + mId; + if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) { + out_ptr[mOffset + outputLD] = + Eigen::TensorSycl::internal::PacketWrapper::scalarize(mId, privetOut); + } + } + } + globalRow += (PacketSize * Properties::LocalThreadSizeM); + } + outputLD += triple_dim.M; + privateRes += Properties::WorkLoadPerThreadM / PacketSize; + } + out_ptr += (GlobalNStride * outputLD); + + nGlobalOffset += (PrivateNStride * GlobalNStride); + } + } + // when no local memory is used the following extract_block will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair &, + const StorageIndex &ncOffset, const StorageIndex cOffset) { + EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = + InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM; + EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = + InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM; + const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; + + auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { + return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && + (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); + }; + const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; + StorageIndex cIndex = cOffset; + + EIGEN_UNROLL_LOOP + for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) { + StorageIndex ncIndex = ncOffset; + EIGEN_UNROLL_LOOP + for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) { + if (check_boundary(chk_bound(cIndex, ncIndex))) { + auto val = + read(inpt, ncIndex, cIndex, ld); + + write(val, private_ptr); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); + const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); + OutScalar val = + (ncInd < NC && cInd < triple_dim.K) + ? read( + inpt, ncInd, cInd, ld) + : OutScalar(0); + write( + val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) + + ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC)); + } + } + + // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So + // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread. + ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1) + ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC + : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC); + private_ptr += InputBlockProperties::nc_stride; + } + // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC + private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC; + cIndex += InputBlockProperties::c_stride; + } + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair local_id_extract( + const StorageIndex &linearLocalThreadId) { + const StorageIndex localThreadNC = + (InputBlockProperties::is_coalesced_layout) + ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride) + : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride); + const StorageIndex localThreadC = + (InputBlockProperties::is_coalesced_layout) + ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride) + : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride); + return std::pair(localThreadNC, localThreadC); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_mem(const cl::sycl::nd_item<1> &, bool &db_offset) noexcept { + db_offset = !db_offset; + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_mem(const cl::sycl::nd_item<1> &itemID, bool &) noexcept { + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_mem(const cl::sycl::nd_item<1> &, bool &) noexcept { + return; + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_thread(const cl::sycl::nd_item<1> & +#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION + itemID +#endif + ) noexcept { +#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION + itemID.barrier(cl::sycl::access::fence_spacce::local_space); +#else + return; +#endif + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_thread(const cl::sycl::nd_item<1> &itemID) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type sync_thread( + const cl::sycl::nd_item<1> &) { + return; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID, + ThreadProperties &thread_properties, + TiledMemory &tiled_input_block, + PacketReturnType *privateRes, bool &db_offset) { + // Tiling the Rhs block from global to local memory + extract_block( + rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR), + tiled_input_block.rhs_extract_index, + contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset, + thread_properties.kGroupOffset - thread_properties.kSize); + + sync_thread(itemID); + + // Tiling the Lhs block from global to local memory + extract_block( + lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK), + tiled_input_block.lhs_extract_index, + contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset, + thread_properties.kGroupOffset - thread_properties.kSize); + + // itemID.barrier(cl::sycl::access::fence_space::local_space); + sync_thread(itemID); + // switch to compute mede + StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK); + StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR); + // Loop over the values of a single tile + for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) { + compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset, + tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes); + lhs_offset += LSDL; + rhs_offset += LSDR; + } + // computing the K index for the next tile + thread_properties.kSize -= Properties::TileSizeDimK; + sync_mem(itemID, db_offset); + } + + // when local memory is available the following compute_panel will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID, + ThreadProperties &thread_properties, + OutPtr out_ptr) { + auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()}; + // Allocate register space + PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = { + PacketReturnType{0}}; + bool db_offset = 0; + + while (thread_properties.kSize >= Properties::TileSizeDimK) { + compute_tile_per_panel(itemID, thread_properties, tiled_input_block, privateRes, db_offset); + } + if (thread_properties.kSize > 0) { + compute_tile_per_panel(itemID, thread_properties, tiled_input_block, privateRes, db_offset); + } + + // Storing the final results in the output + store(1) : RHSBlockProperties::nc_stride>( + out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset, + thread_properties.nGlobalOffset); + } + // When local memory is available the following extract_block will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + extract_block(const Input &inpt, Local local_ptr, const std::pair& local_index, + const StorageIndex &ncOffset, const StorageIndex cOffset) { + EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = + InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM; + EIGEN_CONSTEXPR StorageIndex LoadPerThread = + InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs; + EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL; + static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) && + (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)), + " LocalOffset must be divisable by stride"); + const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; + StorageIndex localThreadNC = local_index.first; + StorageIndex localThreadC = local_index.second; + auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { + return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && + (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); + }; + EIGEN_UNROLL_LOOP + for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) { + const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC); + const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC); + const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; + if (check_boundary(chk_bound(CIndex, NCIndex))) { + auto val = + read(inpt, NCIndex, CIndex, ld); + write( + val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + + (InputBlockProperties::c_stride * localThreadC * LSD)); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); + const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); + OutScalar val = + (nCInd < NC && cInd < triple_dim.K) + ? read( + inpt, nCInd, cInd, ld) + : OutScalar(0); + + write( + val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + + (InputBlockProperties::is_coalesced_layout ? i : 0) + + ((InputBlockProperties::c_stride * localThreadC + + (InputBlockProperties::is_coalesced_layout ? 0 : i)) * + LSD)); + } + } + localThreadNC += (InputBlockProperties::is_coalesced_layout) + ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) + : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride); + localThreadC += (InputBlockProperties::is_coalesced_layout) + ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride) + : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride); + } + } +}; + +#ifndef EIGEN_SYCL_DISABLE_GEMV + +/*! + * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special + * case of Tensor Tensor contraction. + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs) + * + * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs) + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Properties: determines the Contraction Panel properties + * + * \tparam KFactor: determines the number of elements in K dimension in a Tile + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector + * + * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output. + * Otherwise, the result of contraction will be written iin a temporary buffer. + * + * \param scratch: determines the local memory containing the vector block for each work-group + * + * \param vec: determines the vector input (tensor mapper) + * + * \param mat: determines the tensor input (tensor mapper) + * + * \param out_res: determines the output vector containing the contraction result + * + * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension + * + * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor + * + * \param contractDim: determines the size of non contracting dimension for the flattened tensor + * + */ +template +struct GeneralVectorTensor { + typedef typename Eigen::TensorSycl::internal::Vectorise::PacketReturnType + PacketReturnType; + static EIGEN_CONSTEXPR int PacketSize = + Eigen::TensorSycl::internal::Vectorise::PacketSize; + typedef cl::sycl::accessor Scratch; + + static EIGEN_CONSTEXPR StorageIndex OutScratchOffset = + KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; + + // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make + // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true. + typedef BlockProperties + VecBlockProperties; + + Scratch scratch; + const VectorMapper vec; + const TensorMapper mat; + OutAccessor out_res; + const StorageIndex nonContractGroupSize; + const StorageIndex nonContractDim; + const StorageIndex contractDim; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_, + const TensorMapper mat_, OutAccessor out_res_, + const StorageIndex nonContractGroupSize_, + const StorageIndex nonContractDim_, + const StorageIndex contractDim_) + : scratch(scratch_), + vec(vec_), + mat(mat_), + out_res(out_res_), + nonContractGroupSize(nonContractGroupSize_), + nonContractDim(nonContractDim_), + contractDim(contractDim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + auto scratch_ptr = scratch.get_pointer(); + const StorageIndex linearLocalThreadId = itemID.get_local_id(0); + StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC + : linearLocalThreadId % Properties::LocalThreadSizeNC; + StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC + : linearLocalThreadId / Properties::LocalThreadSizeNC; + const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize; + const StorageIndex nonContractGroupId = + is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize; + const StorageIndex contractGroupId = + is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize; + auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim); + + const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC; + const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC; + auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; + const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId; + const StorageIndex globalContractDimOffset = contractGroupOffset + contractId; + auto local_output = scratch_ptr + OutScratchOffset; + const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC && + contractDim - contractGroupOffset >= Properties::TileSizeDimC; + is_internal + ? compute_panel(itemID, vec, mat, local_output, out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + scratch_ptr, contractGroupOffset, +#endif + nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, + nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex) + : compute_panel(itemID, vec, mat, local_output, out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + scratch_ptr, contractGroupOffset, +#endif + nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, + nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex); + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel( + const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output, + OutPtr out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + OutScalar *scratch_ptr, const StorageIndex contractGroupOffset, +#endif + const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim, + StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId, + StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) { + OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)}; + // Reading the vector +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId; + extract_block(vec, scratch_ptr, linearLocalThreadId, + vectorOffset, contractDim); + + itemID.barrier(cl::sycl::access::fence_space::local_space); + auto in_scratch_ptr = scratch_ptr + contractId; +#endif + + StorageIndex privateOffsetC = 0; + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) { + StorageIndex privateOffsetNC = 0; + bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim); +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + auto vecScalar = *in_scratch_ptr; +#else + auto vecScalar = (check_boundary(contract_conds)) + ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC, + is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0)) + : OutScalar(0); +#endif + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + auto matScalar = (check_boundary( + contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim))) + ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC + : globalNonContractDimOffset + privateOffsetNC, + is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC + : globalContractDimOffset + privateOffsetC) + : OutScalar(0); + + outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]); + privateOffsetNC += Properties::LocalThreadSizeNC; + } + privateOffsetC += Properties::LocalThreadSizeC; +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + in_scratch_ptr += Properties::LocalThreadSizeC; +#endif + } + + auto out_scratch_ptr = local_output + outScratchIndex; + // Each block of 16*16 element in shared memory should reduce to 16*1 + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + *out_scratch_ptr = outScalar[j]; + + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + } + if (is_lhs_vec) { + nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC; + contractId = linearLocalThreadId / Properties::LocalThreadSizeNC; + outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; + } + + out_scratch_ptr = local_output + outScratchIndex; + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + EIGEN_UNROLL_LOOP + for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (contractId < offset) { + StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset); + *out_scratch_ptr += out_scratch_ptr[myNeigbourId]; + } + } + // moving to next 16 by 16 block + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + } + + if (contractId == 0) { + out_scratch_ptr = local_output + nonContractId; + StorageIndex global_final_offset = nonContractGroupOffset + nonContractId; + out_ptr += global_final_offset; + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + if (check_boundary(global_final_offset < nonContractDim)) { + auto res = *out_scratch_ptr; + + *out_ptr = res; + out_ptr += Properties::LocalThreadSizeNC; + } + // moving to next 16 by 16 block to ge the next 16 reduced elements + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC; + } + } + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr, + const StorageIndex &linearLocalThreadId, + const StorageIndex &cOffset, const StorageIndex &C) { + local_ptr += InputBlockProperties::c_stride * linearLocalThreadId; + StorageIndex cIndex = cOffset; + for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) { + if (check_boundary(cIndex + InputBlockProperties::c_stride - 1 < C)) { + auto val = read(inpt, StorageIndex(0), + cIndex, StorageIndex(1)); + write(val, local_ptr); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + OutScalar val = + (cIndex + i < C) + ? read( + inpt, StorageIndex(0), cIndex + i, StorageIndex(1)) + : OutScalar(0); + write(val, local_ptr + i); + } + } + local_ptr += InputBlockProperties::c_stride * GroupSize; + cIndex += InputBlockProperties::c_stride * GroupSize; + } + } +}; +#endif + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + +/*! + * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction + * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam LhsScalar: determines the left-hand-side scalar type + * + * \tparam RhsScalar: determines the right-hand-side scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix + * + * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group + * + * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) + * + * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) + * + * \param out_res: determines the output tensor containing the contraction result + * + * \param rng: determins the total input data size + */ +template +struct GeneralScalarContraction { + typedef cl::sycl::accessor Scratch; + Scratch scratch; + const LhsMapper lhs; + const RhsMapper rhs; + OutAccessor out_res; + const StorageIndex rng; + + EIGEN_DEVICE_FUNC + GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex rng_) + : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {} + + EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) { + auto out_ptr = out_res.get_pointer(); + auto scratch_ptr = scratch.get_pointer().get(); + + StorageIndex globalid = itemID.get_global_id(0); + StorageIndex localid = itemID.get_local_id(0); + OutScalar accumulator = OutScalar(0); + for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) { + accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator); + } + auto out_scratch_ptr = scratch_ptr + localid; + *out_scratch_ptr = accumulator; + for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]); + } + } + if (localid == 0) { + out_ptr[itemID.get_group(0)] = accumulator; + } + } +}; +#endif + +} // namespace internal +} // namespace TensorSycl + +template +struct TensorEvaluator, + Eigen::SyclDevice> + : public TensorContractionEvaluatorBase, Eigen::SyclDevice>> { static_assert(std::is_same::value, "SYCL tensor contraction does not support output kernels."); - typedef const Eigen::SyclDevice Device; + typedef Eigen::SyclDevice Device; typedef TensorEvaluator, Device> Self; typedef TensorContractionEvaluatorBase Base; typedef TensorContractionOp XprType; typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; + typedef typename XprType::Index StorageIndex; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - + typedef typename Base::Storage Storage; + typedef typename Base::EvaluatorPointerType EvaluatorPointerType; + struct TripleDim { + const StorageIndex M; + const StorageIndex N; + const StorageIndex K; + TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {} + }; enum { Layout = TensorEvaluator::Layout, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, }; - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + static EIGEN_CONSTEXPR int LDims = Base::LDims; + static EIGEN_CONSTEXPR int RDims = Base::RDims; + static EIGEN_CONSTEXPR int ContractDims = Base::ContractDims; - static const int LDims = - internal::array_size::Dimensions>::value; - static const int RDims = - internal::array_size::Dimensions>::value; - static const int ContractDims = internal::array_size::value; + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; - typedef array left_dim_mapper_t; - typedef array right_dim_mapper_t; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; static const int NumDims = LDims + RDims - 2 * ContractDims; - typedef DSizes Dimensions; - - // typedefs needed in evalTo - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; + typedef DSizes Dimensions; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + typedef typename Eigen::internal::remove_const::type LhsScalar; + typedef typename Eigen::internal::remove_const::type RhsScalar; typedef typename LeftEvaluator::Dimensions LeftDimensions; typedef typename RightEvaluator::Dimensions RightDimensions; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) {} + template + struct input_mapper_propertis { + static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous; + static EIGEN_CONSTEXPR bool is_rhs_matrix = + (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered); + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {} // We need to redefine this method to make nvcc happy - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) { this->m_leftImpl.evalSubExprsIfNeeded(NULL); this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { - evalTo(data); - return false; - } else { - this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); - evalTo(this->m_result); - return true; + if (!data) { + this->m_result = this->m_device.get( + static_cast(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar)))); + data = this->m_result; } + evalToSycl(data); + return (this->m_result != NULL); } - const Eigen::SyclDevice& device() const {return this->m_device;} - void evalTo(Scalar* buffer) const { - // Here is the result + const Eigen::SyclDevice &device() const { return this->m_device; } + void evalToSycl(typename Base::EvaluatorPointerType buffer) const { if (this->m_lhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_reordered) { evalTyped(buffer); - } - else { + } else { evalTyped(buffer); } - } - else { - if (this->m_rhs_inner_dim_reordered) { + } else { + if (this->m_rhs_inner_dim_reordered) { evalTyped(buffer); - } - else { + } else { evalTyped(buffer); } } - } - else { + } else { if (this->m_rhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_reordered) { evalTyped(buffer); - } - else { + } else { evalTyped(buffer); } - } - else { - if (this->m_rhs_inner_dim_reordered) { + } else { + if (this->m_rhs_inner_dim_reordered) { evalTyped(buffer); - } - else { + } else { evalTyped(buffer); } } @@ -138,267 +1388,263 @@ struct TensorEvaluator - void evalTyped(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - EIGEN_UNUSED_VARIABLE(k) - // rows in left side - const Index m = this->m_i_size; - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - LaunchSyclKernels::Run(*this, buffer, m, n, k, - this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides, - this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides); + void evalTyped(typename Base::EvaluatorPointerType buffer) const { + const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size}; + typedef internal::TensorContractionInputMapper< + LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t, + PacketType::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer> + LhsMapper; + + typedef internal::TensorContractionInputMapper::size, rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer> + RhsMapper; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + if (triple_dim.M == 1 && triple_dim.N == 1) { + launchSC(buffer, lhs, rhs, triple_dim.K); + } else +#endif +#ifndef EIGEN_SYCL_DISABLE_GEMV + if (triple_dim.M != 1 && triple_dim.N == 1) { + LaunchVT(buffer, rhs, lhs, triple_dim.M, triple_dim.K); + } else if (triple_dim.M == 1 && triple_dim.N != 1) { + LaunchVT(buffer, lhs, rhs, triple_dim.N, triple_dim.K); + } else // This is equivalent of if (m!=1 && n!=1) +#endif + { + typedef input_mapper_propertis + inpt_mapper_properties; +#ifndef EIGEN_SYCL_DISABLE_SKINNY + bool skinny = false; + auto platform_name = this->device().getPlatformName(); + // This is based on empirical calculation for AMD r9-nano and Fiji + if (platform_name.find("AMD") == 0) { + skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) && + ((triple_dim.M < 1024 && triple_dim.N < 1024) || + (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K))); + } else { + skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) || + ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) || + ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100)); + } + if (skinny) + adjustTT(buffer, lhs, rhs, triple_dim); + else +#endif // EIGEN_SYCL_DISABLE_SKINNY + adjustTT(buffer, lhs, rhs, triple_dim); + } } - // required by sycl to construct the expr on the device. Returns original left_impl - const TensorEvaluator& left_impl() const { - return choose(Cond(Layout) == static_cast(ColMajor)>(), this->m_leftImpl, this->m_rightImpl); + + template + void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + const TripleDim &triple_dim) const { +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + if (device().has_local_memory()) { + typedef TensorSycl::internal::TTPanelSize PanelParameters; + launchTT( + buffer, lhs, rhs, triple_dim); + } +#endif +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF + if (!(device().has_local_memory())) { + typedef TensorSycl::internal::TTPanelSize PanelParameters; + launchTT( + buffer, lhs, rhs, triple_dim); + } +#endif } - // required by sycl to construct the expr on the device. Returns original right_impl - const TensorEvaluator& right_impl() const { - return choose(Cond(Layout) == static_cast(ColMajor)>(), this->m_rightImpl, this->m_leftImpl); + + template + void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + const TripleDim &triple_dim) const { + const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM); + const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN); + const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM; + const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN; + + const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK); + StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK; + StorageIndex groupSizeK = + skinny + ? std::max(std::min(totalTilesK, + (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) / + (groupSizeM * groupSizeN)), + StorageIndex(1)) + : StorageIndex(1); + + const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK; + + const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK; + + const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; + const StorageIndex globalRange = totalGroupSize * localRange; + + const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local) + ? ((Properties::DoubleBuffer + 1) * + (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) + + ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) * + (Properties::TileSizeDimN + Properties::BC)) + : StorageIndex(1); + + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (groupSizeK == 1) { + typedef TensorSycl::internal::TensorContractionKernel + ContractKernelName; + device().template binary_kernel_launcher( + lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim); + } else { + typedef TensorSycl::internal::TensorContractionKernel + ContractKernelName; + CoeffReturnType *temp_pointer = static_cast( + device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + + device().template binary_kernel_launcher( + lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, + triple_dim); + + typedef Eigen::internal::SumReducer Op; + auto op = Op(); + typedef TensorSycl::internal::SecondStepPartialReduction + ReductionKernel; + + device().template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex( + Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))), + cl::sycl::range<1>(localRange)), + StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK); + + device().deallocate_temp(temp_pointer); + } } -}; -template struct KernelConstructor{ - typedef typename Eigen::internal::traits::_LhsNested LHSHostExpr; - typedef typename Eigen::internal::traits::_RhsNested RHSHostExpr; - typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression::Type LHSPlaceHolderExpr; - typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression::Type RHSPlaceHolderExpr; - LHSFunctorExpr lhs_functors; - RHSFunctorExpr rhs_functors; - LhsLocalAcc localLhs; - RhsLocalAcc localRhs; - OutAccessor out_res; - size_t out_offset; - Index roundUpK, M, N, K; - ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides; - LeftNocontractT m_i_strides, m_left_nocontract_strides; - RightNocontractT m_j_strides, m_right_nocontract_strides; - LHSTupleType left_tuple_of_accessors; - RHSTupleType right_tuple_of_accessors; - Device dev; - - - KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, size_t out_offset_, - Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_, - ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_, - LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_) - :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), - out_offset(out_offset_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), - m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_), - m_right_contracting_strides(m_right_contracting_strides_), - m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_), - m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_), - left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){} - - void operator()(cl::sycl::nd_item<2> itemID) { - typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; - typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression::Type LHSDevExpr; - typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression::Type RHSDevExpr; - auto lhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression(lhs_functors, left_tuple_of_accessors); - auto rhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression(rhs_functors, right_tuple_of_accessors); - typedef decltype(lhs_dev_expr.expr) LeftArgType; - typedef decltype(rhs_dev_expr.expr) RightArgType; - typedef typename internal::conditional(Eigen::internal::traits::Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional(Eigen::internal::traits::Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - // initialize data mappers must happen inside the kernel for device eval - LhsMapper lhs(LeftEvaluator(choose(Cond(Eigen::internal::traits::Layout) == static_cast(ColMajor)>(), - lhs_dev_expr.expr, rhs_dev_expr.expr), dev), m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); - RhsMapper rhs(RightEvaluator(choose(Cond(Eigen::internal::traits::Layout) == static_cast(ColMajor)>(), - rhs_dev_expr.expr, lhs_dev_expr.expr),dev), m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides); - auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res); - // Matmul Kernel - // Thread identifiers - const Index mLocalThreadId = itemID.get_local(0); // Local ID row - const Index nLocalThreadId = itemID.get_local(1); // Local ID col - const Index mGroupId = itemID.get_group(0); // Work-group ID row - const Index nGroupId = itemID.get_group(1); // Work-group ID localCol - const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID - // Allocate register space - LhsScalar privateLhs; - RhsScalar privateRhs[WorkLoadPerThreadN]; - OutScalar privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; - // Initialise the privateResumulation registers - for (Index wLPTM=0; wLPTM(0); - } - } +#ifndef EIGEN_SYCL_DISABLE_GEMV + template + void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat, + StorageIndex NC, StorageIndex C) const { + const StorageIndex nonContractDim = NC; + EIGEN_CONSTEXPR StorageIndex NCFactor = 1; + EIGEN_CONSTEXPR StorageIndex CFactor = 1; + EIGEN_CONSTEXPR StorageIndex NCWindow = 16; + typedef Eigen::TensorSycl::internal::TVPanelSize + Properties; + const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC); + const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC); + const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC); + const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC); + const StorageIndex globalRange = + (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC)); + const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC; + const StorageIndex scratchSize = + (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (cNumGroups > 1) { + typedef Eigen::TensorSycl::internal::GeneralVectorTensor + ContractKernelName; + CoeffReturnType *temp_pointer = + static_cast(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); - // Tile Lhs - for (Index lPTL=0; lPTL(0); - } - // Tile Rhs - for (Index lPTR=0; lPTR(0); + device().template binary_kernel_launcher( + vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C); - } - // Loop over all tiles - const Index numTiles = roundUpK/TileSizeDimK; - Index firstHalf=0; - do { - // Synchronise - itemID.barrier(cl::sycl::access::fence_space::local_space); - // Load the next tile of Lhs and Rhs into local memory - Index nextHalf = firstHalf + 1; - if (nextHalf < numTiles) { - // Tile A - for (Index lPTL=0; lPTL(0); - } - // Tile B - for (Index lPTR=0; lPTR(0); - } - } - // Loop over the values of a single tile - for (Index k=0; k Op; + typedef TensorSycl::internal::SecondStepPartialReduction + ReductionKernel; + device().template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)), + cl::sycl::range<1>(localRange)), + StorageIndex(1), Op(), nonContractDim, cNumGroups); + + device().deallocate_temp(temp_pointer); + } else { + typedef Eigen::TensorSycl::internal::GeneralVectorTensor + ContractKernelName; + device().template binary_kernel_launcher( + vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C); } + } +#endif -}; -template struct LaunchSyclKernels { - -static const Index TileSizeDimM = 32ul; // Tile size for dimension M -static const Index TileSizeDimN = 32ul; // Tile size for dimension N -static const Index TileSizeDimK = 16ul; // Tile size for dimension K -static const Index WorkLoadPerThreadM = 4ul; // Work load per thread in dimension M -static const Index WorkLoadPerThreadN = 4ul; // work load per thread in dimension N -static const Index LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here) -static const Index LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here) -static const Index LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)); // workload per thread for Lhs expression -static const Index LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)); // workload per thread for Rhs expression - -// RoundUp function to make sure that the global threadId is divisable by local threadId -static Index RoundUp(Index x, Index y) { - return ((((x) + (y) - 1) / (y))*(y)); -} +#ifndef EIGEN_SYCL_DISABLE_SCALAR + template + EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + StorageIndex K) const { + EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) & + (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), + "The Local thread size must be a power of 2 for the reduction " + "operation"); + EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; -template< typename Self, typename OutScalar, typename ContractT, typename LeftNocontractT, typename RightNocontractT> - static void Run(const Self& self, OutScalar* buffer, Index M, Index N, Index K, - ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides, - LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){ - - typedef typename Self::XprType HostExpr; - typedef typename Eigen::internal::traits::_LhsNested LHSHostExpr; - typedef typename Eigen::internal::traits::_RhsNested RHSHostExpr; - typedef TensorEvaluator OrigLHSExpr; - typedef TensorEvaluator OrigRHSExpr; - typedef Eigen::TensorSycl::internal::FunctorExtractor LHSFunctorExpr; - typedef Eigen::TensorSycl::internal::FunctorExtractor RHSFunctorExpr; - // extract lhs functor list - LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); - // extract rhs functor list - RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.right_impl()); - - Index roundUpK = RoundUp(K, TileSizeDimK); - Index roundUpM = RoundUp(M, TileSizeDimM); - Index roundUpN = RoundUp(N, TileSizeDimN); - ptrdiff_t out_offset = self.device().get_offset(buffer); - self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) { - /// work-around for gcc bug - typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, self.left_impl())) LHSTupleType; - /// work-around for gcc bug - typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, self.right_impl())) RHSTupleType; - // create lhs tuple of accessors - LHSTupleType left_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, self.left_impl()); - // create rhs tuple of accessors - RHSTupleType right_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, self.right_impl()); - - // Local memory for elements of Lhs - typedef cl::sycl::accessor LhsLocalAcc; - LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh); - // Local memory for elements of Rhs - typedef cl::sycl::accessor RhsLocalAcc; - RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh); - - typedef cl::sycl::accessor OutAccessor; - //OutScalar memory - OutAccessor out_res= self.device(). template get_sycl_accessor(cgh, buffer); - // sycl parallel for - cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), - cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), - KernelConstructor(lhs_functors, rhs_functors, - localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, - m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::SyclKernelDevice())); - }); - self.device().asynchronousExec(); + // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread + // reduces at least 512 elementss individually, we get better performance. + const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1); + const StorageIndex global_range = num_work_group * local_range; + + typedef Eigen::TensorSycl::internal::GeneralScalarContraction< + CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false> + ContractKernelName; + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); + if (num_work_group > 1) { + CoeffReturnType *temp_pointer = + static_cast(device().allocate_temp(num_work_group * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + device().template binary_kernel_launcher(lhs, rhs, tmp_global_accessor, + thread_range, local_range, K); + typedef Eigen::internal::SumReducer Op; + typedef TensorSycl::internal::SecondStepFullReducer + GenericRKernel; + device().template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op()); + + device().deallocate_temp(temp_pointer); + } else { + device().template binary_kernel_launcher(lhs, rhs, buffer, thread_range, + local_range, K); + } } -}; +#endif -} // end namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + this->m_leftImpl.cleanup(); + this->m_rightImpl.cleanup(); + + if (this->m_result) { + this->m_device.deallocate_temp(this->m_result); + this->m_result = NULL; + } + } + // The placeholder accessors must bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + this->m_leftImpl.bind(cgh); + this->m_rightImpl.bind(cgh); + this->m_result.bind(cgh); + } +}; +} // namespace Eigen +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 5c94165d1..0218727d1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -18,207 +18,252 @@ namespace Eigen { /** \class TensorConvolution - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor convolution class. - * - * - */ -template -struct EigenConvolutionKernel1D{ -typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; -internal::IndexMapper::Layout> indexMapper; -Kernel_accessor kernel_filter; -const size_t kernelSize, range_x, range_y; -Buffer_accessor buffer_acc; -ptrdiff_t out_offset; -Local_accessor local_acc; -FunctorExpr functors; -TupleType tuple_of_accessors; -EigenConvolutionKernel1D(internal::IndexMapper::Layout> indexMapper_, - Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_, - Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) - :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_), - buffer_acc(buffer_acc_), out_offset(out_offset_),local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} - + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ + +enum class convolution_type { CONV1D, CONV2D, CONV3D }; +template +struct EigenConvolutionKernel; +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const size_t kernelSize; + const cl::sycl::range<2> input_range; + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const size_t kernelSize_, const cl::sycl::range<2> input_range_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernelSize(kernelSize_), + input_range(input_range_) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) { + return (boolean_check[0] && boolean_check[1]); + } void operator()(cl::sycl::nd_item<2> itemID) { - typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; - auto device_expr =TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::SyclKernelDevice()); - - auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); - auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); - - const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize -1); //the required row to be calculated for the for each plane in shered memory - const size_t plane_kernel_offset = itemID.get_local(1) * num_x_input; - const size_t first_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; - const size_t plane_tensor_offset =indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(1)); + auto buffer_ptr = buffer_acc.get_pointer(); + auto kernel_ptr = kernel_filter.get_pointer(); + // the required row to be calculated for the for each plane in shered memory + const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1); + const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input; + const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0]; + const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1)); /// fill the shared memory - for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { - const size_t local_index = i + plane_kernel_offset ; - const size_t tensor_index = plane_tensor_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_input_start); - if(((i + first_input_start) < (range_x +kernelSize-1)) && itemID.get_global(1)< range_y){ - local_acc[local_index] = device_evaluator.coeff(tensor_index); - } - else local_acc[local_index]=0.0f; + for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) { + const size_t local_index = i + plane_kernel_offset; + const size_t tensor_index = + plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset); + + local_acc[local_index] = + (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1]) + ? device_evaluator.coeff(tensor_index) + : CoeffReturnType(0); } itemID.barrier(cl::sycl::access::fence_space::local_space); - // calculate the convolution - const size_t first_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x - if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y){ + // calculate the convolution // output start x + const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]); + if (boundary_check(itemID.get_global_id() < input_range)) { CoeffReturnType result = static_cast(0); - const size_t index = plane_kernel_offset+ itemID.get_local(0); + const size_t index = plane_kernel_offset + itemID.get_local_id(0); for (size_t k = 0; k < kernelSize; ++k) { result += (local_acc[k + index] * kernel_ptr[k]); } - const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1)) - +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start); - buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result; + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start); + buffer_ptr[tensor_index] = result; } } }; - -template -struct EigenConvolutionKernel2D{ -typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; -internal::IndexMapper::Layout> indexMapper; -Kernel_accessor kernel_filter; -const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z; -Buffer_accessor buffer_acc; -ptrdiff_t out_offset; -Local_accessor local_acc; -FunctorExpr functors; -TupleType tuple_of_accessors; -EigenConvolutionKernel2D(internal::IndexMapper::Layout> indexMapper_, - Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_, - Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) - :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_), - buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const cl::sycl::range<2> kernel_size; + const cl::sycl::range<3> input_range; + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernel_size(kernel_size_), + input_range(input_range_) {} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { + return (boolean_check[0] && boolean_check[1] && boolean_check[2]); + } void operator()(cl::sycl::nd_item<3> itemID) { - typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; - auto device_expr =TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::SyclKernelDevice()); - - auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); - auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); - const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory - const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory - const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(2)); - const size_t plane_kernel_offset = itemID.get_local(2) * num_y_input; - - /// fill the shared memory - const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; - const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1]; - for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) { - const size_t local_input_offset = num_x_input * (j + plane_kernel_offset); - for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { + auto buffer_ptr = buffer_acc.get_pointer(); + auto kernel_ptr = kernel_filter.get_pointer(); + // the required row to be calculated for the for each plane in shered memory + const auto num_input = cl::sycl::range<2>{ + (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)}; + + const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2)); + const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1]; + + const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], + itemID.get_group(1) * itemID.get_local_range()[1]}; + + // fill the local memory + bool in_range_dim2 = itemID.get_global_id(2) < input_range[2]; + for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { + const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset); + bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); + for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { const size_t local_index = i + local_input_offset; - const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start ); - if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) &&((j + first_y_input_start) < (range_y +kernelSize_y-1)) && itemID.get_global(2)< range_z){ - local_acc[local_index] = device_evaluator.coeff(tensor_index); - } - else local_acc[local_index]=0.0f; + const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( + i + input_offset[0], j + input_offset[1]); + local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) && + in_range_dim1 && in_range_dim2) + ? device_evaluator.coeff(tensor_index) + : CoeffReturnType(0); + } } - } itemID.barrier(cl::sycl::access::fence_space::local_space); - // calculate the convolution - const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x - const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // output start y - if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){ + // output offset start for each thread + const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], + itemID.get_group(1) * itemID.get_local_range()[1]}; + + if (boundary_check(itemID.get_global_id() < input_range)) { CoeffReturnType result = static_cast(0); - for (size_t j = 0; j < kernelSize_y; j++) { - size_t kernel_offset =kernelSize_x * j; - const size_t index = (num_x_input*(plane_kernel_offset + j+ itemID.get_local(1))) + itemID.get_local(0); - for (size_t i = 0; i < kernelSize_x; i++) { - result += (local_acc[i + index] * kernel_ptr[i+kernel_offset]); + + for (size_t j = 0; j < kernel_size[1]; j++) { + size_t kernel_offset = kernel_size[0] * j; + const size_t index = + (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0); + for (size_t i = 0; i < kernel_size[0]; i++) { + result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]); } } - const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2)) - +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start); - buffer_ptr[tensor_index +ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result; + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0], + itemID.get_local_id(1) + output_offset[1]); + + buffer_ptr[tensor_index] = result; } } }; +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const cl::sycl::range<3> kernel_size; + const cl::sycl::range<3> input_range; + const size_t numP; + + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_, + const size_t numP_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernel_size(kernel_size_), + input_range(input_range_), + numP(numP_) {} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { + return (boolean_check[0] && boolean_check[1] && boolean_check[2]); + } + void operator()(cl::sycl::nd_item<3> itemID) { + auto buffer_ptr = buffer_acc.get_pointer(); + auto kernel_ptr = kernel_filter.get_pointer(); + const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1}; + const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()}; -template -struct EigenConvolutionKernel3D{ -typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; -internal::IndexMapper::Layout> indexMapper; -Kernel_accessor kernel_filter; -const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP; -Buffer_accessor buffer_acc; -ptrdiff_t out_offset; -Local_accessor local_acc; -FunctorExpr functors; -TupleType tuple_of_accessors; -EigenConvolutionKernel3D(internal::IndexMapper::Layout> indexMapper_, - Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ , - const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_, - Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) - :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), - kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_), - buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + const auto output_offset = + cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()}; - void operator()(cl::sycl::nd_item<3> itemID) { - typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; - auto device_expr =TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::SyclKernelDevice()); - - auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); - auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); - const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory - const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory - const size_t num_z_input = (itemID.get_local_range()[2] +kernelSize_z -1); //the required row to be calculated for the for each plane in shered memory - const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; - const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1]; - const size_t first_z_input_start = itemID.get_group(2)*itemID.get_local_range()[2]; - for(size_t p=0; p(0); - for (size_t k = 0; k < kernelSize_z; k++) { - for (size_t j = 0; j < kernelSize_y; j++) { - for (size_t i = 0; i < kernelSize_x; i++) { - const size_t kernel_index =i + kernelSize_x * (j + kernelSize_y * k); - const size_t local_index = ((i+ itemID.get_local(0))+ num_x_input*((j+ itemID.get_local(1)) + num_y_input * (k+ itemID.get_local(2)))); + for (size_t k = 0; k < kernel_size[2]; k++) { + for (size_t j = 0; j < kernel_size[1]; j++) { + for (size_t i = 0; i < kernel_size[0]; i++) { + const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k); + const size_t local_index = + ((i + itemID.get_local_id(0)) + + num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2)))); + result += (local_acc[local_index] * kernel_ptr[kernel_index]); } } } - const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p) - +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start ); - buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result; + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]); + buffer_ptr[tensor_index] = result; } itemID.barrier(cl::sycl::access::fence_space::local_space); @@ -226,25 +271,32 @@ EigenConvolutionKernel3D(internal::IndexMapper -struct TensorEvaluator, const Eigen::SyclDevice> -{ +template +struct TensorEvaluator, Eigen::SyclDevice> { typedef TensorConvolutionOp XprType; - static const int NumDims = internal::array_size::Dimensions>::value; + static const int NumDims = + internal::array_size::Dimensions>::value; static const int NumKernelDims = internal::array_size::value; typedef typename XprType::Index Index; typedef DSizes Dimensions; - typedef typename TensorEvaluator::Dimensions KernelDimensions; + typedef typename TensorEvaluator::Dimensions KernelDimensions; typedef const Eigen::SyclDevice Device; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory KernelStorage; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, PacketAccess = false, BlockAccessV2 = false, PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, + Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; @@ -253,13 +305,22 @@ struct TensorEvaluator(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - - const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); - const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device) + : m_inputImpl(op.inputExpression(), device), + m_kernelArg(op.kernelExpression()), + m_kernelImpl(op.kernelExpression(), device), + m_indices(op.indices()), + m_buf(NULL), + m_kernel(NULL), + m_local_kernel(false), + m_device(device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions &input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions &kernel_dims = + m_kernelImpl.dimensions(); m_dimensions = m_inputImpl.dimensions(); for (int i = 0; i < NumKernelDims; ++i) { @@ -271,21 +332,17 @@ struct TensorEvaluator::type PacketReturnType; - typedef typename InputArgType::Scalar Scalar; - static const int PacketSize = internal::unpacket_traits::size; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { preloadKernel(); m_inputImpl.evalSubExprsIfNeeded(NULL); if (data) { executeEval(data); return false; } else { - m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + m_buf = (EvaluatorPointerType)m_device.get( + (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))); executeEval(m_buf); return true; } @@ -294,194 +351,194 @@ struct TensorEvaluator::PointerType data() const { return m_buf; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { // Don't make a local copy of the kernel unless we have to (i.e. it's an // expression that needs to be evaluated) - const Scalar* in_place = m_kernelImpl.data(); + typename KernelStorage::Type in_place = m_kernelImpl.data(); if (in_place) { m_kernel = in_place; m_local_kernel = false; } else { ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); - Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz)); typedef TensorEvalToOp EvalTo; - EvalTo evalToTmp(local, m_kernelArg); - const bool PacketAccess = internal::IsVectorizable::value; - internal::TensorExecutor::run(evalToTmp, m_device); + EvalTo evalToTmp(m_device.get(local), m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); m_kernel = local; m_local_kernel = true; } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(Scalar* data) const { - typedef TensorEvaluator InputEvaluator; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const { + typedef TensorEvaluator InputEvaluator; typedef typename InputEvaluator::Dimensions InputDims; + switch (NumKernelDims) { + case 1: { + const size_t numX = dimensions()[m_indices[0]]; + const size_t numP = dimensions().TotalSize() / numX; + const auto input_dim = std::array{numX, numP}; + auto global_range = cl::sycl::range<2>{}; + auto local_range = cl::sycl::range<2>{}; + const size_t kernel_size = m_kernelImpl.dimensions().TotalSize(); + + m_device.parallel_for_setup(input_dim, global_range, local_range); + const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]); + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + const array indices{{m_indices[0]}}; + const array kernel_dims{{m_kernelImpl.dimensions()[0]}}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + typedef EigenConvolutionKernel + ConvKernel; + + m_device.template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1])); + break; + } - typedef Eigen::TensorSycl::internal::FunctorExtractor InputFunctorExpr; - // extract input functor list - InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl); - ptrdiff_t out_offset = m_device.get_offset(data); - - - m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) { - - typedef cl::sycl::accessor InputLocalAcc; - /// work-around for gcc 4.8 auto bug - typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, m_inputImpl)) InputTupleType; - // create input tuple of accessors - InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, m_inputImpl); - - typedef cl::sycl::accessor OutputAccessorType; - OutputAccessorType out_res= m_device. template get_sycl_accessor(cgh, data); - typedef cl::sycl::accessor KernelAccessorType; - KernelAccessorType kernel_acc= m_device. template get_sycl_accessor(cgh, m_kernel); - - switch (NumKernelDims) { - case 1: { - const size_t numX = dimensions()[m_indices[0]]; - const size_t numP = dimensions().TotalSize() / numX; - const size_t kernel_size = m_kernelImpl.dimensions().TotalSize(); - size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y; - m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y ); - const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y); - gpu_assert(static_cast(shared_mem) <= m_device.sharedMemPerBlock()); - auto global_range=cl::sycl::range<2>(GRange_x, GRange_y); // global range - auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range - InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); - const array indices{{m_indices[0]}}; - const array kernel_dims{{m_kernelImpl.dimensions()[0]}}; - internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range), - EigenConvolutionKernel1D( - indexMapper,kernel_acc, kernel_size, numX, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); - break; - } - - case 2: { - const size_t idxX =static_cast(Layout) == static_cast(ColMajor) ? 0 : 1; - const size_t idxY =static_cast(Layout) == static_cast(ColMajor) ? 1 : 0; - const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX]; - const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY]; - const size_t numX = dimensions()[m_indices[idxX]]; - const size_t numY = dimensions()[m_indices[idxY]]; - const size_t numP = dimensions().TotalSize() / (numX*numY); - size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z; - m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z ); - const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z; - gpu_assert(static_cast(shared_mem) <= m_device.sharedMemPerBlock()); - auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range - auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range - InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); - const array indices {{m_indices[idxX], m_indices[idxY]}}; - const array kernel_dims{{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}}; - internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range), - EigenConvolutionKernel2D( - indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); - break; - } + case 2: { + auto kernel_index = std::array{static_cast(Layout) == static_cast(ColMajor) ? 0 : 1, + static_cast(Layout) == static_cast(ColMajor) ? 1 : 0}; + auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], + (size_t)m_kernelImpl.dimensions()[kernel_index[1]]}; + const size_t numX = dimensions()[m_indices[kernel_index[0]]]; + const size_t numY = dimensions()[m_indices[kernel_index[1]]]; + const size_t numP = dimensions().TotalSize() / (numX * numY); + auto input_dim = std::array{numX, numY, numP}; + + auto global_range = cl::sycl::range<3>{}; + auto local_range = cl::sycl::range<3>{}; + + m_device.parallel_for_setup(input_dim, global_range, local_range); + + const size_t local_memory_size = + (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2]; + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + const array indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}}; + const array kernel_dims{ + {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + typedef EigenConvolutionKernel + ConvKernel; + m_device.template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]}); + break; + } - case 3: { - const size_t idxX =static_cast(Layout) == static_cast(ColMajor) ? 0 : 2; - const size_t idxY =static_cast(Layout) == static_cast(ColMajor) ? 1 : 1; - const size_t idxZ =static_cast(Layout) == static_cast(ColMajor) ? 2 : 0; - const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX]; - const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY]; - const size_t kernel_size_z = m_kernelImpl.dimensions()[idxZ]; - const size_t numX = dimensions()[m_indices[idxX]]; - const size_t numY = dimensions()[m_indices[idxY]]; - const size_t numZ = dimensions()[m_indices[idxZ]]; - const size_t numP = dimensions().TotalSize() / (numX*numY*numZ); - const array indices{{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}}; - const array kernel_dims{{m_kernelImpl.dimensions()[idxX],m_kernelImpl.dimensions()[idxY], m_kernelImpl.dimensions()[idxZ]}}; - internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z; - m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z ); - const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1); - gpu_assert(static_cast(shared_mem) <= m_device.sharedMemPerBlock()); - auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range - auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range - InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); - cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range), - EigenConvolutionKernel3D( - indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY, - numZ, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); - break; - } + case 3: { + auto kernel_index = std::array{static_cast(Layout) == static_cast(ColMajor) ? 0 : 2, + static_cast(Layout) == static_cast(ColMajor) ? 1 : 1, + static_cast(Layout) == static_cast(ColMajor) ? 2 : 0}; + + auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], + (size_t)m_kernelImpl.dimensions()[kernel_index[1]], + (size_t)m_kernelImpl.dimensions()[kernel_index[2]]}; + + const size_t numX = dimensions()[m_indices[kernel_index[0]]]; + const size_t numY = dimensions()[m_indices[kernel_index[1]]]; + const size_t numZ = dimensions()[m_indices[kernel_index[2]]]; + auto input_dim = std::array{numX, numY, numZ}; + const size_t numP = dimensions().TotalSize() / (numX * numY * numZ); + + const array indices{ + {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}}; + const array kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]], + m_kernelImpl.dimensions()[kernel_index[1]], + m_kernelImpl.dimensions()[kernel_index[2]]}}; + + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + auto global_range = cl::sycl::range<3>{}; + auto local_range = cl::sycl::range<3>{}; + + m_device.parallel_for_setup(input_dim, global_range, local_range); + auto local_memory_range = (local_range + kernel_size - 1); + const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2]; + + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + typedef EigenConvolutionKernel + ConvKernel; + m_device.template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP); + break; + } - default: { - EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); - } + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), + THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); } - }); - m_device.asynchronousExec(); + } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - eigen_assert(m_buf); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_buf != NULL); eigen_assert(index < m_dimensions.TotalSize()); return m_buf[index]; } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const - { - eigen_assert(m_buf); + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const { + eigen_assert(m_buf != NULL); eigen_assert(index < m_dimensions.TotalSize()); - return internal::ploadt(m_buf+index); + return internal::ploadt(m_buf + index); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost // model. const double kernel_size = m_kernelImpl.dimensions().TotalSize(); // We ignore the use of fused multiply-add. - const double convolve_compute_cost = - TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double convolve_compute_cost = TensorOpCost::AddCost() + TensorOpCost::MulCost(); const double firstIndex_compute_cost = NumDims * - (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + - TensorOpCost::DivCost()); + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + TensorOpCost::DivCost()); return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + - kernel_size * (m_inputImpl.costPerCoeff(vectorized) + - m_kernelImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, convolve_compute_cost, vectorized, - PacketSize)); + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize)); + } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_kernelImpl.bind(cgh); + m_inputImpl.bind(cgh); + m_buf.bind(cgh); + m_kernel.bind(cgh); } private: // No assignment (copies are needed by the kernels) - TensorEvaluator& operator = (const TensorEvaluator&); - TensorEvaluator m_inputImpl; + TensorEvaluator &operator=(const TensorEvaluator &); + TensorEvaluator m_inputImpl; KernelArgType m_kernelArg; - TensorEvaluator m_kernelImpl; + TensorEvaluator m_kernelImpl; Indices m_indices; Dimensions m_dimensions; - Scalar* m_buf; - const Scalar* m_kernel; + EvaluatorPointerType m_buf; + typename KernelStorage::Type m_kernel; bool m_local_kernel; - const Eigen::SyclDevice& m_device; -}; + const Eigen::SyclDevice EIGEN_DEVICE_REF m_device; +}; // namespace Eigen -} // end namespace Eigen +} // end namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 6f8b6f193..df591c21d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -16,7 +16,6 @@ #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H #include - namespace Eigen { namespace TensorSycl { @@ -70,9 +69,9 @@ struct SyclDeviceInfo { } // end namespace TensorSycl typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t; -// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and -// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently -// TensorFlow via the Eigen SYCL Backend. +// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and +// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently +// TensorFlow via the Eigen SYCL Backend. EIGEN_STRONG_INLINE auto get_sycl_supported_devices() -> decltype(cl::sycl::device::get_devices()) { #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR @@ -421,6 +420,91 @@ class QueueInterface { return pMapper.get_offset(ptr); } + template + EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs, + const Rhs &rhs, OutPtr outptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + lhs.bind(cgh); + rhs.bind(cgh); + outptr.bind(cgh); + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel(), +#endif + thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + template + EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr, + OutPtr &outptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + inptr.bind(cgh); + outptr.bind(cgh); + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel(), +#endif + thread_range, sycl_kernel(scratch, inptr, outptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + template + EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + inptr.bind(cgh); + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel(), +#endif + thread_range, sycl_kernel(scratch, inptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + EIGEN_STRONG_INLINE void synchronize() const { #ifdef EIGEN_EXCEPTIONS m_queue.wait_and_throw(); @@ -429,6 +513,7 @@ class QueueInterface { #endif } + EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const { set_latest_event(e); #ifndef EIGEN_SYCL_ASYNC_EXECUTION @@ -457,11 +542,10 @@ class QueueInterface { /// This is used to prepare the number of threads and also the number of /// threads per block for sycl kernels template - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, - Index &tileSize0, - Index &tileSize1, Index &rng0, - Index &rng1, Index &GRange0, - Index &GRange1) const { + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array &input_dim, cl::sycl::range<2> &global_range, + cl::sycl::range<2> &local_range) const { + std::array input_range = input_dim; Index max_workgroup_Size = static_cast(getNearestPowerOfTwoWorkGroupSize()); max_workgroup_Size = @@ -469,26 +553,28 @@ class QueueInterface { EIGEN_SYCL_LOCAL_THREAD_DIM1), static_cast(max_workgroup_Size)); Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize1 = + local_range[1] = static_cast(std::pow(2, static_cast(pow_of_2 / 2))); - rng1 = dim1; - if (rng1 == 0) rng1 = static_cast(1); - GRange1 = rng1; - if (tileSize1 > GRange1) - tileSize1 = GRange1; - else if (GRange1 > tileSize1) { - Index xMode = static_cast(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); + input_range[1] = input_dim[1]; + if (input_range[1] == 0) input_range[1] = static_cast(1); + global_range[1] = input_range[1]; + if (local_range[1] > global_range[1]) + local_range[1] = global_range[1]; + else if (global_range[1] > local_range[1]) { + Index xMode = static_cast(global_range[1] % local_range[1]); + if (xMode != 0) + global_range[1] += static_cast(local_range[1] - xMode); } - tileSize0 = static_cast(max_workgroup_Size / tileSize1); - rng0 = dim0; - if (rng0 == 0) rng0 = static_cast(1); - GRange0 = rng0; - if (tileSize0 > GRange0) - tileSize0 = GRange0; - else if (GRange0 > tileSize0) { - Index xMode = static_cast(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); + local_range[0] = static_cast(max_workgroup_Size / local_range[1]); + input_range[0] = input_dim[0]; + if (input_range[0] == 0) input_range[0] = static_cast(1); + global_range[0] = input_range[0]; + if (local_range[0] > global_range[0]) + local_range[0] = global_range[0]; + else if (global_range[0] > local_range[0]) { + Index xMode = static_cast(global_range[0] % local_range[0]); + if (xMode != 0) + global_range[0] += static_cast(local_range[0] - xMode); } } @@ -496,9 +582,9 @@ class QueueInterface { /// threads per block for sycl kernels template EIGEN_STRONG_INLINE void parallel_for_setup( - Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1, - Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, - Index &GRange1, Index &GRange2) const { + const std::array &input_dim, cl::sycl::range<3> &global_range, + cl::sycl::range<3> &local_range) const { + std::array input_range = input_dim; Index max_workgroup_Size = static_cast(getNearestPowerOfTwoWorkGroupSize()); max_workgroup_Size = @@ -506,45 +592,48 @@ class QueueInterface { EIGEN_SYCL_LOCAL_THREAD_DIM1), static_cast(max_workgroup_Size)); Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize2 = + local_range[2] = static_cast(std::pow(2, static_cast(pow_of_2 / 3))); - rng2 = dim2; - if (rng2 == 0) rng1 = static_cast(1); - GRange2 = rng2; - if (tileSize2 > GRange2) - tileSize2 = GRange2; - else if (GRange2 > tileSize2) { - Index xMode = static_cast(GRange2 % tileSize2); - if (xMode != 0) GRange2 += static_cast(tileSize2 - xMode); + input_range[2] = input_dim[2]; + if (input_range[2] == 0) input_range[1] = static_cast(1); + global_range[2] = input_range[2]; + if (local_range[2] > global_range[2]) + local_range[2] = global_range[2]; + else if (global_range[2] > local_range[2]) { + Index xMode = static_cast(global_range[2] % local_range[2]); + if (xMode != 0) + global_range[2] += static_cast(local_range[2] - xMode); } pow_of_2 = static_cast( - std::log2(static_cast(max_workgroup_Size / tileSize2))); - tileSize1 = + std::log2(static_cast(max_workgroup_Size / local_range[2]))); + local_range[1] = static_cast(std::pow(2, static_cast(pow_of_2 / 2))); - rng1 = dim1; - if (rng1 == 0) rng1 = static_cast(1); - GRange1 = rng1; - if (tileSize1 > GRange1) - tileSize1 = GRange1; - else if (GRange1 > tileSize1) { - Index xMode = static_cast(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); + input_range[1] = input_dim[1]; + if (input_range[1] == 0) input_range[1] = static_cast(1); + global_range[1] = input_range[1]; + if (local_range[1] > global_range[1]) + local_range[1] = global_range[1]; + else if (global_range[1] > local_range[1]) { + Index xMode = static_cast(global_range[1] % local_range[1]); + if (xMode != 0) + global_range[1] += static_cast(local_range[1] - xMode); } - tileSize0 = - static_cast(max_workgroup_Size / (tileSize1 * tileSize2)); - rng0 = dim0; - if (rng0 == 0) rng0 = static_cast(1); - GRange0 = rng0; - if (tileSize0 > GRange0) - tileSize0 = GRange0; - else if (GRange0 > tileSize0) { - Index xMode = static_cast(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); + local_range[0] = static_cast(max_workgroup_Size / + (local_range[1] * local_range[2])); + input_range[0] = input_dim[0]; + if (input_range[0] == 0) input_range[0] = static_cast(1); + global_range[0] = input_range[0]; + if (local_range[0] > global_range[0]) + local_range[0] = global_range[0]; + else if (global_range[0] > local_range[0]) { + Index xMode = static_cast(global_range[0] % local_range[0]); + if (xMode != 0) + global_range[0] += static_cast(local_range[0] - xMode); } } EIGEN_STRONG_INLINE bool has_local_memory() const { -#if !defined(EIGEN_SYCL_LOCA_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) +#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) return false; #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) return true; @@ -768,25 +857,19 @@ struct SyclDevice : public SyclDeviceBase { /// This is used to prepare the number of threads and also the number of /// threads per block for sycl kernels template - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, - Index &tileSize0, - Index &tileSize1, Index &rng0, - Index &rng1, Index &GRange0, - Index &GRange1) const { - queue_stream()->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, - rng1, GRange0, GRange1); + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array &input_dim, cl::sycl::range<2> &global_range, + cl::sycl::range<2> &local_range) const { + queue_stream()->parallel_for_setup(input_dim, global_range, local_range); } /// This is used to prepare the number of threads and also the number of /// threads per block for sycl kernels template EIGEN_STRONG_INLINE void parallel_for_setup( - Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1, - Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, - Index &GRange1, Index &GRange2) const { - queue_stream()->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, - tileSize2, rng0, rng1, rng2, GRange0, - GRange1, GRange2); + const std::array &input_dim, cl::sycl::range<3> &global_range, + cl::sycl::range<3> &local_range) const { + queue_stream()->parallel_for_setup(input_dim, global_range, local_range); } /// allocate device memory @@ -943,6 +1026,22 @@ struct SyclDevice : public SyclDeviceBase { EIGEN_STRONG_INLINE std::string getDeviceVendor() const { return queue_stream()->getDeviceVendor(); } + template + EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const { + queue_stream()->template binary_kernel_launcher( + var...); + } + template + EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const { + queue_stream()->template unary_kernel_launcher( + var...); + } + + template + EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const { + queue_stream()->template nullary_kernel_launcher( + var...); + } }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 9926046b9..b83174ab7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -649,131 +649,75 @@ EIGEN_STRONG_INLINE void TensorExecutor -struct ExecExprFunctorKernel_impl { +template +struct ExecExprFunctorKernel { typedef typename Evaluator::Index Index; - const Index range; - const Index vectorizable_threads; Evaluator evaluator; - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl( - const Index range_, const Index vectorizable_threads_, - Evaluator evaluator_) - : range(range_), vectorizable_threads(vectorizable_threads_), - evaluator(evaluator_) {} - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void - operator()(cl::sycl::nd_item<1> itemID) { + const Index range; + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel( + const Scratch, Evaluator evaluator_, const Index range_) + : evaluator(evaluator_), range(range_) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()( + cl::sycl::nd_item<1> itemID) { + compute(itemID); + } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if::type + compute(const cl::sycl::nd_item<1>& itemID) { Index gId = static_cast(itemID.get_global_linear_id()); Index total_threads = itemID.get_global_range(0); - EIGEN_UNROLL_LOOP + for (Index i = gId; i < range; i += total_threads) { evaluator.evalScalar(i); } } -}; - -template -struct ExecExprFunctorKernel_impl { - typedef typename Evaluator::Index Index; - const Index range; - const Index vectorizable_threads; - Evaluator evaluator; - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl( - const Index range_, const Index vectorizable_threads_, - Evaluator evaluator_) - : range(range_), vectorizable_threads(vectorizable_threads_), - evaluator(evaluator_) {} - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void - operator()(cl::sycl::nd_item<1> itemID) { + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if::type + compute(const cl::sycl::nd_item<1>& itemID) { + const Index vectorizedRange = + (range / Evaluator::PacketSize) * Evaluator::PacketSize; Index gId = static_cast(itemID.get_global_linear_id()); - if (gId < vectorizable_threads) { - const Index PacketSize = Eigen::internal::unpacket_traits< - typename Evaluator::PacketReturnType>::size; - evaluator.evalPacket(gId * PacketSize); - gId += (vectorizable_threads * PacketSize); - EIGEN_UNROLL_LOOP - for (Index i = gId; i < range; i += vectorizable_threads) { - evaluator.evalScalar(i); - } + const Index step = Evaluator::PacketSize * itemID.get_global_range(0); + const Index start = Evaluator::PacketSize * gId; + for (Index i = start; i < vectorizedRange; i += step) { + evaluator.evalPacket(i); + } + gId += vectorizedRange; + for (Index i = gId; i < range; i += itemID.get_global_range(0)) { + evaluator.evalScalar(i); } } }; -template -struct ExecExprFunctorKernel - : ExecExprFunctorKernel_impl< - ::Eigen::internal::IsVectorizable::value, - Evaluator> { - ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_, - const Evaluator &evaluator) - : ExecExprFunctorKernel_impl< - ::Eigen::internal::IsVectorizable::value, - Evaluator>(range_, vectorizable_threads_, evaluator) {} -}; - -template -struct ExecExprFunctorKernel - : ExecExprFunctorKernel_impl { - ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_, - const Evaluator &evaluator) - : ExecExprFunctorKernel_impl( - range_, vectorizable_threads_, evaluator) {} -}; - template class TensorExecutor { - public: + public: typedef typename Expression::Index Index; - static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) { - Eigen::TensorEvaluator evaluator(expr, dev); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const Eigen::SyclDevice& dev) { + typedef Eigen::TensorEvaluator Evaluator; + Evaluator evaluator(expr, dev); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { Index range, GRange, tileSize; Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions()); total_size = (total_size == 0) ? 1 : total_size; - const int PacketSize = Eigen::PacketType< - typename Eigen::TensorEvaluator::CoeffReturnType, - Eigen::SyclDevice>::size; - Index vectorizable_threads = - static_cast(total_size / PacketSize); + const int PacketSize = + Eigen::PacketType::size; + Index vectorizable_threads = static_cast(total_size / PacketSize); dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange); range = total_size; - auto f = [&](cl::sycl::handler &cgh) { - evaluator.bind(cgh); - typedef ExecExprFunctorKernel> - conditional_vectorized_kernel; - - typedef ExecExprFunctorKernel> - non_vectorized_kernel; -// This is to make sure that an expression with a size less than vectorized size -// will not call the vectorized kernel. -// The reason for having this kernel is that the vectorisable parameter is a -// compile-time parameter, -// however, the size of a tensor is a run-time parameter - (vectorizable_threads) - ? cgh.parallel_for( -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - dev.program().template get_kernel(), -#endif - cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), - cl::sycl::range<1>(tileSize)), - conditional_vectorized_kernel(range, vectorizable_threads, - evaluator)) - : cgh.parallel_for( -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - dev.program().template get_kernel(), -#endif - cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), - cl::sycl::range<1>(tileSize)), - non_vectorized_kernel(range, vectorizable_threads, - evaluator)); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = dev.sycl_queue().submit(f)); - dev.async_synchronize(e); + + dev.template nullary_kernel_launcher< + typename Evaluator::CoeffReturnType, + ExecExprFunctorKernel >( + evaluator, + cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), + cl::sycl::range<1>(tileSize)), + Index(1), range); } evaluator.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 389d5d906..b115e502b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -123,7 +123,7 @@ struct StorageMemory : StorageMemory {}; namespace TensorSycl { namespace internal{ -template class ReductionFunctor; +template class GenericNondeterministicReducer; } } #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 700337539..d3628f94e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -421,7 +421,7 @@ template struct MemcpyTrigge #ifdef EIGEN_USE_GPU template struct MemcpyTriggerForSlicing { EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { } - EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const { return contiguous > 4*1024*1024; } + EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; } }; #endif @@ -430,7 +430,7 @@ template struct MemcpyTriggerForSlicing struct MemcpyTriggerForSlicing { EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { } - EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const { return contiguous > 4*1024*1024; } + EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; } }; #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 84604cf41..0bb1e643e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -946,7 +946,7 @@ struct TensorReductionEvaluatorBase friend class TensorSycl::internal::ReductionFunctor; + template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer; // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer template friend struct internal::GenericReducer; #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index a379f5a94..387c3edf4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -11,167 +11,576 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /***************************************************************** - * TensorSyclPlaceHolderExpr.h + * TensorReductionSycl.h * * \brief: - * This is the specialisation of the placeholder expression based on the - * operation type + * This is the specialization of the reduction operation. Two phase reduction approach + * is used since the GPU does not have Global Synchronization for global memory among + * different work-group/thread block. To solve the problem, we need to create two kernels + * to reduce the data, where the first kernel reduce the data locally and each local + * workgroup/thread-block save the input data into global memory. In the second phase (global reduction) + * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element. + * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU: + * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf * -*****************************************************************/ + *****************************************************************/ #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP - namespace Eigen { +namespace TensorSycl { namespace internal { -template struct syclGenericBufferReducer{ -template -static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ - do { - auto f = [length, local, op, out_offset, &bufOut, &bufI](cl::sycl::handler& h) mutable { - cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, - cl::sycl::range<1>{std::min(length, local)}}; - /* Two accessors are used: one to the buffer that is being reduced, - * and a second to local memory, used to store intermediate data. */ - auto aI =bufI.template get_access(h); - auto aOut =bufOut.template get_access(h); - typedef decltype(aI) InputAccessor; - typedef decltype(aOut) OutputAccessor; - typedef cl::sycl::accessor LocalAccessor; - LocalAccessor scratch(cl::sycl::range<1>(local), h); - - /* The parallel_for invocation chosen is the variant with an nd_item - * parameter, since the code requires barriers for correctness. */ - h.parallel_for(r, TensorSycl::internal::GenericKernelReducer(op, aOut, out_offset, aI, scratch, length, local)); - }; - dev.sycl_queue().submit(f); - dev.asynchronousExec(); - - /* At this point, you could queue::wait_and_throw() to ensure that - * errors are caught quickly. However, this would likely impact - * performance negatively. */ - length = length / local; - - } while (length > 1); -} +template +struct OpDefiner { + typedef typename Vectorise::PacketReturnType PacketReturnType; + typedef Op type; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, + const Index &) { + return accumulator; + } }; -template struct syclGenericBufferReducer, CoeffReturnType>{ -template -static void run(Eigen::internal::MeanReducer, BufferTOut& bufOut,ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ - syclGenericBufferReducer, CoeffReturnType>::run(Eigen::internal::SumReducer(), - bufOut, out_offset, bufI, dev, length, local); -} +template +struct OpDefiner, CoeffReturnType, Index, false> { + typedef Eigen::internal::SumReducer type; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer &) { + return type(); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator, + const Index &scale) { + ::Eigen::internal::scalar_quotient_op quotient_op; + return quotient_op(accumulator, CoeffReturnType(scale)); + } }; -/// Self is useless here because in expression construction we are going to treat reduction as a leafnode. -/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the -/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as -// a leafNode. +template +struct OpDefiner, CoeffReturnType, Index, true> { + typedef typename Vectorise::PacketReturnType PacketReturnType; + typedef Eigen::internal::SumReducer type; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer &) { + return type(); + } -template -struct FullReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, + const Index &scale) { + return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1(CoeffReturnType(scale))); + } +}; - typedef typename Self::CoeffReturnType CoeffReturnType; - static const bool HasOptimizedImplementation = false; - - static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) { - typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef Eigen::TensorSycl::internal::FunctorExtractor > FunctorExpr; - FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); - int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread. - size_t inputSize =self.impl().dimensions().TotalSize(); - size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input - size_t remaining = inputSize% red_factor; - if(rng ==0) { - red_factor=1; - }; - size_t tileSize =dev.sycl_queue().get_device(). template get_info()/2; - size_t GRange=std::max((size_t )1, rng); - - // convert global range to power of 2 for redecution - GRange--; - GRange |= GRange >> 1; - GRange |= GRange >> 2; - GRange |= GRange >> 4; - GRange |= GRange >> 8; - GRange |= GRange >> 16; -#if __x86_64__ || __ppc64__ || _WIN64 - GRange |= GRange >> 32; -#endif - GRange++; - size_t outTileSize = tileSize; - /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one. - if (GRange < outTileSize) outTileSize=GRange; - /// creating the shared memory for calculating reduction. - /// This one is used to collect all the reduced value of shared memory as we don't have global barrier on GPU. Once it is saved we can - /// recursively apply reduction on it in order to reduce the whole. - auto temp_global_buffer =cl::sycl::buffer(cl::sycl::range<1>(GRange)); - typedef typename Eigen::internal::remove_all::type Dims; - // Dims dims= self.xprDims(); - //Op functor = reducer; - dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { - // this is a workaround for gcc 4.8 bug - typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType; - // create a tuple of accessors from Evaluator - TupleType tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - auto tmp_global_accessor = temp_global_buffer. template get_access(cgh); - typedef decltype(tmp_global_accessor) OutAccessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), - TensorSycl::internal::FullReductionKernelFunctor - (tmp_global_accessor, rng, remaining, red_factor, reducer, self.xprDims(), functors, tuple_of_accessors)); - }); - dev.asynchronousExec(); - - // getting final out buffer at the moment the created buffer is true because there is no need for assign - auto out_buffer =dev.get_sycl_buffer(output); - ptrdiff_t out_offset = dev.get_offset(output); - /// This is used to recursively reduce the tmp value to an element of 1; - syclGenericBufferReducer::run(reducer, out_buffer, out_offset, temp_global_buffer,dev, GRange, outTileSize); +template +struct SecondStepFullReducer { + typedef cl::sycl::accessor + LocalAccessor; + typedef OpDefiner OpDef; + typedef typename OpDef::type Op; + LocalAccessor scratch; + InputAccessor aI; + OutputAccessor outAcc; + Op op; + SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_) + : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {} + + void operator()(cl::sycl::nd_item<1> itemID) { + // Our empirical research shows that the best performance will be achieved + // when there is only one element per thread to reduce in the second step. + // in this step the second step reduction time is almost negligible. + // Hence, in the second step of reduction the input size is fixed to the + // local size, thus, there is only one element read per thread. The + // algorithm must be changed if the number of reduce per thread in the + // second step is greater than 1. Otherwise, the result will be wrong. + const Index localid = itemID.get_local_id(0); + auto aInPtr = aI.get_pointer() + localid; + auto aOutPtr = outAcc.get_pointer(); + CoeffReturnType *scratchptr = scratch.get_pointer(); + CoeffReturnType accumulator = *aInPtr; + + scratchptr[localid] = op.finalize(accumulator); +#pragma unroll 8 + for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + op.reduce(scratchptr[localid + offset], &accumulator); + scratchptr[localid] = op.finalize(accumulator); + } + } + if (localid == 0) *aOutPtr = op.finalize(accumulator); + } +}; + +// Full reduction first phase. In this version the vectorization is true and the reduction accept +// any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ). +template +class FullReductionKernelFunctor { + public: + typedef typename Evaluator::CoeffReturnType CoeffReturnType; + typedef typename Evaluator::Index Index; + typedef OpDefiner + OpDef; + + typedef typename OpDef::type Op; + typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; + typedef typename Evaluator::PacketReturnType PacketReturnType; + typedef + typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess), + PacketReturnType, CoeffReturnType>::type OutType; + typedef cl::sycl::accessor + LocalAccessor; + LocalAccessor scratch; + Evaluator evaluator; + EvaluatorPointerType final_output; + Index rng; + Op op; + + FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_, + Index rng_, OpType op_) + : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {} + + void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type compute_reduction( + const cl::sycl::nd_item<1> &itemID) { + auto output_ptr = final_output.get_pointer(); + Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize; + Index globalid = itemID.get_global_id(0); + Index localid = itemID.get_local_id(0); + Index step = Evaluator::PacketSize * itemID.get_global_range(0); + Index start = Evaluator::PacketSize * globalid; + // vectorizable parts + PacketReturnType packetAccumulator = op.template initializePacket(); +#pragma unroll(8 / Evaluator::PacketSize) + for (Index i = start; i < VectorizedRange; i += step) { + op.template reducePacket(evaluator.impl().template packet(i), &packetAccumulator); + } + globalid += VectorizedRange; + // non vectorizable parts + for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) { + op.template reducePacket( + ::Eigen::TensorSycl::internal::PacketWrapper::convert_to_packet_type( + evaluator.impl().coeff(i), op.initialize()), + &packetAccumulator); + } + scratch[localid] = packetAccumulator = + OpDef::finalise_op(op.template finalizePacket(packetAccumulator), rng); + // reduction parts // Local size is always power of 2 + EIGEN_UNROLL_LOOP + for (Index offset = local_range / 2; offset > 0; offset /= 2) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + op.template reducePacket(scratch[localid + offset], &packetAccumulator); + scratch[localid] = op.template finalizePacket(packetAccumulator); + } + } + if (localid == 0) { + output_ptr[itemID.get_group(0)] = + op.finalizeBoth(op.initialize(), op.template finalizePacket(packetAccumulator)); + } } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type compute_reduction( + const cl::sycl::nd_item<1> &itemID) { + auto output_ptr = final_output.get_pointer(); + Index globalid = itemID.get_global_id(0); + Index localid = itemID.get_local_id(0); + // vectorizable parts + CoeffReturnType accumulator = op.initialize(); + // non vectorizable parts + for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) { + op.reduce(evaluator.impl().coeff(i), &accumulator); + } + scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng); + + // reduction parts. the local size is always power of 2 + EIGEN_UNROLL_LOOP + for (Index offset = local_range / 2; offset > 0; offset /= 2) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + op.reduce(scratch[localid + offset], &accumulator); + scratch[localid] = op.finalize(accumulator); + } + } + if (localid == 0) { + output_ptr[itemID.get_group(0)] = op.finalize(accumulator); + } + } }; +template +class GenericNondeterministicReducer { + public: + typedef typename Evaluator::CoeffReturnType CoeffReturnType; + typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; + typedef typename Evaluator::Index Index; + typedef OpDefiner OpDef; + typedef typename OpDef::type Op; + template + GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_, + Index range_, Index num_values_to_reduce_) + : evaluator(evaluator_), + output_accessor(output_accessor_), + functor(OpDef::get_op(functor_)), + range(range_), + num_values_to_reduce(num_values_to_reduce_) {} -template -struct InnerReducer { + void operator()(cl::sycl::nd_item<1> itemID) { + auto output_accessor_ptr = output_accessor.get_pointer(); + /// const cast added as a naive solution to solve the qualifier drop error + Index globalid = static_cast(itemID.get_global_linear_id()); + if (globalid < range) { + CoeffReturnType accum = functor.initialize(); + Eigen::internal::GenericDimReducer::reduce( + evaluator, evaluator.firstInput(globalid), functor, &accum); + output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce); + } + } + + private: + Evaluator evaluator; + EvaluatorPointerType output_accessor; + Op functor; + Index range; + Index num_values_to_reduce; +}; + +enum class reduction_dim { inner_most, outer_most }; +// default is preserver +template +struct PartialReductionKernel { + typedef typename Evaluator::CoeffReturnType CoeffReturnType; + typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; + typedef typename Evaluator::Index Index; + typedef OpDefiner OpDef; + typedef typename OpDef::type Op; + typedef cl::sycl::accessor + ScratchAcc; + ScratchAcc scratch; + Evaluator evaluator; + EvaluatorPointerType output_accessor; + Op op; + const Index preserve_elements_num_groups; + const Index reduce_elements_num_groups; + const Index num_coeffs_to_preserve; + const Index num_coeffs_to_reduce; + + PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_, + const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_, + const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_) + : scratch(scratch_), + evaluator(evaluator_), + output_accessor(output_accessor_), + op(OpDef::get_op(op_)), + preserve_elements_num_groups(preserve_elements_num_groups_), + reduce_elements_num_groups(reduce_elements_num_groups_), + num_coeffs_to_preserve(num_coeffs_to_preserve_), + num_coeffs_to_reduce(num_coeffs_to_reduce_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId, + CoeffReturnType &accumulator) { + if (globalPId >= num_coeffs_to_preserve) { + return; + } + Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve) + : globalRId + (globalPId * num_coeffs_to_reduce); + Index localOffset = globalRId; + + const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups; + const Index per_thread_global_stride = + rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride; +#pragma unroll 8 + for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) { + op.reduce(evaluator.impl().coeff(global_offset), &accumulator); + localOffset += per_thread_local_stride; + global_offset += per_thread_global_stride; + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + const Index linearLocalThreadId = itemID.get_local_id(0); + Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP + : linearLocalThreadId / PannelParameters::LocalThreadSizeR; + Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP + : linearLocalThreadId % PannelParameters::LocalThreadSizeR; + const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups + : itemID.get_group(0) / reduce_elements_num_groups; + const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups + : itemID.get_group(0) % reduce_elements_num_groups; + + Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId; + const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId; + auto scratchPtr = scratch.get_pointer().get(); + auto outPtr = + output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0); + CoeffReturnType accumulator = op.initialize(); + + element_wise_reduce(globalRId, globalPId, accumulator); + + accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce); + scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] = + accumulator; + if (rt == reduction_dim::inner_most) { + pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP; + rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP; + globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId; + } + + /* Apply the reduction operation between the current local + * id and the one on the other half of the vector. */ + auto out_scratch_ptr = + scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC))); + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (rt == reduction_dim::inner_most) { + accumulator = *out_scratch_ptr; + } + // The Local LocalThreadSizeR is always power of 2 + EIGEN_UNROLL_LOOP + for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) { + if (rLocalThreadId < offset) { + op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator); + // The result has already been divided for mean reducer in the + // previous reduction so no need to divide furthermore + *out_scratch_ptr = op.finalize(accumulator); + } + /* All threads collectively read from global memory into local. + * The barrier ensures all threads' IO is resolved before + * execution continues (strictly speaking, all threads within + * a single work-group - there is no co-ordination between + * work-groups, only work-items). */ + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + + if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) { + outPtr[globalPId] = op.finalize(accumulator); + } + } +}; + +template +struct SecondStepPartialReduction { + typedef OpDefiner OpDef; + typedef typename OpDef::type Op; + typedef cl::sycl::accessor + ScratchAccessor; + InputAccessor input_accessor; + OutputAccessor output_accessor; + Op op; + const Index num_coeffs_to_preserve; + const Index num_coeffs_to_reduce; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_, + OutputAccessor output_accessor_, OpType op_, + const Index num_coeffs_to_preserve_, + const Index num_coeffs_to_reduce_) + : input_accessor(input_accessor_), + output_accessor(output_accessor_), + op(OpDef::get_op(op_)), + num_coeffs_to_preserve(num_coeffs_to_preserve_), + num_coeffs_to_reduce(num_coeffs_to_reduce_) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + const Index globalId = itemID.get_global_id(0); + + if (globalId >= num_coeffs_to_preserve) return; + + auto in_ptr = input_accessor.get_pointer() + globalId; + + OutScalar accumulator = op.initialize(); +// num_coeffs_to_reduce is not bigger that 256 +#pragma unroll 8 + for (Index i = 0; i < num_coeffs_to_reduce; i++) { + op.reduce(*in_ptr, &accumulator); + in_ptr += num_coeffs_to_preserve; + } + output_accessor.get_pointer()[globalId] = op.finalize(accumulator); + } +}; // namespace internal + +template +struct ReductionPannel { + static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP; + static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR; + static EIGEN_CONSTEXPR bool BC = BC_; +}; + +template +struct PartialReducerLauncher { + typedef typename Self::EvaluatorPointerType EvaluatorPointerType; typedef typename Self::CoeffReturnType CoeffReturnType; - static const bool HasOptimizedImplementation = false; + typedef typename Self::Storage Storage; + typedef typename Self::Index Index; + typedef ReductionPannel + PannelParameters; + + typedef PartialReductionKernel SyclReducerKerneType; + + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output, + Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) { + Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP); + + // getPowerOfTwo makes sure local range is power of 2 and <= + // maxSyclThreadPerBlock this will help us to avoid extra check on the + // kernel + static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) & + (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)), + "The Local thread size must be a power of 2 for the reduction " + "operation"); + + EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR; + // In this step, we force the code not to be more than 2-step reduction: + // Our empirical research shows that if each thread reduces at least 64 + // elemnts individually, we get better performance. However, this can change + // on different platforms. In this step we force the code not to be + // morthan step reduction: Our empirical research shows that for inner_most + // dim reducer, it is better to have 8 group in a reduce dimension for sizes + // > 1024 to achieve the best performance. + const Index reductionPerThread = 64; + Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true); + const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP; + Index rGroups = (cu + pNumGroups - 1) / pNumGroups; + const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1; + const Index globalRange = pNumGroups * rNumGroups * localRange; + + EIGEN_CONSTEXPR Index scratchSize = + PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC); + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (rNumGroups > 1) { + CoeffReturnType *temp_pointer = static_cast( + dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType))); + EvaluatorPointerType temp_accessor = dev.get(temp_pointer); + dev.template unary_kernel_launcher( + self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve, + num_coeffs_to_reduce); + + typedef SecondStepPartialReduction + SecondStepPartialReductionKernel; + + dev.template unary_kernel_launcher( + temp_accessor, output, + cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1), + reducer, num_coeffs_to_preserve, rNumGroups); + + self.device().deallocate_temp(temp_pointer); + } else { + dev.template unary_kernel_launcher( + self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve, + num_coeffs_to_reduce); + } + return false; + } +}; +} // namespace internal +} // namespace TensorSycl + +namespace internal { - static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve) { - typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef Eigen::TensorSycl::internal::FunctorExtractor > FunctorExpr; - FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); +template +struct FullReducer { + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::EvaluatorPointerType EvaluatorPointerType; + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; + static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1; + static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) { + typedef typename conditional::type OutType; + static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) & + (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), + "The Local thread size must be a power of 2 for the reduction " + "operation"); + EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; + + typename Self::Index inputSize = self.impl().dimensions().TotalSize(); + // In this step we force the code not to be more than 2-step reduction: + // Our empirical research shows that if each thread reduces at least 512 + // elemnts individually, we get better performance. + const Index reductionPerThread = 2048; + // const Index num_work_group = + Index reductionGroup = dev.getPowerOfTwo( + (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true); + const Index num_work_group = std::min(reductionGroup, local_range); + // 1 + // ? local_range + // : 1); + const Index global_range = num_work_group * local_range; + + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); + typedef TensorSycl::internal::FullReductionKernelFunctor reduction_kernel_t; + if (num_work_group > 1) { + CoeffReturnType *temp_pointer = + static_cast(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType))); + typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer); + dev.template unary_kernel_launcher(self, tmp_global_accessor, thread_range, + local_range, inputSize, reducer); + + typedef TensorSycl::internal::SecondStepFullReducer + GenericRKernel; + dev.template unary_kernel_launcher( + tmp_global_accessor, data, + cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group, + reducer); + + dev.deallocate_temp(temp_pointer); + } else { + dev.template unary_kernel_launcher(self, data, thread_range, local_range, inputSize, + reducer); + } + } +}; +// vectorizable inner_most most dim preserver +// col reduction +template +struct OuterReducer { + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; + + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, + typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, + typename Self::Index num_coeffs_to_preserve) { + return ::Eigen::TensorSycl::internal::PartialReducerLauncher< + Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output, + num_coeffs_to_reduce, + num_coeffs_to_preserve); + } +}; +// row reduction +template +struct InnerReducer { + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; + + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, + typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, + typename Self::Index num_coeffs_to_preserve) { + return ::Eigen::TensorSycl::internal::PartialReducerLauncher< + Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output, + num_coeffs_to_reduce, + num_coeffs_to_preserve); + } +}; + +// ArmgMax uses this kernel for partial reduction// +// TODO(@mehdi.goli) come up with a better kernel +// generic partial reduction +template +struct GenericReducer { + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false; + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, + typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce, + typename Self::Index num_coeffs_to_preserve) { typename Self::Index range, GRange, tileSize; - typedef typename Eigen::internal::remove_all::type Dims; - - // getting final out buffer at the moment the created buffer is true because there is no need for assign - /// creating the shared memory for calculating reduction. - /// This one is used to collect all the reduced value of shared memory as we don't have global barrier on GPU. Once it is saved we can - /// recursively apply reduction on it in order to reduce the whole. - dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); - dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { - // this is workaround for gcc 4.8 bug. - typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc; - // create a tuple of accessors from Evaluator - Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - auto output_accessor = dev.template get_sycl_accessor(cgh, output); - ptrdiff_t out_offset = dev.get_offset(output); - Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast(1); - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), - TensorSycl::internal::ReductionFunctor - (output_accessor, out_offset, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size)); - - }); - dev.asynchronousExec(); + dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); + + dev.template unary_kernel_launcher>( + self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1), + reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast(1)); return false; } }; -} // end namespace internal +} // namespace internal } // namespace Eigen #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h new file mode 100644 index 000000000..0078692cd --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h @@ -0,0 +1,512 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorScanSycl.h + * + * \brief: + * Tensor Scan Sycl implement the extend version of + * "Efficient parallel scan algorithms for GPUs." .for Tensor operations. + * The algorithm requires up to 3 stage (consequently 3 kernels) depending on + * the size of the tensor. In the first kernel (ScanKernelFunctor), each + * threads within the work-group individually reduces the allocated elements per + * thread in order to reduces the total number of blocks. In the next step all + * thread within the work-group will reduce the associated blocks into the + * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary + * buffer is given as an input and all the threads within a work-group scan and + * reduces the boundaries between the blocks (generated from the previous + * kernel). and write the data on the temporary buffer. If the second kernel is + * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will + * adjust the final result into the output buffer. + * The original algorithm for the parallel prefix sum can be found here: + * + * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel + * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003 + *1, no. 1 (2008): 1-17. + *****************************************************************/ + +#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP +#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP + +namespace Eigen { +namespace TensorSycl { +namespace internal { + +#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE +#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4) +#endif + +template +struct ScanParameters { + // must be power of 2 + static EIGEN_CONSTEXPR index_t ScanPerThread = 8; + const index_t total_size; + const index_t non_scan_size; + const index_t scan_size; + const index_t non_scan_stride; + const index_t scan_stride; + const index_t panel_threads; + const index_t group_threads; + const index_t block_threads; + const index_t elements_per_group; + const index_t elements_per_block; + const index_t loop_range; + + ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_, + index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_, + index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_) + : total_size(total_size_), + non_scan_size(non_scan_size_), + scan_size(scan_size_), + non_scan_stride(non_scan_stride_), + scan_stride(scan_stride_), + panel_threads(panel_threads_), + group_threads(group_threads_), + block_threads(block_threads_), + elements_per_group(elements_per_group_), + elements_per_block(elements_per_block_), + loop_range(loop_range_) {} +}; + +enum class scan_step { first, second }; +template +struct ScanKernelFunctor { + typedef cl::sycl::accessor + LocalAccessor; + static EIGEN_CONSTEXPR int PacketSize = ScanParameters::ScanPerThread / 2; + + LocalAccessor scratch; + Evaluator dev_eval; + OutAccessor out_accessor; + OutAccessor temp_accessor; + const ScanParameters scanParameters; + Op accumulator; + const bool inclusive; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_, + OutAccessor out_accessor_, OutAccessor temp_accessor_, + const ScanParameters scanParameters_, Op accumulator_, + const bool inclusive_) + : scratch(scratch_), + dev_eval(dev_eval_), + out_accessor(out_accessor_), + temp_accessor(temp_accessor_), + scanParameters(scanParameters_), + accumulator(accumulator_), + inclusive(inclusive_) {} + + template + typename ::Eigen::internal::enable_if::type EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE + read(const Input &inpt, Index global_id) { + return inpt.coeff(global_id); + } + + template + typename ::Eigen::internal::enable_if::type EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE + read(const Input &inpt, Index global_id) { + return inpt[global_id]; + } + + template + typename ::Eigen::internal::enable_if::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + first_step_inclusive_Operation(InclusiveOp inclusive_op) { + inclusive_op(); + } + + template + typename ::Eigen::internal::enable_if::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + first_step_inclusive_Operation(InclusiveOp) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + auto out_ptr = out_accessor.get_pointer(); + auto tmp_ptr = temp_accessor.get_pointer(); + auto scratch_ptr = scratch.get_pointer().get(); + + for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) { + Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset)); + Index tmp = data_offset % scanParameters.panel_threads; + const Index panel_id = data_offset / scanParameters.panel_threads; + const Index group_id = tmp / scanParameters.group_threads; + tmp = tmp % scanParameters.group_threads; + const Index block_id = tmp / scanParameters.block_threads; + const Index local_id = tmp % scanParameters.block_threads; + // we put one element per packet in scratch_mem + const Index scratch_stride = scanParameters.elements_per_block / PacketSize; + const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride; + CoeffReturnType private_scan[ScanParameters::ScanPerThread]; + CoeffReturnType inclusive_scan; + // the actual panel size is scan_size * non_scan_size. + // elements_per_panel is roundup to power of 2 for binary tree + const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size; + const Index group_offset = group_id * scanParameters.non_scan_stride; + // This will be effective when the size is bigger than elements_per_block + const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride; + const Index thread_offset = (ScanParameters::ScanPerThread * local_id * scanParameters.scan_stride); + const Index global_offset = panel_offset + group_offset + block_offset + thread_offset; + Index next_elements = 0; + EIGEN_UNROLL_LOOP + for (int i = 0; i < ScanParameters::ScanPerThread; i++) { + Index global_id = global_offset + next_elements; + private_scan[i] = ((((block_id * scanParameters.elements_per_block) + + (ScanParameters::ScanPerThread * local_id) + i) < scanParameters.scan_size) && + (global_id < scanParameters.total_size)) + ? read(dev_eval, global_id) + : accumulator.initialize(); + next_elements += scanParameters.scan_stride; + } + first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC { + if (inclusive) { + inclusive_scan = private_scan[ScanParameters::ScanPerThread - 1]; + } + }); + // This for loop must be 2 + EIGEN_UNROLL_LOOP + for (int packetIndex = 0; packetIndex < ScanParameters::ScanPerThread; packetIndex += PacketSize) { + Index private_offset = 1; + // build sum in place up the tree + EIGEN_UNROLL_LOOP + for (Index d = PacketSize >> 1; d > 0; d >>= 1) { + EIGEN_UNROLL_LOOP + for (Index l = 0; l < d; l++) { + Index ai = private_offset * (2 * l + 1) - 1 + packetIndex; + Index bi = private_offset * (2 * l + 2) - 1 + packetIndex; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(private_scan[ai], &accum); + accumulator.reduce(private_scan[bi], &accum); + private_scan[bi] = accumulator.finalize(accum); + } + private_offset *= 2; + } + scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] = + private_scan[PacketSize - 1 + packetIndex]; + private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize(); + // traverse down tree & build scan + EIGEN_UNROLL_LOOP + for (Index d = 1; d < PacketSize; d *= 2) { + private_offset >>= 1; + EIGEN_UNROLL_LOOP + for (Index l = 0; l < d; l++) { + Index ai = private_offset * (2 * l + 1) - 1 + packetIndex; + Index bi = private_offset * (2 * l + 2) - 1 + packetIndex; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(private_scan[ai], &accum); + accumulator.reduce(private_scan[bi], &accum); + private_scan[ai] = private_scan[bi]; + private_scan[bi] = accumulator.finalize(accum); + } + } + } + + Index offset = 1; + // build sum in place up the tree + for (Index d = scratch_stride >> 1; d > 0; d >>= 1) { + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (local_id < d) { + Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset; + Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(scratch_ptr[ai], &accum); + accumulator.reduce(scratch_ptr[bi], &accum); + scratch_ptr[bi] = accumulator.finalize(accum); + } + offset *= 2; + } + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + // next step optimisation + if (local_id == 0) { + if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) { + const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) * + scanParameters.non_scan_size + + group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) + + block_id; + tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset]; + } + // clear the last element + scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize(); + } + // traverse down tree & build scan + for (Index d = 1; d < scratch_stride; d *= 2) { + offset >>= 1; + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (local_id < d) { + Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset; + Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(scratch_ptr[ai], &accum); + accumulator.reduce(scratch_ptr[bi], &accum); + scratch_ptr[ai] = scratch_ptr[bi]; + scratch_ptr[bi] = accumulator.finalize(accum); + } + } + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + // This for loop must be 2 + EIGEN_UNROLL_LOOP + for (int packetIndex = 0; packetIndex < ScanParameters::ScanPerThread; packetIndex += PacketSize) { + EIGEN_UNROLL_LOOP + for (Index i = 0; i < PacketSize; i++) { + CoeffReturnType accum = private_scan[packetIndex + i]; + accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum); + private_scan[packetIndex + i] = accumulator.finalize(accum); + } + } + first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC { + if (inclusive) { + accumulator.reduce(private_scan[ScanParameters::ScanPerThread - 1], &inclusive_scan); + private_scan[0] = accumulator.finalize(inclusive_scan); + } + }); + next_elements = 0; + // right the first set of private param + EIGEN_UNROLL_LOOP + for (Index i = 0; i < ScanParameters::ScanPerThread; i++) { + Index global_id = global_offset + next_elements; + if ((((block_id * scanParameters.elements_per_block) + (ScanParameters::ScanPerThread * local_id) + i) < + scanParameters.scan_size) && + (global_id < scanParameters.total_size)) { + Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters::ScanPerThread) * (inclusive)); + out_ptr[global_id] = private_scan[private_id]; + } + next_elements += scanParameters.scan_stride; + } + } // end for loop + } +}; + +template +struct ScanAdjustmentKernelFunctor { + typedef cl::sycl::accessor + LocalAccessor; + static EIGEN_CONSTEXPR int PacketSize = ScanParameters::ScanPerThread / 2; + InAccessor in_accessor; + OutAccessor out_accessor; + const ScanParameters scanParameters; + Op accumulator; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_, + OutAccessor out_accessor_, + const ScanParameters scanParameters_, + Op accumulator_) + : in_accessor(in_accessor_), + out_accessor(out_accessor_), + scanParameters(scanParameters_), + accumulator(accumulator_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + auto in_ptr = in_accessor.get_pointer(); + auto out_ptr = out_accessor.get_pointer(); + + for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) { + Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset)); + Index tmp = data_offset % scanParameters.panel_threads; + const Index panel_id = data_offset / scanParameters.panel_threads; + const Index group_id = tmp / scanParameters.group_threads; + tmp = tmp % scanParameters.group_threads; + const Index block_id = tmp / scanParameters.block_threads; + const Index local_id = tmp % scanParameters.block_threads; + + // the actual panel size is scan_size * non_scan_size. + // elements_per_panel is roundup to power of 2 for binary tree + const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size; + const Index group_offset = group_id * scanParameters.non_scan_stride; + // This will be effective when the size is bigger than elements_per_block + const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride; + const Index thread_offset = ScanParameters::ScanPerThread * local_id * scanParameters.scan_stride; + + const Index global_offset = panel_offset + group_offset + block_offset + thread_offset; + const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block; + const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id; + CoeffReturnType adjust_val = in_ptr[in_id]; + + Index next_elements = 0; + EIGEN_UNROLL_LOOP + for (Index i = 0; i < ScanParameters::ScanPerThread; i++) { + Index global_id = global_offset + next_elements; + if ((((block_id * scanParameters.elements_per_block) + (ScanParameters::ScanPerThread * local_id) + i) < + scanParameters.scan_size) && + (global_id < scanParameters.total_size)) { + CoeffReturnType accum = adjust_val; + accumulator.reduce(out_ptr[global_id], &accum); + out_ptr[global_id] = accumulator.finalize(accum); + } + next_elements += scanParameters.scan_stride; + } + } + } +}; + +template +struct ScanInfo { + const Index &total_size; + const Index &scan_size; + const Index &panel_size; + const Index &non_scan_size; + const Index &scan_stride; + const Index &non_scan_stride; + + Index max_elements_per_block; + Index block_size; + Index panel_threads; + Index group_threads; + Index block_threads; + Index elements_per_group; + Index elements_per_block; + Index loop_range; + Index global_range; + Index local_range; + const Eigen::SyclDevice &dev; + EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_, + const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_, + const Eigen::SyclDevice &dev_) + : total_size(total_size_), + scan_size(scan_size_), + panel_size(panel_size_), + non_scan_size(non_scan_size_), + scan_stride(scan_stride_), + non_scan_stride(non_scan_stride_), + dev(dev_) { + // must be power of 2 + local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()), + Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1)); + + max_elements_per_block = local_range * ScanParameters::ScanPerThread; + + elements_per_group = + dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters::ScanPerThread)), true); + const Index elements_per_panel = elements_per_group * non_scan_size; + elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block)); + panel_threads = elements_per_panel / ScanParameters::ScanPerThread; + group_threads = elements_per_group / ScanParameters::ScanPerThread; + block_threads = elements_per_block / ScanParameters::ScanPerThread; + block_size = elements_per_group / elements_per_block; +#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE + const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE)); +#else + const Index max_threads = panel_threads * panel_size; +#endif + global_range = roundUp(max_threads, local_range); + loop_range = Index( + std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters::ScanPerThread))); + } + inline ScanParameters get_scan_parameter() { + return ScanParameters(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads, + group_threads, block_threads, elements_per_group, elements_per_block, loop_range); + } + inline cl::sycl::nd_range<1> get_thread_range() { + return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); + } +}; + +template +struct SYCLAdjustBlockOffset { + EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr, + Reducer &accumulator, const Index total_size, + const Index scan_size, const Index panel_size, + const Index non_scan_size, const Index scan_stride, + const Index non_scan_stride, const Eigen::SyclDevice &dev) { + auto scan_info = + ScanInfo(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev); + + typedef ScanAdjustmentKernelFunctor + AdjustFuctor; + dev.template unary_kernel_launcher(in_ptr, out_ptr, scan_info.get_thread_range(), + scan_info.max_elements_per_block, + scan_info.get_scan_parameter(), accumulator); + } +}; + +template +struct ScanLauncher_impl { + template + EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, + const Index total_size, const Index scan_size, const Index panel_size, + const Index non_scan_size, const Index scan_stride, + const Index non_scan_stride, const bool inclusive, + const Eigen::SyclDevice &dev) { + auto scan_info = + ScanInfo(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev); + const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size; + const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters::ScanPerThread / 2); + CoeffReturnType *temp_pointer = + static_cast(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer); + + typedef ScanKernelFunctor ScanFunctor; + dev.template binary_kernel_launcher( + in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size, + scan_info.get_scan_parameter(), accumulator, inclusive); + + if (scan_info.block_size > 1) { + ScanLauncher_impl::scan_block( + tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size, + non_scan_size, Index(1), scan_info.block_size, false, dev); + + SYCLAdjustBlockOffset::adjust_scan_block_offset( + tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, + non_scan_stride, dev); + } + dev.deallocate_temp(temp_pointer); + } +}; + +} // namespace internal +} // namespace TensorSycl + +template +struct ScanLauncher { + typedef typename Self::Index Index; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::Storage Storage; + typedef typename Self::EvaluatorPointerType EvaluatorPointerType; + void operator()(Self &self, EvaluatorPointerType data) { + const Index total_size = internal::array_prod(self.dimensions()); + const Index scan_size = self.size(); + const Index scan_stride = self.stride(); + // this is the scan op (can be sum or ...) + auto accumulator = self.accumulator(); + auto inclusive = !self.exclusive(); + auto consume_dim = self.consume_dim(); + auto dev = self.device(); + + auto dims = self.inner().dimensions(); + + Index non_scan_size = 1; + Index panel_size = 1; + if (static_cast(Self::Layout) == static_cast(ColMajor)) { + for (int i = 0; i < consume_dim; i++) { + non_scan_size *= dims[i]; + } + for (int i = consume_dim + 1; i < Self::NumDims; i++) { + panel_size *= dims[i]; + } + } else { + for (int i = Self::NumDims - 1; i > consume_dim; i--) { + non_scan_size *= dims[i]; + } + for (int i = consume_dim - 1; i >= 0; i--) { + panel_size *= dims[i]; + } + } + const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size; + auto eval_impl = self.inner(); + TensorSycl::internal::ScanLauncher_impl::scan_block( + eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, + inclusive, dev); + } +}; +} // namespace Eigen + +#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h deleted file mode 100644 index 7b8bd2df7..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h +++ /dev/null @@ -1,120 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: eigen@codeplay.com -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// General include header of SYCL target for Tensor Module -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H - -#ifdef EIGEN_USE_SYCL - -// global pointer to set different attribute state for a class -template -struct MakeGlobalPointer { - typedef typename cl::sycl::global_ptr::pointer_t Type; - typedef typename cl::sycl::global_ptr::reference_t RefType; -}; - -// global pointer to set different attribute state for a class -template -struct MakeLocalPointer { - typedef typename cl::sycl::local_ptr::pointer_t Type; - typedef typename cl::sycl::local_ptr::reference_t RefType; -}; - - -namespace Eigen { - template class TensorTupleReducerDeviceOp; - template struct TensorEvaluator, SyclKernelDevice>; -namespace internal { - -#ifdef __SYCL_DEVICE_ONLY__ -template struct TypeConversion { - template - static typename MakeGlobalPointer::Type get_address_space_pointer(typename MakeGlobalPointer::Type p); - template - static typename MakeLocalPointer::Type get_address_space_pointer(typename MakeLocalPointer::Type p); - - template - static A* get_address_space_pointer(T* p); - typedef decltype(get_address_space_pointer(B())) type; -}; - -#endif -} -namespace TensorSycl { -namespace internal { - - template struct GenericKernelReducer; -/// This struct is used for special expression nodes with no operations (for example assign and selectOP). - struct NoOP; - -template struct GetType{ - typedef const T Type; -}; -template struct GetType{ - typedef T Type; -}; - -template struct ValueCondition { - static constexpr size_t Res =X; -}; -template struct ValueCondition { - static constexpr size_t Res =Y; -}; - -} -} -} - -// tuple construction -#include "TensorSyclTuple.h" - -// counting number of leaf at compile time -#include "TensorSyclLeafCount.h" - -// The index PlaceHolder takes the actual expression and replaces the actual -// data on it with the place holder. It uses the same pre-order expression tree -// traverse as the leaf count in order to give the right access number to each -// node in the expression -#include "TensorSyclPlaceHolderExpr.h" - -// creation of an accessor tuple from a tuple of SYCL buffers -#include "TensorSyclExtractAccessor.h" - -// this is used to change the address space type in tensor map for GPU -#include "TensorSyclConvertToDeviceExpression.h" - -// this is used to extract the functors -#include "TensorSyclExtractFunctors.h" - -// this is used to create tensormap on the device -// this is used to construct the expression on the device -#include "TensorSyclExprConstructor.h" - -/// this is used for extracting tensor reduction -#include "TensorReductionSycl.h" - -// TensorArgMaxSycl.h -#include "TensorArgMaxSycl.h" - -/// this is used for extracting tensor convolution -#include "TensorConvolutionSycl.h" - -// kernel execution using fusion -#include "TensorSyclRun.h" -//sycl functors -#include "TensorSyclFunctors.h" - -#include "TensorContractionSycl.h" - -#endif // end of EIGEN_USE_SYCL -#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h deleted file mode 100644 index d6ac7b91f..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ /dev/null @@ -1,205 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorSyclConvertToDeviceExpression.h - * - * \brief: - * Conversion from host pointer to device pointer - * inside leaf nodes of the expression. - * -*****************************************************************/ - -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP - -namespace Eigen { -namespace TensorSycl { -namespace internal { - -/// \struct ConvertToDeviceExpression -/// \brief This struct is used to convert the MakePointer in the host expression -/// to the MakeGlobalPointer for the device expression. For the leafNodes -/// containing the pointer. This is due to the fact that the address space of -/// the pointer T* is different on the host and the device. -template -struct ConvertToDeviceExpression; - -template class NonOpCategory, bool IsConst, typename... Args> -struct NonOpConversion{ - typedef typename GetType::Type...> >::Type Type; -}; - - -template class > class NonOpCategory, bool IsConst, typename Args> -struct DeviceConvertor{ - typedef typename GetType::Type, MakeGlobalPointer> >::Type Type; -}; - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is TensorMap -#define TENSORMAPCONVERT(CVQual)\ -template class MakePointer_>\ -struct ConvertToDeviceExpression > {\ - typedef CVQual TensorMap Type;\ -}; - -TENSORMAPCONVERT(const) -TENSORMAPCONVERT() -#undef TENSORMAPCONVERT - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, TensorBroadcastingOp -#define CATEGORYCONVERT(CVQual)\ -template class Category, typename OP, typename... subExprs>\ -struct ConvertToDeviceExpression > {\ - typedef CVQual Category::Type... > Type;\ -}; -CATEGORYCONVERT(const) -CATEGORYCONVERT() -#undef CATEGORYCONVERT - - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is TensorCwiseSelectOp -#define SELECTOPCONVERT(CVQual, Res)\ -template \ -struct ConvertToDeviceExpression >\ -: NonOpConversion {}; -SELECTOPCONVERT(const, true) -SELECTOPCONVERT(, false) -#undef SELECTOPCONVERT - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is const AssingOP -#define ASSIGNCONVERT(CVQual, Res)\ -template \ -struct ConvertToDeviceExpression >\ -: NonOpConversion{}; - -ASSIGNCONVERT(const, true) -ASSIGNCONVERT(, false) -#undef ASSIGNCONVERT - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is TensorEvalToOp -#define KERNELBROKERCONVERT(CVQual, Res, ExprNode)\ -template \ -struct ConvertToDeviceExpression > \ -: DeviceConvertor{}; - - -KERNELBROKERCONVERT(const, true, TensorEvalToOp) -KERNELBROKERCONVERT(, false, TensorEvalToOp) -#undef KERNELBROKERCONVERT - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node types are TensorForcedEvalOp and TensorLayoutSwapOp -#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(CVQual, ExprNode)\ -template \ -struct ConvertToDeviceExpression > {\ - typedef CVQual ExprNode< typename ConvertToDeviceExpression::Type> Type;\ -}; - - -// TensorForcedEvalOp -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorForcedEvalOp) -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorForcedEvalOp) - -// TensorLayoutSwapOp -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorLayoutSwapOp) -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorLayoutSwapOp) - -//TensorIndexTupleOp -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorIndexTupleOp) -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorIndexTupleOp) -#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp -#define KERNELBROKERCONVERTREDUCTION(CVQual)\ -template class MakePointer_>\ -struct ConvertToDeviceExpression > {\ - typedef CVQual TensorReductionOp::Type, MakeGlobalPointer> Type;\ -}; - -KERNELBROKERCONVERTREDUCTION(const) -KERNELBROKERCONVERTREDUCTION() -#undef KERNELBROKERCONVERTREDUCTION - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp -#define KERNELBROKERCONVERTTUPLEREDUCTION(CVQual)\ -template \ -struct ConvertToDeviceExpression > {\ - typedef CVQual TensorTupleReducerOp::Type> Type;\ -}; - -KERNELBROKERCONVERTTUPLEREDUCTION(const) -KERNELBROKERCONVERTTUPLEREDUCTION() -#undef KERNELBROKERCONVERTTUPLEREDUCTION - -//TensorSlicingOp -#define KERNELBROKERCONVERTSLICEOP(CVQual)\ -template\ -struct ConvertToDeviceExpression >{\ - typedef CVQual TensorSlicingOp::Type> Type;\ -}; - -KERNELBROKERCONVERTSLICEOP(const) -KERNELBROKERCONVERTSLICEOP() -#undef KERNELBROKERCONVERTSLICEOP - -//TensorStridingSlicingOp -#define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\ -template\ -struct ConvertToDeviceExpression >{\ - typedef CVQual TensorStridingSlicingOp::Type> Type;\ -}; - -KERNELBROKERCONVERTERSLICESTRIDEOP(const) -KERNELBROKERCONVERTERSLICESTRIDEOP() -#undef KERNELBROKERCONVERTERSLICESTRIDEOP - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp -#define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\ -template \ -struct ConvertToDeviceExpression > {\ - typedef CVQual TensorChippingOp::Type> Type;\ -}; -KERNELBROKERCONVERTCHIPPINGOP(const) -KERNELBROKERCONVERTCHIPPINGOP() -#undef KERNELBROKERCONVERTCHIPPINGOP - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorImagePatchOp -#define KERNELBROKERCONVERTIMAGEPATCHOP(CVQual)\ -template\ -struct ConvertToDeviceExpression >{\ - typedef CVQual TensorImagePatchOp::Type> Type;\ -}; -KERNELBROKERCONVERTIMAGEPATCHOP(const) -KERNELBROKERCONVERTIMAGEPATCHOP() -#undef KERNELBROKERCONVERTIMAGEPATCHOP - - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorVolumePatchOp -#define KERNELBROKERCONVERTVOLUMEPATCHOP(CVQual)\ -template\ -struct ConvertToDeviceExpression >{\ - typedef CVQual TensorVolumePatchOp::Type> Type;\ -}; -KERNELBROKERCONVERTVOLUMEPATCHOP(const) -KERNELBROKERCONVERTVOLUMEPATCHOP() -#undef KERNELBROKERCONVERTVOLUMEPATCHOP - -} // namespace internal -} // namespace TensorSycl -} // namespace Eigen - -#endif // UNSUPPORTED_EIGEN_CXX1 diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h deleted file mode 100644 index 67003daf5..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ /dev/null @@ -1,514 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorSyclExprConstructor.h - * - * \brief: - * This file re-create an expression on the SYCL device in order - * to use the original tensor evaluator. - * -*****************************************************************/ - -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP - -namespace Eigen { -namespace TensorSycl { -namespace internal { - -template -struct DeviceFixedSizeTensor; - -template -struct DeviceFixedSizeTensor>{ - template - static EIGEN_ALWAYS_INLINE Expr instantiate(Data& dt) {return Expr(ConvertToActualTypeSycl(typename Expr::Scalar, dt), Indices...);} -}; -/// this class is used by EvalToOp in order to create an lhs expression which is -/// a pointer from an accessor on device-only buffer -template -struct EvalToLHSConstructor { - PtrType expr; - EvalToLHSConstructor(const utility::tuple::Tuple &t) : expr(ConvertToActualTypeSycl(typename Eigen::internal::remove_all::type, utility::tuple::get(t))) {} -}; - -/// struct ExprConstructor is used to reconstruct the expression on the device and -/// recreate the expression with MakeGlobalPointer containing the device address -/// space for the TensorMap pointers used in eval function. -/// It receives the original expression type, the functor of the node, the tuple -/// of accessors, and the device expression type to re-instantiate the -/// expression tree for the device -template -struct ExprConstructor; - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorMap -#define TENSORMAP(CVQual)\ -template class MakePointer_, size_t N, typename... Params>\ -struct ExprConstructor< CVQual TensorMap,\ -CVQual PlaceHolder, N>, Params...>{\ - typedef CVQual TensorMap Type;\ - Type expr;\ - template \ - ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple &t)\ - : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get(t)), fd.dimensions())){}\ -}; - -TENSORMAP(const) -TENSORMAP() -#undef TENSORMAP - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorMap -#define TENSORMAPFIXEDSIZE(CVQual)\ -template class MakePointer_, size_t N, typename... Params>\ -struct ExprConstructor< CVQual TensorMap, Options_, MakeGlobalPointer>,\ -CVQual PlaceHolder, Options_, MakePointer_>, N>, Params...>{\ - typedef CVQual TensorMap, Options_, MakeGlobalPointer> Type;\ - Type expr;\ - template \ - ExprConstructor(FuncDetector &, const utility::tuple::Tuple &t)\ - : expr(DeviceFixedSizeTensor::instantiate(utility::tuple::get(t))){}\ -}; - -TENSORMAPFIXEDSIZE(const) -TENSORMAPFIXEDSIZE() -#undef TENSORMAPFIXEDSIZE - -#define UNARYCATEGORY(CVQual)\ -template class UnaryCategory, typename OP, typename OrigRHSExpr, typename RHSExpr, typename... Params>\ -struct ExprConstructor, CVQual UnaryCategory, Params...> {\ - typedef ExprConstructor my_type;\ - my_type rhsExpr;\ - typedef CVQual UnaryCategory Type;\ - Type expr;\ - template \ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ - : rhsExpr(funcD.rhsExpr, t), expr(rhsExpr.expr, funcD.func) {}\ -}; - -UNARYCATEGORY(const) -UNARYCATEGORY() -#undef UNARYCATEGORY - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorBinaryOp -#define BINARYCATEGORY(CVQual)\ -template class BinaryCategory, typename OP, typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr,\ -typename RHSExpr, typename... Params>\ -struct ExprConstructor, CVQual BinaryCategory, Params...> {\ - typedef ExprConstructor my_left_type;\ - typedef ExprConstructor my_right_type;\ - typedef CVQual BinaryCategory Type;\ - my_left_type lhsExpr;\ - my_right_type rhsExpr;\ - Type expr;\ - template \ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ - : lhsExpr(funcD.lhsExpr, t),rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr, funcD.func) {}\ -}; - -BINARYCATEGORY(const) -BINARYCATEGORY() -#undef BINARYCATEGORY - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorCwiseTernaryOp -#define TERNARYCATEGORY(CVQual)\ -template