From 00f32752f7d0b193c6788691c3cf0b76457a044d Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Thu, 28 Nov 2019 10:08:54 +0000 Subject: [SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch. * Unifying all loadLocalTile from lhs and rhs to an extract_block function. * Adding get_tensor operation which was missing in TensorContractionMapper. * Adding the -D method missing from cmake for Disable_Skinny Contraction operation. * Wrapping all the indices in TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL * Unifying load to private register for tall/skinny no shared * Unifying load to vector tile for tensor-vector/vector-tensor operation * Removing all the LHS/RHS class for extracting data from global * Removing Outputfunction from TensorContractionSkinnyNoshared. * Combining the local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining General Tensor-Vector and VectorTensor contraction into one kernel. * Making double buffering optional for Tensor contraction when local memory is version is used. * Modifying benchmark to accept custom Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL CMake --- bench/tensors/README | 17 +- bench/tensors/eigen_sycl_bench.sh | 30 ++ bench/tensors/eigen_sycl_bench_contract.sh | 7 + bench/tensors/tensor_benchmarks.h | 102 ++++--- bench/tensors/tensor_benchmarks_sycl.cc | 133 ++++++--- .../tensor_benchmarks_sycl_include_headers.cc | 2 - bench/tensors/tensor_contract_sycl_bench.cc | 325 +++++++++++++++++++++ 7 files changed, 537 insertions(+), 79 deletions(-) create mode 100755 bench/tensors/eigen_sycl_bench.sh create mode 100644 bench/tensors/eigen_sycl_bench_contract.sh delete mode 100644 bench/tensors/tensor_benchmarks_sycl_include_headers.cc create mode 100644 bench/tensors/tensor_contract_sycl_bench.cc (limited to 'bench') diff --git a/bench/tensors/README b/bench/tensors/README index 69342cc9c..dcbf0217a 100644 --- a/bench/tensors/README +++ b/bench/tensors/README @@ -11,15 +11,10 @@ nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBU We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code. nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_53 -o benchmarks_fp16_gpu -last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call -g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu +To compile and run the benchmark for SYCL, using ComputeCpp, simply run the +following commands: +1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY} +2. bash eigen_sycl_bench.sh -To compile and run the benchmark for SYCL, using ComputeCpp you currently need following passes (only for translation units containing device code): -1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code. -{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc -DEIGEN_USE_SYCL=1 -2. The host compilation pass that generates the final host binary. -clang++ -O3 -c benchmark_main.cc -pthread -I ../../ -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 -o benchmark_main.o -clang++ -O3 tensor_benchmarks_sycl_include_headers.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 benchmark_main.o -o tensor_benchmark_sycl -export LD_LIBRARY_PATH={ComputeCpp_ROOT}/lib -3. Run the benchmark -./tensor_benchmark_sycl +Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call +g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu diff --git a/bench/tensors/eigen_sycl_bench.sh b/bench/tensors/eigen_sycl_bench.sh new file mode 100755 index 000000000..3f67b3d86 --- /dev/null +++ b/bench/tensors/eigen_sycl_bench.sh @@ -0,0 +1,30 @@ +rm -f tensor_benchmark_sycl +: "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}" +echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR +${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++ \ +tensor_benchmarks_sycl.cc \ +benchmark_main.cc \ +-I ../../ \ +-I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ \ +-std=c++11 \ +-march=native \ +-O3 \ +-DNDEBUG \ +-DEIGEN_MPL2_ONLY \ +-DEIGEN_USE_SYCL=1 \ +-DEIGEN_SYCL_LOCAL_MEM=1 \ +-no-serial-memop \ +-mllvm \ +-inline-threshold=10000 \ +-fsycl-ih-last \ +-sycl-driver \ +-Xclang -cl-mad-enable \ +-lOpenCL \ +-lComputeCpp \ +-lpthread \ +-o \ +tensor_benchmark_sycl\ +${@:1} + +export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH +./tensor_benchmark_sycl diff --git a/bench/tensors/eigen_sycl_bench_contract.sh b/bench/tensors/eigen_sycl_bench_contract.sh new file mode 100644 index 000000000..73fd6c4a0 --- /dev/null +++ b/bench/tensors/eigen_sycl_bench_contract.sh @@ -0,0 +1,7 @@ +rm -f tensor_contract_sycl_bench +: "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}" +echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR +${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++ tensor_contract_sycl_bench.cc -I ../../ -I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ -std=c++11 -O3 -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -no-serial-memop -mllvm -inline-threshold=10000 -fsycl-ih-last -sycl-driver -Xclang -cl-mad-enable -lOpenCL -lComputeCpp -lpthread -o tensor_contract_sycl_bench ${@:1} +export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH +./tensor_contract_sycl_bench + diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 3a640ede4..0825e1563 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -27,6 +27,11 @@ template class BenchmarkSuite { initialize(); } + BenchmarkSuite(const Device& device, size_t m, size_t k) + : m_(1), k_(k), n_(m), device_(device) { + initialize(); + } + ~BenchmarkSuite() { device_.deallocate(a_); device_.deallocate(b_); @@ -79,6 +84,11 @@ template class BenchmarkSuite { sizes[0] = m_; sizes[1] = m_; TensorMap, Eigen::Aligned> C(c_, sizes); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = C.random(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = C.random(); @@ -264,6 +274,7 @@ template class BenchmarkSuite { finalizeBenchmark(static_cast(m_) * k_ * num_iters); } + void broadcasting(int num_iters) { Eigen::array size_a; size_a[0] = m_; @@ -406,8 +417,8 @@ for (int iter = 0; iter < 10; ++iter) { b_, input_size); Eigen::array output_size; output_size[0] = k_; - TensorMap, Eigen::Aligned> C( - c_, output_size); + TensorMap, Eigen::Aligned> A( + a_, output_size); #ifndef EIGEN_HAS_INDEX_LIST Eigen::array sum_along_dim; @@ -419,12 +430,12 @@ for (int iter = 0; iter < 10; ++iter) { #endif #ifdef EIGEN_USE_SYCL // warmup for sycl for (int iter = 0; iter < 10; ++iter) { - C.device(device_) = B.sum(sum_along_dim); + A.device(device_) = B.sum(sum_along_dim); } #endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { - C.device(device_) = B.sum(sum_along_dim); + A.device(device_) = B.sum(sum_along_dim); } // Record the number of FLOP executed per second (assuming one operation // per value) @@ -455,37 +466,27 @@ for (int iter = 0; iter < 10; ++iter) { finalizeBenchmark(static_cast(k_) * n_ * num_iters); } + + // do a contraction which is equivalent to a matrix multiplication void contraction(int num_iters) { - Eigen::array sizeA; - sizeA[0] = m_; - sizeA[1] = k_; - Eigen::array sizeB; - sizeB[0] = k_; - sizeB[1] = n_; - Eigen::array sizeC; - sizeC[0] = m_; - sizeC[1] = n_; + contraction(Eigen::ColMajor)>(num_iters, false, false); + } - const TensorMap, Eigen::Aligned> A(a_, sizeA); - const TensorMap, Eigen::Aligned> B(b_, sizeB); - TensorMap, Eigen::Aligned> C(c_, sizeC); + void contractionRowMajor(int num_iters) { + contraction(Eigen::RowMajor)>(num_iters, false, false); + } + + void contractionRowMajorAT(int num_iters) { + contraction(Eigen::RowMajor)>(num_iters, true, false); + } - typedef typename Tensor::DimensionPair DimPair; - Eigen::array dims; - dims[0] = DimPair(1, 0); -#ifdef EIGEN_USE_SYCL // warmup for sycl - for (int iter = 0; iter < 10; ++iter) { - C.device(device_) = A.contract(B, dims); - } -#endif - StartBenchmarkTiming(); - for (int iter = 0; iter < num_iters; ++iter) { - C.device(device_) = A.contract(B, dims); - } - // Record the number of FLOP executed per second (size_ multiplications and - // additions for each value in the resulting tensor) - finalizeBenchmark(static_cast(2) * m_ * n_ * k_ * num_iters); + void contractionRowMajorBT(int num_iters) { + contraction(Eigen::RowMajor)>(num_iters, false, true); + } + + void contractionRowMajorABT(int num_iters) { + contraction(Eigen::RowMajor)>(num_iters, true, true); } void convolution(int num_iters, int kernel_x, int kernel_y) { @@ -513,13 +514,49 @@ for (int iter = 0; iter < 10; ++iter) { for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.convolve(B, dims); } - // Record the number of FLOP executed per second (kernel_size + // Record the number of FLOPs executed per second (kernel_size // multiplications and additions for each value in the resulting tensor) finalizeBenchmark(static_cast(2) * (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters); } private: + // do a contraction which is equivalent to a matrix multiplication + template + void contraction(int num_iters, bool trans_a, bool trans_b) { + Eigen::array sizeA; + sizeA[0] = (trans_a ? k_: m_); + sizeA[1] = (trans_a ? m_: k_); + Eigen::array sizeB; + sizeB[0] = (trans_b ? n_: k_); + sizeB[1] = (trans_b ? k_: n_); + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + TensorIndex a_contract_dim = (trans_a ? 0 : 1); + TensorIndex b_contract_dim = (trans_b ? 1 : 0); + dims[0] = DimPair(a_contract_dim, b_contract_dim); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + // Record the number of FLOP executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(static_cast(2) * m_ * n_ * k_ * num_iters); + } + void initialize() { a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); @@ -531,7 +568,6 @@ for (int iter = 0; iter < 10; ++iter) { device_.memset(b_, 23, k_ * n_ * sizeof(T)); device_.memset(c_, 31, m_ * n_ * sizeof(T)); - //BenchmarkUseRealTime(); } inline void finalizeBenchmark(int64_t num_items) { diff --git a/bench/tensors/tensor_benchmarks_sycl.cc b/bench/tensors/tensor_benchmarks_sycl.cc index cb6daac15..b8a096684 100644 --- a/bench/tensors/tensor_benchmarks_sycl.cc +++ b/bench/tensors/tensor_benchmarks_sycl.cc @@ -5,19 +5,76 @@ #include "tensor_benchmarks.h" -#define BM_FuncGPU(FUNC) \ - static void BM_##FUNC(int iters, int N) { \ - StopBenchmarkTiming(); \ - cl::sycl::gpu_selector selector; \ - Eigen::QueueInterface queue(selector); \ - Eigen::SyclDevice device(&queue); \ - BenchmarkSuite suite(device, N); \ - suite.FUNC(iters); \ - } \ +cl::sycl::gpu_selector selector; +Eigen::QueueInterface queue(selector); +#define BM_FuncWithInput2DimsGPU(FUNC, D1, D2) \ + static void BM_##FUNC##_##D1##x##D2(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, D1, D2); \ + suite.FUNC(iters); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2, 10, 10); + +BM_FuncWithInput2DimsGPU(rowReduction, 256, 100352); +BM_FuncWithInput2DimsGPU(rowReduction, 64, 100352); +BM_FuncWithInput2DimsGPU(rowReduction, 512, 25088); +BM_FuncWithInput2DimsGPU(rowReduction, 128, 25088); +BM_FuncWithInput2DimsGPU(rowReduction, 102, 6272); +BM_FuncWithInput2DimsGPU(rowReduction, 256, 6272); +BM_FuncWithInput2DimsGPU(rowReduction, 204, 1568); +BM_FuncWithInput2DimsGPU(rowReduction, 512, 1568); +BM_FuncWithInput2DimsGPU(rowReduction, 1024, 1568); +BM_FuncWithInput2DimsGPU(rowReduction, 2048, 1568); + +BM_FuncWithInput2DimsGPU(colReduction, 100352, 256); +BM_FuncWithInput2DimsGPU(colReduction, 100352, 64); +BM_FuncWithInput2DimsGPU(colReduction, 25088, 512); +BM_FuncWithInput2DimsGPU(colReduction, 6272, 102); +BM_FuncWithInput2DimsGPU(colReduction, 25088, 128); +BM_FuncWithInput2DimsGPU(colReduction, 6272, 256); +BM_FuncWithInput2DimsGPU(colReduction, 1568, 204); +BM_FuncWithInput2DimsGPU(colReduction, 1568, 512); +BM_FuncWithInput2DimsGPU(colReduction, 1568, 1024); +BM_FuncWithInput2DimsGPU(colReduction, 1568, 2048); +BM_FuncWithInput2DimsGPU(fullReduction, 1001, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 2050048, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 2097152, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 2048, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 262144, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 256, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 589824, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 1024, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 524288, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 512, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 2359296, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 1048576, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 131072, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 16384, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 9408, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 64, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 4096, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 36864, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 32768, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 128, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 147456, 1); +BM_FuncWithInput2DimsGPU(fullReduction, 65536, 1); +#define BM_FuncGPU(FUNC) \ + static void BM_##FUNC(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters); \ + } \ BENCHMARK_RANGE(BM_##FUNC, 10, 5000); +BM_FuncGPU(rowReduction); +BM_FuncGPU(colReduction); +BM_FuncGPU(fullReduction); + BM_FuncGPU(memcpy); BM_FuncGPU(typeCasting); +BM_FuncGPU(random); BM_FuncGPU(slicing); BM_FuncGPU(rowChip); BM_FuncGPU(colChip); @@ -28,40 +85,50 @@ BM_FuncGPU(broadcasting); BM_FuncGPU(coeffWiseOp); BM_FuncGPU(algebraicFunc); BM_FuncGPU(transcendentalFunc); -BM_FuncGPU(rowReduction); -BM_FuncGPU(colReduction); -BM_FuncGPU(fullReduction); - - // Contractions -#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ - static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ - StopBenchmarkTiming(); \ - cl::sycl::gpu_selector selector; \ - Eigen::QueueInterface queue(selector); \ - Eigen::SyclDevice device(&queue); \ - BenchmarkSuite suite(device, D1, D2, D3); \ - suite.FUNC(iters); \ - } \ +#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ + static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } \ BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); - BM_FuncWithInputDimsGPU(contraction, N, N, N); BM_FuncWithInputDimsGPU(contraction, 64, N, N); BM_FuncWithInputDimsGPU(contraction, N, 64, N); BM_FuncWithInputDimsGPU(contraction, N, N, 64); +BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, N); +BM_FuncWithInputDimsGPU(contractionRowMajor, 64, N, N); +BM_FuncWithInputDimsGPU(contractionRowMajor, N, 64, N); +BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, 64); + +BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, N); +BM_FuncWithInputDimsGPU(contractionRowMajorAT, 64, N, N); +BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, 64, N); +BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, 64); + +BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, N); +BM_FuncWithInputDimsGPU(contractionRowMajorBT, 64, N, N); +BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, 64, N); +BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, 64); + + +BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, N); +BM_FuncWithInputDimsGPU(contractionRowMajorABT, 64, N, N); +BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, 64, N); +BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, 64); // Convolutions -#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ - static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ - StopBenchmarkTiming(); \ - cl::sycl::gpu_selector selector; \ - Eigen::QueueInterface queue(selector); \ - Eigen::SyclDevice device(&queue); \ - BenchmarkSuite suite(device, N); \ - suite.FUNC(iters, DIM1, DIM2); \ - } \ +#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ + static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ + StopBenchmarkTiming(); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters, DIM1, DIM2); \ + } \ BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); BM_FuncWithKernelDimsGPU(convolution, 7, 1); diff --git a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc deleted file mode 100644 index bcc3c4c79..000000000 --- a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc +++ /dev/null @@ -1,2 +0,0 @@ -#include "tensor_benchmarks_sycl.cc" -#include "tensor_benchmarks_sycl.sycl" diff --git a/bench/tensors/tensor_contract_sycl_bench.cc b/bench/tensors/tensor_contract_sycl_bench.cc new file mode 100644 index 000000000..8f2defe42 --- /dev/null +++ b/bench/tensors/tensor_contract_sycl_bench.cc @@ -0,0 +1,325 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifndef EIGEN_BENCH_CONTRACT_SYCL +#define EIGEN_BENCH_CONTRACT_SYCL +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#include +#include +#include +#include +#include + +#include + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; +std::ofstream out("Result.txt"); + +std::chrono::time_point get_time(){ + std::chrono::time_point start, end; + return std::chrono::system_clock::now(); +} + +template +void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){ + + std::chrono::duration elapsed_seconds = end-start; + std::cout <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " << + static_cast((static_cast(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n"; + out <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " << + static_cast((static_cast(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n"; +} + +// do a contraction which is equivalent to a matrix multiplication +template +void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + + Eigen::array sizeA; + sizeA[0] = m_; + sizeA[1] = k_; + Eigen::array sizeB; + sizeB[0] = k_; + sizeB[1] = n_; + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(1, 0); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); +} + + + +// do a contraction which is equivalent to a matrix multiplication +template +void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + + Eigen::array sizeA; + sizeA[0] = m_; + sizeA[1] = k_; + Eigen::array sizeB; + sizeB[0] = k_; + sizeB[1] = n_; + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(1, 0); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); +} + + +template +void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + Eigen::array sizeA; + sizeA[0] = k_; + sizeA[1] = m_; + Eigen::array sizeB; + sizeB[0] = k_; + sizeB[1] = n_; + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(0, 0); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); + +} + +template +void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + + Eigen::array sizeA; + sizeA[0] = m_; + sizeA[1] = k_; + Eigen::array sizeB; + sizeB[0] = n_; + sizeB[1] = k_; + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(1, 1); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); + +} + +template +void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + + Eigen::array sizeA; + sizeA[0] = k_; + sizeA[1] = m_; + Eigen::array sizeB; + sizeB[0] = n_; + sizeB[1] = k_; + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(0, 1); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); +} + +int main() { + cl::sycl::gpu_selector selector; + Eigen::QueueInterface queue(selector); + Eigen::SyclDevice device(&queue); + int64_t num_iters =20; + for(int64_t m = 32; m <= 4096; m *= 2) + for(int64_t k = 32; k <= 4096; k *= 2) + for(int64_t n = 32; n <= 4096; n*= 2){ + (contraction(device, num_iters, m, k, n)); + (contractionRowMajor(device, num_iters, m, k, n)); + (contractionAT(device, num_iters, m, k, n)); + (contractionBT(device, num_iters, m, k, n)); + (contractionABT(device, num_iters, m, k, n)); + } + return 0; + } + +#endif // EIGEN_BENCH_CONTRACT_SYCL -- cgit v1.2.3