diff options
author | Mehdi Goli <mehdi.goli@codeplay.com> | 2019-11-28 10:08:54 +0000 |
---|---|---|
committer | Mehdi Goli <mehdi.goli@codeplay.com> | 2019-11-28 10:08:54 +0000 |
commit | 00f32752f7d0b193c6788691c3cf0b76457a044d (patch) | |
tree | 792e46110f0751ea8802fa9d403d1472d5977ac3 /bench/tensors/tensor_contract_sycl_bench.cc | |
parent | ea51a9eace7e4f0ea839e61eb2df85ccfb94aee8 (diff) |
[SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch.
* Unifying all loadLocalTile from lhs and rhs to an extract_block function.
* Adding get_tensor operation which was missing in TensorContractionMapper.
* Adding the -D method missing from cmake for Disable_Skinny Contraction operation.
* Wrapping all the indices in TensorScanSycl into Scan parameter struct.
* Fixing typo in Device SYCL
* Unifying load to private register for tall/skinny no shared
* Unifying load to vector tile for tensor-vector/vector-tensor operation
* Removing all the LHS/RHS class for extracting data from global
* Removing Outputfunction from TensorContractionSkinnyNoshared.
* Combining the local memory version of tall/skinny and normal tensor contraction into one kernel.
* Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel.
* Combining General Tensor-Vector and VectorTensor contraction into one kernel.
* Making double buffering optional for Tensor contraction when local memory is version is used.
* Modifying benchmark to accept custom Reduction Sizes
* Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host
* Adding Test for SYCL
* Modifying SYCL CMake
Diffstat (limited to 'bench/tensors/tensor_contract_sycl_bench.cc')
-rw-r--r-- | bench/tensors/tensor_contract_sycl_bench.cc | 325 |
1 files changed, 325 insertions, 0 deletions
diff --git a/bench/tensors/tensor_contract_sycl_bench.cc b/bench/tensors/tensor_contract_sycl_bench.cc new file mode 100644 index 000000000..8f2defe42 --- /dev/null +++ b/bench/tensors/tensor_contract_sycl_bench.cc @@ -0,0 +1,325 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifndef EIGEN_BENCH_CONTRACT_SYCL +#define EIGEN_BENCH_CONTRACT_SYCL +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#include <SYCL/sycl.hpp> +#include <fstream> +#include <iostream> +#include <chrono> +#include <ctime> + +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; +std::ofstream out("Result.txt"); + +std::chrono::time_point<std::chrono::system_clock> get_time(){ + std::chrono::time_point<std::chrono::system_clock> start, end; + return std::chrono::system_clock::now(); +} + +template<typename Start, typename End, typename TensorIndex> +void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){ + + std::chrono::duration<double> elapsed_seconds = end-start; + std::cout <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " << + static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n"; + out <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " << + static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n"; +} + +// do a contraction which is equivalent to a matrix multiplication +template<typename T, typename Device, typename TensorIndex> +void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + + Eigen::array<TensorIndex, 2> sizeA; + sizeA[0] = m_; + sizeA[1] = k_; + Eigen::array<TensorIndex, 2> sizeB; + sizeB[0] = k_; + sizeB[1] = n_; + Eigen::array<TensorIndex, 2> sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA); + const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB); + TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor<T, 2>::DimensionPair DimPair; + Eigen::array<DimPair, 1> dims; + dims[0] = DimPair(1, 0); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); +} + + + +// do a contraction which is equivalent to a matrix multiplication +template<typename T, typename Device, typename TensorIndex> +void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + + Eigen::array<TensorIndex, 2> sizeA; + sizeA[0] = m_; + sizeA[1] = k_; + Eigen::array<TensorIndex, 2> sizeB; + sizeB[0] = k_; + sizeB[1] = n_; + Eigen::array<TensorIndex, 2> sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA); + const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB); + TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor<T, 2>::DimensionPair DimPair; + Eigen::array<DimPair, 1> dims; + dims[0] = DimPair(1, 0); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); +} + + +template<typename T, typename Device, typename TensorIndex> +void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + Eigen::array<TensorIndex, 2> sizeA; + sizeA[0] = k_; + sizeA[1] = m_; + Eigen::array<TensorIndex, 2> sizeB; + sizeB[0] = k_; + sizeB[1] = n_; + Eigen::array<TensorIndex, 2> sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA); + const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB); + TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor<T, 2>::DimensionPair DimPair; + Eigen::array<DimPair, 1> dims; + dims[0] = DimPair(0, 0); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); + +} + +template<typename T, typename Device, typename TensorIndex> +void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + + Eigen::array<TensorIndex, 2> sizeA; + sizeA[0] = m_; + sizeA[1] = k_; + Eigen::array<TensorIndex, 2> sizeB; + sizeB[0] = n_; + sizeB[1] = k_; + Eigen::array<TensorIndex, 2> sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA); + const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB); + TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor<T, 2>::DimensionPair DimPair; + Eigen::array<DimPair, 1> dims; + dims[0] = DimPair(1, 1); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); + +} + +template<typename T, typename Device, typename TensorIndex> +void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + + Eigen::array<TensorIndex, 2> sizeA; + sizeA[0] = k_; + sizeA[1] = m_; + Eigen::array<TensorIndex, 2> sizeB; + sizeB[0] = n_; + sizeB[1] = k_; + Eigen::array<TensorIndex, 2> sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA); + const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB); + TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor<T, 2>::DimensionPair DimPair; + Eigen::array<DimPair, 1> dims; + dims[0] = DimPair(0, 1); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); +} + +int main() { + cl::sycl::gpu_selector selector; + Eigen::QueueInterface queue(selector); + Eigen::SyclDevice device(&queue); + int64_t num_iters =20; + for(int64_t m = 32; m <= 4096; m *= 2) + for(int64_t k = 32; k <= 4096; k *= 2) + for(int64_t n = 32; n <= 4096; n*= 2){ + (contraction<float>(device, num_iters, m, k, n)); + (contractionRowMajor<float>(device, num_iters, m, k, n)); + (contractionAT<float>(device, num_iters, m, k, n)); + (contractionBT<float>(device, num_iters, m, k, n)); + (contractionABT<float>(device, num_iters, m, k, n)); + } + return 0; + } + +#endif // EIGEN_BENCH_CONTRACT_SYCL |