From 00f32752f7d0b193c6788691c3cf0b76457a044d Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Thu, 28 Nov 2019 10:08:54 +0000 Subject: [SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch. * Unifying all loadLocalTile from lhs and rhs to an extract_block function. * Adding get_tensor operation which was missing in TensorContractionMapper. * Adding the -D method missing from cmake for Disable_Skinny Contraction operation. * Wrapping all the indices in TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL * Unifying load to private register for tall/skinny no shared * Unifying load to vector tile for tensor-vector/vector-tensor operation * Removing all the LHS/RHS class for extracting data from global * Removing Outputfunction from TensorContractionSkinnyNoshared. * Combining the local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining General Tensor-Vector and VectorTensor contraction into one kernel. * Making double buffering optional for Tensor contraction when local memory is version is used. * Modifying benchmark to accept custom Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL CMake --- bench/tensors/tensor_contract_sycl_bench.cc | 325 ++++++++++++++++++++++++++++ 1 file changed, 325 insertions(+) create mode 100644 bench/tensors/tensor_contract_sycl_bench.cc (limited to 'bench/tensors/tensor_contract_sycl_bench.cc') diff --git a/bench/tensors/tensor_contract_sycl_bench.cc b/bench/tensors/tensor_contract_sycl_bench.cc new file mode 100644 index 000000000..8f2defe42 --- /dev/null +++ b/bench/tensors/tensor_contract_sycl_bench.cc @@ -0,0 +1,325 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifndef EIGEN_BENCH_CONTRACT_SYCL +#define EIGEN_BENCH_CONTRACT_SYCL +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#include +#include +#include +#include +#include + +#include + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; +std::ofstream out("Result.txt"); + +std::chrono::time_point get_time(){ + std::chrono::time_point start, end; + return std::chrono::system_clock::now(); +} + +template +void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){ + + std::chrono::duration elapsed_seconds = end-start; + std::cout <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " << + static_cast((static_cast(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n"; + out <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " << + static_cast((static_cast(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n"; +} + +// do a contraction which is equivalent to a matrix multiplication +template +void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + + Eigen::array sizeA; + sizeA[0] = m_; + sizeA[1] = k_; + Eigen::array sizeB; + sizeB[0] = k_; + sizeB[1] = n_; + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(1, 0); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); +} + + + +// do a contraction which is equivalent to a matrix multiplication +template +void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + + Eigen::array sizeA; + sizeA[0] = m_; + sizeA[1] = k_; + Eigen::array sizeB; + sizeB[0] = k_; + sizeB[1] = n_; + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(1, 0); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); +} + + +template +void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + Eigen::array sizeA; + sizeA[0] = k_; + sizeA[1] = m_; + Eigen::array sizeB; + sizeB[0] = k_; + sizeB[1] = n_; + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(0, 0); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); + +} + +template +void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + + Eigen::array sizeA; + sizeA[0] = m_; + sizeA[1] = k_; + Eigen::array sizeB; + sizeB[0] = n_; + sizeB[1] = k_; + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(1, 1); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); + +} + +template +void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { + T* a_; + T* b_; + T* c_; + a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); + b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); + c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(T)); + device_.memset(b_, 23, k_ * n_ * sizeof(T)); + device_.memset(c_, 31, m_ * n_ * sizeof(T)); + + Eigen::array sizeA; + sizeA[0] = k_; + sizeA[1] = m_; + Eigen::array sizeB; + sizeB[0] = n_; + sizeB[1] = k_; + Eigen::array sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(0, 1); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + auto start = get_time(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + auto end = get_time(); + // Record the number of FLOPs executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT"); + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + device_.synchronize(); +} + +int main() { + cl::sycl::gpu_selector selector; + Eigen::QueueInterface queue(selector); + Eigen::SyclDevice device(&queue); + int64_t num_iters =20; + for(int64_t m = 32; m <= 4096; m *= 2) + for(int64_t k = 32; k <= 4096; k *= 2) + for(int64_t n = 32; n <= 4096; n*= 2){ + (contraction(device, num_iters, m, k, n)); + (contractionRowMajor(device, num_iters, m, k, n)); + (contractionAT(device, num_iters, m, k, n)); + (contractionBT(device, num_iters, m, k, n)); + (contractionABT(device, num_iters, m, k, n)); + } + return 0; + } + +#endif // EIGEN_BENCH_CONTRACT_SYCL -- cgit v1.2.3