// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2016 // Mehdi Goli Codeplay Software Ltd. // Ralph Potter Codeplay Software Ltd. // Luke Iwanski Codeplay Software Ltd. // Contact: // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_BENCH_CONTRACT_SYCL #define EIGEN_BENCH_CONTRACT_SYCL #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #include #include #include #include #include #include using Eigen::array; using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; std::ofstream out("Result.txt"); std::chrono::time_point get_time(){ std::chrono::time_point start, end; return std::chrono::system_clock::now(); } template void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){ std::chrono::duration elapsed_seconds = end-start; std::cout <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " << static_cast((static_cast(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n"; out <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " << static_cast((static_cast(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n"; } // do a contraction which is equivalent to a matrix multiplication template void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { T* a_; T* b_; T* c_; a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); // Initialize the content of the memory pools to prevent asan from // complaining. device_.memset(a_, 12, m_ * k_ * sizeof(T)); device_.memset(b_, 23, k_ * n_ * sizeof(T)); device_.memset(c_, 31, m_ * n_ * sizeof(T)); Eigen::array sizeA; sizeA[0] = m_; sizeA[1] = k_; Eigen::array sizeB; sizeB[0] = k_; sizeB[1] = n_; Eigen::array sizeC; sizeC[0] = m_; sizeC[1] = n_; const TensorMap, Eigen::Aligned> A(a_, sizeA); const TensorMap, Eigen::Aligned> B(b_, sizeB); TensorMap, Eigen::Aligned> C(c_, sizeC); typedef typename Tensor::DimensionPair DimPair; Eigen::array dims; dims[0] = DimPair(1, 0); #ifdef EIGEN_USE_SYCL // warmup for sycl for (int iter = 0; iter < 10; ++iter) { C.device(device_) = A.contract(B, dims); } #endif auto start = get_time(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.contract(B, dims); } auto end = get_time(); // Record the number of FLOPs executed per second (size_ multiplications and // additions for each value in the resulting tensor) finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction"); device_.deallocate(a_); device_.deallocate(b_); device_.deallocate(c_); device_.synchronize(); } // do a contraction which is equivalent to a matrix multiplication template void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { T* a_; T* b_; T* c_; a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); // Initialize the content of the memory pools to prevent asan from // complaining. device_.memset(a_, 12, m_ * k_ * sizeof(T)); device_.memset(b_, 23, k_ * n_ * sizeof(T)); device_.memset(c_, 31, m_ * n_ * sizeof(T)); Eigen::array sizeA; sizeA[0] = m_; sizeA[1] = k_; Eigen::array sizeB; sizeB[0] = k_; sizeB[1] = n_; Eigen::array sizeC; sizeC[0] = m_; sizeC[1] = n_; const TensorMap, Eigen::Aligned> A(a_, sizeA); const TensorMap, Eigen::Aligned> B(b_, sizeB); TensorMap, Eigen::Aligned> C(c_, sizeC); typedef typename Tensor::DimensionPair DimPair; Eigen::array dims; dims[0] = DimPair(1, 0); #ifdef EIGEN_USE_SYCL // warmup for sycl for (int iter = 0; iter < 10; ++iter) { C.device(device_) = A.contract(B, dims); } #endif auto start = get_time(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.contract(B, dims); } auto end = get_time(); // Record the number of FLOPs executed per second (size_ multiplications and // additions for each value in the resulting tensor) finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor"); device_.deallocate(a_); device_.deallocate(b_); device_.deallocate(c_); device_.synchronize(); } template void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { T* a_; T* b_; T* c_; a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); // Initialize the content of the memory pools to prevent asan from // complaining. device_.memset(a_, 12, m_ * k_ * sizeof(T)); device_.memset(b_, 23, k_ * n_ * sizeof(T)); device_.memset(c_, 31, m_ * n_ * sizeof(T)); Eigen::array sizeA; sizeA[0] = k_; sizeA[1] = m_; Eigen::array sizeB; sizeB[0] = k_; sizeB[1] = n_; Eigen::array sizeC; sizeC[0] = m_; sizeC[1] = n_; const TensorMap, Eigen::Aligned> A(a_, sizeA); const TensorMap, Eigen::Aligned> B(b_, sizeB); TensorMap, Eigen::Aligned> C(c_, sizeC); typedef typename Tensor::DimensionPair DimPair; Eigen::array dims; dims[0] = DimPair(0, 0); #ifdef EIGEN_USE_SYCL // warmup for sycl for (int iter = 0; iter < 10; ++iter) { C.device(device_) = A.contract(B, dims); } #endif auto start = get_time(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.contract(B, dims); } auto end = get_time(); // Record the number of FLOPs executed per second (size_ multiplications and // additions for each value in the resulting tensor) finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT"); device_.deallocate(a_); device_.deallocate(b_); device_.deallocate(c_); device_.synchronize(); } template void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { T* a_; T* b_; T* c_; a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); // Initialize the content of the memory pools to prevent asan from // complaining. device_.memset(a_, 12, m_ * k_ * sizeof(T)); device_.memset(b_, 23, k_ * n_ * sizeof(T)); device_.memset(c_, 31, m_ * n_ * sizeof(T)); Eigen::array sizeA; sizeA[0] = m_; sizeA[1] = k_; Eigen::array sizeB; sizeB[0] = n_; sizeB[1] = k_; Eigen::array sizeC; sizeC[0] = m_; sizeC[1] = n_; const TensorMap, Eigen::Aligned> A(a_, sizeA); const TensorMap, Eigen::Aligned> B(b_, sizeB); TensorMap, Eigen::Aligned> C(c_, sizeC); typedef typename Tensor::DimensionPair DimPair; Eigen::array dims; dims[0] = DimPair(1, 1); #ifdef EIGEN_USE_SYCL // warmup for sycl for (int iter = 0; iter < 10; ++iter) { C.device(device_) = A.contract(B, dims); } #endif auto start = get_time(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.contract(B, dims); } auto end = get_time(); // Record the number of FLOPs executed per second (size_ multiplications and // additions for each value in the resulting tensor) finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT"); device_.deallocate(a_); device_.deallocate(b_); device_.deallocate(c_); device_.synchronize(); } template void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) { T* a_; T* b_; T* c_; a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); // Initialize the content of the memory pools to prevent asan from // complaining. device_.memset(a_, 12, m_ * k_ * sizeof(T)); device_.memset(b_, 23, k_ * n_ * sizeof(T)); device_.memset(c_, 31, m_ * n_ * sizeof(T)); Eigen::array sizeA; sizeA[0] = k_; sizeA[1] = m_; Eigen::array sizeB; sizeB[0] = n_; sizeB[1] = k_; Eigen::array sizeC; sizeC[0] = m_; sizeC[1] = n_; const TensorMap, Eigen::Aligned> A(a_, sizeA); const TensorMap, Eigen::Aligned> B(b_, sizeB); TensorMap, Eigen::Aligned> C(c_, sizeC); typedef typename Tensor::DimensionPair DimPair; Eigen::array dims; dims[0] = DimPair(0, 1); #ifdef EIGEN_USE_SYCL // warmup for sycl for (int iter = 0; iter < 10; ++iter) { C.device(device_) = A.contract(B, dims); } #endif auto start = get_time(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.contract(B, dims); } auto end = get_time(); // Record the number of FLOPs executed per second (size_ multiplications and // additions for each value in the resulting tensor) finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT"); device_.deallocate(a_); device_.deallocate(b_); device_.deallocate(c_); device_.synchronize(); } int main() { cl::sycl::gpu_selector selector; Eigen::QueueInterface queue(selector); Eigen::SyclDevice device(&queue); int64_t num_iters =20; for(int64_t m = 32; m <= 4096; m *= 2) for(int64_t k = 32; k <= 4096; k *= 2) for(int64_t n = 32; n <= 4096; n*= 2){ (contraction(device, num_iters, m, k, n)); (contractionRowMajor(device, num_iters, m, k, n)); (contractionAT(device, num_iters, m, k, n)); (contractionBT(device, num_iters, m, k, n)); (contractionABT(device, num_iters, m, k, n)); } return 0; } #endif // EIGEN_BENCH_CONTRACT_SYCL