[SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch.

* Unifying all loadLocalTile from lhs and rhs to an extract_block function. * Adding get_tensor operation which was missing in TensorContractionMapper. * Adding the -D method missing from cmake for Disable_Skinny Contraction operation. * Wrapping all the indices in TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL * Unifying load to private register for tall/skinny no shared * Unifying load to vector tile for tensor-vector/vector-tensor operation * Removing all the LHS/RHS class for extracting data from global * Removing Outputfunction from TensorContractionSkinnyNoshared. * Combining the local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining General Tensor-Vector and VectorTensor contraction into one kernel. * Making double buffering optional for Tensor contraction when local memory is version is used. * Modifying benchmark to accept custom Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL CMake
author: Mehdi Goli <mehdi.goli@codeplay.com> 2019-11-28 10:08:54 +0000
committer: Mehdi Goli <mehdi.goli@codeplay.com> 2019-11-28 10:08:54 +0000
commit: 00f32752f7d0b193c6788691c3cf0b76457a044d (patch)
tree: 792e46110f0751ea8802fa9d403d1472d5977ac3 /bench/tensors/tensor_contract_sycl_bench.cc
parent: ea51a9eace7e4f0ea839e61eb2df85ccfb94aee8 (diff)
1 files changed, 325 insertions, 0 deletions
diff --git a/bench/tensors/tensor_contract_sycl_bench.cc b/bench/tensors/tensor_contract_sycl_bench.cc
new file mode 100644
index 000000000..8f2defe42
--- /dev/null
+++ b/bench/tensors/tensor_contract_sycl_bench.cc
@@ -0,0 +1,325 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_BENCH_CONTRACT_SYCL
+#define EIGEN_BENCH_CONTRACT_SYCL
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#include <SYCL/sycl.hpp>
+#include <fstream>
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+std::ofstream out("Result.txt");
+
+std::chrono::time_point<std::chrono::system_clock> get_time(){
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  return std::chrono::system_clock::now();
+}
+
+template<typename Start, typename End, typename TensorIndex>
+void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){
+
+  std::chrono::duration<double> elapsed_seconds = end-start;
+  std::cout <<"Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
+  static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
+    out <<"Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
+    static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
+}
+
+// do a contraction which is equivalent to a matrix multiplication
+template<typename T, typename Device, typename TensorIndex>
+void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = m_;
+  sizeA[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = k_;
+  sizeB[1] = n_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 0);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+ auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+}
+
+
+
+// do a contraction which is equivalent to a matrix multiplication
+template<typename T, typename Device, typename TensorIndex>
+void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = m_;
+  sizeA[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = k_;
+  sizeB[1] = n_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 0);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+}
+
+
+template<typename T, typename Device, typename TensorIndex>
+void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = k_;
+  sizeA[1] = m_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = k_;
+  sizeB[1] = n_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(0, 0);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+
+}
+
+template<typename T, typename Device, typename TensorIndex>
+void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = m_;
+  sizeA[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = n_;
+  sizeB[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 1);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+
+}
+
+template<typename T, typename Device, typename TensorIndex>
+void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = k_;
+  sizeA[1] = m_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = n_;
+  sizeB[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(0, 1);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+}
+
+int main() {
+  cl::sycl::gpu_selector selector;
+  Eigen::QueueInterface queue(selector);
+  Eigen::SyclDevice device(&queue);
+  int64_t num_iters =20;
+  for(int64_t m = 32; m <= 4096; m *= 2)
+    for(int64_t k = 32; k <= 4096; k *= 2)
+      for(int64_t n = 32; n <= 4096; n*= 2){
+        (contraction<float>(device, num_iters, m, k, n));
+        (contractionRowMajor<float>(device, num_iters, m, k, n));
+        (contractionAT<float>(device, num_iters, m, k, n));
+        (contractionBT<float>(device, num_iters, m, k, n));
+        (contractionABT<float>(device, num_iters, m, k, n));
+      }
+  return 0;
+  }
+
+#endif // EIGEN_BENCH_CONTRACT_SYCL
author	Mehdi Goli <mehdi.goli@codeplay.com>	2019-11-28 10:08:54 +0000
committer	Mehdi Goli <mehdi.goli@codeplay.com>	2019-11-28 10:08:54 +0000
commit	00f32752f7d0b193c6788691c3cf0b76457a044d (patch)
tree	792e46110f0751ea8802fa9d403d1472d5977ac3 /bench/tensors/tensor_contract_sycl_bench.cc
parent	ea51a9eace7e4f0ea839e61eb2df85ccfb94aee8 (diff)