From 00f32752f7d0b193c6788691c3cf0b76457a044d Mon Sep 17 00:00:00 2001
From: Mehdi Goli <mehdi.goli@codeplay.com>
Date: Thu, 28 Nov 2019 10:08:54 +0000
Subject: [SYCL] Rebasing the SYCL support branch on top of the Einge upstream
 master branch. * Unifying all loadLocalTile from lhs and rhs to an
 extract_block function. * Adding get_tensor operation which was missing in
 TensorContractionMapper. * Adding the -D method missing from cmake for
 Disable_Skinny Contraction operation. * Wrapping all the indices in
 TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL *
 Unifying load to private register for tall/skinny no shared * Unifying load
 to vector tile for tensor-vector/vector-tensor operation * Removing all the
 LHS/RHS class for extracting data from global * Removing Outputfunction from
 TensorContractionSkinnyNoshared. * Combining the local memory version of
 tall/skinny and normal tensor contraction into one kernel. * Combining the
 no-local memory version of tall/skinny and normal tensor contraction into one
 kernel. * Combining General Tensor-Vector and VectorTensor contraction into
 one kernel. * Making double buffering optional for Tensor contraction when
 local memory is version is used. * Modifying benchmark to accept custom
 Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to
 allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL
 CMake

---
 bench/tensors/README                               |  17 +-
 bench/tensors/eigen_sycl_bench.sh                  |  30 ++
 bench/tensors/eigen_sycl_bench_contract.sh         |   7 +
 bench/tensors/tensor_benchmarks.h                  | 102 ++++---
 bench/tensors/tensor_benchmarks_sycl.cc            | 133 ++++++---
 .../tensor_benchmarks_sycl_include_headers.cc      |   2 -
 bench/tensors/tensor_contract_sycl_bench.cc        | 325 +++++++++++++++++++++
 7 files changed, 537 insertions(+), 79 deletions(-)
 create mode 100755 bench/tensors/eigen_sycl_bench.sh
 create mode 100644 bench/tensors/eigen_sycl_bench_contract.sh
 delete mode 100644 bench/tensors/tensor_benchmarks_sycl_include_headers.cc
 create mode 100644 bench/tensors/tensor_contract_sycl_bench.cc

(limited to 'bench')

diff --git a/bench/tensors/README b/bench/tensors/README
index 69342cc9c..dcbf0217a 100644
--- a/bench/tensors/README
+++ b/bench/tensors/README
@@ -11,15 +11,10 @@ nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBU
 We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code.
 nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_53 -o benchmarks_fp16_gpu
 
-last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
-g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
+To compile and run the benchmark for SYCL, using ComputeCpp, simply run the
+following commands:
+1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY}
+2. bash eigen_sycl_bench.sh
 
-To compile and run the benchmark for SYCL, using ComputeCpp you currently need following passes (only for translation units containing device code):
-1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code.
-{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc -DEIGEN_USE_SYCL=1
-2. The host compilation pass that generates the final host binary.
-clang++ -O3 -c benchmark_main.cc -pthread -I ../../ -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 -o benchmark_main.o
-clang++ -O3 tensor_benchmarks_sycl_include_headers.cc -pthread -I ../../ -I  {ComputeCpp_ROOT}/include/ -L  {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 benchmark_main.o -o tensor_benchmark_sycl
-export LD_LIBRARY_PATH={ComputeCpp_ROOT}/lib
-3. Run the benchmark
-./tensor_benchmark_sycl
+Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
+g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
diff --git a/bench/tensors/eigen_sycl_bench.sh b/bench/tensors/eigen_sycl_bench.sh
new file mode 100755
index 000000000..3f67b3d86
--- /dev/null
+++ b/bench/tensors/eigen_sycl_bench.sh
@@ -0,0 +1,30 @@
+rm -f tensor_benchmark_sycl
+: "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}"
+echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR
+${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++ \
+tensor_benchmarks_sycl.cc \
+benchmark_main.cc \
+-I ../../ \
+-I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ \
+-std=c++11 \
+-march=native \
+-O3 \
+-DNDEBUG \
+-DEIGEN_MPL2_ONLY \
+-DEIGEN_USE_SYCL=1 \
+-DEIGEN_SYCL_LOCAL_MEM=1 \
+-no-serial-memop \
+-mllvm \
+-inline-threshold=10000 \
+-fsycl-ih-last \
+-sycl-driver \
+-Xclang -cl-mad-enable \
+-lOpenCL \
+-lComputeCpp \
+-lpthread \
+-o \
+tensor_benchmark_sycl\
+${@:1}
+
+export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH
+./tensor_benchmark_sycl
diff --git a/bench/tensors/eigen_sycl_bench_contract.sh b/bench/tensors/eigen_sycl_bench_contract.sh
new file mode 100644
index 000000000..73fd6c4a0
--- /dev/null
+++ b/bench/tensors/eigen_sycl_bench_contract.sh
@@ -0,0 +1,7 @@
+rm -f tensor_contract_sycl_bench
+: "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}"
+echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR
+${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++  tensor_contract_sycl_bench.cc -I ../../ -I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ -std=c++11 -O3 -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -no-serial-memop -mllvm -inline-threshold=10000 -fsycl-ih-last -sycl-driver -Xclang -cl-mad-enable -lOpenCL -lComputeCpp -lpthread -o tensor_contract_sycl_bench ${@:1}
+export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH
+./tensor_contract_sycl_bench
+
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
index 3a640ede4..0825e1563 100644
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@@ -27,6 +27,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     initialize();
   }
 
+  BenchmarkSuite(const Device& device, size_t m, size_t k)
+      : m_(1), k_(k), n_(m), device_(device) {
+    initialize();
+  }
+
   ~BenchmarkSuite() {
     device_.deallocate(a_);
     device_.deallocate(b_);
@@ -79,6 +84,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     sizes[0] = m_;
     sizes[1] = m_;
     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = C.random();
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = C.random();
@@ -264,6 +274,7 @@ template <typename Device, typename T> class BenchmarkSuite {
     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
   }
 
+
   void broadcasting(int num_iters) {
     Eigen::array<TensorIndex, 2> size_a;
     size_a[0] = m_;
@@ -406,8 +417,8 @@ for (int iter = 0; iter < 10; ++iter) {
         b_, input_size);
     Eigen::array<TensorIndex, 1> output_size;
     output_size[0] = k_;
-    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(
-        c_, output_size);
+    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> A(
+        a_, output_size);
 
 #ifndef EIGEN_HAS_INDEX_LIST
     Eigen::array<TensorIndex, 1> sum_along_dim;
@@ -419,12 +430,12 @@ for (int iter = 0; iter < 10; ++iter) {
 #endif
 #ifdef EIGEN_USE_SYCL // warmup for sycl
   for (int iter = 0; iter < 10; ++iter) {
-    C.device(device_) = B.sum(sum_along_dim);
+    A.device(device_) = B.sum(sum_along_dim);
   }
 #endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
-      C.device(device_) = B.sum(sum_along_dim);
+      A.device(device_) = B.sum(sum_along_dim);
     }
     // Record the number of FLOP executed per second (assuming one operation
     // per value)
@@ -455,37 +466,27 @@ for (int iter = 0; iter < 10; ++iter) {
     finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
   }
 
+  
+
   // do a contraction which is equivalent to a matrix multiplication
   void contraction(int num_iters) {
-    Eigen::array<TensorIndex, 2> sizeA;
-    sizeA[0] = m_;
-    sizeA[1] = k_;
-    Eigen::array<TensorIndex, 2> sizeB;
-    sizeB[0] = k_;
-    sizeB[1] = n_;
-    Eigen::array<TensorIndex, 2> sizeC;
-    sizeC[0] = m_;
-    sizeC[1] = n_;
+      contraction<static_cast<int>(Eigen::ColMajor)>(num_iters, false, false);
+  }
 
-    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
-    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
-    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
+    void contractionRowMajor(int num_iters) {
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, false);
+  }
+    
+  void contractionRowMajorAT(int num_iters) {
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, false);
+  }
 
-    typedef typename Tensor<T, 2>::DimensionPair DimPair;
-    Eigen::array<DimPair, 1> dims;
-    dims[0] = DimPair(1, 0);
-#ifdef EIGEN_USE_SYCL // warmup for sycl
-    for (int iter = 0; iter < 10; ++iter) {
-      C.device(device_) = A.contract(B, dims);
-     }
-#endif
-    StartBenchmarkTiming();
-    for (int iter = 0; iter < num_iters; ++iter) {
-      C.device(device_) = A.contract(B, dims);
-    }
-    // Record the number of FLOP executed per second (size_ multiplications and
-    // additions for each value in the resulting tensor)
-    finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
+  void contractionRowMajorBT(int num_iters) {
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, true);
+  }
+
+  void contractionRowMajorABT(int num_iters) {
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, true);
   }
 
   void convolution(int num_iters, int kernel_x, int kernel_y) {
@@ -513,13 +514,49 @@ for (int iter = 0; iter < 10; ++iter) {
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = A.convolve(B, dims);
     }
-    // Record the number of FLOP executed per second (kernel_size
+    // Record the number of FLOPs executed per second (kernel_size
     // multiplications and additions for each value in the resulting tensor)
     finalizeBenchmark(static_cast<int64_t>(2) *
         (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
   }
 
  private:
+ // do a contraction which is equivalent to a matrix multiplication
+  template<int Layout>
+  void contraction(int num_iters, bool trans_a, bool trans_b) {
+    Eigen::array<TensorIndex, 2> sizeA;
+    sizeA[0] = (trans_a ? k_: m_);
+    sizeA[1] = (trans_a ? m_:  k_);
+    Eigen::array<TensorIndex, 2> sizeB;
+    sizeB[0] = (trans_b ? n_: k_);
+    sizeB[1] = (trans_b ? k_: n_);
+    Eigen::array<TensorIndex, 2> sizeC;
+    sizeC[0] = m_;
+    sizeC[1] = n_;
+
+    const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> A(a_, sizeA);
+    const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> B(b_, sizeB);
+    TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> C(c_, sizeC);
+
+    typedef typename Tensor<T, 2, Layout>::DimensionPair DimPair;
+    Eigen::array<DimPair, 1> dims;
+    TensorIndex a_contract_dim = (trans_a ? 0 : 1);
+    TensorIndex b_contract_dim = (trans_b ? 1 : 0);
+    dims[0] = DimPair(a_contract_dim, b_contract_dim);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.contract(B, dims);
+     }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.contract(B, dims);
+    }
+    // Record the number of FLOP executed per second (size_ multiplications and
+    // additions for each value in the resulting tensor)
+    finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
+  }
+
   void initialize() {
     a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
     b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
@@ -531,7 +568,6 @@ for (int iter = 0; iter < 10; ++iter) {
     device_.memset(b_, 23, k_ * n_ * sizeof(T));
     device_.memset(c_, 31, m_ * n_ * sizeof(T));
 
-    //BenchmarkUseRealTime();
   }
 
   inline void finalizeBenchmark(int64_t num_items) {
diff --git a/bench/tensors/tensor_benchmarks_sycl.cc b/bench/tensors/tensor_benchmarks_sycl.cc
index cb6daac15..b8a096684 100644
--- a/bench/tensors/tensor_benchmarks_sycl.cc
+++ b/bench/tensors/tensor_benchmarks_sycl.cc
@@ -5,19 +5,76 @@
 
 #include "tensor_benchmarks.h"
 
-#define BM_FuncGPU(FUNC)                                                       \
-  static void BM_##FUNC(int iters, int N) {                                    \
-    StopBenchmarkTiming();                                                     \
-    cl::sycl::gpu_selector selector;                                           \
-    Eigen::QueueInterface queue(selector);                                     \
-    Eigen::SyclDevice device(&queue);                                          \
-    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N);                 \
-    suite.FUNC(iters);                                                         \
-  }                                                                            \
+cl::sycl::gpu_selector selector;
+Eigen::QueueInterface queue(selector);
+#define BM_FuncWithInput2DimsGPU(FUNC, D1, D2)                      \
+  static void BM_##FUNC##_##D1##x##D2(int iters, int N) {           \
+    StopBenchmarkTiming();                                          \
+    Eigen::SyclDevice device(&queue);                               \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2); \
+    suite.FUNC(iters);                                              \
+  }                                                                 \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2, 10, 10);
+
+BM_FuncWithInput2DimsGPU(rowReduction, 256, 100352);
+BM_FuncWithInput2DimsGPU(rowReduction, 64, 100352);
+BM_FuncWithInput2DimsGPU(rowReduction, 512, 25088);
+BM_FuncWithInput2DimsGPU(rowReduction, 128, 25088);
+BM_FuncWithInput2DimsGPU(rowReduction, 102, 6272);
+BM_FuncWithInput2DimsGPU(rowReduction, 256, 6272);
+BM_FuncWithInput2DimsGPU(rowReduction, 204, 1568);
+BM_FuncWithInput2DimsGPU(rowReduction, 512, 1568);
+BM_FuncWithInput2DimsGPU(rowReduction, 1024, 1568);
+BM_FuncWithInput2DimsGPU(rowReduction, 2048, 1568);
+
+BM_FuncWithInput2DimsGPU(colReduction, 100352, 256);
+BM_FuncWithInput2DimsGPU(colReduction, 100352, 64);
+BM_FuncWithInput2DimsGPU(colReduction, 25088, 512);
+BM_FuncWithInput2DimsGPU(colReduction, 6272, 102);
+BM_FuncWithInput2DimsGPU(colReduction, 25088, 128);
+BM_FuncWithInput2DimsGPU(colReduction, 6272, 256);
+BM_FuncWithInput2DimsGPU(colReduction, 1568, 204);
+BM_FuncWithInput2DimsGPU(colReduction, 1568, 512);
+BM_FuncWithInput2DimsGPU(colReduction, 1568, 1024);
+BM_FuncWithInput2DimsGPU(colReduction, 1568, 2048);
+BM_FuncWithInput2DimsGPU(fullReduction, 1001, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 2050048, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 2097152, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 2048, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 262144, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 256, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 589824, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 1024, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 524288, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 512, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 2359296, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 1048576, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 131072, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 16384, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 9408, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 64, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 4096, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 36864, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 32768, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 128, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 147456, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 65536, 1);
+#define BM_FuncGPU(FUNC)                                       \
+  static void BM_##FUNC(int iters, int N) {                    \
+    StopBenchmarkTiming();                                     \
+    Eigen::SyclDevice device(&queue);                          \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
+    suite.FUNC(iters);                                         \
+  }                                                            \
   BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
 
+BM_FuncGPU(rowReduction);
+BM_FuncGPU(colReduction);
+BM_FuncGPU(fullReduction);
+
 BM_FuncGPU(memcpy);
 BM_FuncGPU(typeCasting);
+BM_FuncGPU(random);
 BM_FuncGPU(slicing);
 BM_FuncGPU(rowChip);
 BM_FuncGPU(colChip);
@@ -28,40 +85,50 @@ BM_FuncGPU(broadcasting);
 BM_FuncGPU(coeffWiseOp);
 BM_FuncGPU(algebraicFunc);
 BM_FuncGPU(transcendentalFunc);
-BM_FuncGPU(rowReduction);
-BM_FuncGPU(colReduction);
-BM_FuncGPU(fullReduction);
-
-
 // Contractions
-#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
-  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
-    StopBenchmarkTiming();                                                     \
-    cl::sycl::gpu_selector selector;                                           \
-    Eigen::QueueInterface queue(selector);                                     \
-    Eigen::SyclDevice device(&queue);                                          \
-    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2, D3);        \
-    suite.FUNC(iters);                                                         \
-  }                                                                            \
+#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                       \
+  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {        \
+    StopBenchmarkTiming();                                              \
+    Eigen::SyclDevice device(&queue);                                   \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2, D3); \
+    suite.FUNC(iters);                                                  \
+  }                                                                     \
   BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
 
-
 BM_FuncWithInputDimsGPU(contraction, N, N, N);
 BM_FuncWithInputDimsGPU(contraction, 64, N, N);
 BM_FuncWithInputDimsGPU(contraction, N, 64, N);
 BM_FuncWithInputDimsGPU(contraction, N, N, 64);
 
+BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajor, 64, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajor, N, 64, N);
+BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, 64);
+
+BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorAT, 64, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, 64, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, 64);
+
+BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorBT, 64, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, 64, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, 64);
+
+
+BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorABT, 64, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, 64, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, 64);
 
 // Convolutions
-#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
-  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
-    StopBenchmarkTiming();                                                     \
-    cl::sycl::gpu_selector selector;                                           \
-    Eigen::QueueInterface queue(selector);                                     \
-    Eigen::SyclDevice device(&queue);                                          \
-    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N);                 \
-    suite.FUNC(iters, DIM1, DIM2);                                             \
-  }                                                                            \
+#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)             \
+  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {  \
+    StopBenchmarkTiming();                                     \
+    Eigen::SyclDevice device(&queue);                          \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
+    suite.FUNC(iters, DIM1, DIM2);                             \
+  }                                                            \
   BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
 
 BM_FuncWithKernelDimsGPU(convolution, 7, 1);
diff --git a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc
deleted file mode 100644
index bcc3c4c79..000000000
--- a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc
+++ /dev/null
@@ -1,2 +0,0 @@
-#include "tensor_benchmarks_sycl.cc"
-#include "tensor_benchmarks_sycl.sycl"
diff --git a/bench/tensors/tensor_contract_sycl_bench.cc b/bench/tensors/tensor_contract_sycl_bench.cc
new file mode 100644
index 000000000..8f2defe42
--- /dev/null
+++ b/bench/tensors/tensor_contract_sycl_bench.cc
@@ -0,0 +1,325 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_BENCH_CONTRACT_SYCL
+#define EIGEN_BENCH_CONTRACT_SYCL
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#include <SYCL/sycl.hpp>
+#include <fstream>
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+std::ofstream out("Result.txt");
+
+std::chrono::time_point<std::chrono::system_clock> get_time(){
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  return std::chrono::system_clock::now();
+}
+
+template<typename Start, typename End, typename TensorIndex>
+void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){
+
+  std::chrono::duration<double> elapsed_seconds = end-start;
+  std::cout <<"Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
+  static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
+    out <<"Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
+    static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
+}
+
+// do a contraction which is equivalent to a matrix multiplication
+template<typename T, typename Device, typename TensorIndex>
+void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = m_;
+  sizeA[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = k_;
+  sizeB[1] = n_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 0);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+ auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+}
+
+
+
+// do a contraction which is equivalent to a matrix multiplication
+template<typename T, typename Device, typename TensorIndex>
+void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = m_;
+  sizeA[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = k_;
+  sizeB[1] = n_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 0);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+}
+
+
+template<typename T, typename Device, typename TensorIndex>
+void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = k_;
+  sizeA[1] = m_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = k_;
+  sizeB[1] = n_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(0, 0);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+
+}
+
+template<typename T, typename Device, typename TensorIndex>
+void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = m_;
+  sizeA[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = n_;
+  sizeB[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 1);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+
+}
+
+template<typename T, typename Device, typename TensorIndex>
+void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = k_;
+  sizeA[1] = m_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = n_;
+  sizeB[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(0, 1);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+}
+
+int main() {
+  cl::sycl::gpu_selector selector;
+  Eigen::QueueInterface queue(selector);
+  Eigen::SyclDevice device(&queue);
+  int64_t num_iters =20;
+  for(int64_t m = 32; m <= 4096; m *= 2)
+    for(int64_t k = 32; k <= 4096; k *= 2)
+      for(int64_t n = 32; n <= 4096; n*= 2){
+        (contraction<float>(device, num_iters, m, k, n));
+        (contractionRowMajor<float>(device, num_iters, m, k, n));
+        (contractionAT<float>(device, num_iters, m, k, n));
+        (contractionBT<float>(device, num_iters, m, k, n));
+        (contractionABT<float>(device, num_iters, m, k, n));
+      }
+  return 0;
+  }
+
+#endif // EIGEN_BENCH_CONTRACT_SYCL
-- 
cgit v1.2.3