From 00f32752f7d0b193c6788691c3cf0b76457a044d Mon Sep 17 00:00:00 2001
From: Mehdi Goli <mehdi.goli@codeplay.com>
Date: Thu, 28 Nov 2019 10:08:54 +0000
Subject: [SYCL] Rebasing the SYCL support branch on top of the Einge upstream
 master branch. * Unifying all loadLocalTile from lhs and rhs to an
 extract_block function. * Adding get_tensor operation which was missing in
 TensorContractionMapper. * Adding the -D method missing from cmake for
 Disable_Skinny Contraction operation. * Wrapping all the indices in
 TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL *
 Unifying load to private register for tall/skinny no shared * Unifying load
 to vector tile for tensor-vector/vector-tensor operation * Removing all the
 LHS/RHS class for extracting data from global * Removing Outputfunction from
 TensorContractionSkinnyNoshared. * Combining the local memory version of
 tall/skinny and normal tensor contraction into one kernel. * Combining the
 no-local memory version of tall/skinny and normal tensor contraction into one
 kernel. * Combining General Tensor-Vector and VectorTensor contraction into
 one kernel. * Making double buffering optional for Tensor contraction when
 local memory is version is used. * Modifying benchmark to accept custom
 Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to
 allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL
 CMake

---
 CMakeLists.txt                                     |   15 +
 Eigen/src/Core/arch/SYCL/InteropHeaders.h          |    4 +
 Eigen/src/Core/util/ConfigureVectorization.h       |   30 +-
 Eigen/src/Core/util/Macros.h                       |    2 +-
 bench/tensors/README                               |   17 +-
 bench/tensors/eigen_sycl_bench.sh                  |   30 +
 bench/tensors/eigen_sycl_bench_contract.sh         |    7 +
 bench/tensors/tensor_benchmarks.h                  |  102 +-
 bench/tensors/tensor_benchmarks_sycl.cc            |  133 +-
 .../tensor_benchmarks_sycl_include_headers.cc      |    2 -
 bench/tensors/tensor_contract_sycl_bench.cc        |  325 ++++
 cmake/EigenTesting.cmake                           |  161 +-
 cmake/FindComputeCpp.cmake                         |  539 ++++--
 unsupported/Eigen/CXX11/Tensor                     |   23 +-
 .../Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h      |  152 --
 .../CXX11/src/Tensor/TensorContractionMapper.h     |    4 +
 .../Eigen/CXX11/src/Tensor/TensorContractionSycl.h | 1890 ++++++++++++++++----
 .../Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h |  689 +++----
 .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h      |  245 ++-
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h        |  150 +-
 .../CXX11/src/Tensor/TensorForwardDeclarations.h   |    2 +-
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h        |    4 +-
 .../Eigen/CXX11/src/Tensor/TensorReduction.h       |    2 +-
 .../Eigen/CXX11/src/Tensor/TensorReductionSycl.h   |  673 +++++--
 .../Eigen/CXX11/src/Tensor/TensorScanSycl.h        |  512 ++++++
 unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h    |  120 --
 .../Tensor/TensorSyclConvertToDeviceExpression.h   |  205 ---
 .../CXX11/src/Tensor/TensorSyclExprConstructor.h   |  514 ------
 .../CXX11/src/Tensor/TensorSyclExtractAccessor.h   |  310 ----
 .../CXX11/src/Tensor/TensorSyclExtractFunctors.h   |  467 -----
 .../Eigen/CXX11/src/Tensor/TensorSyclFunctors.h    |  248 ---
 .../Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h   |  213 ---
 .../CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h   |  302 ----
 unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h |   96 -
 .../Eigen/CXX11/src/Tensor/TensorSyclTuple.h       |  239 ---
 unsupported/doc/Overview.dox                       |    3 +
 unsupported/doc/SYCL.dox                           |    9 +
 unsupported/doc/examples/CMakeLists.txt            |    4 +
 unsupported/doc/examples/SYCL/CMakeLists.txt       |   38 +
 unsupported/doc/examples/SYCL/CwiseMul.cpp         |   63 +
 unsupported/test/CMakeLists.txt                    |  129 +-
 unsupported/test/cxx11_tensor_argmax_sycl.cpp      |  136 +-
 unsupported/test/cxx11_tensor_builtins_sycl.cpp    |  497 ++---
 unsupported/test/cxx11_tensor_chipping_sycl.cpp    |    7 +-
 unsupported/test/cxx11_tensor_contract_sycl.cpp    | 1010 +++++++++--
 unsupported/test/cxx11_tensor_custom_op_sycl.cpp   |    5 +
 unsupported/test/cxx11_tensor_forced_eval_sycl.cpp |    5 +-
 unsupported/test/cxx11_tensor_image_op_sycl.cpp    |  103 ++
 unsupported/test/cxx11_tensor_math_sycl.cpp        |  105 ++
 unsupported/test/cxx11_tensor_morphing_sycl.cpp    |  138 ++
 unsupported/test/cxx11_tensor_random_sycl.cpp      |  100 ++
 unsupported/test/cxx11_tensor_reduction_sycl.cpp   |  941 +++++++++-
 unsupported/test/cxx11_tensor_reverse_sycl.cpp     |  128 +-
 unsupported/test/cxx11_tensor_scan_sycl.cpp        |  141 ++
 unsupported/test/cxx11_tensor_shuffling_sycl.cpp   |   52 +-
 unsupported/test/cxx11_tensor_sycl.cpp             |   91 +-
 56 files changed, 7321 insertions(+), 4811 deletions(-)
 create mode 100755 bench/tensors/eigen_sycl_bench.sh
 create mode 100644 bench/tensors/eigen_sycl_bench_contract.sh
 delete mode 100644 bench/tensors/tensor_benchmarks_sycl_include_headers.cc
 create mode 100644 bench/tensors/tensor_contract_sycl_bench.cc
 delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
 mode change 100644 => 100755 unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
 create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
 delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
 delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
 delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
 delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
 delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
 delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
 delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
 delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
 delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
 delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
 create mode 100644 unsupported/doc/SYCL.dox
 create mode 100644 unsupported/doc/examples/SYCL/CMakeLists.txt
 create mode 100644 unsupported/doc/examples/SYCL/CwiseMul.cpp
 create mode 100644 unsupported/test/cxx11_tensor_image_op_sycl.cpp
 create mode 100644 unsupported/test/cxx11_tensor_math_sycl.cpp
 create mode 100644 unsupported/test/cxx11_tensor_random_sycl.cpp
 create mode 100644 unsupported/test/cxx11_tensor_scan_sycl.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 619bd18f8..36a155133 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -492,6 +492,21 @@ if(EIGEN_TEST_SYCL)
   else()
     message(STATUS "Using ComputeCPP SYCL")
     include(FindComputeCpp)
+    set(COMPUTECPP_DRIVER_DEFAULT_VALUE OFF)
+    if (NOT MSVC)
+      set(COMPUTECPP_DRIVER_DEFAULT_VALUE ON)
+    endif()
+    option(COMPUTECPP_USE_COMPILER_DRIVER
+      "Use ComputeCpp driver instead of a 2 steps compilation"
+      ${COMPUTECPP_DRIVER_DEFAULT_VALUE}
+    )
+  endif(EIGEN_SYCL_TRISYCL)
+  option(EIGEN_DONT_VECTORIZE_SYCL "Don't use vectorisation in the SYCL tests." OFF)
+  if(EIGEN_DONT_VECTORIZE_SYCL)
+    message(STATUS "Disabling SYCL vectorization in tests/examples")
+    # When disabling SYCL vectorization, also disable Eigen default vectorization
+    add_definitions(-DEIGEN_DONT_VECTORIZE=1)
+    add_definitions(-DEIGEN_DONT_VECTORIZE_SYCL=1)
   endif()
 endif()
 
diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
index d76030419..5cef1a49f 100644
--- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h
+++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@@ -161,6 +161,8 @@ struct PacketWrapper<PacketReturnType, 4> {
         eigen_assert(false && "INDEX MUST BE BETWEEN 0 and 3");
         abort();
     }
+    __builtin_unreachable();
+
   }
   EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type(
       Scalar in, Scalar other) {
@@ -203,6 +205,8 @@ struct PacketWrapper<PacketReturnType, 2> {
         eigen_assert(false && "INDEX MUST BE BETWEEN 0 and 1");
         abort();
     }
+    __builtin_unreachable();
+  
   }
   EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type(
       Scalar in, Scalar other) {
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index fdb1627a1..d52805d32 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -240,15 +240,19 @@
       #define EIGEN_VECTORIZE_SSE4_2
     #endif
     #ifdef __AVX__
-      #define EIGEN_VECTORIZE_AVX
+      #ifndef EIGEN_USE_SYCL 
+        #define EIGEN_VECTORIZE_AVX
+      #endif
       #define EIGEN_VECTORIZE_SSE3
       #define EIGEN_VECTORIZE_SSSE3
       #define EIGEN_VECTORIZE_SSE4_1
       #define EIGEN_VECTORIZE_SSE4_2
     #endif
     #ifdef __AVX2__
-      #define EIGEN_VECTORIZE_AVX2
-      #define EIGEN_VECTORIZE_AVX
+      #ifndef EIGEN_USE_SYCL 
+        #define EIGEN_VECTORIZE_AVX2
+        #define EIGEN_VECTORIZE_AVX
+      #endif
       #define EIGEN_VECTORIZE_SSE3
       #define EIGEN_VECTORIZE_SSSE3
       #define EIGEN_VECTORIZE_SSE4_1
@@ -267,19 +271,23 @@
       #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638).
       #endif
       #endif
-      #define EIGEN_VECTORIZE_AVX512
-      #define EIGEN_VECTORIZE_AVX2
-      #define EIGEN_VECTORIZE_AVX
+      #ifndef EIGEN_USE_SYCL
+        #define EIGEN_VECTORIZE_AVX512
+        #define EIGEN_VECTORIZE_AVX2
+        #define EIGEN_VECTORIZE_AVX
+      #endif
       #define EIGEN_VECTORIZE_FMA
       #define EIGEN_VECTORIZE_SSE3
       #define EIGEN_VECTORIZE_SSSE3
       #define EIGEN_VECTORIZE_SSE4_1
       #define EIGEN_VECTORIZE_SSE4_2
-      #ifdef __AVX512DQ__
-        #define EIGEN_VECTORIZE_AVX512DQ
-      #endif
-      #ifdef __AVX512ER__
-        #define EIGEN_VECTORIZE_AVX512ER
+      #ifndef EIGEN_USE_SYCL
+        #ifdef __AVX512DQ__
+          #define EIGEN_VECTORIZE_AVX512DQ
+        #endif
+        #ifdef __AVX512ER__
+          #define EIGEN_VECTORIZE_AVX512ER
+        #endif
       #endif
     #endif
 
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index e7bf75a81..2b40c5fd0 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -854,7 +854,7 @@
   #ifndef EIGEN_DONT_VECTORIZE
     #define EIGEN_DONT_VECTORIZE
   #endif
-  #define EIGEN_DEVICE_FUNC __attribute__((always_inline))
+  #define EIGEN_DEVICE_FUNC __attribute__((flatten)) __attribute__((always_inline))
 // All functions callable from CUDA/HIP code must be qualified with __device__
 #elif defined(EIGEN_GPUCC) 
     #define EIGEN_DEVICE_FUNC __host__ __device__
diff --git a/bench/tensors/README b/bench/tensors/README
index 69342cc9c..dcbf0217a 100644
--- a/bench/tensors/README
+++ b/bench/tensors/README
@@ -11,15 +11,10 @@ nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBU
 We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code.
 nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_53 -o benchmarks_fp16_gpu
 
-last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
-g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
+To compile and run the benchmark for SYCL, using ComputeCpp, simply run the
+following commands:
+1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY}
+2. bash eigen_sycl_bench.sh
 
-To compile and run the benchmark for SYCL, using ComputeCpp you currently need following passes (only for translation units containing device code):
-1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code.
-{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc -DEIGEN_USE_SYCL=1
-2. The host compilation pass that generates the final host binary.
-clang++ -O3 -c benchmark_main.cc -pthread -I ../../ -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 -o benchmark_main.o
-clang++ -O3 tensor_benchmarks_sycl_include_headers.cc -pthread -I ../../ -I  {ComputeCpp_ROOT}/include/ -L  {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 benchmark_main.o -o tensor_benchmark_sycl
-export LD_LIBRARY_PATH={ComputeCpp_ROOT}/lib
-3. Run the benchmark
-./tensor_benchmark_sycl
+Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
+g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
diff --git a/bench/tensors/eigen_sycl_bench.sh b/bench/tensors/eigen_sycl_bench.sh
new file mode 100755
index 000000000..3f67b3d86
--- /dev/null
+++ b/bench/tensors/eigen_sycl_bench.sh
@@ -0,0 +1,30 @@
+rm -f tensor_benchmark_sycl
+: "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}"
+echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR
+${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++ \
+tensor_benchmarks_sycl.cc \
+benchmark_main.cc \
+-I ../../ \
+-I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ \
+-std=c++11 \
+-march=native \
+-O3 \
+-DNDEBUG \
+-DEIGEN_MPL2_ONLY \
+-DEIGEN_USE_SYCL=1 \
+-DEIGEN_SYCL_LOCAL_MEM=1 \
+-no-serial-memop \
+-mllvm \
+-inline-threshold=10000 \
+-fsycl-ih-last \
+-sycl-driver \
+-Xclang -cl-mad-enable \
+-lOpenCL \
+-lComputeCpp \
+-lpthread \
+-o \
+tensor_benchmark_sycl\
+${@:1}
+
+export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH
+./tensor_benchmark_sycl
diff --git a/bench/tensors/eigen_sycl_bench_contract.sh b/bench/tensors/eigen_sycl_bench_contract.sh
new file mode 100644
index 000000000..73fd6c4a0
--- /dev/null
+++ b/bench/tensors/eigen_sycl_bench_contract.sh
@@ -0,0 +1,7 @@
+rm -f tensor_contract_sycl_bench
+: "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}"
+echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR
+${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++  tensor_contract_sycl_bench.cc -I ../../ -I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ -std=c++11 -O3 -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -no-serial-memop -mllvm -inline-threshold=10000 -fsycl-ih-last -sycl-driver -Xclang -cl-mad-enable -lOpenCL -lComputeCpp -lpthread -o tensor_contract_sycl_bench ${@:1}
+export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH
+./tensor_contract_sycl_bench
+
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
index 3a640ede4..0825e1563 100644
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@@ -27,6 +27,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     initialize();
   }
 
+  BenchmarkSuite(const Device& device, size_t m, size_t k)
+      : m_(1), k_(k), n_(m), device_(device) {
+    initialize();
+  }
+
   ~BenchmarkSuite() {
     device_.deallocate(a_);
     device_.deallocate(b_);
@@ -79,6 +84,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     sizes[0] = m_;
     sizes[1] = m_;
     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = C.random();
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = C.random();
@@ -264,6 +274,7 @@ template <typename Device, typename T> class BenchmarkSuite {
     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
   }
 
+
   void broadcasting(int num_iters) {
     Eigen::array<TensorIndex, 2> size_a;
     size_a[0] = m_;
@@ -406,8 +417,8 @@ for (int iter = 0; iter < 10; ++iter) {
         b_, input_size);
     Eigen::array<TensorIndex, 1> output_size;
     output_size[0] = k_;
-    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(
-        c_, output_size);
+    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> A(
+        a_, output_size);
 
 #ifndef EIGEN_HAS_INDEX_LIST
     Eigen::array<TensorIndex, 1> sum_along_dim;
@@ -419,12 +430,12 @@ for (int iter = 0; iter < 10; ++iter) {
 #endif
 #ifdef EIGEN_USE_SYCL // warmup for sycl
   for (int iter = 0; iter < 10; ++iter) {
-    C.device(device_) = B.sum(sum_along_dim);
+    A.device(device_) = B.sum(sum_along_dim);
   }
 #endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
-      C.device(device_) = B.sum(sum_along_dim);
+      A.device(device_) = B.sum(sum_along_dim);
     }
     // Record the number of FLOP executed per second (assuming one operation
     // per value)
@@ -455,37 +466,27 @@ for (int iter = 0; iter < 10; ++iter) {
     finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
   }
 
+  
+
   // do a contraction which is equivalent to a matrix multiplication
   void contraction(int num_iters) {
-    Eigen::array<TensorIndex, 2> sizeA;
-    sizeA[0] = m_;
-    sizeA[1] = k_;
-    Eigen::array<TensorIndex, 2> sizeB;
-    sizeB[0] = k_;
-    sizeB[1] = n_;
-    Eigen::array<TensorIndex, 2> sizeC;
-    sizeC[0] = m_;
-    sizeC[1] = n_;
+      contraction<static_cast<int>(Eigen::ColMajor)>(num_iters, false, false);
+  }
 
-    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
-    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
-    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
+    void contractionRowMajor(int num_iters) {
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, false);
+  }
+    
+  void contractionRowMajorAT(int num_iters) {
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, false);
+  }
 
-    typedef typename Tensor<T, 2>::DimensionPair DimPair;
-    Eigen::array<DimPair, 1> dims;
-    dims[0] = DimPair(1, 0);
-#ifdef EIGEN_USE_SYCL // warmup for sycl
-    for (int iter = 0; iter < 10; ++iter) {
-      C.device(device_) = A.contract(B, dims);
-     }
-#endif
-    StartBenchmarkTiming();
-    for (int iter = 0; iter < num_iters; ++iter) {
-      C.device(device_) = A.contract(B, dims);
-    }
-    // Record the number of FLOP executed per second (size_ multiplications and
-    // additions for each value in the resulting tensor)
-    finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
+  void contractionRowMajorBT(int num_iters) {
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, true);
+  }
+
+  void contractionRowMajorABT(int num_iters) {
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, true);
   }
 
   void convolution(int num_iters, int kernel_x, int kernel_y) {
@@ -513,13 +514,49 @@ for (int iter = 0; iter < 10; ++iter) {
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = A.convolve(B, dims);
     }
-    // Record the number of FLOP executed per second (kernel_size
+    // Record the number of FLOPs executed per second (kernel_size
     // multiplications and additions for each value in the resulting tensor)
     finalizeBenchmark(static_cast<int64_t>(2) *
         (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
   }
 
  private:
+ // do a contraction which is equivalent to a matrix multiplication
+  template<int Layout>
+  void contraction(int num_iters, bool trans_a, bool trans_b) {
+    Eigen::array<TensorIndex, 2> sizeA;
+    sizeA[0] = (trans_a ? k_: m_);
+    sizeA[1] = (trans_a ? m_:  k_);
+    Eigen::array<TensorIndex, 2> sizeB;
+    sizeB[0] = (trans_b ? n_: k_);
+    sizeB[1] = (trans_b ? k_: n_);
+    Eigen::array<TensorIndex, 2> sizeC;
+    sizeC[0] = m_;
+    sizeC[1] = n_;
+
+    const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> A(a_, sizeA);
+    const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> B(b_, sizeB);
+    TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> C(c_, sizeC);
+
+    typedef typename Tensor<T, 2, Layout>::DimensionPair DimPair;
+    Eigen::array<DimPair, 1> dims;
+    TensorIndex a_contract_dim = (trans_a ? 0 : 1);
+    TensorIndex b_contract_dim = (trans_b ? 1 : 0);
+    dims[0] = DimPair(a_contract_dim, b_contract_dim);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.contract(B, dims);
+     }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.contract(B, dims);
+    }
+    // Record the number of FLOP executed per second (size_ multiplications and
+    // additions for each value in the resulting tensor)
+    finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
+  }
+
   void initialize() {
     a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
     b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
@@ -531,7 +568,6 @@ for (int iter = 0; iter < 10; ++iter) {
     device_.memset(b_, 23, k_ * n_ * sizeof(T));
     device_.memset(c_, 31, m_ * n_ * sizeof(T));
 
-    //BenchmarkUseRealTime();
   }
 
   inline void finalizeBenchmark(int64_t num_items) {
diff --git a/bench/tensors/tensor_benchmarks_sycl.cc b/bench/tensors/tensor_benchmarks_sycl.cc
index cb6daac15..b8a096684 100644
--- a/bench/tensors/tensor_benchmarks_sycl.cc
+++ b/bench/tensors/tensor_benchmarks_sycl.cc
@@ -5,19 +5,76 @@
 
 #include "tensor_benchmarks.h"
 
-#define BM_FuncGPU(FUNC)                                                       \
-  static void BM_##FUNC(int iters, int N) {                                    \
-    StopBenchmarkTiming();                                                     \
-    cl::sycl::gpu_selector selector;                                           \
-    Eigen::QueueInterface queue(selector);                                     \
-    Eigen::SyclDevice device(&queue);                                          \
-    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N);                 \
-    suite.FUNC(iters);                                                         \
-  }                                                                            \
+cl::sycl::gpu_selector selector;
+Eigen::QueueInterface queue(selector);
+#define BM_FuncWithInput2DimsGPU(FUNC, D1, D2)                      \
+  static void BM_##FUNC##_##D1##x##D2(int iters, int N) {           \
+    StopBenchmarkTiming();                                          \
+    Eigen::SyclDevice device(&queue);                               \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2); \
+    suite.FUNC(iters);                                              \
+  }                                                                 \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2, 10, 10);
+
+BM_FuncWithInput2DimsGPU(rowReduction, 256, 100352);
+BM_FuncWithInput2DimsGPU(rowReduction, 64, 100352);
+BM_FuncWithInput2DimsGPU(rowReduction, 512, 25088);
+BM_FuncWithInput2DimsGPU(rowReduction, 128, 25088);
+BM_FuncWithInput2DimsGPU(rowReduction, 102, 6272);
+BM_FuncWithInput2DimsGPU(rowReduction, 256, 6272);
+BM_FuncWithInput2DimsGPU(rowReduction, 204, 1568);
+BM_FuncWithInput2DimsGPU(rowReduction, 512, 1568);
+BM_FuncWithInput2DimsGPU(rowReduction, 1024, 1568);
+BM_FuncWithInput2DimsGPU(rowReduction, 2048, 1568);
+
+BM_FuncWithInput2DimsGPU(colReduction, 100352, 256);
+BM_FuncWithInput2DimsGPU(colReduction, 100352, 64);
+BM_FuncWithInput2DimsGPU(colReduction, 25088, 512);
+BM_FuncWithInput2DimsGPU(colReduction, 6272, 102);
+BM_FuncWithInput2DimsGPU(colReduction, 25088, 128);
+BM_FuncWithInput2DimsGPU(colReduction, 6272, 256);
+BM_FuncWithInput2DimsGPU(colReduction, 1568, 204);
+BM_FuncWithInput2DimsGPU(colReduction, 1568, 512);
+BM_FuncWithInput2DimsGPU(colReduction, 1568, 1024);
+BM_FuncWithInput2DimsGPU(colReduction, 1568, 2048);
+BM_FuncWithInput2DimsGPU(fullReduction, 1001, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 2050048, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 2097152, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 2048, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 262144, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 256, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 589824, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 1024, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 524288, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 512, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 2359296, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 1048576, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 131072, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 16384, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 9408, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 64, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 4096, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 36864, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 32768, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 128, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 147456, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 65536, 1);
+#define BM_FuncGPU(FUNC)                                       \
+  static void BM_##FUNC(int iters, int N) {                    \
+    StopBenchmarkTiming();                                     \
+    Eigen::SyclDevice device(&queue);                          \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
+    suite.FUNC(iters);                                         \
+  }                                                            \
   BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
 
+BM_FuncGPU(rowReduction);
+BM_FuncGPU(colReduction);
+BM_FuncGPU(fullReduction);
+
 BM_FuncGPU(memcpy);
 BM_FuncGPU(typeCasting);
+BM_FuncGPU(random);
 BM_FuncGPU(slicing);
 BM_FuncGPU(rowChip);
 BM_FuncGPU(colChip);
@@ -28,40 +85,50 @@ BM_FuncGPU(broadcasting);
 BM_FuncGPU(coeffWiseOp);
 BM_FuncGPU(algebraicFunc);
 BM_FuncGPU(transcendentalFunc);
-BM_FuncGPU(rowReduction);
-BM_FuncGPU(colReduction);
-BM_FuncGPU(fullReduction);
-
-
 // Contractions
-#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
-  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
-    StopBenchmarkTiming();                                                     \
-    cl::sycl::gpu_selector selector;                                           \
-    Eigen::QueueInterface queue(selector);                                     \
-    Eigen::SyclDevice device(&queue);                                          \
-    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2, D3);        \
-    suite.FUNC(iters);                                                         \
-  }                                                                            \
+#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                       \
+  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {        \
+    StopBenchmarkTiming();                                              \
+    Eigen::SyclDevice device(&queue);                                   \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2, D3); \
+    suite.FUNC(iters);                                                  \
+  }                                                                     \
   BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
 
-
 BM_FuncWithInputDimsGPU(contraction, N, N, N);
 BM_FuncWithInputDimsGPU(contraction, 64, N, N);
 BM_FuncWithInputDimsGPU(contraction, N, 64, N);
 BM_FuncWithInputDimsGPU(contraction, N, N, 64);
 
+BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajor, 64, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajor, N, 64, N);
+BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, 64);
+
+BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorAT, 64, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, 64, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, 64);
+
+BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorBT, 64, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, 64, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, 64);
+
+
+BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorABT, 64, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, 64, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, 64);
 
 // Convolutions
-#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
-  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
-    StopBenchmarkTiming();                                                     \
-    cl::sycl::gpu_selector selector;                                           \
-    Eigen::QueueInterface queue(selector);                                     \
-    Eigen::SyclDevice device(&queue);                                          \
-    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N);                 \
-    suite.FUNC(iters, DIM1, DIM2);                                             \
-  }                                                                            \
+#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)             \
+  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {  \
+    StopBenchmarkTiming();                                     \
+    Eigen::SyclDevice device(&queue);                          \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
+    suite.FUNC(iters, DIM1, DIM2);                             \
+  }                                                            \
   BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
 
 BM_FuncWithKernelDimsGPU(convolution, 7, 1);
diff --git a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc
deleted file mode 100644
index bcc3c4c79..000000000
--- a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc
+++ /dev/null
@@ -1,2 +0,0 @@
-#include "tensor_benchmarks_sycl.cc"
-#include "tensor_benchmarks_sycl.sycl"
diff --git a/bench/tensors/tensor_contract_sycl_bench.cc b/bench/tensors/tensor_contract_sycl_bench.cc
new file mode 100644
index 000000000..8f2defe42
--- /dev/null
+++ b/bench/tensors/tensor_contract_sycl_bench.cc
@@ -0,0 +1,325 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_BENCH_CONTRACT_SYCL
+#define EIGEN_BENCH_CONTRACT_SYCL
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#include <SYCL/sycl.hpp>
+#include <fstream>
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+std::ofstream out("Result.txt");
+
+std::chrono::time_point<std::chrono::system_clock> get_time(){
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  return std::chrono::system_clock::now();
+}
+
+template<typename Start, typename End, typename TensorIndex>
+void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){
+
+  std::chrono::duration<double> elapsed_seconds = end-start;
+  std::cout <<"Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
+  static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
+    out <<"Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
+    static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
+}
+
+// do a contraction which is equivalent to a matrix multiplication
+template<typename T, typename Device, typename TensorIndex>
+void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = m_;
+  sizeA[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = k_;
+  sizeB[1] = n_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 0);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+ auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+}
+
+
+
+// do a contraction which is equivalent to a matrix multiplication
+template<typename T, typename Device, typename TensorIndex>
+void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = m_;
+  sizeA[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = k_;
+  sizeB[1] = n_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 0);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+}
+
+
+template<typename T, typename Device, typename TensorIndex>
+void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = k_;
+  sizeA[1] = m_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = k_;
+  sizeB[1] = n_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(0, 0);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+
+}
+
+template<typename T, typename Device, typename TensorIndex>
+void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = m_;
+  sizeA[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = n_;
+  sizeB[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 1);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+
+}
+
+template<typename T, typename Device, typename TensorIndex>
+void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = k_;
+  sizeA[1] = m_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = n_;
+  sizeB[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(0, 1);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+}
+
+int main() {
+  cl::sycl::gpu_selector selector;
+  Eigen::QueueInterface queue(selector);
+  Eigen::SyclDevice device(&queue);
+  int64_t num_iters =20;
+  for(int64_t m = 32; m <= 4096; m *= 2)
+    for(int64_t k = 32; k <= 4096; k *= 2)
+      for(int64_t n = 32; n <= 4096; n*= 2){
+        (contraction<float>(device, num_iters, m, k, n));
+        (contractionRowMajor<float>(device, num_iters, m, k, n));
+        (contractionAT<float>(device, num_iters, m, k, n));
+        (contractionBT<float>(device, num_iters, m, k, n));
+        (contractionABT<float>(device, num_iters, m, k, n));
+      }
+  return 0;
+  }
+
+#endif // EIGEN_BENCH_CONTRACT_SYCL
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index f8ffe2387..524223717 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -113,111 +113,28 @@ macro(ei_add_test_internal testname testname_with_suffix)
     add_dependencies("Build${current_subproject}" ${targetname})
     set_property(TEST ${testname_with_suffix} PROPERTY LABELS "${current_subproject}")
   endif()
-
-endmacro()
-
-# SYCL
-macro(ei_add_test_internal_sycl testname testname_with_suffix)
-  set(targetname ${testname_with_suffix})
-
-  if(EIGEN_ADD_TEST_FILENAME_EXTENSION)
-    set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION})
-  else()
-    set(filename ${testname}.cpp)
-  endif()
-
-  set( include_file "${CMAKE_CURRENT_BINARY_DIR}/inc_${filename}")
-  set( bc_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.sycl")
-  set( host_file "${CMAKE_CURRENT_SOURCE_DIR}/${filename}")
-
-  if(NOT EIGEN_SYCL_TRISYCL)
-    include_directories( SYSTEM ${COMPUTECPP_PACKAGE_ROOT_DIR}/include)
-
-    add_custom_command(
-      OUTPUT ${include_file}
-      COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${host_file}\\\"" > ${include_file}
-      COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${bc_file}\\\"" >> ${include_file}
-      DEPENDS ${filename} ${bc_file}
-      COMMENT "Building ComputeCpp integration header file ${include_file}"
-      )
-
-    # Add a custom target for the generated integration header
-    add_custom_target("${testname}_integration_header_sycl" DEPENDS ${include_file})
-
-    add_executable(${targetname} ${include_file})
-    add_dependencies(${targetname} "${testname}_integration_header_sycl")
-  else()
-    add_executable(${targetname} ${host_file})
-  endif()
-
-  add_sycl_to_target(${targetname} ${CMAKE_CURRENT_BINARY_DIR} ${filename})
-
-  if (targetname MATCHES "^eigen2_")
-    add_dependencies(eigen2_buildtests ${targetname})
-  else()
-    add_dependencies(buildtests ${targetname})
-  endif()
-
-  if(EIGEN_NO_ASSERTION_CHECKING)
-    ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_NO_ASSERTION_CHECKING=1")
-  else()
-    if(EIGEN_DEBUG_ASSERTS)
-      ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_DEBUG_ASSERTS=1")
-    endif()
-  endif()
-
-  ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}")
-
-  if(MSVC AND NOT EIGEN_SPLIT_LARGE_TESTS)
-    ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj")
-  endif()
-
-  # let the user pass flags.
-  if(${ARGC} GREATER 2)
-    ei_add_target_property(${targetname} COMPILE_FLAGS "${ARGV2}")
-  endif()
-
-  if(EIGEN_TEST_CUSTOM_CXX_FLAGS)
-    ei_add_target_property(${targetname} COMPILE_FLAGS "${EIGEN_TEST_CUSTOM_CXX_FLAGS}")
-  endif()
-
-  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
-    target_link_libraries(${targetname} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
-  endif()
-  if(EXTERNAL_LIBS)
-    target_link_libraries(${targetname} ${EXTERNAL_LIBS})
-  endif()
-  if(EIGEN_TEST_CUSTOM_LINKER_FLAGS)
-    target_link_libraries(${targetname} ${EIGEN_TEST_CUSTOM_LINKER_FLAGS})
-  endif()
-
-  if(${ARGC} GREATER 3)
-    set(libs_to_link ${ARGV3})
-    # it could be that some cmake module provides a bad library string " "  (just spaces),
-    # and that severely breaks target_link_libraries ("can't link to -l-lstdc++" errors).
-    # so we check for strings containing only spaces.
-    string(STRIP "${libs_to_link}" libs_to_link_stripped)
-    string(LENGTH "${libs_to_link_stripped}" libs_to_link_stripped_length)
-    if(${libs_to_link_stripped_length} GREATER 0)
-      # notice: no double quotes around ${libs_to_link} here. It may be a list.
-      target_link_libraries(${targetname} ${libs_to_link})
-    endif()
-  endif()
-
-  add_test(${testname_with_suffix} "${targetname}")
-
-  # Specify target and test labels according to EIGEN_CURRENT_SUBPROJECT
-  get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT)
-  if ((current_subproject) AND (NOT (current_subproject STREQUAL "")))
-    set_property(TARGET ${targetname} PROPERTY LABELS "Build${current_subproject}")
-    add_dependencies("Build${current_subproject}" ${targetname})
-    set_property(TEST ${testname_with_suffix} PROPERTY LABELS "${current_subproject}")
-  endif()
-
-
-endmacro()
-
-
+  if(EIGEN_SYCL)
+    # Force include of the SYCL file at the end to avoid errors.
+    set_property(TARGET ${targetname} PROPERTY COMPUTECPP_INCLUDE_AFTER 1)
+    # Set COMPILE_FLAGS to COMPILE_DEFINITIONS instead to avoid having to duplicate the flags
+    # to the device compiler.
+    get_target_property(target_compile_flags ${targetname} COMPILE_FLAGS)
+    separate_arguments(target_compile_flags)
+    foreach(flag ${target_compile_flags})
+      if(${flag} MATCHES "^-D.*")
+        string(REPLACE "-D" "" definition_flag ${flag})
+        set_property(TARGET ${targetname} APPEND PROPERTY COMPILE_DEFINITIONS ${definition_flag})
+        list(REMOVE_ITEM target_compile_flags ${flag})
+      endif()
+    endforeach()
+    set_property(TARGET ${targetname} PROPERTY COMPILE_FLAGS ${target_compile_flags})
+    # Link against pthread and add sycl to target
+    set(THREADS_PREFER_PTHREAD_FLAG ON)
+    find_package(Threads REQUIRED)
+    target_link_libraries(${targetname} Threads::Threads)
+    add_sycl_to_target(TARGET ${targetname} SOURCES ${filename})
+  endif(EIGEN_SYCL)
+endmacro(ei_add_test_internal)
 # Macro to add a test
 #
 # the unique mandatory parameter testname must correspond to a file
@@ -296,40 +213,6 @@ macro(ei_add_test testname)
   endif()
 endmacro()
 
-macro(ei_add_test_sycl testname)
-  get_property(EIGEN_TESTS_LIST GLOBAL PROPERTY EIGEN_TESTS_LIST)
-  set(EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}${testname}\n")
-  set_property(GLOBAL PROPERTY EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
-
-  if(EIGEN_ADD_TEST_FILENAME_EXTENSION)
-    set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION})
-  else()
-    set(filename ${testname}.cpp)
-  endif()
-
-  file(READ "${filename}" test_source)
-  set(parts 0)
-  string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+"
-         occurrences "${test_source}")
-  string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurrences}")
-  list(REMOVE_DUPLICATES suffixes)
-  if(EIGEN_SPLIT_LARGE_TESTS AND suffixes)
-    add_custom_target(${testname})
-    foreach(suffix ${suffixes})
-      ei_add_test_internal_sycl(${testname} ${testname}_${suffix}
-        "${ARGV1} -DEIGEN_TEST_PART_${suffix}=1" "${ARGV2}")
-      add_dependencies(${testname} ${testname}_${suffix})
-    endforeach()
-  else()
-    set(symbols_to_enable_all_parts "")
-    foreach(suffix ${suffixes})
-      set(symbols_to_enable_all_parts
-        "${symbols_to_enable_all_parts} -DEIGEN_TEST_PART_${suffix}=1")
-    endforeach()
-    ei_add_test_internal_sycl(${testname} ${testname} "${ARGV1} ${symbols_to_enable_all_parts}" "${ARGV2}")
-  endif()
-endmacro()
-
 # adds a failtest, i.e. a test that succeed if the program fails to compile
 # note that the test runner for these is CMake itself, when passed -DEIGEN_FAILTEST=ON
 # so here we're just running CMake commands immediately, we're not adding any targets.
diff --git a/cmake/FindComputeCpp.cmake b/cmake/FindComputeCpp.cmake
index f84a2554d..c926ee292 100644
--- a/cmake/FindComputeCpp.cmake
+++ b/cmake/FindComputeCpp.cmake
@@ -2,7 +2,7 @@
 # FindComputeCpp
 #---------------
 #
-#   Copyright 2016 Codeplay Software Ltd.
+#   Copyright 2016-2018 Codeplay Software Ltd.
 #
 #   Licensed under the Apache License, Version 2.0 (the "License");
 #   you may not use these files except in compliance with the License.
@@ -23,244 +23,421 @@
 #
 #  Tools for finding and building with ComputeCpp.
 #
-#  User must define COMPUTECPP_PACKAGE_ROOT_DIR pointing to the ComputeCpp
-#   installation.
+#  User must define ComputeCpp_DIR pointing to the ComputeCpp
+#  installation.
 #
 #  Latest version of this file can be found at:
 #    https://github.com/codeplaysoftware/computecpp-sdk
 
-# Require CMake version 3.2.2 or higher
-cmake_minimum_required(VERSION 3.2.2)
-
-# Check that a supported host compiler can be found
-if(CMAKE_COMPILER_IS_GNUCXX)
-    # Require at least gcc 4.8
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
-      message(FATAL_ERROR
-        "host compiler - Not found! (gcc version must be at least 4.8)")
-    else()
-      message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}")
-    endif()
-elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    # Require at least clang 3.6
-    if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6)
-      message(FATAL_ERROR
-        "host compiler - Not found! (clang version must be at least 3.6)")
-    else()
-      message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}")
-    endif()
-else()
-  message(WARNING
-    "host compiler - Not found! (ComputeCpp supports GCC and Clang, see readme)")
-endif()
-
-set(COMPUTECPP_64_BIT_DEFAULT ON)
-option(COMPUTECPP_64_BIT_CODE "Compile device code in 64 bit mode"
-        ${COMPUTECPP_64_BIT_DEFAULT})
-mark_as_advanced(COMPUTECPP_64_BIT_CODE)
-
-option(COMPUTECPP_DISABLE_GCC_DUAL_ABI "Compile with pre-5.1 ABI" OFF)
-mark_as_advanced(COMPUTECPP_DISABLE_GCC_DUAL_ABI)
+cmake_minimum_required(VERSION 3.4.3)
+include(FindPackageHandleStandardArgs)
 
 set(COMPUTECPP_USER_FLAGS "" CACHE STRING "User flags for compute++")
+separate_arguments(COMPUTECPP_USER_FLAGS)
 mark_as_advanced(COMPUTECPP_USER_FLAGS)
 
-# Find OpenCL package
-find_package(OpenCL REQUIRED)
+set(COMPUTECPP_BITCODE "spir64" CACHE STRING
+  "Bitcode type to use as SYCL target in compute++")
+mark_as_advanced(COMPUTECPP_BITCODE)
 
-# Find ComputeCpp packagee
-if(NOT COMPUTECPP_PACKAGE_ROOT_DIR)
-  message(FATAL_ERROR
-    "ComputeCpp package - Not found! (please set COMPUTECPP_PACKAGE_ROOT_DIR")
-else()
-  message(STATUS "ComputeCpp package - Found")
-endif()
+find_package(OpenCL REQUIRED)
 
-# Obtain the path to compute++
-find_program(COMPUTECPP_DEVICE_COMPILER compute++ PATHS
-  ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
-if (EXISTS ${COMPUTECPP_DEVICE_COMPILER})
-  mark_as_advanced(COMPUTECPP_DEVICE_COMPILER)
-  message(STATUS "compute++ - Found")
-else()
-  message(FATAL_ERROR "compute++ - Not found! (${COMPUTECPP_DEVICE_COMPILER})")
-endif()
+# Find ComputeCpp package
 
-# Obtain the path to computecpp_info
-find_program(COMPUTECPP_INFO_TOOL computecpp_info PATHS
-  ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
-if (EXISTS ${COMPUTECPP_INFO_TOOL})
-  mark_as_advanced(${COMPUTECPP_INFO_TOOL})
-  message(STATUS "computecpp_info - Found")
-else()
-  message(FATAL_ERROR "computecpp_info - Not found! (${COMPUTECPP_INFO_TOOL})")
+if(DEFINED ComputeCpp_DIR)
+  set(computecpp_find_hint ${ComputeCpp_DIR})
+elseif(DEFINED ENV{COMPUTECPP_DIR})
+  set(computecpp_find_hint $ENV{COMPUTECPP_DIR})
 endif()
 
-# Obtain the path to the ComputeCpp runtime library
-find_library(COMPUTECPP_RUNTIME_LIBRARY ComputeCpp PATHS ${COMPUTECPP_PACKAGE_ROOT_DIR}
-  HINTS ${COMPUTECPP_PACKAGE_ROOT_DIR}/lib PATH_SUFFIXES lib
-  DOC "ComputeCpp Runtime Library" NO_DEFAULT_PATH)
+# Used for running executables on the host
+set(computecpp_host_find_hint ${computecpp_find_hint})
 
-if (EXISTS ${COMPUTECPP_RUNTIME_LIBRARY})
-  mark_as_advanced(COMPUTECPP_RUNTIME_LIBRARY)
-  message(STATUS "libComputeCpp.so - Found")
-else()
-  message(FATAL_ERROR "libComputeCpp.so - Not found!")
+if(CMAKE_CROSSCOMPILING)
+  # ComputeCpp_HOST_DIR is used to find executables that are run on the host
+  if(DEFINED ComputeCpp_HOST_DIR)
+    set(computecpp_host_find_hint ${ComputeCpp_HOST_DIR})
+  elseif(DEFINED ENV{COMPUTECPP_HOST_DIR})
+    set(computecpp_host_find_hint $ENV{COMPUTECPP_HOST_DIR})
+  endif()
 endif()
 
-# Obtain the ComputeCpp include directory
-set(COMPUTECPP_INCLUDE_DIRECTORY ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/)
-if (NOT EXISTS ${COMPUTECPP_INCLUDE_DIRECTORY})
-  message(FATAL_ERROR "ComputeCpp includes - Not found!")
+find_program(ComputeCpp_DEVICE_COMPILER_EXECUTABLE compute++
+  HINTS ${computecpp_host_find_hint}
+  PATH_SUFFIXES bin)
+
+find_program(ComputeCpp_INFO_EXECUTABLE computecpp_info
+  HINTS ${computecpp_host_find_hint}
+  PATH_SUFFIXES bin)
+
+find_library(COMPUTECPP_RUNTIME_LIBRARY
+  NAMES ComputeCpp ComputeCpp_vs2015
+  HINTS ${computecpp_find_hint}
+  PATH_SUFFIXES lib
+  DOC "ComputeCpp Runtime Library")
+
+find_library(COMPUTECPP_RUNTIME_LIBRARY_DEBUG
+  NAMES ComputeCpp ComputeCpp_vs2015_d
+  HINTS ${computecpp_find_hint}
+  PATH_SUFFIXES lib
+  DOC "ComputeCpp Debug Runtime Library")
+
+find_path(ComputeCpp_INCLUDE_DIRS
+  NAMES "CL/sycl.hpp"
+  HINTS ${computecpp_find_hint}/include
+  DOC "The ComputeCpp include directory")
+get_filename_component(ComputeCpp_INCLUDE_DIRS ${ComputeCpp_INCLUDE_DIRS} ABSOLUTE)
+
+get_filename_component(computecpp_canonical_root_dir "${ComputeCpp_INCLUDE_DIRS}/.." ABSOLUTE)
+set(ComputeCpp_ROOT_DIR "${computecpp_canonical_root_dir}" CACHE PATH
+    "The root of the ComputeCpp install")
+
+if(NOT ComputeCpp_INFO_EXECUTABLE)
+  message(WARNING "Can't find computecpp_info - check ComputeCpp_DIR")
 else()
-  message(STATUS "ComputeCpp includes - Found")
-endif()
+  execute_process(COMMAND ${ComputeCpp_INFO_EXECUTABLE} "--dump-version"
+    OUTPUT_VARIABLE ComputeCpp_VERSION
+    RESULT_VARIABLE ComputeCpp_INFO_EXECUTABLE_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(NOT ComputeCpp_INFO_EXECUTABLE_RESULT EQUAL "0")
+    message(WARNING "Package version - Error obtaining version!")
+  endif()
 
-# Obtain the package version
-execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-version"
-  OUTPUT_VARIABLE COMPUTECPP_PACKAGE_VERSION
-  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
-  message(FATAL_ERROR "Package version - Error obtaining version!")
-else()
-  mark_as_advanced(COMPUTECPP_PACKAGE_VERSION)
-  message(STATUS "Package version - ${COMPUTECPP_PACKAGE_VERSION}")
+  execute_process(COMMAND ${ComputeCpp_INFO_EXECUTABLE} "--dump-is-supported"
+    OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED
+    RESULT_VARIABLE ComputeCpp_INFO_EXECUTABLE_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(NOT ComputeCpp_INFO_EXECUTABLE_RESULT EQUAL "0")
+    message(WARNING "platform - Error checking platform support!")
+  else()
+    mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED)
+    if (COMPUTECPP_PLATFORM_IS_SUPPORTED)
+      message(STATUS "platform - your system can support ComputeCpp")
+    else()
+      message(WARNING "platform - your system CANNOT support ComputeCpp")
+    endif()
+  endif()
 endif()
 
-# Obtain the device compiler flags
-execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-device-compiler-flags"
-  OUTPUT_VARIABLE COMPUTECPP_DEVICE_COMPILER_FLAGS
-  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
-  message(FATAL_ERROR "compute++ flags - Error obtaining compute++ flags!")
-else()
-  mark_as_advanced(COMPUTECPP_COMPILER_FLAGS)
-  message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}")
+find_package_handle_standard_args(ComputeCpp
+  REQUIRED_VARS ComputeCpp_ROOT_DIR
+                ComputeCpp_DEVICE_COMPILER_EXECUTABLE
+                ComputeCpp_INFO_EXECUTABLE
+                COMPUTECPP_RUNTIME_LIBRARY
+                COMPUTECPP_RUNTIME_LIBRARY_DEBUG
+                ComputeCpp_INCLUDE_DIRS
+  VERSION_VAR ComputeCpp_VERSION)
+mark_as_advanced(ComputeCpp_ROOT_DIR
+                 ComputeCpp_DEVICE_COMPILER_EXECUTABLE
+                 ComputeCpp_INFO_EXECUTABLE
+                 COMPUTECPP_RUNTIME_LIBRARY
+                 COMPUTECPP_RUNTIME_LIBRARY_DEBUG
+                 ComputeCpp_INCLUDE_DIRS
+                 ComputeCpp_VERSION)
+
+if(NOT ComputeCpp_FOUND)
+  return()
 endif()
 
-# Check if the platform is supported
-execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported"
-  OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED
-  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
-  message(FATAL_ERROR "platform - Error checking platform support!")
-else()
-  mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED)
-  if (COMPUTECPP_PLATFORM_IS_SUPPORTED)
-    message(STATUS "platform - your system can support ComputeCpp")
-  else()
-    message(STATUS "platform - your system CANNOT support ComputeCpp")
+list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -O2 -mllvm -inline-threshold=1000 -intelspirmetadata)
+mark_as_advanced(COMPUTECPP_DEVICE_COMPILER_FLAGS)
+
+if(CMAKE_CROSSCOMPILING)
+  if(NOT COMPUTECPP_DONT_USE_TOOLCHAIN)
+    list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS --gcc-toolchain=${COMPUTECPP_TOOLCHAIN_DIR})
   endif()
+  list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS --sysroot=${COMPUTECPP_SYSROOT_DIR})
+  list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -target ${COMPUTECPP_TARGET_TRIPLE})
+endif()
+
+list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -sycl-target ${COMPUTECPP_BITCODE})
+message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}")
+
+if(NOT TARGET OpenCL::OpenCL)
+  add_library(OpenCL::OpenCL UNKNOWN IMPORTED)
+  set_target_properties(OpenCL::OpenCL PROPERTIES
+    IMPORTED_LOCATION             "${OpenCL_LIBRARIES}"
+    INTERFACE_INCLUDE_DIRECTORIES "${OpenCL_INCLUDE_DIRS}"
+  )
 endif()
 
-set(COMPUTECPP_USER_FLAGS
-  -sycl-compress-name
-  -Wall
-  -no-serial-memop
-  -DEIGEN_NO_ASSERTION_CHECKING=1
+if(NOT TARGET ComputeCpp::ComputeCpp)
+  add_library(ComputeCpp::ComputeCpp UNKNOWN IMPORTED)
+  set_target_properties(ComputeCpp::ComputeCpp PROPERTIES
+    IMPORTED_LOCATION_DEBUG          "${COMPUTECPP_RUNTIME_LIBRARY_DEBUG}"
+    IMPORTED_LOCATION_RELWITHDEBINFO "${COMPUTECPP_RUNTIME_LIBRARY_DEBUG}"
+    IMPORTED_LOCATION                "${COMPUTECPP_RUNTIME_LIBRARY}"
+    INTERFACE_INCLUDE_DIRECTORIES    "${ComputeCpp_INCLUDE_DIRS}"
+    INTERFACE_LINK_LIBRARIES         "OpenCL::OpenCL"
   )
+endif()
+
+# This property allows targets to specify that their sources should be
+# compiled with the integration header included after the user's
+# sources, not before (e.g. when an enum is used in a kernel name, this
+# is not technically valid SYCL code but can work with ComputeCpp)
+define_property(
+  TARGET PROPERTY COMPUTECPP_INCLUDE_AFTER
+  BRIEF_DOCS "Include integration header after user source"
+  FULL_DOCS "Changes compiler arguments such that the source file is
+  actually the integration header, and the .cpp file is included on
+  the command line so that it is seen by the compiler first. Enables
+  non-standards-conformant SYCL code to compile with ComputeCpp."
+)
+define_property(
+  TARGET PROPERTY INTERFACE_COMPUTECPP_FLAGS
+  BRIEF_DOCS "Interface compile flags to provide compute++"
+  FULL_DOCS  "Set additional compile flags to pass to compute++ when compiling
+  any target which links to this one."
+)
+define_property(
+  SOURCE PROPERTY COMPUTECPP_SOURCE_FLAGS
+  BRIEF_DOCS "Source file compile flags for compute++"
+  FULL_DOCS  "Set additional compile flags for compiling the SYCL integration
+  header for the given source file."
+)
 
 ####################
-#   __build_sycl
+#   __build_ir
 ####################
 #
 #  Adds a custom target for running compute++ and adding a dependency for the
 #  resulting integration header.
 #
-#  targetName : Name of the target.
-#  sourceFile : Source file to be compiled.
-#  binaryDir : Intermediate directory to output the integration header.
-#  fileCounter : Counter included in name of custom target. Different counter
-#       values prevent duplicated names of custom target when source files with the same name,
-#       but located in different directories, are used for the same target.
+#  TARGET : Name of the target.
+#  SOURCE : Source file to be compiled.
+#  COUNTER : Counter included in name of custom target. Different counter
+#       values prevent duplicated names of custom target when source files with
+#       the same name, but located in different directories, are used for the
+#       same target.
 #
-function(__build_spir targetName sourceFile binaryDir fileCounter)
-
-  # Retrieve source file name.
-  get_filename_component(sourceFileName ${sourceFile} NAME)
-
-  # Set the path to the Sycl file.
-  set(outputSyclFile ${binaryDir}/${sourceFileName}.sycl)
-
-  # Add any user-defined include to the device compiler
-  set(device_compiler_includes "")
-  get_property(includeDirectories DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY
-    INCLUDE_DIRECTORIES)
-  foreach(directory ${includeDirectories})
-    set(device_compiler_includes "-I${directory}" ${device_compiler_includes})
-  endforeach()
-  get_target_property(targetIncludeDirectories ${targetName} INCLUDE_DIRECTORIES)
-  foreach(directory ${targetIncludeDirectories})
-    set(device_compiler_includes "-I${directory}" ${device_compiler_includes})
-  endforeach()
-  if (CMAKE_INCLUDE_PATH)
-    foreach(directory ${CMAKE_INCLUDE_PATH})
-      set(device_compiler_includes "-I${directory}"
-        ${device_compiler_includes})
+function(__build_ir)
+  set(options)
+  set(one_value_args
+    TARGET
+    SOURCE
+    COUNTER
+  )
+  set(multi_value_args)
+  cmake_parse_arguments(SDK_BUILD_IR
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}"
+    ${ARGN}
+  )
+  get_filename_component(sourceFileName ${SDK_BUILD_IR_SOURCE} NAME)
+
+  # Set the path to the integration header.
+  # The .sycl filename must depend on the target so that different targets
+  # using the same source file will be generated with a different rule.
+  set(baseSyclName ${CMAKE_CURRENT_BINARY_DIR}/${SDK_BUILD_IR_TARGET}_${sourceFileName})
+  set(outputSyclFile ${baseSyclName}.sycl)
+  set(depFileName ${baseSyclName}.sycl.d)
+
+  set(include_directories "$<TARGET_PROPERTY:${SDK_BUILD_IR_TARGET},INCLUDE_DIRECTORIES>")
+  set(compile_definitions "$<TARGET_PROPERTY:${SDK_BUILD_IR_TARGET},COMPILE_DEFINITIONS>")
+  set(generated_include_directories
+    $<$<BOOL:${include_directories}>:-I\"$<JOIN:${include_directories},\"\t-I\">\">)
+  set(generated_compile_definitions
+    $<$<BOOL:${compile_definitions}>:-D$<JOIN:${compile_definitions},\t-D>>)
+
+  # Obtain language standard of the file
+  set(device_compiler_cxx_standard)
+  get_target_property(targetCxxStandard ${SDK_BUILD_IR_TARGET} CXX_STANDARD)
+  if (targetCxxStandard MATCHES 17)
+    set(device_compiler_cxx_standard "-std=c++1z")
+  elseif (targetCxxStandard MATCHES 14)
+    set(device_compiler_cxx_standard "-std=c++14")
+  elseif (targetCxxStandard MATCHES 11)
+    set(device_compiler_cxx_standard "-std=c++11")
+  elseif (targetCxxStandard MATCHES 98)
+    message(FATAL_ERROR "SYCL applications cannot be compiled using C++98")
+  else ()
+    set(device_compiler_cxx_standard "")
+  endif()
+
+  get_property(source_compile_flags
+    SOURCE ${SDK_BUILD_IR_SOURCE}
+    PROPERTY COMPUTECPP_SOURCE_FLAGS
+  )
+  separate_arguments(source_compile_flags)
+  if(source_compile_flags)
+    list(APPEND computecpp_source_flags ${source_compile_flags})
+  endif()
+
+  list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS
+    ${device_compiler_cxx_standard}
+    ${COMPUTECPP_USER_FLAGS}
+    ${computecpp_source_flags}
+  )
+
+  set(ir_dependencies ${SDK_BUILD_IR_SOURCE})
+  get_target_property(target_libraries ${SDK_BUILD_IR_TARGET} LINK_LIBRARIES)
+  if(target_libraries)
+    foreach(library ${target_libraries})
+      list(APPEND ir_dependencies ${library})
     endforeach()
   endif()
 
-  set(COMPUTECPP_DEVICE_COMPILER_FLAGS
-    ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
-    ${COMPUTECPP_USER_FLAGS})
-  # Convert argument list format
-  separate_arguments(COMPUTECPP_DEVICE_COMPILER_FLAGS)
+  # Depfile support was only added in CMake 3.7
+  # CMake throws an error if it is unsupported by the generator (i. e. not ninja)
+  if((NOT CMAKE_VERSION VERSION_LESS 3.7.0) AND
+          CMAKE_GENERATOR MATCHES "Ninja")
+    file(RELATIVE_PATH relOutputFile ${CMAKE_BINARY_DIR} ${outputSyclFile})
+    set(generate_depfile -MMD -MF ${depFileName} -MT ${relOutputFile})
+    set(enable_depfile DEPFILE ${depFileName})
+  endif()
 
   # Add custom command for running compute++
   add_custom_command(
     OUTPUT ${outputSyclFile}
-    COMMAND ${COMPUTECPP_DEVICE_COMPILER}
+    COMMAND ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE}
             ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
-            -isystem ${COMPUTECPP_INCLUDE_DIRECTORY}
-            ${COMPUTECPP_PLATFORM_SPECIFIC_ARGS}
-            ${device_compiler_includes}
+            ${generated_include_directories}
+            ${generated_compile_definitions}
             -o ${outputSyclFile}
-            -c ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}
-    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}
-    IMPLICIT_DEPENDS CXX "${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}"
-    WORKING_DIRECTORY ${binaryDir}
+            -c ${SDK_BUILD_IR_SOURCE}
+            ${generate_depfile}
+    DEPENDS ${ir_dependencies}
+    IMPLICIT_DEPENDS CXX ${SDK_BUILD_IR_SOURCE}
+    ${enable_depfile}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
     COMMENT "Building ComputeCpp integration header file ${outputSyclFile}")
 
-  # Add a custom target for the generated integration header
-  add_custom_target(${targetName}_integration_header DEPENDS ${outputSyclFile})
+  # Name: (user-defined name)_(source file)_(counter)_ih
+  set(headerTargetName
+    ${SDK_BUILD_IR_TARGET}_${sourceFileName}_${SDK_BUILD_IR_COUNTER}_ih)
+
+  if(NOT MSVC)
+    # Add a custom target for the generated integration header
+    add_custom_target(${headerTargetName} DEPENDS ${outputSyclFile})
+    add_dependencies(${SDK_BUILD_IR_TARGET} ${headerTargetName})
+  endif()
+
+  # This property can be set on a per-target basis to indicate that the
+  # integration header should appear after the main source listing
+  get_target_property(includeAfter ${SDK_ADD_SYCL_TARGET} COMPUTECPP_INCLUDE_AFTER)
+
+  if(includeAfter)
+    # Change the source file to the integration header - e.g.
+    # g++ -c source_file_name.cpp.sycl
+    get_target_property(current_sources ${SDK_BUILD_IR_TARGET} SOURCES)
+    # Remove absolute path to source file
+    list(REMOVE_ITEM current_sources ${SDK_BUILD_IR_SOURCE})
+    # Remove relative path to source file
+    string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" ""
+      rel_source_file ${SDK_BUILD_IR_SOURCE}
+    )
+    list(REMOVE_ITEM current_sources ${rel_source_file})
+    # Add SYCL header to source list
+    list(APPEND current_sources ${outputSyclFile})
+    set_property(TARGET ${SDK_BUILD_IR_TARGET}
+      PROPERTY SOURCES ${current_sources})
+    # CMake/gcc don't know what language a .sycl file is, so tell them
+    set_property(SOURCE ${outputSyclFile} PROPERTY LANGUAGE CXX)
+    set(includedFile ${SDK_BUILD_IR_SOURCE})
+    set(cppFile ${outputSyclFile})
+  else()
+    set_property(SOURCE ${outputSyclFile} PROPERTY HEADER_FILE_ONLY ON)
+    set(includedFile ${outputSyclFile})
+    set(cppFile ${SDK_BUILD_IR_SOURCE})
+  endif()
 
-  # Add a dependency on the integration header
-  add_dependencies(${targetName} ${targetName}_integration_header)
+  # Force inclusion of the integration header for the host compiler
+  if(MSVC)
+    # Group SYCL files inside Visual Studio
+    source_group("SYCL" FILES ${outputSyclFile})
+
+    if(includeAfter)
+      # Allow the source file to be edited using Visual Studio.
+      # It will be added as a header file so it won't be compiled.
+      set_property(SOURCE ${SDK_BUILD_IR_SOURCE} PROPERTY HEADER_FILE_ONLY true)
+    endif()
 
-  # Set the host compiler C++ standard to C++11
-  set_property(TARGET ${targetName} PROPERTY CXX_STANDARD 11)
+    # Add both source and the sycl files to the VS solution.
+    target_sources(${SDK_BUILD_IR_TARGET} PUBLIC ${SDK_BUILD_IR_SOURCE} ${outputSyclFile})
 
-  # Disable GCC dual ABI on GCC 5.1 and higher
-  if(COMPUTECPP_DISABLE_GCC_DUAL_ABI)
-    set_property(TARGET ${targetName} APPEND PROPERTY COMPILE_DEFINITIONS
-      "_GLIBCXX_USE_CXX11_ABI=0")
+    set(forceIncludeFlags "/FI${includedFile} /TP")
+  else()
+    set(forceIncludeFlags "-include ${includedFile} -x c++")
   endif()
 
-endfunction()
+  set_property(
+    SOURCE ${cppFile}
+    APPEND_STRING PROPERTY COMPILE_FLAGS "${forceIncludeFlags}"
+  )
+
+endfunction(__build_ir)
 
 #######################
 #  add_sycl_to_target
 #######################
 #
 #  Adds a SYCL compilation custom command associated with an existing
-#  target and sets a dependency on that new command.
+#  target and sets a dependancy on that new command.
 #
-#  targetName : Name of the target to add a SYCL to.
-#  binaryDir : Intermediate directory to output the integration header.
-#  sourceFiles : Source files to be compiled for SYCL.
+#  TARGET : Name of the target to add SYCL to.
+#  SOURCES : Source files to be compiled for SYCL.
 #
-function(add_sycl_to_target targetName binaryDir sourceFiles)
+function(add_sycl_to_target)
+  set(options)
+  set(one_value_args
+    TARGET
+  )
+  set(multi_value_args
+    SOURCES
+  )
+  cmake_parse_arguments(SDK_ADD_SYCL
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}"
+    ${ARGN}
+  )
 
-  set(sourceFiles ${sourceFiles} ${ARGN})
-  set(fileCounter 0)
-  # Add custom target to run compute++ and generate the integration header
-  foreach(sourceFile ${sourceFiles})
-    __build_spir(${targetName} ${sourceFile} ${binaryDir} ${fileCounter})
-    math(EXPR fileCounter "${fileCounter} + 1")
-  endforeach()
+  # If the CXX compiler is set to compute++ enable the driver.
+  get_filename_component(cmakeCxxCompilerFileName "${CMAKE_CXX_COMPILER}" NAME)
+  if("${cmakeCxxCompilerFileName}" STREQUAL "compute++")
+    if(MSVC)
+      message(FATAL_ERROR "The compiler driver is not supported by this system,
+                           revert the CXX compiler to your default host compiler.")
+    endif()
 
-  # Link with the ComputeCpp runtime library
-  target_link_libraries(${targetName} PUBLIC ${COMPUTECPP_RUNTIME_LIBRARY}
-                        PUBLIC ${OpenCL_LIBRARIES})
+    get_target_property(includeAfter ${SDK_ADD_SYCL_TARGET} COMPUTECPP_INCLUDE_AFTER)
+    if(includeAfter)
+      list(APPEND COMPUTECPP_USER_FLAGS -fsycl-ih-last)
+    endif()
+    list(INSERT COMPUTECPP_DEVICE_COMPILER_FLAGS 0 -sycl-driver)
+    # Prepend COMPUTECPP_DEVICE_COMPILER_FLAGS and append COMPUTECPP_USER_FLAGS
+    foreach(prop COMPILE_OPTIONS INTERFACE_COMPILE_OPTIONS)
+      get_target_property(target_compile_options ${SDK_ADD_SYCL_TARGET} ${prop})
+      if(NOT target_compile_options)
+        set(target_compile_options "")
+      endif()
+      set_property(
+        TARGET ${SDK_ADD_SYCL_TARGET}
+        PROPERTY ${prop}
+        ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
+        ${target_compile_options}
+        ${COMPUTECPP_USER_FLAGS}
+      )
+    endforeach()
+  else()
+    set(fileCounter 0)
+    list(INSERT COMPUTECPP_DEVICE_COMPILER_FLAGS 0 -sycl)
+    # Add custom target to run compute++ and generate the integration header
+    foreach(sourceFile ${SDK_ADD_SYCL_SOURCES})
+      if(NOT IS_ABSOLUTE ${sourceFile})
+        set(sourceFile "${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}")
+      endif()
+      __build_ir(
+        TARGET     ${SDK_ADD_SYCL_TARGET}
+        SOURCE     ${sourceFile}
+        COUNTER    ${fileCounter}
+      )
+      MATH(EXPR fileCounter "${fileCounter} + 1")
+    endforeach()
+  endif()
 
-endfunction()
+  set_property(TARGET ${SDK_ADD_SYCL_TARGET}
+    APPEND PROPERTY LINK_LIBRARIES ComputeCpp::ComputeCpp)
+  set_property(TARGET ${SDK_ADD_SYCL_TARGET}
+    APPEND PROPERTY INTERFACE_LINK_LIBRARIES ComputeCpp::ComputeCpp)
+endfunction(add_sycl_to_target)
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 6a8dc2cd8..f8a62253c 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -15,19 +15,6 @@
 
 #if EIGEN_HAS_CXX11
 
-#if defined(EIGEN_USE_SYCL)
-#undef min
-#undef max
-#undef isnan
-#undef isinf
-#undef isfinite
-#include <CL/sycl.hpp>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <utility>
-#endif
-
 #include "../SpecialFunctions"
 
 #include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
@@ -72,7 +59,7 @@ typedef unsigned __int64 uint64_t;
 #include <time.h>
 #endif
 
-#ifdef EIGEN_USE_THREADS
+#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
 #include "ThreadPool"
 #endif
 
@@ -147,7 +134,13 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorScan.h"
 #include "src/Tensor/TensorTrace.h"
 
-#include "src/Tensor/TensorSycl.h"
+#ifdef EIGEN_USE_SYCL
+#include "src/Tensor/TensorReductionSycl.h"
+#include "src/Tensor/TensorConvolutionSycl.h"
+#include "src/Tensor/TensorContractionSycl.h"
+#include "src/Tensor/TensorScanSycl.h"
+#endif
+
 #include "src/Tensor/TensorExecutor.h"
 #include "src/Tensor/TensorDevice.h"
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
deleted file mode 100644
index 2184c94b3..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
+++ /dev/null
@@ -1,152 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- *  TensorArgMaxSycl.h
- * \brief:
- *  TensorArgMaxSycl
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP
-namespace Eigen {
-namespace internal {
-  template<typename Dims, typename XprType>
-  struct eval<TensorTupleReducerDeviceOp<Dims, XprType>, Eigen::Dense>
-  {
-    typedef const TensorTupleReducerDeviceOp<Dims, XprType>& type;
-  };
-
-  template<typename Dims, typename XprType>
-  struct nested<TensorTupleReducerDeviceOp<Dims, XprType>, 1,
-                typename eval<TensorTupleReducerDeviceOp<Dims, XprType> >::type>
-  {
-    typedef TensorTupleReducerDeviceOp<Dims, XprType> type;
-  };
-
-template<typename StrideDims, typename XprType>
-struct traits<TensorTupleReducerDeviceOp<StrideDims, XprType> > : public traits<XprType>
-{
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef Index Scalar;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-};
-
-
-}// end namespace internal
-template<typename StrideDims, typename XprType>
-class TensorTupleReducerDeviceOp : public TensorBase<TensorTupleReducerDeviceOp<StrideDims, XprType>, ReadOnlyAccessors>
-{
-  public:
-  typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename Eigen::internal::nested<TensorTupleReducerDeviceOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Index Index;
-  typedef typename XprType::CoeffReturnType TupleType;
-  typedef Index CoeffReturnType;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerDeviceOp(XprType expr,
-                                                              const Index return_dim,
-                                                              const StrideDims strides,
-                                                              const Index stride_mod, const Index stride_div)
-          :m_xpr(expr), m_return_dim(return_dim), m_strides(strides), m_stride_mod(stride_mod), m_stride_div(stride_div) {}
-
-  EIGEN_DEVICE_FUNC
-  const typename internal::remove_all<typename XprType::Nested>::type&
-  expression() const { return m_xpr; }
-
-  EIGEN_DEVICE_FUNC
-  Index return_dim() const { return m_return_dim; }
-
-  EIGEN_DEVICE_FUNC
-  const StrideDims& strides() const { return m_strides; }
-
-  EIGEN_DEVICE_FUNC
-  const Index& stride_mod() const { return m_stride_mod; }
-
-  EIGEN_DEVICE_FUNC
-  const Index& stride_div() const { return m_stride_div; }
-
-  protected:
-    typename Eigen::internal::remove_all<typename
-    XprType::Nested
-    >::type m_xpr;
-    const Index m_return_dim;
-    const StrideDims m_strides;
-    const Index m_stride_mod;
-    const Index m_stride_div;
-};
-
-
-// Eval as rvalue
-template<typename StrideDims, typename ArgType>
-struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice>
-{
-  typedef TensorTupleReducerDeviceOp<StrideDims, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::TupleType TupleType;
-  typedef typename TensorEvaluator<ArgType, SyclKernelDevice>::Dimensions Dimensions;
-
-  enum {
-    IsAligned =  false,
-    PacketAccess = false,
-    BlockAccessV2 = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout,
-    CoordAccess = false,
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,  const SyclKernelDevice& device)
-      : m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()),
-      m_stride_div(op.stride_div()){}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
-    return m_impl.dimensions();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    const TupleType v = m_impl.coeff(index);
-    return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
-  }
-typedef typename MakeGlobalPointer<typename TensorEvaluator<ArgType , SyclKernelDevice>::CoeffReturnType >::Type ptr_Dev_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_Dev_type  data() const { return const_cast<ptr_Dev_type>(m_impl.data()); }
-
-protected:
- TensorEvaluator<ArgType , SyclKernelDevice> m_impl;
- const Index m_return_dim;
- const StrideDims m_strides;
- const Index m_stride_mod;
- const Index m_stride_div;
-};
-} // end namespace Eigen
-#endif //UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index 50865d404..9ab900b4a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -545,6 +545,10 @@ class TensorContractionInputMapper
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
     return VectorMapper(*this, i, j);
   }
+  
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& get_tensor() const {
+    return Base::m_tensor;
+  }
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
old mode 100644
new mode 100755
index 35f931c53..a6ca1777a
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -1,136 +1,1386 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
+// This file is part of Eigen, a lightweight C++ template library for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not
+// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 /*****************************************************************
- * TensorTensorContractionsycl.h
+ * TensorContractionSycl.h
  *
  * \brief:
- *  TensorContractionsycl
+ *  TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
  *
-*****************************************************************/
+ *****************************************************************/
 
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
 #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
+
 namespace Eigen {
 
-template <typename Index, typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels;
-template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, const Eigen::SyclDevice> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, const Eigen::SyclDevice> > {
+namespace TensorSycl {
+namespace internal {
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+/*!
+ * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector
+ * contraction kernel on various hardware devices.
+ *
+ * \tparam Scalar: determines the element type of the tensor/vector
+ *
+ * \tparam StorageIndex  determines the Index type.
+ *
+ * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group
+ *
+ * \tparam CFactor: determines the number of contracting element to be process by each thread
+ *
+ * \tparam NCFactor: determines the number of non-contracting element to be process by each thread
+ */
+template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor>
+struct TVPanelSize {
+  // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+  // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+  // TileSizeDimNC: determines the tile size for the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor;
+  // TileSizeDimC: determines the tile size for the contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC;
+  // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC;
+  // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC;
+  // BC : determines if supporting bank conflict is required
+  static EIGEN_CONSTEXPR bool BC = false;
+};
+#endif
+
+/*!
+ * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor
+ contraction kernel on various hardware devices.
+ *
+ * \tparam Scalar: determines the element type of the tensor
+ *
+ * \tparam StorageIndex: determines the Index type.
+ *
+ * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the
+ available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro).
+ *
+ * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the
+ available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro).
+ *
+ * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered
+ */
+
+template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK>
+struct TTPanelSize {
+  // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK;
+  // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the
+  // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro//
+#ifndef EIGEN_SYCL_REG_M
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M;
+#else
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;
+#endif
+// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the
+// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro
+#ifndef EIGEN_SYCL_REG_N
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N;
+#else
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;
+#endif
+  // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+  // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+  // TileSizeDimM: determines the tile size for the m dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM;
+  // TileSizeDimN: determines the tile size for the n dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN;
+  // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisable by packetsize
+  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs =
+      ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN));
+  // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisable by packetsize
+  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs =
+      ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM));
+  // BC : determines if supporting bank conflict is required
+  static EIGEN_CONSTEXPR bool BC = true;
+  // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by
+  // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device doesnot have sufficient  local memory)
+  static EIGEN_CONSTEXPR bool DoubleBuffer =
+#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER
+      false;
+#else
+      true;
+#endif
+};
+
+/* !
+ * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to
+ * specialize the contraction algorithm based on device support for dedicated local memory.
+ */
+enum class contraction_type { local, no_local };
+/* !
+ * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private).
+ */
+enum class data_source { global_mem, local_mem, private_mem };
+
+/*!
+ * \brief read, a template function used for loading the data from global
+ memory. This function is used to guarantee coalesced and vectorized load whenever possible
+ *
+ * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
+ vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam TensorMapper: determines the input tensor mapper type
+ *
+ * \tparam StorageIndex: determines the Index type
+
+ * \param tensorMapper: is the input tensor
+ *
+ * \param NCIndex: is the non-contracting dim index
+ *
+ * \param CIndex is the contracting dim index
+ *
+ * \param ld: is the leading dimension of the flattened tensor
+ */
+template <bool PacketLoad, bool is_coalesced_layout, bool, typename PacketType, typename TensorMapper,
+          typename StorageIndex>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<PacketLoad, PacketType>::type read(
+    const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) {
+  const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex;
+  const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex;
+  return tensorMapper.get_tensor().template packet<Unaligned>(row + (col * ld));
+}
+
+/*!
+ * \brief read, special overload of read function, when the read access is not vectorized
+ *
+ * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
+  vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+  contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+  when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam TensorMapper: determines the input tensor mapper type
+ *
+ * \tparam StorageIndex: determines the Index type
+
+ * \param tensorMapper: is the input tensor
+ *
+ * \param NCIndex: is the non-contracting dim index
+ *
+ * \param CIndex: is the contracting dim index
+ */
+template <bool PacketLoad, bool, bool IsRhs, typename PacketType, typename TensorMapper, typename StorageIndex>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!PacketLoad, PacketType>::type read(
+    const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) {
+  const StorageIndex row = (IsRhs) ? CIndex : NCIndex;
+  const StorageIndex col = (IsRhs) ? NCIndex : CIndex;
+  return tensorMapper(row, col);
+}
+
+/*!
+ * \brief write, a template function used for storing the data to local memory. This function is used to guarantee
+ * coalesced and vectorized store whenever possible.
+ *
+ * \tparam StorageIndex: determines the Index type
+ *
+ * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ *
+ * \param CIndex is the contracting dim index
+ */
+
+template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename ::Eigen::internal::enable_if<dt != data_source::global_mem, void>::type
+    write(PacketType &packet_data, DataScalar ptr) {
+  EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size;
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < PacketSize; i++) {
+    *ptr = PacketWrapper<PacketType, PacketSize>::scalarize(i, packet_data);
+    ptr += ld;
+  }
+}
+
+/*!
+ * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function
+ * is used to guarantee coalesced and vectorized store whenever possible.
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ */
+
+template <data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
+    Eigen::internal::unpacket_traits<PacketType>::size != 1 && dt == data_source::global_mem, void>::type
+write(PacketType &packet_data, DataScalar *ptr) {
+  ::Eigen::internal::pstoreu<DataScalar, PacketType>(ptr, packet_data);
+}
+
+/*!
+ * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled.
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ */
+template <data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
+    Eigen::internal::unpacket_traits<PacketType>::size == 1 && dt == data_source::global_mem, void>::type
+write(PacketType &packet_data, DataScalar *ptr) {
+  *ptr = packet_data;
+}
+
+/*!
+ * \brief check_boundary: is used to check the edge condition for non-internal blocks.
+ *
+ * \tparam is_internal: determines if the block is internal
+ */
+template <bool is_internal>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) {
+  return true;
+}
+
+/*!
+ * \brief check_boundary: specialization of the check_boundary for non-internal blocks.
+ *
+ * \param cond: true when the data is in range. Otherwise false
+ */
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary<false>(bool cond) {
+  return cond;
+}
+
+/*!
+ * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed
+ * by each workgroup.
+ *
+ * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed
+ *
+ * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be
+ * packetType; Otherwise it will be scalar Type
+ *
+ * \param elements_per_access determines the size of each element based on OutType
+ *
+ * \param is_coalesced_layout  determines whether or not the Tensor data in a memory can be access coalesced and
+ * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the
+ * Tensor Block for each workgroup
+ *
+ * \param c_stride  determines the stride of contracting dimension to access the next adjustment element within the
+ * Tensor Block for each workgroup
+ */
+template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType>
+struct BlockProperties {
+  static EIGEN_CONSTEXPR bool packet_load = packet_load_;
+  typedef typename Eigen::internal::unpacket_traits<PacketType>::type OutScalar;
+  static EIGEN_CONSTEXPR bool is_rhs = is_rhs_;
+  typedef typename Eigen::internal::conditional<packet_load, PacketType, OutScalar>::type OutType;
+  static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size;
+  static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs);
+  static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1);
+  static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access);
+};
+
+/*!
+ * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup.  Please see
+ * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup,
+ * work-items
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \param linearLocalThreadId: determines the linearized location of a thread within a work-group
+ *
+ * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when
+ * tall/skinny algorithm is used
+ *
+ * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of
+ * the flattened tensor.
+ *
+ * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the
+ * flattened tensor. It will be > 1 when tall/skinny algorithm is used.
+ *
+ * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a
+ * flattened tensor. The position determines the distance of each thread within the workgroup from each other
+ * independent from their global position.
+ *
+ * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a
+ * flattened tensor. The position determines the distance of each thread within the workgroup from each other
+ * independent from their global position.
+ *
+ * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a
+ * flattened tensor
+ *
+ * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a
+ * flattened tensor
+ *
+ * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the
+ * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used.
+ *
+ * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or
+ * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be
+ * resolve by compiler.
+ */
+template <typename StorageIndex>
+struct ThreadProperties {
+  const StorageIndex linearLocalThreadId;
+  const StorageIndex kGroupId;
+  const StorageIndex mGroupOffset;
+  const StorageIndex nGroupOffset;
+  const StorageIndex kGroupOffset;
+  const StorageIndex mLocalOffset;
+  const StorageIndex nLocalOffset;
+  const StorageIndex mGlobalOffset;
+  const StorageIndex nGlobalOffset;
+  StorageIndex kSize;
+  const bool is_internal;
+  // this is used to adjust the last block
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties(
+      const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_,
+      const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_,
+      const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_,
+      StorageIndex kSize_, const bool is_internal_)
+      : linearLocalThreadId(linearLocalThreadId_),
+        kGroupId(kGroupId_),
+        mGroupOffset(mGroupOffset_),
+        nGroupOffset(nGroupOffset_),
+        kGroupOffset(kGroupOffset_),
+        mLocalOffset(mLocalOffset_),
+        nLocalOffset(nLocalOffset_),
+        mGlobalOffset(mGlobalOffset_),
+        nGlobalOffset(nGlobalOffset_),
+        kSize(kSize_),
+        is_internal(is_internal_) {}
+};
+
+/*!
+ * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation.
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam LhsScalar: determines the left-hand-side scalar type
+ *
+ * \tparam RhsScalar: determines the right-hand-side scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix
+ *
+ * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Properties: determines the Contraction Panel properties
+ *
+ * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory
+ access is used to guarantee that always the memory access are coalesced.
+ *
+ * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output.
+ Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny
+ contraction is used. So in this case, a final reduction step is required to compute final output.
+
+ * \tparam contraction_tp: it is an enum value representing whether the local memroy/no local memory implementation of
+ the algorithm to be used
+ *
+ * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
+ *
+ * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
+ *
+ * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
+ *
+ * \param out_res: determines the output tensor containing the contraction result
+ *
+ * \param groupSizeM: a logical number determining the number of work-group for m dimension
+ *
+ * \param groupSizeN: a logical number determining the number of work-group for n dimension
+ *
+ * \param numTiles: determines total number of tiles on the k dimension
+ *
+ * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
+ */
+template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
+          typename RhsMapper, typename StorageIndex, typename Properties, typename TripleDim, bool Vectorizable,
+          typename input_mapper_properties, bool IsFinal, contraction_type contraction_tp>
+class TensorContractionKernel {
+ public:
+  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
+      PacketReturnType;
+  static EIGEN_CONSTEXPR int PacketSize =
+      Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
+  static EIGEN_CONSTEXPR bool is_lhs_transposed =
+      !::Eigen::internal::TensorContractionInputMapperTrait<LhsMapper>::inner_dim_contiguous;
+  static EIGEN_CONSTEXPR bool is_rhs_transposed =
+      !::Eigen::internal::TensorContractionInputMapperTrait<RhsMapper>::inner_dim_contiguous;
+
+  typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable,
+                          PacketReturnType>
+      LHSBlockProperties;
+
+  typedef BlockProperties<is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix && Vectorizable,
+                          PacketReturnType>
+      RHSBlockProperties;
+
+  static EIGEN_CONSTEXPR StorageIndex NStride =
+      contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride;
+
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+  typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr;
+  typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr;
+  typedef
+      typename ::Eigen::internal::conditional<contraction_tp == contraction_type::local, local_ptr, private_ptr>::type
+          tile_ptr;
+  static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local
+                                                 ? Properties::TileSizeDimM + Properties::BC
+                                                 : Properties::WorkLoadPerThreadM;
+  static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local
+                                                 ? Properties::TileSizeDimN + Properties::BC
+                                                 : Properties::WorkLoadPerThreadN;
+  static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+
+  /**
+   * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not
+   * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to
+   * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting
+   * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out
+   * different type of memory needed when local/no_local memory computation is called.
+   *
+   * \tparam contraction_type: it is an enum value representing whether the local memroy/no local memory implementation
+   of the algorithm to be used
+   * \tparam the private memory size
+   * \param ptr the tile memory pointer type
+   */
+  template <contraction_type, StorageIndex>
+  struct MemHolder {
+    tile_ptr ptr;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {}
+  };
+  /**
+   * \brief specialization of memHolder class when no local memory kernel is used.
+   */
+  template <StorageIndex MemSize>
+  struct MemHolder<contraction_type::no_local, MemSize> {
+    OutScalar ptr[MemSize] = {OutScalar{0}};
+  };
+  /**
+   * \brief TiledMemory: contains required memory pointer for loading  each tile of the TensorContraction panel from
+   * global memory to local/private memory when local/no_local algorithm used.
+   *
+   * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the
+   * selected contraction_type.
+   *
+   * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the
+   * selected contraction_type.
+   *
+   * \param lhs_extract_index: determins the position of each thread on a local memory for lhs input. When private
+   * memory is used this is set to zero as this is not applicable in case of private memory.
+   *
+   * \param rhs_extract_index: determins the position of each thread on a local memory for rhs input. When private
+   * memory is used this is set to zero as this is not applicable in case of private memory.
+   *
+   * \param lhs_scratch_compute : determines the  location to load for computation for lhs_local memory. This is the
+   * same as lhs_scratch_extract for private memory.
+   *
+   * \param rhs_scratch_compute : determines the  location to load for computation for rhs_local memory. This is the
+   * same as rhs_scratch_extract for private memory.
+   */
+  struct TiledMemory {
+    MemHolder<contraction_tp, Properties::WorkLoadPerThreadM * Properties::TileSizeDimK> lhs_scratch_extract;
+    MemHolder<contraction_tp, Properties::WorkLoadPerThreadN * Properties::TileSizeDimK> rhs_scratch_extract;
+    tile_ptr lhs_scratch_ptr_compute;
+    tile_ptr rhs_scratch_ptr_compute;
+    const std::pair<StorageIndex, StorageIndex> lhs_extract_index;
+    const std::pair<StorageIndex, StorageIndex> rhs_extract_index;
+    template <contraction_type tp = contraction_tp>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TiledMemory(const ThreadProperties<StorageIndex> &, local_ptr,
+                typename ::Eigen::internal::enable_if<tp == contraction_type::no_local>::type * = 0)
+        : lhs_scratch_extract{},
+          rhs_scratch_extract{},
+          lhs_scratch_ptr_compute(lhs_scratch_extract.ptr),
+          rhs_scratch_ptr_compute(rhs_scratch_extract.ptr),
+          lhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})),
+          rhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})) {}
+
+    template <contraction_type tp = contraction_tp>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TiledMemory(const ThreadProperties<StorageIndex> &thread_properties, local_ptr block_start_ptr,
+                typename ::Eigen::internal::enable_if<tp == contraction_type::local>::type * = 0)
+        : lhs_scratch_extract{block_start_ptr},
+          rhs_scratch_extract{lhs_scratch_extract.ptr +
+                              ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)},
+          lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset),
+          rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset),
+          lhs_extract_index(
+              local_id_extract<LHSBlockProperties, Properties::TileSizeDimM>(thread_properties.linearLocalThreadId)),
+          rhs_extract_index(
+              local_id_extract<RHSBlockProperties, Properties::TileSizeDimN>(thread_properties.linearLocalThreadId)) {}
+  };
+
+  Scratch scratch;
+  const LhsMapper lhs;
+  const RhsMapper rhs;
+  OutAccessor out_res;
+  const StorageIndex groupSizeM;
+  const StorageIndex groupSizeN;
+  const StorageIndex numTiles;
+  const TripleDim triple_dim;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
+                                                                const RhsMapper rhs_, OutAccessor out_res_,
+                                                                const StorageIndex groupSizeM_,
+                                                                const StorageIndex groupSizeN_,
+                                                                const StorageIndex numTiles_,
+                                                                const TripleDim triple_dim_)
+      : scratch(scratch_),
+        lhs(lhs_),
+        rhs(rhs_),
+        out_res(out_res_),
+        groupSizeM(groupSizeM_),
+        groupSizeN(groupSizeN_),
+        numTiles(numTiles_),
+        triple_dim(triple_dim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
+                                                                const RhsMapper rhs_, OutAccessor out_res_,
+                                                                const StorageIndex groupSizeM_,
+                                                                const StorageIndex numTiles_,
+                                                                const TripleDim triple_dim_)
+      : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
+    const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
+    const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
+    const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM;
+    const StorageIndex tmp = itemID.get_group(0) / groupSizeM;
+    const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN;
+    const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN;
+    const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM;
+    const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN;
+    const StorageIndex mLocalOffset = PacketSize * mLocalThreadId;
+    const StorageIndex nLocalOffset = NStride * nLocalThreadId;
+    const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset;
+    const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset;
+
+    const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK;
+    StorageIndex kGroupOffset = kGroupId * kSizePerWG;
+    const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM &&
+                             triple_dim.N - nGroupOffset >= Properties::TileSizeDimN &&
+                             triple_dim.K - kGroupOffset >= kSizePerWG;
+    // this is used to adjust the last block
+    StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset);
+    // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to
+    // tile
+    kGroupOffset += kSize;
+
+    auto thread_properties =
+        ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset,
+                                       mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);
+
+    auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
+
+    (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
+                                    : compute_panel<false>(itemID, thread_properties, out_ptr);
+  }
+  // The compute block computes the contraction operation private block for each thread and store the resutl in the
+  // privateRes memory of Each computation the compute block function is independent of local and no local concepts as
+  // it only compute the block on each thread's private memory space
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
+                                                                    PacketReturnType *privateRes) {
+    StorageIndex idx = 0;
+    EIGEN_CONSTEXPR StorageIndex lhs_stride =
+        contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) {
+      auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)};
+      StorageIndex lhs_index = 0;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
+        PacketReturnType lhsPack{};
+        Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::set_packet(lhsPack,
+                                                                                             lhs_block_ptr + lhs_index);
+        privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]);
+
+        lhs_index += lhs_stride;
+        idx++;
+      }
+    }
+  }
+  // The store function write the computed contraction operation in the private memory of each thread to the global
+  // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base
+  // class.
+  template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
+                                                   StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) {
+    auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
+      return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
+    };
+    // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is
+    // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId*
+    // WorkLoadPerThreadN slice of N
+    EIGEN_CONSTEXPR StorageIndex GlobalNStride =
+        contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) {
+      // output leading dimension
+      StorageIndex outputLD = 0;
+      // When local memory is used the PrivateNstride is always 1 because the coalesed access on N is loaded into Local
+      // memory and extracting from local to global is the same as no transposed version. However, when local memory is
+      // not used and RHS is transposed we packetize the load for RHS.
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex nId = 0; nId < PrivateNStride; nId++) {
+        StorageIndex globalRow = mGlobalOffset;
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
+          PacketReturnType privetOut = privateRes[wLPTM];
+          if (check_boundary<is_internal_block>(chk_bound(globalRow, nId))) {
+            // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second
+            // StorageIndex Therefore it is always coalesced layout
+            write<data_source::global_mem>(privetOut, out_ptr + outputLD + globalRow);
+          } else {
+            EIGEN_UNROLL_LOOP
+            for (StorageIndex mId = 0; mId < PacketSize; mId++) {
+              StorageIndex mOffset = globalRow + mId;
+              if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) {
+                out_ptr[mOffset + outputLD] =
+                    Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::scalarize(mId, privetOut);
+              }
+            }
+          }
+          globalRow += (PacketSize * Properties::LocalThreadSizeM);
+        }
+        outputLD += triple_dim.M;
+        privateRes += Properties::WorkLoadPerThreadM / PacketSize;
+      }
+      out_ptr += (GlobalNStride * outputLD);
+
+      nGlobalOffset += (PrivateNStride * GlobalNStride);
+    }
+  }
+  // when no local memory is used the following extract_block will be enabled
+  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename PrivateReg,
+            contraction_type contract_tp = contraction_tp>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<contract_tp == contraction_type::no_local>::type
+      extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
+                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
+    EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
+        InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
+    EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
+        InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM;
+    const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
+
+    auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
+      return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
+              (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
+    };
+    const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
+    StorageIndex cIndex = cOffset;
+
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) {
+      StorageIndex ncIndex = ncOffset;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) {
+        if (check_boundary<is_internal_block>(chk_bound(cIndex, ncIndex))) {
+          auto val =
+              read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                   InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, ncIndex, cIndex, ld);
+
+          write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
+                data_source::private_mem>(val, private_ptr);
+        } else {
+          EIGEN_UNROLL_LOOP
+          for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+            const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
+            const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
+            OutScalar val =
+                (ncInd < NC && cInd < triple_dim.K)
+                    ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                          inpt, ncInd, cInd, ld)
+                    : OutScalar(0);
+            write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
+                  data_source::private_mem>(
+                val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) +
+                         ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC));
+          }
+        }
+
+        // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So
+        // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread.
+        ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1)
+                      ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC
+                      : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC);
+        private_ptr += InputBlockProperties::nc_stride;
+      }
+      // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC
+      private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC;
+      cIndex += InputBlockProperties::c_stride;
+    }
+  }
+  template <typename InputBlockProperties, StorageIndex TileSizeDimNC>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair<StorageIndex, StorageIndex> local_id_extract(
+      const StorageIndex &linearLocalThreadId) {
+    const StorageIndex localThreadNC =
+        (InputBlockProperties::is_coalesced_layout)
+            ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride)
+            : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    const StorageIndex localThreadC =
+        (InputBlockProperties::is_coalesced_layout)
+            ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride)
+            : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    return std::pair<StorageIndex, StorageIndex>(localThreadNC, localThreadC);
+  }
+
+  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<db && ctp == contraction_type::local>::type
+      sync_mem(const cl::sycl::nd_item<1> &, bool &db_offset) noexcept {
+    db_offset = !db_offset;
+  }
+
+  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<!db && ctp == contraction_type::local>::type
+      sync_mem(const cl::sycl::nd_item<1> &itemID, bool &) noexcept {
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+  }
+
+  template <contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<ctp == contraction_type::no_local>::type
+      sync_mem(const cl::sycl::nd_item<1> &, bool &) noexcept {
+    return;
+  }
+
+  template <bool need_sync, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::no_local>::type
+      sync_thread(const cl::sycl::nd_item<1> &
+#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
+                      itemID
+#endif
+                  ) noexcept {
+#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
+    itemID.barrier(cl::sycl::access::fence_spacce::local_space);
+#else
+    return;
+#endif
+  }
+  template <bool need_sync, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::local>::type
+      sync_thread(const cl::sycl::nd_item<1> &itemID) {
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+  }
+  template <bool need_sync>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!need_sync>::type sync_thread(
+      const cl::sycl::nd_item<1> &) {
+    return;
+  }
 
+  template <bool is_internal_block>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
+                                                                    ThreadProperties<StorageIndex> &thread_properties,
+                                                                    TiledMemory &tiled_input_block,
+                                                                    PacketReturnType *privateRes, bool &db_offset) {
+    // Tiling the Rhs block from global to local memory
+    extract_block<RHSBlockProperties, is_internal_block>(
+        rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
+        tiled_input_block.rhs_extract_index,
+        contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset,
+        thread_properties.kGroupOffset - thread_properties.kSize);
+
+    sync_thread<contraction_tp == contraction_type::no_local>(itemID);
+
+    // Tiling the Lhs block from global to local memory
+    extract_block<LHSBlockProperties, is_internal_block>(
+        lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK),
+        tiled_input_block.lhs_extract_index,
+        contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset,
+        thread_properties.kGroupOffset - thread_properties.kSize);
+
+    // itemID.barrier(cl::sycl::access::fence_space::local_space);
+    sync_thread<contraction_tp == contraction_type::local>(itemID);
+    // switch to compute mede
+    StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK);
+    StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR);
+    // Loop over the values of a single tile
+    for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) {
+      compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset,
+                             tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes);
+      lhs_offset += LSDL;
+      rhs_offset += LSDR;
+    }
+    // computing the K index for the next tile
+    thread_properties.kSize -= Properties::TileSizeDimK;
+    sync_mem(itemID, db_offset);
+  }
+
+  // when local memory is available the following compute_panel will be enabled
+  template <bool is_internal_block, typename OutPtr>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
+                                                           ThreadProperties<StorageIndex> &thread_properties,
+                                                           OutPtr out_ptr) {
+    auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
+    // Allocate register space
+    PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
+        PacketReturnType{0}};
+    bool db_offset = 0;
+
+    while (thread_properties.kSize >= Properties::TileSizeDimK) {
+      compute_tile_per_panel<is_internal_block>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
+    }
+    if (thread_properties.kSize > 0) {
+      compute_tile_per_panel<false>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
+    }
+
+    // Storing the final results in the output
+    store<is_internal_block,
+          contraction_tp == contraction_type::local ? static_cast<StorageIndex>(1) : RHSBlockProperties::nc_stride>(
+        out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset,
+        thread_properties.nGlobalOffset);
+  }
+  // When local memory is available the following extract_block will be enabled
+  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename Local,
+            contraction_type contract_tp = contraction_tp>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<contract_tp == contraction_type::local>::type
+      extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
+                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
+    EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
+        InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
+    EIGEN_CONSTEXPR StorageIndex LoadPerThread =
+        InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs;
+    EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;
+    static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) &&
+                   (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)),
+                  " LocalOffset must be divisable by stride");
+    const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
+    StorageIndex localThreadNC = local_index.first;
+    StorageIndex localThreadC = local_index.second;
+    auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
+      return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
+              (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
+    };
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) {
+      const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC);
+      const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC);
+      const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
+      if (check_boundary<is_internal_block>(chk_bound(CIndex, NCIndex))) {
+        auto val =
+            read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                 InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, NCIndex, CIndex, ld);
+        write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
+            val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
+                     (InputBlockProperties::c_stride * localThreadC * LSD));
+      } else {
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+          const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
+          const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
+          OutScalar val =
+              (nCInd < NC && cInd < triple_dim.K)
+                  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                        inpt, nCInd, cInd, ld)
+                  : OutScalar(0);
+
+          write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
+              val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
+                       (InputBlockProperties::is_coalesced_layout ? i : 0) +
+                       ((InputBlockProperties::c_stride * localThreadC +
+                         (InputBlockProperties::is_coalesced_layout ? 0 : i)) *
+                        LSD));
+        }
+      }
+      localThreadNC += (InputBlockProperties::is_coalesced_layout)
+                           ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride)
+                           : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+      localThreadC += (InputBlockProperties::is_coalesced_layout)
+                          ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride)
+                          : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    }
+  }
+};
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+
+/*!
+ * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special
+ * case of Tensor Tensor contraction.
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs)
+ *
+ * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs)
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Properties: determines the Contraction Panel properties
+ *
+ * \tparam KFactor: determines the number of elements in K dimension in a Tile
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector
+ *
+ * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output.
+ * Otherwise, the result of contraction will be written iin a temporary buffer.
+ *
+ * \param scratch: determines the local memory containing the vector block for each work-group
+ *
+ * \param vec: determines the vector input (tensor mapper)
+ *
+ * \param mat: determines the tensor input (tensor mapper)
+ *
+ * \param out_res: determines the output vector containing the contraction result
+ *
+ * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension
+ *
+ * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor
+ *
+ * \param contractDim: determines the size of non contracting dimension for the flattened tensor
+ *
+ */
+template <typename OutScalar, typename OutAccessor, typename VectorMapper, typename TensorMapper, typename StorageIndex,
+          typename Properties, StorageIndex KFactor, bool Vectorizable, bool is_lhs_vec, bool IsFinal>
+struct GeneralVectorTensor {
+  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
+      PacketReturnType;
+  static EIGEN_CONSTEXPR int PacketSize =
+      Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+
+  static EIGEN_CONSTEXPR StorageIndex OutScratchOffset =
+      KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
+
+  // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make
+  // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true.
+  typedef BlockProperties<is_lhs_vec ? false : true, is_lhs_vec ? false : true, Vectorizable, PacketReturnType>
+      VecBlockProperties;
+
+  Scratch scratch;
+  const VectorMapper vec;
+  const TensorMapper mat;
+  OutAccessor out_res;
+  const StorageIndex nonContractGroupSize;
+  const StorageIndex nonContractDim;
+  const StorageIndex contractDim;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_,
+                                                            const TensorMapper mat_, OutAccessor out_res_,
+                                                            const StorageIndex nonContractGroupSize_,
+                                                            const StorageIndex nonContractDim_,
+                                                            const StorageIndex contractDim_)
+      : scratch(scratch_),
+        vec(vec_),
+        mat(mat_),
+        out_res(out_res_),
+        nonContractGroupSize(nonContractGroupSize_),
+        nonContractDim(nonContractDim_),
+        contractDim(contractDim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    auto scratch_ptr = scratch.get_pointer();
+    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
+    StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
+                                            : linearLocalThreadId % Properties::LocalThreadSizeNC;
+    StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC
+                                         : linearLocalThreadId / Properties::LocalThreadSizeNC;
+    const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize;
+    const StorageIndex nonContractGroupId =
+        is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize;
+    const StorageIndex contractGroupId =
+        is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize;
+    auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim);
+
+    const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;
+    const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;
+    auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
+    const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId;
+    const StorageIndex globalContractDimOffset = contractGroupOffset + contractId;
+    auto local_output = scratch_ptr + OutScratchOffset;
+    const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC &&
+                             contractDim - contractGroupOffset >= Properties::TileSizeDimC;
+    is_internal
+        ? compute_panel<true>(itemID, vec, mat, local_output, out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+                              scratch_ptr, contractGroupOffset,
+#endif
+                              nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
+                              nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex)
+        : compute_panel<false>(itemID, vec, mat, local_output, out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+                               scratch_ptr, contractGroupOffset,
+#endif
+                               nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
+                               nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex);
+  }
+  template <bool is_internal_block, typename OutPtr>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(
+      const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output,
+      OutPtr out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      OutScalar *scratch_ptr, const StorageIndex contractGroupOffset,
+#endif
+      const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim,
+      StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId,
+      StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) {
+    OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)};
+    // Reading the vector
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+    const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId;
+    extract_block<VecBlockProperties, is_internal_block, KFactor,
+                  Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC>(vec, scratch_ptr, linearLocalThreadId,
+                                                                                vectorOffset, contractDim);
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+    auto in_scratch_ptr = scratch_ptr + contractId;
+#endif
+
+    StorageIndex privateOffsetC = 0;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) {
+      StorageIndex privateOffsetNC = 0;
+      bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim);
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      auto vecScalar = *in_scratch_ptr;
+#else
+      auto vecScalar = (check_boundary<is_internal_block>(contract_conds))
+                           ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC,
+                                 is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0))
+                           : OutScalar(0);
+#endif
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+        auto matScalar = (check_boundary<is_internal_block>(
+                             contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim)))
+                             ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC
+                                              : globalNonContractDimOffset + privateOffsetNC,
+                                   is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC
+                                              : globalContractDimOffset + privateOffsetC)
+                             : OutScalar(0);
+
+        outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]);
+        privateOffsetNC += Properties::LocalThreadSizeNC;
+      }
+      privateOffsetC += Properties::LocalThreadSizeC;
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      in_scratch_ptr += Properties::LocalThreadSizeC;
+#endif
+    }
+
+    auto out_scratch_ptr = local_output + outScratchIndex;
+    // Each block of 16*16 element in shared memory should reduce to 16*1
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+      *out_scratch_ptr = outScalar[j];
+
+      out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+    }
+    if (is_lhs_vec) {
+      nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC;
+      contractId = linearLocalThreadId / Properties::LocalThreadSizeNC;
+      outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
+    }
+
+    out_scratch_ptr = local_output + outScratchIndex;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) {
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (contractId < offset) {
+          StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset);
+          *out_scratch_ptr += out_scratch_ptr[myNeigbourId];
+        }
+      }
+      // moving to next 16 by 16 block
+      out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+    }
+
+    if (contractId == 0) {
+      out_scratch_ptr = local_output + nonContractId;
+      StorageIndex global_final_offset = nonContractGroupOffset + nonContractId;
+      out_ptr += global_final_offset;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+        if (check_boundary<is_internal_block>(global_final_offset < nonContractDim)) {
+          auto res = *out_scratch_ptr;
+
+          *out_ptr = res;
+          out_ptr += Properties::LocalThreadSizeNC;
+        }
+        // moving to next 16 by 16 block to ge the next 16 reduced elements
+        out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+        if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC;
+      }
+    }
+  }
+
+  template <typename InputBlockProperties, bool is_internal_block, int CFactor, int GroupSize, typename Input,
+            typename Local>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr,
+                                                                  const StorageIndex &linearLocalThreadId,
+                                                                  const StorageIndex &cOffset, const StorageIndex &C) {
+    local_ptr += InputBlockProperties::c_stride * linearLocalThreadId;
+    StorageIndex cIndex = cOffset;
+    for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) {
+      if (check_boundary<is_internal_block>(cIndex + InputBlockProperties::c_stride - 1 < C)) {
+        auto val = read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                        InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, StorageIndex(0),
+                                                                                              cIndex, StorageIndex(1));
+        write<StorageIndex, 1, data_source::local_mem>(val, local_ptr);
+      } else {
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+          OutScalar val =
+              (cIndex + i < C)
+                  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                        inpt, StorageIndex(0), cIndex + i, StorageIndex(1))
+                  : OutScalar(0);
+          write<StorageIndex, 1, data_source::local_mem>(val, local_ptr + i);
+        }
+      }
+      local_ptr += InputBlockProperties::c_stride * GroupSize;
+      cIndex += InputBlockProperties::c_stride * GroupSize;
+    }
+  }
+};
+#endif
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+
+/*!
+ * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction
+ * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam LhsScalar: determines the left-hand-side scalar type
+ *
+ * \tparam RhsScalar: determines the right-hand-side scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix
+ *
+ * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
+ *
+ * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
+ *
+ * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
+ *
+ * \param out_res: determines the output tensor containing the contraction result
+ *
+ * \param rng: determins the total input data size
+ */
+template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
+          typename RhsMapper, typename StorageIndex, bool Vectorizable>
+struct GeneralScalarContraction {
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+  Scratch scratch;
+  const LhsMapper lhs;
+  const RhsMapper rhs;
+  OutAccessor out_res;
+  const StorageIndex rng;
+
+  EIGEN_DEVICE_FUNC
+  GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_,
+                           const StorageIndex rng_)
+      : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
+
+  EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) {
+    auto out_ptr = out_res.get_pointer();
+    auto scratch_ptr = scratch.get_pointer().get();
+
+    StorageIndex globalid = itemID.get_global_id(0);
+    StorageIndex localid = itemID.get_local_id(0);
+    OutScalar accumulator = OutScalar(0);
+    for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator);
+    }
+    auto out_scratch_ptr = scratch_ptr + localid;
+    *out_scratch_ptr = accumulator;
+    for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]);
+      }
+    }
+    if (localid == 0) {
+      out_ptr[itemID.get_group(0)] = accumulator;
+    }
+  }
+};
+#endif
+
+}  // namespace internal
+}  // namespace TensorSycl
+
+template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>,
+                       Eigen::SyclDevice>
+    : public TensorContractionEvaluatorBase<TensorEvaluator<
+          const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Eigen::SyclDevice>> {
   static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
                 "SYCL tensor contraction does not support output kernels.");
 
-  typedef const Eigen::SyclDevice Device;
+  typedef Eigen::SyclDevice Device;
 
   typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
   typedef TensorContractionEvaluatorBase<Self> Base;
   typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::Index Index;
+  typedef typename XprType::Index StorageIndex;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
+  typedef typename Base::Storage Storage;
+  typedef typename Base::EvaluatorPointerType EvaluatorPointerType;
+  struct TripleDim {
+    const StorageIndex M;
+    const StorageIndex N;
+    const StorageIndex K;
+    TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {}
+  };
   enum {
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = false,
   };
 
-  // Most of the code is assuming that both input tensors are ColMajor. If the
-  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
-  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
-  // will pretend B is LHS and A is RHS.
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+  static EIGEN_CONSTEXPR int LDims = Base::LDims;
+  static EIGEN_CONSTEXPR int RDims = Base::RDims;
+  static EIGEN_CONSTEXPR int ContractDims = Base::ContractDims;
 
-  static const int LDims =
-      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
-  static const int RDims =
-      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
-  static const int ContractDims = internal::array_size<Indices>::value;
+  typedef array<StorageIndex, LDims> left_dim_mapper_t;
+  typedef array<StorageIndex, RDims> right_dim_mapper_t;
 
-  typedef array<Index, LDims> left_dim_mapper_t;
-  typedef array<Index, RDims> right_dim_mapper_t;
-
-  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, LDims - ContractDims> left_nocontract_t;
-  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+  typedef array<StorageIndex, ContractDims> contract_t;
+  typedef array<StorageIndex, LDims - ContractDims> left_nocontract_t;
+  typedef array<StorageIndex, RDims - ContractDims> right_nocontract_t;
 
   static const int NumDims = LDims + RDims - 2 * ContractDims;
 
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  // typedefs needed in evalTo
-  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+  typedef DSizes<StorageIndex, NumDims> Dimensions;
 
-  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+  typedef TensorEvaluator<typename Base::EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<typename Base::EvalRightArgType, Device> RightEvaluator;
+  typedef typename Eigen::internal::remove_const<typename LeftEvaluator::CoeffReturnType>::type LhsScalar;
+  typedef typename Eigen::internal::remove_const<typename RightEvaluator::CoeffReturnType>::type RhsScalar;
 
   typedef typename LeftEvaluator::Dimensions LeftDimensions;
   typedef typename RightEvaluator::Dimensions RightDimensions;
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
-      Base(op, device) {}
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered>
+  struct input_mapper_propertis {
+    static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
+    static EIGEN_CONSTEXPR bool is_rhs_matrix =
+        (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {}
 
   // We need to redefine this method to make nvcc happy
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) {
     this->m_leftImpl.evalSubExprsIfNeeded(NULL);
     this->m_rightImpl.evalSubExprsIfNeeded(NULL);
-   if (data) {
-      evalTo(data);
-      return false;
-    } else {
-      this->m_result = static_cast<Scalar*>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
-      evalTo(this->m_result);
-      return true;
+    if (!data) {
+      this->m_result = this->m_device.get(
+          static_cast<Scalar *>(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar))));
+      data = this->m_result;
     }
+    evalToSycl(data);
+    return (this->m_result != NULL);
   }
-  const Eigen::SyclDevice& device() const {return this->m_device;}
-  void evalTo(Scalar* buffer) const {
-    // Here is the result
+  const Eigen::SyclDevice &device() const { return this->m_device; }
+  void evalToSycl(typename Base::EvaluatorPointerType buffer) const {
     if (this->m_lhs_inner_dim_contiguous) {
       if (this->m_rhs_inner_dim_contiguous) {
         if (this->m_rhs_inner_dim_reordered) {
           evalTyped<true, true, true, Unaligned>(buffer);
-        }
-        else {
+        } else {
           evalTyped<true, true, false, Unaligned>(buffer);
         }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
+      } else {
+        if (this->m_rhs_inner_dim_reordered) {
           evalTyped<true, false, true, Unaligned>(buffer);
-        }
-        else {
+        } else {
           evalTyped<true, false, false, Unaligned>(buffer);
         }
       }
-    }
-    else {
+    } else {
       if (this->m_rhs_inner_dim_contiguous) {
         if (this->m_rhs_inner_dim_reordered) {
           evalTyped<false, true, true, Unaligned>(buffer);
-        }
-        else {
+        } else {
           evalTyped<false, true, false, Unaligned>(buffer);
         }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
+      } else {
+        if (this->m_rhs_inner_dim_reordered) {
           evalTyped<false, false, true, Unaligned>(buffer);
-        }
-        else {
+        } else {
           evalTyped<false, false, false, Unaligned>(buffer);
         }
       }
@@ -138,267 +1388,263 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   }
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalTyped(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-    EIGEN_UNUSED_VARIABLE(k)
-    // rows in left side
-    const Index m = this->m_i_size;
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-    LaunchSyclKernels<Index, LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
-                       this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides,
-                       this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides);
+  void evalTyped(typename Base::EvaluatorPointerType buffer) const {
+    const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};
+    typedef internal::TensorContractionInputMapper<
+        LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t,
+        PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer>
+        LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, StorageIndex, internal::Rhs, RightEvaluator,
+                                                   right_nocontract_t, contract_t,
+                                                   PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer>
+        RhsMapper;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+    if (triple_dim.M == 1 && triple_dim.N == 1) {
+      launchSC(buffer, lhs, rhs, triple_dim.K);
+    } else
+#endif
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+        if (triple_dim.M != 1 && triple_dim.N == 1) {
+      LaunchVT<false>(buffer, rhs, lhs, triple_dim.M, triple_dim.K);
+    } else if (triple_dim.M == 1 && triple_dim.N != 1) {
+      LaunchVT<true>(buffer, lhs, rhs, triple_dim.N, triple_dim.K);
+    } else  // This is equivalent of if (m!=1 && n!=1)
+#endif
+    {
+      typedef input_mapper_propertis<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>
+          inpt_mapper_properties;
+#ifndef EIGEN_SYCL_DISABLE_SKINNY
+      bool skinny = false;
+      auto platform_name = this->device().getPlatformName();
+      // This is based on empirical calculation for AMD r9-nano and Fiji
+      if (platform_name.find("AMD") == 0) {
+        skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) &&
+                 ((triple_dim.M < 1024 && triple_dim.N < 1024) ||
+                  (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K)));
+      } else {
+        skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) ||
+                  ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) ||
+                  ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100));
+      }
+      if (skinny)
+        adjustTT<true, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
+      else
+#endif  // EIGEN_SYCL_DISABLE_SKINNY
+        adjustTT<false, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
+    }
   }
-  // required by sycl to construct the expr on the device. Returns original left_impl
-  const TensorEvaluator<LeftArgType, Device>& left_impl() const {
-    return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_leftImpl, this->m_rightImpl);
+
+  template <bool skinny, typename input_mapper_properties, typename LhsMapper, typename RhsMapper>
+  void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                                    const TripleDim &triple_dim) const {
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+    if (device().has_local_memory()) {
+      typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 16> PanelParameters;
+      launchTT<TensorSycl::internal::contraction_type::local, skinny, input_mapper_properties, PanelParameters>(
+          buffer, lhs, rhs, triple_dim);
+    }
+#endif
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF
+    if (!(device().has_local_memory())) {
+      typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 4> PanelParameters;
+      launchTT<TensorSycl::internal::contraction_type::no_local, skinny, input_mapper_properties, PanelParameters>(
+          buffer, lhs, rhs, triple_dim);
+    }
+#endif
   }
-  // required by sycl to construct the expr on the device. Returns original right_impl
-  const TensorEvaluator<RightArgType, Device>& right_impl() const {
-    return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_rightImpl, this->m_leftImpl);
+
+  template <TensorSycl::internal::contraction_type ct, bool skinny, typename input_mapper_properties,
+            typename Properties, typename LhsMapper, typename RhsMapper>
+  void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                const TripleDim &triple_dim) const {
+    const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM);
+    const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN);
+    const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM;
+    const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN;
+
+    const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK);
+    StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK;
+    StorageIndex groupSizeK =
+        skinny
+            ? std::max(std::min(totalTilesK,
+                                (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) /
+                                    (groupSizeM * groupSizeN)),
+                       StorageIndex(1))
+            : StorageIndex(1);
+
+    const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK;
+
+    const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK;
+
+    const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+    const StorageIndex globalRange = totalGroupSize * localRange;
+
+    const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local)
+                                         ? ((Properties::DoubleBuffer + 1) *
+                                            (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) +
+                                               ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) *
+                                                (Properties::TileSizeDimN + Properties::BC))
+                                         : StorageIndex(1);
+
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (groupSizeK == 1) {
+      typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
+                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
+                                                            PacketAccess, input_mapper_properties, true, ct>
+          ContractKernelName;
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim);
+    } else {
+      typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
+                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
+                                                            PacketAccess, input_mapper_properties, false, ct>
+          ContractKernelName;
+      CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
+          device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,
+          triple_dim);
+
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      auto op = Op();
+      typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
+                                                               EvaluatorPointerType, Op>
+          ReductionKernel;
+
+      device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
+          tmp_global_accessor, buffer,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex(
+                                    Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),
+                                cl::sycl::range<1>(localRange)),
+          StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK);
+
+      device().deallocate_temp(temp_pointer);
+    }
   }
-};
 
-template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar,  typename LHSFunctorExpr,  typename RHSFunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT,
-typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
-typename HostExpr::Index TileSizeDimM, typename HostExpr::Index TileSizeDimN,typename HostExpr::Index TileSizeDimK, typename HostExpr::Index WorkLoadPerThreadM,typename HostExpr::Index WorkLoadPerThreadN,
-typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadSizeN, typename HostExpr::Index LoadPerThreadLhs, typename HostExpr::Index LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{
-  typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
-  typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
-  typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<LHSHostExpr>::Type LHSPlaceHolderExpr;
-  typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<RHSHostExpr>::Type RHSPlaceHolderExpr;
-  LHSFunctorExpr lhs_functors;
-  RHSFunctorExpr rhs_functors;
-  LhsLocalAcc localLhs;
-  RhsLocalAcc localRhs;
-  OutAccessor out_res;
-  size_t out_offset;
-  Index roundUpK, M, N, K;
-  ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides;
-  LeftNocontractT m_i_strides, m_left_nocontract_strides;
-  RightNocontractT m_j_strides,  m_right_nocontract_strides;
-  LHSTupleType left_tuple_of_accessors;
-  RHSTupleType right_tuple_of_accessors;
-  Device dev;
-
-
-  KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, size_t out_offset_,
-    Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_,
-    ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_,
-    LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_)
-    :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_),
-    out_offset(out_offset_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
-    m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_),
-    m_right_contracting_strides(m_right_contracting_strides_),
-    m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_),
-    m_j_strides(m_j_strides_),  m_right_nocontract_strides(m_right_nocontract_strides_),
-    left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){}
-
-    void operator()(cl::sycl::nd_item<2> itemID) {
-      typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
-      typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<LHSHostExpr>::Type LHSDevExpr;
-      typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<RHSHostExpr>::Type RHSDevExpr;
-      auto lhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<LHSDevExpr, LHSPlaceHolderExpr>(lhs_functors, left_tuple_of_accessors);
-      auto rhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<RHSDevExpr, RHSPlaceHolderExpr>(rhs_functors, right_tuple_of_accessors);
-      typedef decltype(lhs_dev_expr.expr) LeftArgType;
-      typedef decltype(rhs_dev_expr.expr) RightArgType;
-      typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
-      typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-      typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-      typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-      typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                      LeftEvaluator, LeftNocontractT,
-                                                     ContractT, 1,
-                                                     lhs_inner_dim_contiguous,
-                                                     false, Unaligned, MakeGlobalPointer> LhsMapper;
-
-      typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                     RightEvaluator, RightNocontractT,
-                                                     ContractT, 1,
-                                                     rhs_inner_dim_contiguous,
-                                                     rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper;
-      // initialize data mappers must happen inside the kernel for device eval
-      LhsMapper lhs(LeftEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
-                    lhs_dev_expr.expr, rhs_dev_expr.expr), dev), m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
-      RhsMapper rhs(RightEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
-                    rhs_dev_expr.expr, lhs_dev_expr.expr),dev), m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
-      auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res);
-      // Matmul Kernel
-      // Thread identifiers
-      const Index mLocalThreadId = itemID.get_local(0); // Local ID row
-      const Index nLocalThreadId = itemID.get_local(1); // Local ID col
-      const Index mGroupId = itemID.get_group(0); // Work-group ID row
-      const Index nGroupId = itemID.get_group(1); // Work-group ID localCol
-      const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
-      // Allocate register space
-      LhsScalar privateLhs;
-      RhsScalar privateRhs[WorkLoadPerThreadN];
-      OutScalar privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN];
-      // Initialise the privateResumulation registers
-      for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
-          for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
-              privateRes[wLPTM][wLPTN] = static_cast<OutScalar>(0);
-          }
-      }
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+  template <bool is_lhs_vec, typename VectorMapper, typename TensorMapper, typename StorageIndex>
+  void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat,
+                                    StorageIndex NC, StorageIndex C) const {
+    const StorageIndex nonContractDim = NC;
+    EIGEN_CONSTEXPR StorageIndex NCFactor = 1;
+    EIGEN_CONSTEXPR StorageIndex CFactor = 1;
+    EIGEN_CONSTEXPR StorageIndex NCWindow = 16;
+    typedef Eigen::TensorSycl::internal::TVPanelSize<CoeffReturnType, StorageIndex, NCWindow, CFactor, NCFactor>
+        Properties;
+    const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC);
+    const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC);
+    const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC);
+    const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC);
+    const StorageIndex globalRange =
+        (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC));
+    const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC;
+    const StorageIndex scratchSize =
+        (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (cNumGroups > 1) {
+      typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
+                                                               TensorMapper, StorageIndex, Properties, CFactor, false,
+                                                               is_lhs_vec, false>
+          ContractKernelName;
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
 
-      // Tile Lhs
-      for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
-          Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-          Index localLhsRow =  localLhsLinearId% TileSizeDimM;
-          Index localLhsCol = localLhsLinearId/TileSizeDimM;
-          // Load the value (wide vector load)
-          Index GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
-          localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<OutScalar>(0);
-      }
-      // Tile Rhs
-      for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
-          Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-          Index localRhsRow =  localRhsLinearId% TileSizeDimN;
-          Index localRhsCol = localRhsLinearId/TileSizeDimN;
-          // Load the value (wide vector load)
-          Index GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
-          localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<OutScalar>(0);
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
 
-      }
-      // Loop over all tiles
-      const Index numTiles = roundUpK/TileSizeDimK;
-      Index firstHalf=0;
-      do {
-          // Synchronise
-          itemID.barrier(cl::sycl::access::fence_space::local_space);
-          // Load the next tile of Lhs and Rhs into local memory
-          Index nextHalf = firstHalf + 1;
-          if (nextHalf < numTiles) {
-              // Tile A
-              for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
-                  Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-                  Index localLhsRow =  localLhsLinearId% TileSizeDimM;
-                  Index localLhsCol = localLhsLinearId/TileSizeDimM;
-                  // global K id
-                  Index GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
-                  // Store the loaded value into local memory
-                  localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<OutScalar>(0);
-              }
-              // Tile B
-              for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
-                  Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-                  Index localRhsRow =  localRhsLinearId% TileSizeDimN;
-                  Index localRhsCol = localRhsLinearId/TileSizeDimN;
-                  // Load the value (wide vector load)
-                  Index GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
-                  // Store the loaded vector into local memory
-                  localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<OutScalar>(0);
-              }
-          }
-          // Loop over the values of a single tile
-          for (Index k=0; k<TileSizeDimK; k++) {
-              // Cache the values of localRhs in registers
-              for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
-                  Index localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
-                  privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)];
-              }
-              // Perform the computation
-              for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
-                  Index localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
-                  privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)];
-                  for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
-                      privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN];
-                  }
-              }
-          }
-          // Next tile
-          firstHalf++;
-      } while (firstHalf<numTiles);
-
-      // Store the final results in C
-      for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
-          Index globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
-          if (globalRow< M){
-            for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
-                Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
-                if(globalCol<N)
-                  out_ptr[globalCol*M + globalRow +ConvertToActualSyclOffset(OutScalar, out_offset)] = privateRes[wLPTM][wLPTN];
-            }
-          }
-      }
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
+                                                               EvaluatorPointerType, Op>
+          ReductionKernel;
 
+      device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
+          tmp_global_accessor, buffer,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),
+                                cl::sycl::range<1>(localRange)),
+          StorageIndex(1), Op(), nonContractDim, cNumGroups);
+
+      device().deallocate_temp(temp_pointer);
+    } else {
+      typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
+                                                               TensorMapper, StorageIndex, Properties, CFactor, false,
+                                                               is_lhs_vec, true>
+          ContractKernelName;
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
     }
+  }
+#endif
 
-};
-template <typename Index, typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels {
-
-static const Index TileSizeDimM = 32ul;                                      // Tile size for dimension M
-static const Index TileSizeDimN = 32ul;                                      // Tile size for dimension N
-static const Index TileSizeDimK = 16ul;                                      // Tile size for dimension K
-static const Index WorkLoadPerThreadM = 4ul;                                 // Work load per thread in dimension M
-static const Index WorkLoadPerThreadN = 4ul;                                 // work load per thread in dimension N
-static const Index LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM);   // Local thread size for the first dimension (M here)
-static const Index LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN);   // Local thread size for the second dimension (N here)
-static const Index LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN));  // workload per thread for Lhs expression
-static const Index LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM));  // workload per thread for Rhs expression
-
-// RoundUp function to make sure that the global threadId is divisable by local threadId
-static Index RoundUp(Index x, Index y) {
-  return ((((x) + (y) - 1) / (y))*(y));
-}
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+  template <typename LhsMapper, typename RhsMapper>
+  EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                                    StorageIndex K) const {
+    EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
+                          (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
+                        "The Local thread size must be a power of 2 for the reduction "
+                        "operation");
+    EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
 
-template< typename Self, typename OutScalar, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
-  static void Run(const Self& self, OutScalar* buffer,  Index M, Index N, Index K,
-    ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides,
-    LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){
-
-    typedef typename Self::XprType HostExpr;
-    typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
-    typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
-    typedef TensorEvaluator<LHSHostExpr, const Eigen::SyclDevice> OrigLHSExpr;
-    typedef TensorEvaluator<RHSHostExpr, const Eigen::SyclDevice> OrigRHSExpr;
-    typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigLHSExpr> LHSFunctorExpr;
-    typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigRHSExpr> RHSFunctorExpr;
-    // extract lhs functor list
-    LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
-    // extract rhs functor list
-    RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.right_impl());
-
-    Index roundUpK = RoundUp(K, TileSizeDimK);
-    Index roundUpM = RoundUp(M, TileSizeDimM);
-    Index roundUpN = RoundUp(N, TileSizeDimN);
-    ptrdiff_t out_offset = self.device().get_offset(buffer);
-    self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
-      /// work-around for gcc bug
-      typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType;
-      /// work-around for gcc bug
-      typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl())) RHSTupleType;
-      // create lhs tuple of accessors
-      LHSTupleType left_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl());
-      // create rhs tuple of accessors
-      RHSTupleType right_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl());
-
-      // Local memory for elements of Lhs
-      typedef cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LhsLocalAcc;
-      LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh);
-      // Local memory for elements of Rhs
-      typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc;
-      RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
-
-      typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> OutAccessor;
-      //OutScalar memory
-      OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::read_write>(cgh, buffer);
-      // sycl parallel for
-      cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN),
-      cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
-       KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
-       RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
-       WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::SyclKernelDevice>(lhs_functors, rhs_functors,
-          localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
-          m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::SyclKernelDevice()));
-    });
-    self.device().asynchronousExec();
+    // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread
+    // reduces at least 512 elementss individually, we get better performance.
+    const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1);
+    const StorageIndex global_range = num_work_group * local_range;
+
+    typedef Eigen::TensorSycl::internal::GeneralScalarContraction<
+        CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false>
+        ContractKernelName;
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+    if (num_work_group > 1) {
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,
+                                                                                    thread_range, local_range, K);
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
+                                                          EvaluatorPointerType, StorageIndex, local_range>
+          GenericRKernel;
+      device().template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
+          tmp_global_accessor, buffer,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op());
+
+      device().deallocate_temp(temp_pointer);
+    } else {
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,
+                                                                                    local_range, K);
+    }
   }
-};
+#endif
 
-} // end namespace Eigen
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    this->m_leftImpl.cleanup();
+    this->m_rightImpl.cleanup();
+
+    if (this->m_result) {
+      this->m_device.deallocate_temp(this->m_result);
+      this->m_result = NULL;
+    }
+  }
+  // The placeholder accessors must bound to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    this->m_leftImpl.bind(cgh);
+    this->m_rightImpl.bind(cgh);
+    this->m_result.bind(cgh);
+  }
+};
+}  // namespace Eigen
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index 5c94165d1..0218727d1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -18,207 +18,252 @@
 namespace Eigen {
 
 /** \class TensorConvolution
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor convolution class.
-  *
-  *
-  */
-template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
-typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
-struct EigenConvolutionKernel1D{
-typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
-Kernel_accessor kernel_filter;
-const size_t kernelSize, range_x, range_y;
-Buffer_accessor buffer_acc;
-ptrdiff_t out_offset;
-Local_accessor local_acc;
-FunctorExpr functors;
-TupleType tuple_of_accessors;
-EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
-  Kernel_accessor kernel_filter_,  const size_t kernelSize_, const size_t range_x_, const size_t range_y_,
-  Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
-  :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_),
-  buffer_acc(buffer_acc_), out_offset(out_offset_),local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
-
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor convolution class.
+ *
+ *
+ */
+
+enum class convolution_type { CONV1D, CONV2D, CONV3D };
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
+struct EigenConvolutionKernel;
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV1D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
+  const size_t kernelSize;
+  const cl::sycl::range<2> input_range;
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
+                         const size_t kernelSize_, const cl::sycl::range<2> input_range_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernelSize(kernelSize_),
+        input_range(input_range_) {}
+
+  template <typename BooleanDim2>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
+    return (boolean_check[0] && boolean_check[1]);
+  }
   void operator()(cl::sycl::nd_item<2> itemID) {
-    typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
-    auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
-
-    auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
-    auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
-
-    const size_t num_x_input =  (itemID.get_local_range()[0] +kernelSize -1); //the required row to be calculated for the for each plane in shered memory
-    const size_t plane_kernel_offset = itemID.get_local(1) * num_x_input;
-    const size_t first_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
-    const size_t plane_tensor_offset =indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(1));
+    auto buffer_ptr = buffer_acc.get_pointer();
+    auto kernel_ptr = kernel_filter.get_pointer();
+    // the required row to be calculated for the for each plane in shered memory
+    const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
+    const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
+    const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
+    const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
     /// fill the shared memory
-    for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
-      const size_t local_index = i + plane_kernel_offset ;
-      const size_t tensor_index  =  plane_tensor_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_input_start);
-      if(((i + first_input_start) < (range_x +kernelSize-1)) && itemID.get_global(1)< range_y){
-        local_acc[local_index] = device_evaluator.coeff(tensor_index);
-      }
-      else local_acc[local_index]=0.0f;
+    for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
+      const size_t local_index = i + plane_kernel_offset;
+      const size_t tensor_index =
+          plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
+
+      local_acc[local_index] =
+          (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
+              ? device_evaluator.coeff(tensor_index)
+              : CoeffReturnType(0);
     }
 
     itemID.barrier(cl::sycl::access::fence_space::local_space);
 
-    // calculate the convolution
-    const size_t first_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
-    if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y){
+    // calculate the convolution // output start x
+    const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
+    if (boundary_check(itemID.get_global_id() < input_range)) {
       CoeffReturnType result = static_cast<CoeffReturnType>(0);
-      const size_t index = plane_kernel_offset+ itemID.get_local(0);
+      const size_t index = plane_kernel_offset + itemID.get_local_id(0);
       for (size_t k = 0; k < kernelSize; ++k) {
         result += (local_acc[k + index] * kernel_ptr[k]);
       }
-      const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1))
-      +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start);
-      buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
+      const size_t tensor_index =
+          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
+          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
+      buffer_ptr[tensor_index] = result;
     }
   }
 };
 
-
-template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
-typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
-struct EigenConvolutionKernel2D{
-typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
-Kernel_accessor kernel_filter;
-const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z;
-Buffer_accessor buffer_acc;
-ptrdiff_t out_offset;
-Local_accessor local_acc;
-FunctorExpr functors;
-TupleType tuple_of_accessors;
-EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
-  Kernel_accessor kernel_filter_,  const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_,
-  Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
-  :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_),
-  buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV2D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
+  const cl::sycl::range<2> kernel_size;
+  const cl::sycl::range<3> input_range;
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
+                         const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernel_size(kernel_size_),
+        input_range(input_range_) {}
+  template <typename BooleanDim3>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
+  }
 
   void operator()(cl::sycl::nd_item<3> itemID) {
-    typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
-    auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
-
-    auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
-    auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
-    const size_t num_x_input =  (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
-    const size_t num_y_input =  (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
-    const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(2));
-    const size_t plane_kernel_offset = itemID.get_local(2) * num_y_input;
-
-    /// fill the shared memory
-    const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
-    const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
-    for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
-      const size_t local_input_offset = num_x_input * (j + plane_kernel_offset);
-      for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
+    auto buffer_ptr = buffer_acc.get_pointer();
+    auto kernel_ptr = kernel_filter.get_pointer();
+    // the required row to be calculated for the for each plane in shered memory
+    const auto num_input = cl::sycl::range<2>{
+        (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
+
+    const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
+    const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
+
+    const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
+                                                 itemID.get_group(1) * itemID.get_local_range()[1]};
+      
+    // fill the local memory
+    bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
+    for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
+      const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
+      bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); 
+      for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
         const size_t local_index = i + local_input_offset;
-        const size_t tensor_index  = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start );
-        if(((i + first_x_input_start) < (range_x +kernelSize_x-1))  &&((j + first_y_input_start) < (range_y +kernelSize_y-1)) && itemID.get_global(2)< range_z){
-          local_acc[local_index] = device_evaluator.coeff(tensor_index);
-        }
-        else local_acc[local_index]=0.0f;
+        const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+                                                             i + input_offset[0], j + input_offset[1]);
+        local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
+                                  in_range_dim1 && in_range_dim2)
+                                     ? device_evaluator.coeff(tensor_index)
+                                     : CoeffReturnType(0);
+      }
     }
-  }
 
     itemID.barrier(cl::sycl::access::fence_space::local_space);
 
-    // calculate the convolution
-    const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
-    const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // output start y
-    if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
+    // output offset start for each thread
+    const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
+                                                  itemID.get_group(1) * itemID.get_local_range()[1]};
+
+    if (boundary_check(itemID.get_global_id() < input_range)) {
       CoeffReturnType result = static_cast<CoeffReturnType>(0);
-      for (size_t j = 0; j < kernelSize_y; j++) {
-        size_t kernel_offset =kernelSize_x * j;
-        const size_t index = (num_x_input*(plane_kernel_offset + j+ itemID.get_local(1))) + itemID.get_local(0);
-        for (size_t i = 0; i < kernelSize_x; i++) {
-        result += (local_acc[i + index] * kernel_ptr[i+kernel_offset]);
+
+      for (size_t j = 0; j < kernel_size[1]; j++) {
+        size_t kernel_offset = kernel_size[0] * j;
+        const size_t index =
+            (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
+        for (size_t i = 0; i < kernel_size[0]; i++) {
+          result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
         }
       }
-      const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2))
-      +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start);
-      buffer_ptr[tensor_index  +ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
+      const size_t tensor_index =
+          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
+          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
+                                                             itemID.get_local_id(1) + output_offset[1]);
+
+      buffer_ptr[tensor_index] = result;
     }
   }
 };
 
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV3D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
+  const cl::sycl::range<3> kernel_size;
+  const cl::sycl::range<3> input_range;
+  const size_t numP;
+
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
+                         const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
+                         const size_t numP_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernel_size(kernel_size_),
+        input_range(input_range_),
+        numP(numP_) {}
+  template <typename BooleanDim3>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
+  }
+  void operator()(cl::sycl::nd_item<3> itemID) {
+    auto buffer_ptr = buffer_acc.get_pointer();
+    auto kernel_ptr = kernel_filter.get_pointer();
+    const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
 
+    const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
 
-template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
-typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
-struct EigenConvolutionKernel3D{
-typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
-Kernel_accessor kernel_filter;
-const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP;
-Buffer_accessor buffer_acc;
-ptrdiff_t out_offset;
-Local_accessor local_acc;
-FunctorExpr functors;
-TupleType tuple_of_accessors;
-EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
-  Kernel_accessor kernel_filter_,  const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ ,
-  const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_,
-  Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
-  :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_),
-  kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_),
-  buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+    const auto output_offset =
+          cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
 
-  void operator()(cl::sycl::nd_item<3> itemID) {
-    typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
-    auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
-
-    auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
-    auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
-    const size_t num_x_input =  (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
-    const size_t num_y_input =  (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
-    const size_t num_z_input =  (itemID.get_local_range()[2] +kernelSize_z -1); //the required row to be calculated for the for each plane in shered memory
-    const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
-    const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
-    const size_t first_z_input_start = itemID.get_group(2)*itemID.get_local_range()[2];
-    for(size_t p=0; p<numP; p++){
+    for (size_t p = 0; p < numP; p++) {
       /// fill the shared memory
-      const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
-      for (size_t k = itemID.get_local(2); k < num_z_input; k += itemID.get_local_range()[2]) {
-        for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
-          for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
-            const size_t local_index = i + (num_x_input * (j + (num_y_input * k)));
-            const size_t tensor_index  = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start , k+ first_z_input_start );
-            if(((i + first_x_input_start) < (range_x +kernelSize_x-1))  && ((j + first_y_input_start) < (range_y +kernelSize_y-1)) &&  ((k + first_z_input_start) < (range_z +kernelSize_z-1)) ){
-              local_acc[local_index] = device_evaluator.coeff(tensor_index);
-            }
-            else local_acc[local_index]=0.0f;
+      const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+      for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
+        size_t local_index_dim2 = num_input[0] * num_input[1] * k;
+        bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
+        for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
+          bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
+          size_t local_index_dim1 = (num_input[0] * j)  + local_index_dim2;
+          for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
+            bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
+            const size_t local_index = local_index_dim1 + i;
+            const size_t tensor_index =
+                plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+                                         i + input_offset[0], j + input_offset[1], k + input_offset[2]);
+            local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
           }
         }
       }
       itemID.barrier(cl::sycl::access::fence_space::local_space);
 
       // calculate the convolution
-      const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // x
-      const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // y
-      const size_t fitst_z_output_start =itemID.get_group(2)*(itemID.get_local_range()[2]); // z
 
-      if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
+      if (boundary_check(itemID.get_global_id() < input_range)) {
         CoeffReturnType result = static_cast<CoeffReturnType>(0);
-        for (size_t k = 0; k < kernelSize_z; k++) {
-          for (size_t j = 0; j < kernelSize_y; j++) {
-            for (size_t i = 0; i < kernelSize_x; i++) {
-              const size_t kernel_index =i + kernelSize_x * (j + kernelSize_y * k);
-              const size_t local_index = ((i+ itemID.get_local(0))+  num_x_input*((j+ itemID.get_local(1)) + num_y_input * (k+ itemID.get_local(2))));
+        for (size_t k = 0; k < kernel_size[2]; k++) {
+          for (size_t j = 0; j < kernel_size[1]; j++) {
+            for (size_t i = 0; i < kernel_size[0]; i++) {
+              const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
+              const size_t local_index =
+                  ((i + itemID.get_local_id(0)) +
+                   num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
+
               result += (local_acc[local_index] * kernel_ptr[kernel_index]);
             }
           }
         }
-        const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p)
-        +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start );
-        buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
+        const size_t tensor_index =
+            indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
+            indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
+        buffer_ptr[tensor_index] = result;
       }
 
       itemID.barrier(cl::sycl::access::fence_space::local_space);
@@ -226,25 +271,32 @@ EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::inter
   }
 };
 
-
-template<typename Indices, typename InputArgType, typename KernelArgType>
-struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, const Eigen::SyclDevice>
-{
+template <typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
   typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
 
-  static const int NumDims =  internal::array_size<typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions>::value;
+  static const int NumDims =
+      internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
   static const int NumKernelDims = internal::array_size<Indices>::value;
   typedef typename XprType::Index Index;
   typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions KernelDimensions;
+  typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
   typedef const Eigen::SyclDevice Device;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
+  typedef typename InputArgType::Scalar Scalar;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
 
   enum {
-    IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
+    IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
+                TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
     PacketAccess = false,
     BlockAccessV2 = false,
     PreferBlockAccess = false,
-    Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
+    Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
@@ -253,13 +305,22 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   typedef internal::TensorBlockNotImplemented TensorBlockV2;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device)
-      : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
-  {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    const typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
-    const typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
+      : m_inputImpl(op.inputExpression(), device),
+        m_kernelArg(op.kernelExpression()),
+        m_kernelImpl(op.kernelExpression(), device),
+        m_indices(op.indices()),
+        m_buf(NULL),
+        m_kernel(NULL),
+        m_local_kernel(false),
+        m_device(device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
+                         static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
+        m_kernelImpl.dimensions();
 
     m_dimensions = m_inputImpl.dimensions();
     for (int i = 0; i < NumKernelDims; ++i) {
@@ -271,21 +332,17 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     }
   }
 
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, const Eigen::SyclDevice>::type PacketReturnType;
-  typedef typename InputArgType::Scalar Scalar;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
+  EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     preloadKernel();
     m_inputImpl.evalSubExprsIfNeeded(NULL);
     if (data) {
       executeEval(data);
       return false;
     } else {
-      m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
+      m_buf = (EvaluatorPointerType)m_device.get(
+          (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
       executeEval(m_buf);
       return true;
     }
@@ -294,194 +351,194 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_inputImpl.cleanup();
     if (m_buf) {
-      m_device.deallocate(m_buf);
+      m_device.deallocate_temp(m_buf);
       m_buf = NULL;
     }
     if (m_local_kernel) {
-      m_device.deallocate((void*)m_kernel);
+      m_device.deallocate_temp(m_kernel);
       m_local_kernel = false;
     }
     m_kernel = NULL;
   }
   /// used by sycl in order to build the sycl buffer
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
   /// used by sycl in order to build the sycl buffer
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return m_buf; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
     // Don't make a local copy of the kernel unless we have to (i.e. it's an
     // expression that needs to be evaluated)
-    const Scalar* in_place = m_kernelImpl.data();
+    typename KernelStorage::Type in_place = m_kernelImpl.data();
     if (in_place) {
       m_kernel = in_place;
       m_local_kernel = false;
     } else {
       ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
-      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
       typedef TensorEvalToOp<const KernelArgType> EvalTo;
-      EvalTo evalToTmp(local, m_kernelArg);
-      const bool PacketAccess = internal::IsVectorizable<const Eigen::SyclDevice, KernelArgType>::value;
-      internal::TensorExecutor<const EvalTo, const Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
+      EvalTo evalToTmp(m_device.get(local), m_kernelArg);
+      const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
       m_kernel = local;
       m_local_kernel = true;
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE  void executeEval(Scalar* data) const {
-    typedef TensorEvaluator<InputArgType, const Eigen::SyclDevice> InputEvaluator;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
+    typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
     typedef typename InputEvaluator::Dimensions InputDims;
+    switch (NumKernelDims) {
+      case 1: {
+        const size_t numX = dimensions()[m_indices[0]];
+        const size_t numP = dimensions().TotalSize() / numX;
+        const auto input_dim = std::array<size_t, 2>{numX, numP};
+        auto global_range = cl::sycl::range<2>{};
+        auto local_range = cl::sycl::range<2>{};
+        const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+        const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        const array<Index, 1> indices{{m_indices[0]}};
+        const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
+        internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
+            ConvKernel;
+
+        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+            m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
+            indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
+        break;
+      }
 
-    typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr;
-    // extract input functor list
-    InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl);
-    ptrdiff_t out_offset = m_device.get_offset(data);
-
-
-    m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) {
-
-      typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> InputLocalAcc;
-      /// work-around for gcc 4.8 auto bug
-      typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl)) InputTupleType;
-      // create input tuple of accessors
-      InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl);
-
-      typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutputAccessorType;
-      OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, data);
-      typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType;
-      KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel);
-
-      switch (NumKernelDims) {
-        case 1: {
-          const size_t numX = dimensions()[m_indices[0]];
-          const size_t numP = dimensions().TotalSize() / numX;
-          const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
-          size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y;
-          m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y );
-          const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y);
-          gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
-          auto global_range=cl::sycl::range<2>(GRange_x, GRange_y);  // global range
-          auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y);  // local range
-          InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
-          const array<Index, 1> indices{{m_indices[0]}};
-          const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
-          internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
-          cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range),
-          EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
-          InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
-          indexMapper,kernel_acc, kernel_size, numX, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
-          break;
-        }
-
-        case 2: {
-          const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
-          const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
-          const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
-          const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
-          const size_t numX = dimensions()[m_indices[idxX]];
-          const size_t numY = dimensions()[m_indices[idxY]];
-          const size_t numP = dimensions().TotalSize() / (numX*numY);
-          size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
-          m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
-          const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z;
-          gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
-          auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z);  // global range
-          auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z);  // local range
-          InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
-          const array<Index, 2> indices {{m_indices[idxX], m_indices[idxY]}};
-          const array<Index, 2> kernel_dims{{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}};
-          internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
-          cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
-          EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
-          InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
-          indexMapper,kernel_acc, kernel_size_x,  kernel_size_y, numX, numY, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
-          break;
-        }
+      case 2: {
+        auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
+        auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
+        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
+        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
+        const size_t numP = dimensions().TotalSize() / (numX * numY);
+        auto input_dim = std::array<size_t, 3>{numX, numY, numP};
+
+        auto global_range = cl::sycl::range<3>{};
+        auto local_range = cl::sycl::range<3>{};
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+
+        const size_t local_memory_size =
+            (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
+        const array<Index, 2> kernel_dims{
+            {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
+        internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
+            ConvKernel;
+        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+            m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
+            indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
+        break;
+      }
 
-        case 3: {
-          const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
-          const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
-          const size_t idxZ =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
-          const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
-          const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
-          const size_t kernel_size_z = m_kernelImpl.dimensions()[idxZ];
-          const size_t numX = dimensions()[m_indices[idxX]];
-          const size_t numY = dimensions()[m_indices[idxY]];
-          const size_t numZ = dimensions()[m_indices[idxZ]];
-          const size_t numP = dimensions().TotalSize() / (numX*numY*numZ);
-          const array<Index, 3> indices{{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}};
-          const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[idxX],m_kernelImpl.dimensions()[idxY], m_kernelImpl.dimensions()[idxZ]}};
-          internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
-          size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
-          m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
-          const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1);
-          gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
-          auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z);  // global range
-          auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z);  // local range
-          InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
-          cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
-          EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
-          InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
-          indexMapper,kernel_acc, kernel_size_x,  kernel_size_y, kernel_size_z, numX, numY,
-          numZ, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
-          break;
-        }
+      case 3: {
+        auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
+
+        auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
+
+        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
+        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
+        const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
+        auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
+        const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
+
+        const array<Index, 3> indices{
+            {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
+        const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
+                                           m_kernelImpl.dimensions()[kernel_index[1]],
+                                           m_kernelImpl.dimensions()[kernel_index[2]]}};
+
+        internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+        auto global_range = cl::sycl::range<3>{};
+        auto local_range = cl::sycl::range<3>{};
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+        auto local_memory_range = (local_range + kernel_size - 1);
+        const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
+
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
+            ConvKernel;
+        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+            m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
+            indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
+        break;
+      }
 
-        default: {
-          EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
-        }
+      default: {
+        EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
+                            THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
       }
-    });
-    m_device.asynchronousExec();
+    }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    eigen_assert(m_buf);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_buf != NULL);
     eigen_assert(index < m_dimensions.TotalSize());
     return m_buf[index];
   }
 
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
-  {
-    eigen_assert(m_buf);
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
+    eigen_assert(m_buf != NULL);
     eigen_assert(index < m_dimensions.TotalSize());
-    return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
     // model.
     const double kernel_size = m_kernelImpl.dimensions().TotalSize();
     // We ignore the use of fused multiply-add.
-    const double convolve_compute_cost =
-        TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+    const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
     const double firstIndex_compute_cost =
         NumDims *
-        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
-         TensorOpCost::DivCost<Index>());
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
     return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
-           kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
-                          m_kernelImpl.costPerCoeff(vectorized) +
-                          TensorOpCost(0, 0, convolve_compute_cost, vectorized,
-                                       PacketSize));
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
+  }
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_kernelImpl.bind(cgh);
+    m_inputImpl.bind(cgh);
+    m_buf.bind(cgh);
+    m_kernel.bind(cgh);
   }
 
  private:
   // No assignment (copies are needed by the kernels)
-  TensorEvaluator& operator = (const TensorEvaluator&);
-  TensorEvaluator<InputArgType, const Eigen::SyclDevice> m_inputImpl;
+  TensorEvaluator &operator=(const TensorEvaluator &);
+  TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
   KernelArgType m_kernelArg;
-  TensorEvaluator<KernelArgType, const Eigen::SyclDevice> m_kernelImpl;
+  TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
   Indices m_indices;
   Dimensions m_dimensions;
-  Scalar* m_buf;
-  const Scalar* m_kernel;
+  EvaluatorPointerType m_buf;
+  typename KernelStorage::Type m_kernel;
   bool m_local_kernel;
-  const Eigen::SyclDevice& m_device;
-};
+  const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
+};  // namespace Eigen
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index 6f8b6f193..df591c21d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -16,7 +16,6 @@
 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
 #include <unordered_set>
 
-
 namespace Eigen {
 
 namespace TensorSycl {
@@ -70,9 +69,9 @@ struct SyclDeviceInfo {
 }  // end namespace TensorSycl
 
 typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
-// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and 
-// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently 
-// TensorFlow via the Eigen SYCL Backend. 
+// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
+// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
+// TensorFlow via the Eigen SYCL Backend.
 EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
     -> decltype(cl::sycl::device::get_devices()) {
 #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
@@ -421,6 +420,91 @@ class QueueInterface {
     return pMapper.get_offset(ptr);
   }
 
+  template <typename OutScalar, typename sycl_kernel, typename Lhs,
+            typename Rhs, typename OutPtr, typename Range, typename Index,
+            typename... T>
+  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
+                                                  const Rhs &rhs, OutPtr outptr,
+                                                  Range thread_range,
+                                                  Index scratchSize,
+                                                  T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      // binding the placeholder accessors to a commandgroup handler
+      lhs.bind(cgh);
+      rhs.bind(cgh);
+      outptr.bind(cgh);
+      typedef cl::sycl::accessor<OutScalar, 1,
+                                 cl::sycl::access::mode::read_write,
+                                 cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+          program().template get_kernel<sycl_kernel>(),
+#endif
+          thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+    async_synchronize(e);
+  }
+
+  template <typename OutScalar, typename sycl_kernel, typename InPtr,
+            typename OutPtr, typename Range, typename Index, typename... T>
+  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
+                                                 OutPtr &outptr,
+                                                 Range thread_range,
+                                                 Index scratchSize,
+                                                 T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      // binding the placeholder accessors to a commandgroup handler
+      inptr.bind(cgh);
+      outptr.bind(cgh);
+      typedef cl::sycl::accessor<OutScalar, 1,
+                                 cl::sycl::access::mode::read_write,
+                                 cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+          program().template get_kernel<sycl_kernel>(),
+#endif
+          thread_range, sycl_kernel(scratch, inptr, outptr, var...));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+    async_synchronize(e);
+  }
+
+    template <typename OutScalar, typename sycl_kernel, typename InPtr,
+           typename Range, typename Index, typename... T>
+  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
+                                                 Range thread_range,
+                                                 Index scratchSize,
+                                                 T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      // binding the placeholder accessors to a commandgroup handler
+      inptr.bind(cgh);
+      typedef cl::sycl::accessor<OutScalar, 1,
+                                 cl::sycl::access::mode::read_write,
+                                 cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+          program().template get_kernel<sycl_kernel>(),
+#endif
+          thread_range, sycl_kernel(scratch, inptr, var...));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+    async_synchronize(e);
+  }
+
+
   EIGEN_STRONG_INLINE void synchronize() const {
 #ifdef EIGEN_EXCEPTIONS
     m_queue.wait_and_throw();
@@ -429,6 +513,7 @@ class QueueInterface {
 #endif
   }
 
+
   EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
     set_latest_event(e);
 #ifndef EIGEN_SYCL_ASYNC_EXECUTION
@@ -457,11 +542,10 @@ class QueueInterface {
   /// This is used to prepare the number of threads and also the number of
   /// threads per block for sycl kernels
   template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,
-                                              Index &tileSize0,
-                                              Index &tileSize1, Index &rng0,
-                                              Index &rng1, Index &GRange0,
-                                              Index &GRange1) const {
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+      cl::sycl::range<2> &local_range) const {
+    std::array<Index, 2> input_range = input_dim;
     Index max_workgroup_Size =
         static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
     max_workgroup_Size =
@@ -469,26 +553,28 @@ class QueueInterface {
                                     EIGEN_SYCL_LOCAL_THREAD_DIM1),
                  static_cast<Index>(max_workgroup_Size));
     Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
-    tileSize1 =
+    local_range[1] =
         static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
-    rng1 = dim1;
-    if (rng1 == 0) rng1 = static_cast<Index>(1);
-    GRange1 = rng1;
-    if (tileSize1 > GRange1)
-      tileSize1 = GRange1;
-    else if (GRange1 > tileSize1) {
-      Index xMode = static_cast<Index>(GRange1 % tileSize1);
-      if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
+    input_range[1] = input_dim[1];
+    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[1] = input_range[1];
+    if (local_range[1] > global_range[1])
+      local_range[1] = global_range[1];
+    else if (global_range[1] > local_range[1]) {
+      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+      if (xMode != 0)
+        global_range[1] += static_cast<Index>(local_range[1] - xMode);
     }
-    tileSize0 = static_cast<Index>(max_workgroup_Size / tileSize1);
-    rng0 = dim0;
-    if (rng0 == 0) rng0 = static_cast<Index>(1);
-    GRange0 = rng0;
-    if (tileSize0 > GRange0)
-      tileSize0 = GRange0;
-    else if (GRange0 > tileSize0) {
-      Index xMode = static_cast<Index>(GRange0 % tileSize0);
-      if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
+    local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
+    input_range[0] = input_dim[0];
+    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+    global_range[0] = input_range[0];
+    if (local_range[0] > global_range[0])
+      local_range[0] = global_range[0];
+    else if (global_range[0] > local_range[0]) {
+      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+      if (xMode != 0)
+        global_range[0] += static_cast<Index>(local_range[0] - xMode);
     }
   }
 
@@ -496,9 +582,9 @@ class QueueInterface {
   /// threads per block for sycl kernels
   template <typename Index>
   EIGEN_STRONG_INLINE void parallel_for_setup(
-      Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1,
-      Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0,
-      Index &GRange1, Index &GRange2) const {
+      const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+      cl::sycl::range<3> &local_range) const {
+    std::array<Index, 3> input_range = input_dim;
     Index max_workgroup_Size =
         static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
     max_workgroup_Size =
@@ -506,45 +592,48 @@ class QueueInterface {
                                     EIGEN_SYCL_LOCAL_THREAD_DIM1),
                  static_cast<Index>(max_workgroup_Size));
     Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
-    tileSize2 =
+    local_range[2] =
         static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
-    rng2 = dim2;
-    if (rng2 == 0) rng1 = static_cast<Index>(1);
-    GRange2 = rng2;
-    if (tileSize2 > GRange2)
-      tileSize2 = GRange2;
-    else if (GRange2 > tileSize2) {
-      Index xMode = static_cast<Index>(GRange2 % tileSize2);
-      if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
+    input_range[2] = input_dim[2];
+    if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[2] = input_range[2];
+    if (local_range[2] > global_range[2])
+      local_range[2] = global_range[2];
+    else if (global_range[2] > local_range[2]) {
+      Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
+      if (xMode != 0)
+        global_range[2] += static_cast<Index>(local_range[2] - xMode);
     }
     pow_of_2 = static_cast<Index>(
-        std::log2(static_cast<Index>(max_workgroup_Size / tileSize2)));
-    tileSize1 =
+        std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
+    local_range[1] =
         static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
-    rng1 = dim1;
-    if (rng1 == 0) rng1 = static_cast<Index>(1);
-    GRange1 = rng1;
-    if (tileSize1 > GRange1)
-      tileSize1 = GRange1;
-    else if (GRange1 > tileSize1) {
-      Index xMode = static_cast<Index>(GRange1 % tileSize1);
-      if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
+    input_range[1] = input_dim[1];
+    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[1] = input_range[1];
+    if (local_range[1] > global_range[1])
+      local_range[1] = global_range[1];
+    else if (global_range[1] > local_range[1]) {
+      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+      if (xMode != 0)
+        global_range[1] += static_cast<Index>(local_range[1] - xMode);
     }
-    tileSize0 =
-        static_cast<Index>(max_workgroup_Size / (tileSize1 * tileSize2));
-    rng0 = dim0;
-    if (rng0 == 0) rng0 = static_cast<Index>(1);
-    GRange0 = rng0;
-    if (tileSize0 > GRange0)
-      tileSize0 = GRange0;
-    else if (GRange0 > tileSize0) {
-      Index xMode = static_cast<Index>(GRange0 % tileSize0);
-      if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
+    local_range[0] = static_cast<Index>(max_workgroup_Size /
+                                        (local_range[1] * local_range[2]));
+    input_range[0] = input_dim[0];
+    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+    global_range[0] = input_range[0];
+    if (local_range[0] > global_range[0])
+      local_range[0] = global_range[0];
+    else if (global_range[0] > local_range[0]) {
+      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+      if (xMode != 0)
+        global_range[0] += static_cast<Index>(local_range[0] - xMode);
     }
   }
 
   EIGEN_STRONG_INLINE bool has_local_memory() const {
-#if !defined(EIGEN_SYCL_LOCA_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
     return false;
 #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
     return true;
@@ -768,25 +857,19 @@ struct SyclDevice : public SyclDeviceBase {
   /// This is used to prepare the number of threads and also the number of
   /// threads per block for sycl kernels
   template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,
-                                              Index &tileSize0,
-                                              Index &tileSize1, Index &rng0,
-                                              Index &rng1, Index &GRange0,
-                                              Index &GRange1) const {
-    queue_stream()->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0,
-                                       rng1, GRange0, GRange1);
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+      cl::sycl::range<2> &local_range) const {
+    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
   }
 
   /// This is used to prepare the number of threads and also the number of
   /// threads per block for sycl kernels
   template <typename Index>
   EIGEN_STRONG_INLINE void parallel_for_setup(
-      Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1,
-      Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0,
-      Index &GRange1, Index &GRange2) const {
-    queue_stream()->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1,
-                                       tileSize2, rng0, rng1, rng2, GRange0,
-                                       GRange1, GRange2);
+      const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+      cl::sycl::range<3> &local_range) const {
+    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
   }
 
   /// allocate device memory
@@ -943,6 +1026,22 @@ struct SyclDevice : public SyclDeviceBase {
   EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
     return queue_stream()->getDeviceVendor();
   }
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
+    queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
+        var...);
+  }
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
+    queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
+        var...);
+  }
+
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
+    queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
+        var...);
+  }
 };
 }  // end namespace Eigen
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 9926046b9..b83174ab7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -649,131 +649,75 @@ EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Til
 // SYCL Executor policy
 #ifdef EIGEN_USE_SYCL
 
-template <bool Vectorizable, typename Evaluator>
-struct ExecExprFunctorKernel_impl {
+template <typename Evaluator>
+struct ExecExprFunctorKernel {
   typedef typename Evaluator::Index Index;
-  const Index range;
-  const Index vectorizable_threads;
   Evaluator evaluator;
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl(
-      const Index range_, const Index vectorizable_threads_,
-      Evaluator evaluator_)
-      : range(range_), vectorizable_threads(vectorizable_threads_),
-        evaluator(evaluator_) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
-  operator()(cl::sycl::nd_item<1> itemID) {
+  const Index range;
+  template <typename Scratch>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
+      const Scratch, Evaluator evaluator_, const Index range_)
+      : evaluator(evaluator_), range(range_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
+      cl::sycl::nd_item<1> itemID) {
+    compute(itemID);
+  }
+  template <bool is_vec = Evaluator::PacketAccess>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type
+  compute(const cl::sycl::nd_item<1>& itemID) {
     Index gId = static_cast<Index>(itemID.get_global_linear_id());
     Index total_threads = itemID.get_global_range(0);
-    EIGEN_UNROLL_LOOP
+
     for (Index i = gId; i < range; i += total_threads) {
       evaluator.evalScalar(i);
     }
   }
-};
-
-template <typename Evaluator>
-struct ExecExprFunctorKernel_impl<true, Evaluator> {
-  typedef typename Evaluator::Index Index;
-  const Index range;
-  const Index vectorizable_threads;
-  Evaluator evaluator;
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl(
-      const Index range_, const Index vectorizable_threads_,
-      Evaluator evaluator_)
-      : range(range_), vectorizable_threads(vectorizable_threads_),
-        evaluator(evaluator_) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
-  operator()(cl::sycl::nd_item<1> itemID) {
+  template <bool is_vec = Evaluator::PacketAccess>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type
+  compute(const cl::sycl::nd_item<1>& itemID) {
+    const Index vectorizedRange =
+        (range / Evaluator::PacketSize) * Evaluator::PacketSize;
     Index gId = static_cast<Index>(itemID.get_global_linear_id());
-    if (gId < vectorizable_threads) {
-      const Index PacketSize = Eigen::internal::unpacket_traits<
-          typename Evaluator::PacketReturnType>::size;
-      evaluator.evalPacket(gId * PacketSize);
-      gId += (vectorizable_threads * PacketSize);
-      EIGEN_UNROLL_LOOP
-      for (Index i = gId; i < range; i += vectorizable_threads) {
-        evaluator.evalScalar(i);
-      }
+    const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+    const Index start = Evaluator::PacketSize * gId;
+    for (Index i = start; i < vectorizedRange; i += step) {
+      evaluator.evalPacket(i);
+    }
+    gId += vectorizedRange;
+    for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
+      evaluator.evalScalar(i);
     }
   }
 };
 
-template <typename Expr, bool NonZeroVectoriseSize, typename Evaluator>
-struct ExecExprFunctorKernel
-    : ExecExprFunctorKernel_impl<
-          ::Eigen::internal::IsVectorizable<Eigen::SyclDevice, Expr>::value,
-          Evaluator> {
-  ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_,
-                        const Evaluator &evaluator)
-      : ExecExprFunctorKernel_impl<
-            ::Eigen::internal::IsVectorizable<Eigen::SyclDevice, Expr>::value,
-            Evaluator>(range_, vectorizable_threads_, evaluator) {}
-};
-
-template <typename Expr, typename Evaluator>
-struct ExecExprFunctorKernel<Expr, false, Evaluator>
-    : ExecExprFunctorKernel_impl<false, Evaluator> {
-  ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_,
-                        const Evaluator &evaluator)
-      : ExecExprFunctorKernel_impl<false, Evaluator>(
-            range_, vectorizable_threads_, evaluator) {}
-};
-
 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
 class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
-  public:
+ public:
   typedef typename Expression::Index Index;
-   static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) {
-    Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> evaluator(expr, dev);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                                      const Eigen::SyclDevice& dev) {
+    typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
+    Evaluator evaluator(expr, dev);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign) {
       Index range, GRange, tileSize;
       Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
       total_size = (total_size == 0) ? 1 : total_size;
-      const int PacketSize = Eigen::PacketType<
-          typename Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>::CoeffReturnType,
-          Eigen::SyclDevice>::size;
-      Index vectorizable_threads =
-          static_cast<Index>(total_size / PacketSize);
+      const int PacketSize =
+          Eigen::PacketType<typename Evaluator::CoeffReturnType,
+                            Eigen::SyclDevice>::size;
+      Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
       dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
       range = total_size;
-      auto f = [&](cl::sycl::handler &cgh) {
-        evaluator.bind(cgh);
-        typedef ExecExprFunctorKernel<Expression, true,
-                                      Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>>
-            conditional_vectorized_kernel;
-
-        typedef ExecExprFunctorKernel<Expression, false,
-                                      Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>>
-            non_vectorized_kernel;
-// This is to make sure that an expression with a size less than vectorized size
-// will not call the vectorized kernel.
-// The reason for having this kernel is that the vectorisable parameter is a
-// compile-time parameter,
-// however, the size of a tensor is a run-time parameter
-        (vectorizable_threads)
-            ? cgh.parallel_for(
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-                  dev.program().template get_kernel<vectorized_kernel>(),
-#endif
-                  cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
-                                        cl::sycl::range<1>(tileSize)),
-                  conditional_vectorized_kernel(range, vectorizable_threads,
-                                                evaluator))
-            : cgh.parallel_for(
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-                  dev.program().template get_kernel<non_vectorized_kernel>(),
-#endif
-                  cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
-                                        cl::sycl::range<1>(tileSize)),
-                  non_vectorized_kernel(range, vectorizable_threads,
-                                        evaluator));
-      };
-      cl::sycl::event e;
-      EIGEN_SYCL_TRY_CATCH(e = dev.sycl_queue().submit(f));
-      dev.async_synchronize(e);
+
+      dev.template nullary_kernel_launcher<
+          typename Evaluator::CoeffReturnType,
+          ExecExprFunctorKernel<Evaluator> >(
+          evaluator,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
+                                cl::sycl::range<1>(tileSize)),
+          Index(1), range);
     }
     evaluator.cleanup();
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 389d5d906..b115e502b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -123,7 +123,7 @@ struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {};
 
 namespace TensorSycl {
 namespace internal{
-template <typename Evaluator, typename Op> class ReductionFunctor;
+template <typename Evaluator, typename Op> class GenericNondeterministicReducer;
 }
 }
 #endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 700337539..d3628f94e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -421,7 +421,7 @@ template <typename Index, typename Device, bool BlockAccess> struct MemcpyTrigge
 #ifdef EIGEN_USE_GPU
 template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, GpuDevice, BlockAccess>  {
   EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { }
-  EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const { return contiguous > 4*1024*1024; }
+  EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
 };
 #endif
 
@@ -430,7 +430,7 @@ template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index
 #ifdef EIGEN_USE_SYCL
 template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, Eigen::SyclDevice, BlockAccess>  {
   EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { }
-  EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const { return contiguous > 4*1024*1024; }
+  EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
 };
 #endif
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 84604cf41..0bb1e643e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -946,7 +946,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
 #endif
 
 #if defined(EIGEN_USE_SYCL)
- template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::ReductionFunctor;
+ template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer;
  // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
  template <typename, typename, typename> friend struct internal::GenericReducer;
 #endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index a379f5a94..387c3edf4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -11,167 +11,576 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 /*****************************************************************
- * TensorSyclPlaceHolderExpr.h
+ * TensorReductionSycl.h
  *
  * \brief:
- *  This is the specialisation of the placeholder expression based on the
- * operation type
+ *  This is the specialization of the reduction operation. Two phase reduction approach 
+ * is used since the GPU does not have Global Synchronization for global memory among 
+ * different work-group/thread block. To solve the problem, we need to create two kernels 
+ * to reduce the data, where the first kernel reduce the data locally and each local 
+ * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
+ * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element. 
+ * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
+ * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
  *
-*****************************************************************/
+ *****************************************************************/
 
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
-
 namespace Eigen {
+namespace TensorSycl {
 namespace internal {
 
-template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{
-template<typename BufferTOut, typename BufferTIn>
-static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
-  do {
-          auto f = [length, local, op, out_offset, &bufOut, &bufI](cl::sycl::handler& h) mutable {
-            cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
-                                    cl::sycl::range<1>{std::min(length, local)}};
-            /* Two accessors are used: one to the buffer that is being reduced,
-             * and a second to local memory, used to store intermediate data. */
-            auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h);
-            auto aOut =bufOut.template get_access<cl::sycl::access::mode::write>(h);
-            typedef decltype(aI) InputAccessor;
-            typedef decltype(aOut) OutputAccessor;
-            typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor;
-            LocalAccessor scratch(cl::sycl::range<1>(local), h);
-
-            /* The parallel_for invocation chosen is the variant with an nd_item
-             * parameter, since the code requires barriers for correctness. */
-            h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, out_offset, aI, scratch,  length, local));
-          };
-            dev.sycl_queue().submit(f);
-            dev.asynchronousExec();
-
-          /* At this point, you could queue::wait_and_throw() to ensure that
-           * errors are caught quickly. However, this would likely impact
-           * performance negatively. */
-          length = length / local;
-
-        } while (length > 1);
-}
+template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
+struct OpDefiner {
+  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
+  typedef Op type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
 
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+                                                                            const Index &) {
+    return accumulator;
+  }
 };
 
-template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{
-template<typename BufferTOut, typename BufferTIn>
-static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut,ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
-   syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(),
-    bufOut, out_offset, bufI, dev, length, local);
-}
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
+  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+    return type();
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
+                                                                           const Index &scale) {
+    ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
+    return quotient_op(accumulator, CoeffReturnType(scale));
+  }
 };
 
-/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
-/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
-/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
-// a leafNode.
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
+  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
+  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+    return type();
+  }
 
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+                                                                            const Index &scale) {
+    return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
+  }
+};
 
-  typedef typename Self::CoeffReturnType CoeffReturnType;
-  static const bool HasOptimizedImplementation = false;
-
-  static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
-    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
-    typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
-    FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
-    int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
-    size_t inputSize =self.impl().dimensions().TotalSize();
-    size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
-    size_t remaining = inputSize% red_factor;
-    if(rng ==0) {
-      red_factor=1;
-    };
-    size_t tileSize =dev.sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
-    size_t GRange=std::max((size_t )1, rng);
-
-    // convert global range to power of 2 for redecution
-    GRange--;
-    GRange |= GRange >> 1;
-    GRange |= GRange >> 2;
-    GRange |= GRange >> 4;
-    GRange |= GRange >> 8;
-    GRange |= GRange >> 16;
-#if __x86_64__ || __ppc64__ || _WIN64
-    GRange |= GRange >> 32;
-#endif
-    GRange++;
-    size_t  outTileSize = tileSize;
-    /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
-    if (GRange < outTileSize) outTileSize=GRange;
-    /// creating the shared memory for calculating reduction.
-    /// This one is used to collect all the reduced value of shared memory as we don't have global barrier on GPU. Once it is saved we can
-    /// recursively apply reduction on it in order to reduce the whole.
-    auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
-    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
-  //  Dims dims= self.xprDims();
-    //Op functor = reducer;
-    dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
-      // this is a workaround for gcc 4.8 bug
-      typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType;
-      // create a tuple of accessors from Evaluator
-      TupleType tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
-      auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
-      typedef decltype(tmp_global_accessor) OutAccessor;
-      cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)),
-        TensorSycl::internal::FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Op, Dims, size_t, TupleType>
-       (tmp_global_accessor, rng, remaining, red_factor, reducer, self.xprDims(), functors,  tuple_of_accessors));
-    });
-    dev.asynchronousExec();
-
-    // getting final out buffer at the moment the created buffer is true because there is no need for assign
-    auto out_buffer =dev.get_sycl_buffer(output);
-    ptrdiff_t out_offset = dev.get_offset(output);
-    /// This is used to recursively reduce the tmp value to an element of 1;
-    syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, out_offset, temp_global_buffer,dev, GRange,  outTileSize);
+template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
+          Index local_range>
+struct SecondStepFullReducer {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
+  typedef typename OpDef::type Op;
+  LocalAccessor scratch;
+  InputAccessor aI;
+  OutputAccessor outAcc;
+  Op op;
+  SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
+      : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
+
+  void operator()(cl::sycl::nd_item<1> itemID) {
+    // Our empirical research shows that the best performance will be achieved
+    // when there is only one element per thread to reduce in the second step.
+    // in this step the second step reduction time is almost negligible.
+    // Hence, in the second step of reduction the input size is fixed to the
+    // local size, thus, there is only one element read per thread. The
+    // algorithm must be changed if the number of reduce per thread in the
+    // second step is greater than 1. Otherwise, the result will be wrong.
+    const Index localid = itemID.get_local_id(0);
+    auto aInPtr = aI.get_pointer() + localid;
+    auto aOutPtr = outAcc.get_pointer();
+    CoeffReturnType *scratchptr = scratch.get_pointer();
+    CoeffReturnType accumulator = *aInPtr;
+
+    scratchptr[localid] = op.finalize(accumulator);
+#pragma unroll 8
+    for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.reduce(scratchptr[localid + offset], &accumulator);
+        scratchptr[localid] = op.finalize(accumulator);
+      }
+    }
+    if (localid == 0) *aOutPtr = op.finalize(accumulator);
+  }
+};
+
+// Full reduction first phase. In this version the vectorization is true and the reduction accept 
+// any generic reducerOp  e.g( max, min, sum, mean, iamax, iamin, etc ). 
+template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
+class FullReductionKernelFunctor {
+ public:
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
+                    (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+      OpDef;
+
+  typedef typename OpDef::type Op;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::PacketReturnType PacketReturnType;
+  typedef
+      typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
+                                              PacketReturnType, CoeffReturnType>::type OutType;
+  typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  LocalAccessor scratch;
+  Evaluator evaluator;
+  EvaluatorPointerType final_output;
+  Index rng;
+  Op op;
+
+  FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
+                             Index rng_, OpType op_)
+      : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
+
+  void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
+
+  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction(
+      const cl::sycl::nd_item<1> &itemID) {
+    auto output_ptr = final_output.get_pointer();
+    Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
+    Index globalid = itemID.get_global_id(0);
+    Index localid = itemID.get_local_id(0);
+    Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+    Index start = Evaluator::PacketSize * globalid;
+    // vectorizable parts
+    PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
+#pragma unroll(8 / Evaluator::PacketSize)
+    for (Index i = start; i < VectorizedRange; i += step) {
+      op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
+    }
+    globalid += VectorizedRange;
+    // non vectorizable parts
+    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      op.template reducePacket<PacketReturnType>(
+          ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
+              evaluator.impl().coeff(i), op.initialize()),
+          &packetAccumulator);
+    }
+    scratch[localid] = packetAccumulator =
+        OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
+    // reduction parts // Local size is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
+        scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
+      }
+    }
+    if (localid == 0) {
+      output_ptr[itemID.get_group(0)] =
+          op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
+    }
   }
 
+  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction(
+      const cl::sycl::nd_item<1> &itemID) {
+    auto output_ptr = final_output.get_pointer();
+    Index globalid = itemID.get_global_id(0);
+    Index localid = itemID.get_local_id(0);
+    // vectorizable parts
+    CoeffReturnType accumulator = op.initialize();
+    // non vectorizable parts
+    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      op.reduce(evaluator.impl().coeff(i), &accumulator);
+    }
+    scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
+
+    // reduction parts. the local size is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.reduce(scratch[localid + offset], &accumulator);
+        scratch[localid] = op.finalize(accumulator);
+      }
+    }
+    if (localid == 0) {
+      output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
+    }
+  }
 };
 
+template <typename Evaluator, typename OpType>
+class GenericNondeterministicReducer {
+ public:
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  template <typename Scratch>
+  GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
+                       Index range_, Index num_values_to_reduce_)
+      : evaluator(evaluator_),
+        output_accessor(output_accessor_),
+        functor(OpDef::get_op(functor_)),
+        range(range_),
+        num_values_to_reduce(num_values_to_reduce_) {}
 
-template <typename Self, typename Op>
-struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
+  void operator()(cl::sycl::nd_item<1> itemID) {
+    auto output_accessor_ptr = output_accessor.get_pointer();
+    /// const cast added as a naive solution to solve the qualifier drop error
+    Index globalid = static_cast<Index>(itemID.get_global_linear_id());
+    if (globalid < range) {
+      CoeffReturnType accum = functor.initialize();
+      Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
+          evaluator, evaluator.firstInput(globalid), functor, &accum);
+      output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
+    }
+  }
+
+ private:
+  Evaluator evaluator;
+  EvaluatorPointerType output_accessor;
+  Op functor;
+  Index range;
+  Index num_values_to_reduce;
+};
+
+enum class reduction_dim { inner_most, outer_most };
+// default is preserver
+template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
+struct PartialReductionKernel {
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      ScratchAcc;
+  ScratchAcc scratch;
+  Evaluator evaluator;
+  EvaluatorPointerType output_accessor;
+  Op op;
+  const Index preserve_elements_num_groups;
+  const Index reduce_elements_num_groups;
+  const Index num_coeffs_to_preserve;
+  const Index num_coeffs_to_reduce;
+
+  PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
+                         const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
+                         const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
+      : scratch(scratch_),
+        evaluator(evaluator_),
+        output_accessor(output_accessor_),
+        op(OpDef::get_op(op_)),
+        preserve_elements_num_groups(preserve_elements_num_groups_),
+        reduce_elements_num_groups(reduce_elements_num_groups_),
+        num_coeffs_to_preserve(num_coeffs_to_preserve_),
+        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
+                                                                 CoeffReturnType &accumulator) {
+    if (globalPId >= num_coeffs_to_preserve) {
+      return;
+    }
+    Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
+                                                          : globalRId + (globalPId * num_coeffs_to_reduce);
+    Index localOffset = globalRId;
+
+    const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
+    const Index per_thread_global_stride =
+        rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
+#pragma unroll 8
+    for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
+      op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
+      localOffset += per_thread_local_stride;
+      global_offset += per_thread_global_stride;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    const Index linearLocalThreadId = itemID.get_local_id(0);
+    Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
+                                                           : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
+    Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
+                                                           : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
+    const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
+                                                           : itemID.get_group(0) / reduce_elements_num_groups;
+    const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
+                                                           : itemID.get_group(0) % reduce_elements_num_groups;
+
+    Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+    const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
+    auto scratchPtr = scratch.get_pointer().get();
+    auto outPtr =
+        output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
+    CoeffReturnType accumulator = op.initialize();
+
+    element_wise_reduce(globalRId, globalPId, accumulator);
+
+    accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
+    scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
+        accumulator;
+    if (rt == reduction_dim::inner_most) {
+      pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
+      rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
+      globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+    }
+
+    /* Apply the reduction operation between the current local
+     * id and the one on the other half of the vector. */
+    auto out_scratch_ptr =
+        scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+    if (rt == reduction_dim::inner_most) {
+      accumulator = *out_scratch_ptr;
+    }
+    // The Local LocalThreadSizeR is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
+      if (rLocalThreadId < offset) {
+        op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
+        // The result has already been divided for mean reducer in the
+        // previous reduction so no need to divide furthermore
+        *out_scratch_ptr = op.finalize(accumulator);
+      }
+      /* All threads collectively read from global memory into local.
+       * The barrier ensures all threads' IO is resolved before
+       * execution continues (strictly speaking, all threads within
+       * a single work-group - there is no co-ordination between
+       * work-groups, only work-items). */
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+    }
+
+    if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
+      outPtr[globalPId] = op.finalize(accumulator);
+    }
+  }
+};
+
+template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
+struct SecondStepPartialReduction {
+  typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      ScratchAccessor;
+  InputAccessor input_accessor;
+  OutputAccessor output_accessor;
+  Op op;
+  const Index num_coeffs_to_preserve;
+  const Index num_coeffs_to_reduce;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
+                                                                   OutputAccessor output_accessor_, OpType op_,
+                                                                   const Index num_coeffs_to_preserve_,
+                                                                   const Index num_coeffs_to_reduce_)
+      : input_accessor(input_accessor_),
+        output_accessor(output_accessor_),
+        op(OpDef::get_op(op_)),
+        num_coeffs_to_preserve(num_coeffs_to_preserve_),
+        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    const Index globalId = itemID.get_global_id(0);
+
+    if (globalId >= num_coeffs_to_preserve) return;
+
+    auto in_ptr = input_accessor.get_pointer() + globalId;
+
+    OutScalar accumulator = op.initialize();
+// num_coeffs_to_reduce is not bigger that 256
+#pragma unroll 8
+    for (Index i = 0; i < num_coeffs_to_reduce; i++) {
+      op.reduce(*in_ptr, &accumulator);
+      in_ptr += num_coeffs_to_preserve;
+    }
+    output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
+  }
+};  // namespace internal
+
+template <typename Index, Index LTP, Index LTR, bool BC_>
+struct ReductionPannel {
+  static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
+  static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
+  static EIGEN_CONSTEXPR bool BC = BC_;
+};
+
+template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
+struct PartialReducerLauncher {
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
   typedef typename Self::CoeffReturnType CoeffReturnType;
-  static const bool HasOptimizedImplementation = false;
+  typedef typename Self::Storage Storage;
+  typedef typename Self::Index Index;
+  typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
+      PannelParameters;
+
+  typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
+
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
+                  Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
+    Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
+
+    // getPowerOfTwo makes sure local range is power of 2 and <=
+    // maxSyclThreadPerBlock this will help us to avoid extra check on the
+    // kernel
+    static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
+                    (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
+                  "The Local thread size must be a power of 2 for the reduction "
+                  "operation");
+
+    EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
+    // In this step, we force the code not to be more than 2-step reduction:
+    // Our empirical research shows that if each thread reduces at least 64
+    // elemnts individually, we get better performance. However, this can change
+    // on different platforms. In this step we force the code not to be
+    // morthan step reduction: Our empirical research shows that for inner_most
+    // dim reducer, it is better to have 8 group in a reduce dimension for sizes
+    // > 1024 to achieve the best performance.
+    const Index reductionPerThread = 64;
+    Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
+    const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
+    Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
+    const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
+    const Index globalRange = pNumGroups * rNumGroups * localRange;
+
+    EIGEN_CONSTEXPR Index scratchSize =
+        PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (rNumGroups > 1) {
+      CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
+          dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
+      EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
+      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+          self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+          num_coeffs_to_reduce);
+
+      typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
+          SecondStepPartialReductionKernel;
+
+      dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
+          temp_accessor, output,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
+          reducer, num_coeffs_to_preserve, rNumGroups);
+
+      self.device().deallocate_temp(temp_pointer);
+    } else {
+      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+          self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+          num_coeffs_to_reduce);
+    }
+    return false;
+  }
+};
+}  // namespace internal
+}  // namespace TensorSycl
+
+namespace internal {
 
-  static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve) {
-    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
-    typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
-    FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+  static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
+  static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
+    typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType;
+    static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
+                    (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
+                  "The Local thread size must be a power of 2 for the reduction "
+                  "operation");
+    EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+
+    typename Self::Index inputSize = self.impl().dimensions().TotalSize();
+    // In this step we force the code not to be more than 2-step reduction:
+    // Our empirical research shows that if each thread reduces at least 512
+    // elemnts individually, we get better performance.
+    const Index reductionPerThread = 2048;
+    // const Index num_work_group =
+    Index reductionGroup = dev.getPowerOfTwo(
+        (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
+    const Index num_work_group = std::min(reductionGroup, local_range);
+    // 1
+    // ? local_range
+    // : 1);
+    const Index global_range = num_work_group * local_range;
+
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+    typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
+    if (num_work_group > 1) {
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
+      typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
+      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
+                                                                      local_range, inputSize, reducer);
+
+      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
+                                                          EvaluatorPointerType, Index, local_range>
+          GenericRKernel;
+      dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
+          tmp_global_accessor, data,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
+          reducer);
+
+      dev.deallocate_temp(temp_pointer);
+    } else {
+      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
+                                                                      reducer);
+    }
+  }
+};
+// vectorizable inner_most most dim preserver
+// col reduction
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, Eigen::SyclDevice> {
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
+    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
+                                                                                 num_coeffs_to_reduce,
+                                                                                 num_coeffs_to_preserve);
+  }
+};
+// row reduction
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, Eigen::SyclDevice> {
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
+    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
+                                                                                 num_coeffs_to_reduce,
+                                                                                 num_coeffs_to_preserve);
+  }
+};
+
+// ArmgMax uses this kernel for partial reduction//
+// TODO(@mehdi.goli) come up with a better kernel
+// generic partial reduction
+template <typename Self, typename Op>
+struct GenericReducer<Self, Op, Eigen::SyclDevice> {
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
     typename Self::Index range, GRange, tileSize;
-    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
-
-    // getting final out buffer at the moment the created buffer is true because there is no need for assign
-    /// creating the shared memory for calculating reduction.
-    /// This one is used to collect all the reduced value of shared memory as we don't have global barrier on GPU. Once it is saved we can
-    /// recursively apply reduction on it in order to reduce the whole.
-      dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
-      dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
-      // this is workaround for gcc 4.8 bug.
-      typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc;
-      // create a tuple of accessors from Evaluator
-      Tuple_of_Acc tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
-      auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, output);
-      ptrdiff_t out_offset = dev.get_offset(output);
-      Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1);
-      cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
-      TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
-      (output_accessor, out_offset, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));
-
-    });
-    dev.asynchronousExec();
+    dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
+
+    dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
+                                       TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
+        self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
+        reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
     return false;
   }
 };
 
-}  // end namespace internal
+}  // namespace internal
 }  // namespace Eigen
 
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
new file mode 100644
index 000000000..0078692cd
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
@@ -0,0 +1,512 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorScanSycl.h
+ *
+ * \brief:
+ *  Tensor Scan Sycl implement the extend  version of
+ * "Efficient parallel scan algorithms for GPUs." .for Tensor operations.
+ * The algorithm requires up to 3 stage (consequently 3 kernels) depending on
+ * the size of the tensor. In the first kernel (ScanKernelFunctor), each
+ * threads within the work-group individually reduces the allocated elements per
+ * thread in order to reduces the total number of blocks. In the next step all
+ * thread within the work-group will reduce the associated blocks into the
+ * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary
+ * buffer is given as an input and all the threads within a work-group scan and
+ * reduces the boundaries between the blocks (generated from the previous
+ * kernel). and write the data on the temporary buffer. If the second kernel is
+ * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will
+ * adjust the final result into the output buffer.
+ * The original algorithm for the parallel prefix sum can be found here:
+ *
+ * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel
+ * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003
+ *1, no. 1 (2008): 1-17.
+ *****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
+#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
+#endif
+
+template <typename index_t>
+struct ScanParameters {
+  // must be power of 2
+  static EIGEN_CONSTEXPR index_t ScanPerThread = 8;
+  const index_t total_size;
+  const index_t non_scan_size;
+  const index_t scan_size;
+  const index_t non_scan_stride;
+  const index_t scan_stride;
+  const index_t panel_threads;
+  const index_t group_threads;
+  const index_t block_threads;
+  const index_t elements_per_group;
+  const index_t elements_per_block;
+  const index_t loop_range;
+
+  ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
+                 index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
+                 index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
+      : total_size(total_size_),
+        non_scan_size(non_scan_size_),
+        scan_size(scan_size_),
+        non_scan_stride(non_scan_stride_),
+        scan_stride(scan_stride_),
+        panel_threads(panel_threads_),
+        group_threads(group_threads_),
+        block_threads(block_threads_),
+        elements_per_group(elements_per_group_),
+        elements_per_block(elements_per_block_),
+        loop_range(loop_range_) {}
+};
+
+enum class scan_step { first, second };
+template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,
+          scan_step stp>
+struct ScanKernelFunctor {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+
+  LocalAccessor scratch;
+  Evaluator dev_eval;
+  OutAccessor out_accessor;
+  OutAccessor temp_accessor;
+  const ScanParameters<Index> scanParameters;
+  Op accumulator;
+  const bool inclusive;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_,
+                                                          OutAccessor out_accessor_, OutAccessor temp_accessor_,
+                                                          const ScanParameters<Index> scanParameters_, Op accumulator_,
+                                                          const bool inclusive_)
+      : scratch(scratch_),
+        dev_eval(dev_eval_),
+        out_accessor(out_accessor_),
+        temp_accessor(temp_accessor_),
+        scanParameters(scanParameters_),
+        accumulator(accumulator_),
+        inclusive(inclusive_) {}
+
+  template <scan_step sst = stp, typename Input>
+  typename ::Eigen::internal::enable_if<sst == scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
+      EIGEN_STRONG_INLINE
+      read(const Input &inpt, Index global_id) {
+    return inpt.coeff(global_id);
+  }
+
+  template <scan_step sst = stp, typename Input>
+  typename ::Eigen::internal::enable_if<sst != scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
+      EIGEN_STRONG_INLINE
+      read(const Input &inpt, Index global_id) {
+    return inpt[global_id];
+  }
+
+  template <scan_step sst = stp, typename InclusiveOp>
+  typename ::Eigen::internal::enable_if<sst == scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  first_step_inclusive_Operation(InclusiveOp inclusive_op) {
+    inclusive_op();
+  }
+
+  template <scan_step sst = stp, typename InclusiveOp>
+  typename ::Eigen::internal::enable_if<sst != scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  first_step_inclusive_Operation(InclusiveOp) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    auto out_ptr = out_accessor.get_pointer();
+    auto tmp_ptr = temp_accessor.get_pointer();
+    auto scratch_ptr = scratch.get_pointer().get();
+
+    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
+      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
+      Index tmp = data_offset % scanParameters.panel_threads;
+      const Index panel_id = data_offset / scanParameters.panel_threads;
+      const Index group_id = tmp / scanParameters.group_threads;
+      tmp = tmp % scanParameters.group_threads;
+      const Index block_id = tmp / scanParameters.block_threads;
+      const Index local_id = tmp % scanParameters.block_threads;
+      // we put one element per packet in scratch_mem
+      const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
+      const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
+      CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
+      CoeffReturnType inclusive_scan;
+      // the actual panel size is scan_size * non_scan_size.
+      // elements_per_panel is roundup to power of 2 for binary tree
+      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
+      const Index group_offset = group_id * scanParameters.non_scan_stride;
+      // This will be effective when the size is bigger than elements_per_block
+      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
+      const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);
+      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
+      Index next_elements = 0;
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
+                             (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) &&
+                           (global_id < scanParameters.total_size))
+                              ? read(dev_eval, global_id)
+                              : accumulator.initialize();
+        next_elements += scanParameters.scan_stride;
+      }
+      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
+        if (inclusive) {
+          inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
+        }
+      });
+      // This for loop must be 2
+      EIGEN_UNROLL_LOOP
+      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
+        Index private_offset = 1;
+        // build sum in place up the tree
+        EIGEN_UNROLL_LOOP
+        for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
+          EIGEN_UNROLL_LOOP
+          for (Index l = 0; l < d; l++) {
+            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
+            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
+            CoeffReturnType accum = accumulator.initialize();
+            accumulator.reduce(private_scan[ai], &accum);
+            accumulator.reduce(private_scan[bi], &accum);
+            private_scan[bi] = accumulator.finalize(accum);
+          }
+          private_offset *= 2;
+        }
+        scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
+            private_scan[PacketSize - 1 + packetIndex];
+        private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
+        // traverse down tree & build scan
+        EIGEN_UNROLL_LOOP
+        for (Index d = 1; d < PacketSize; d *= 2) {
+          private_offset >>= 1;
+          EIGEN_UNROLL_LOOP
+          for (Index l = 0; l < d; l++) {
+            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
+            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
+            CoeffReturnType accum = accumulator.initialize();
+            accumulator.reduce(private_scan[ai], &accum);
+            accumulator.reduce(private_scan[bi], &accum);
+            private_scan[ai] = private_scan[bi];
+            private_scan[bi] = accumulator.finalize(accum);
+          }
+        }
+      }
+
+      Index offset = 1;
+      // build sum in place up the tree
+      for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
+        // Synchronise
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (local_id < d) {
+          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
+          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
+          CoeffReturnType accum = accumulator.initialize();
+          accumulator.reduce(scratch_ptr[ai], &accum);
+          accumulator.reduce(scratch_ptr[bi], &accum);
+          scratch_ptr[bi] = accumulator.finalize(accum);
+        }
+        offset *= 2;
+      }
+      // Synchronise
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      // next step optimisation
+      if (local_id == 0) {
+        if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) {
+          const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) *
+                                    scanParameters.non_scan_size +
+                                group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +
+                                block_id;
+          tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
+        }
+        // clear the last element
+        scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
+      }
+      // traverse down tree & build scan
+      for (Index d = 1; d < scratch_stride; d *= 2) {
+        offset >>= 1;
+        // Synchronise
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (local_id < d) {
+          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
+          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
+          CoeffReturnType accum = accumulator.initialize();
+          accumulator.reduce(scratch_ptr[ai], &accum);
+          accumulator.reduce(scratch_ptr[bi], &accum);
+          scratch_ptr[ai] = scratch_ptr[bi];
+          scratch_ptr[bi] = accumulator.finalize(accum);
+        }
+      }
+      // Synchronise
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      // This for loop must be 2
+      EIGEN_UNROLL_LOOP
+      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
+        EIGEN_UNROLL_LOOP
+        for (Index i = 0; i < PacketSize; i++) {
+          CoeffReturnType accum = private_scan[packetIndex + i];
+          accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
+          private_scan[packetIndex + i] = accumulator.finalize(accum);
+        }
+      }
+      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
+        if (inclusive) {
+          accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
+          private_scan[0] = accumulator.finalize(inclusive_scan);
+        }
+      });
+      next_elements = 0;
+      // right the first set of private param
+      EIGEN_UNROLL_LOOP
+      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
+             scanParameters.scan_size) &&
+            (global_id < scanParameters.total_size)) {
+          Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
+          out_ptr[global_id] = private_scan[private_id];
+        }
+        next_elements += scanParameters.scan_stride;
+      }
+    }  // end for loop
+  }
+};
+
+template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>
+struct ScanAdjustmentKernelFunctor {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+  InAccessor in_accessor;
+  OutAccessor out_accessor;
+  const ScanParameters<Index> scanParameters;
+  Op accumulator;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,
+                                                                    OutAccessor out_accessor_,
+                                                                    const ScanParameters<Index> scanParameters_,
+                                                                    Op accumulator_)
+      : in_accessor(in_accessor_),
+        out_accessor(out_accessor_),
+        scanParameters(scanParameters_),
+        accumulator(accumulator_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    auto in_ptr = in_accessor.get_pointer();
+    auto out_ptr = out_accessor.get_pointer();
+
+    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
+      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
+      Index tmp = data_offset % scanParameters.panel_threads;
+      const Index panel_id = data_offset / scanParameters.panel_threads;
+      const Index group_id = tmp / scanParameters.group_threads;
+      tmp = tmp % scanParameters.group_threads;
+      const Index block_id = tmp / scanParameters.block_threads;
+      const Index local_id = tmp % scanParameters.block_threads;
+
+      // the actual panel size is scan_size * non_scan_size.
+      // elements_per_panel is roundup to power of 2 for binary tree
+      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
+      const Index group_offset = group_id * scanParameters.non_scan_stride;
+      // This will be effective when the size is bigger than elements_per_block
+      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
+      const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;
+
+      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
+      const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block;
+      const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;
+      CoeffReturnType adjust_val = in_ptr[in_id];
+
+      Index next_elements = 0;
+      EIGEN_UNROLL_LOOP
+      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
+             scanParameters.scan_size) &&
+            (global_id < scanParameters.total_size)) {
+          CoeffReturnType accum = adjust_val;
+          accumulator.reduce(out_ptr[global_id], &accum);
+          out_ptr[global_id] = accumulator.finalize(accum);
+        }
+        next_elements += scanParameters.scan_stride;
+      }
+    }
+  }
+};
+
+template <typename Index>
+struct ScanInfo {
+  const Index &total_size;
+  const Index &scan_size;
+  const Index &panel_size;
+  const Index &non_scan_size;
+  const Index &scan_stride;
+  const Index &non_scan_stride;
+
+  Index max_elements_per_block;
+  Index block_size;
+  Index panel_threads;
+  Index group_threads;
+  Index block_threads;
+  Index elements_per_group;
+  Index elements_per_block;
+  Index loop_range;
+  Index global_range;
+  Index local_range;
+  const Eigen::SyclDevice &dev;
+  EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,
+                               const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,
+                               const Eigen::SyclDevice &dev_)
+      : total_size(total_size_),
+        scan_size(scan_size_),
+        panel_size(panel_size_),
+        non_scan_size(non_scan_size_),
+        scan_stride(scan_stride_),
+        non_scan_stride(non_scan_stride_),
+        dev(dev_) {
+    // must be power of 2
+    local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),
+                           Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
+
+    max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread;
+
+    elements_per_group =
+        dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);
+    const Index elements_per_panel = elements_per_group * non_scan_size;
+    elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block));
+    panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread;
+    group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread;
+    block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread;
+    block_size = elements_per_group / elements_per_block;
+#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
+    const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE));
+#else
+    const Index max_threads = panel_threads * panel_size;
+#endif
+    global_range = roundUp(max_threads, local_range);
+    loop_range = Index(
+        std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
+  }
+  inline ScanParameters<Index> get_scan_parameter() {
+    return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads,
+                                 group_threads, block_threads, elements_per_group, elements_per_block, loop_range);
+  }
+  inline cl::sycl::nd_range<1> get_thread_range() {
+    return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+  }
+};
+
+template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>
+struct SYCLAdjustBlockOffset {
+  EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,
+                                                           Reducer &accumulator, const Index total_size,
+                                                           const Index scan_size, const Index panel_size,
+                                                           const Index non_scan_size, const Index scan_stride,
+                                                           const Index non_scan_stride, const Eigen::SyclDevice &dev) {
+    auto scan_info =
+        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
+
+    typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index>
+        AdjustFuctor;
+    dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
+                                                                      scan_info.max_elements_per_block,
+                                                                      scan_info.get_scan_parameter(), accumulator);
+  }
+};
+
+template <typename CoeffReturnType, scan_step stp>
+struct ScanLauncher_impl {
+  template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>
+  EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,
+                                             const Index total_size, const Index scan_size, const Index panel_size,
+                                             const Index non_scan_size, const Index scan_stride,
+                                             const Index non_scan_stride, const bool inclusive,
+                                             const Eigen::SyclDevice &dev) {
+    auto scan_info =
+        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
+    const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
+    const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);
+    CoeffReturnType *temp_pointer =
+        static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));
+    EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
+
+    typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;
+    dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
+        in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
+        scan_info.get_scan_parameter(), accumulator, inclusive);
+
+    if (scan_info.block_size > 1) {
+      ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(
+          tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
+          non_scan_size, Index(1), scan_info.block_size, false, dev);
+
+      SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset(
+          tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
+          non_scan_stride, dev);
+    }
+    dev.deallocate_temp(temp_pointer);
+  }
+};
+
+}  // namespace internal
+}  // namespace TensorSycl
+
+template <typename Self, typename Reducer>
+struct ScanLauncher<Self, Reducer, Eigen::SyclDevice> {
+  typedef typename Self::Index Index;
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::Storage Storage;
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+  void operator()(Self &self, EvaluatorPointerType data) {
+    const Index total_size = internal::array_prod(self.dimensions());
+    const Index scan_size = self.size();
+    const Index scan_stride = self.stride();
+    // this is the scan op (can be sum or ...)
+    auto accumulator = self.accumulator();
+    auto inclusive = !self.exclusive();
+    auto consume_dim = self.consume_dim();
+    auto dev = self.device();
+
+    auto dims = self.inner().dimensions();
+
+    Index non_scan_size = 1;
+    Index panel_size = 1;
+    if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < consume_dim; i++) {
+        non_scan_size *= dims[i];
+      }
+      for (int i = consume_dim + 1; i < Self::NumDims; i++) {
+        panel_size *= dims[i];
+      }
+    } else {
+      for (int i = Self::NumDims - 1; i > consume_dim; i--) {
+        non_scan_size *= dims[i];
+      }
+      for (int i = consume_dim - 1; i >= 0; i--) {
+        panel_size *= dims[i];
+      }
+    }
+    const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
+    auto eval_impl = self.inner();
+    TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block(
+        eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
+        inclusive, dev);
+  }
+};
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
deleted file mode 100644
index 7b8bd2df7..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+++ /dev/null
@@ -1,120 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: eigen@codeplay.com
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// General include header of SYCL target for Tensor Module
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
-
-#ifdef EIGEN_USE_SYCL
-
-// global pointer to set different attribute state for a class
-template <class T>
-struct MakeGlobalPointer {
-  typedef typename cl::sycl::global_ptr<T>::pointer_t Type;
-  typedef typename cl::sycl::global_ptr<T>::reference_t RefType;
-};
-
-// global pointer to set different attribute state for a class
-template <class T>
-struct MakeLocalPointer {
-  typedef typename cl::sycl::local_ptr<T>::pointer_t Type;
-  typedef typename cl::sycl::local_ptr<T>::reference_t RefType;
-};
-
-
-namespace Eigen {
-  template<typename StrideDims, typename XprType> class TensorTupleReducerDeviceOp;
-  template<typename StrideDims, typename ArgType> struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice>;
-namespace internal {
-
-#ifdef __SYCL_DEVICE_ONLY__
-template<typename A, typename B> struct TypeConversion {
-  template<typename T>
-  static typename MakeGlobalPointer<A>::Type get_address_space_pointer(typename MakeGlobalPointer<T>::Type p);
-  template<typename T>
-  static typename MakeLocalPointer<A>::Type get_address_space_pointer(typename MakeLocalPointer<T>::Type p);
-
-  template<typename T>
-  static A* get_address_space_pointer(T* p);
-  typedef decltype(get_address_space_pointer(B())) type;
-};
-
-#endif
-}
-namespace TensorSycl {
-namespace internal {
-
-  template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
-/// This struct is used for special expression nodes with no operations (for example assign and selectOP).
-  struct NoOP;
-
-template<bool IsConst, typename T> struct GetType{
-  typedef const T Type;
-};
-template<typename T> struct GetType<false, T>{
-  typedef T Type;
-};
-
-template <bool Conds,  size_t X , size_t Y > struct ValueCondition {
-  static constexpr size_t Res =X;
-};
-template<size_t X, size_t Y> struct ValueCondition<false, X, Y> {
-  static constexpr size_t Res =Y;
-};
-
-}
-}
-}
-
-// tuple construction
-#include "TensorSyclTuple.h"
-
-// counting number of leaf at compile time
-#include "TensorSyclLeafCount.h"
-
-// The index PlaceHolder takes the actual expression and replaces the actual
-// data on it with the place holder. It uses the same pre-order expression tree
-// traverse as the leaf count in order to give the right access number to each
-// node in the expression
-#include "TensorSyclPlaceHolderExpr.h"
-
-// creation of an accessor tuple from a tuple of SYCL buffers
-#include "TensorSyclExtractAccessor.h"
-
-// this is used to change the address space type in tensor map for GPU
-#include "TensorSyclConvertToDeviceExpression.h"
-
-// this is used to extract the functors
-#include "TensorSyclExtractFunctors.h"
-
-// this is used to create tensormap on the device
-// this is used to construct the expression on the device
-#include "TensorSyclExprConstructor.h"
-
-/// this is used for extracting tensor reduction
-#include "TensorReductionSycl.h"
-
-// TensorArgMaxSycl.h
-#include "TensorArgMaxSycl.h"
-
-/// this is used for extracting tensor convolution
-#include "TensorConvolutionSycl.h"
-
-// kernel execution using fusion
-#include "TensorSyclRun.h"
-//sycl functors
-#include "TensorSyclFunctors.h"
-
-#include "TensorContractionSycl.h"
-
-#endif  // end of EIGEN_USE_SYCL
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
deleted file mode 100644
index d6ac7b91f..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
+++ /dev/null
@@ -1,205 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclConvertToDeviceExpression.h
- *
- * \brief:
- *  Conversion from host pointer to device pointer
- *  inside leaf nodes of the expression.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-/// \struct ConvertToDeviceExpression
-/// \brief This struct is used to convert the MakePointer in the host expression
-/// to the MakeGlobalPointer for the device expression. For the leafNodes
-/// containing the pointer. This is due to the fact that the address space of
-/// the pointer T* is different on the host and the device.
-template <typename Expr>
-struct ConvertToDeviceExpression;
-
-template<template<class...> class NonOpCategory, bool IsConst, typename... Args>
-struct NonOpConversion{
-  typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type...> >::Type Type;
-};
-
-
-template<template<class, template <class> class > class NonOpCategory, bool IsConst, typename Args>
-struct DeviceConvertor{
-  typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type, MakeGlobalPointer> >::Type Type;
-};
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is TensorMap
-#define TENSORMAPCONVERT(CVQual)\
-template <typename T,  int Options_, template <class> class MakePointer_>\
-struct ConvertToDeviceExpression<CVQual TensorMap<T, Options_, MakePointer_> > {\
-  typedef CVQual TensorMap<T, Options_, MakeGlobalPointer> Type;\
-};
-
-TENSORMAPCONVERT(const)
-TENSORMAPCONVERT()
-#undef TENSORMAPCONVERT
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, TensorBroadcastingOp
-#define CATEGORYCONVERT(CVQual)\
-template <template<class, class...> class Category, typename OP, typename... subExprs>\
-struct ConvertToDeviceExpression<CVQual Category<OP, subExprs...> > {\
-  typedef CVQual Category<OP, typename ConvertToDeviceExpression<subExprs>::Type... > Type;\
-};
-CATEGORYCONVERT(const)
-CATEGORYCONVERT()
-#undef CATEGORYCONVERT
-
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is  TensorCwiseSelectOp
-#define SELECTOPCONVERT(CVQual, Res)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr>\
-struct ConvertToDeviceExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >\
-: NonOpConversion<TensorSelectOp, Res, IfExpr, ThenExpr, ElseExpr> {};
-SELECTOPCONVERT(const, true)
-SELECTOPCONVERT(, false)
-#undef SELECTOPCONVERT
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is const AssingOP
-#define ASSIGNCONVERT(CVQual, Res)\
-template <typename LHSExpr, typename RHSExpr>\
-struct ConvertToDeviceExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr> >\
-: NonOpConversion<TensorAssignOp, Res, LHSExpr, RHSExpr>{};
-
-ASSIGNCONVERT(const, true)
-ASSIGNCONVERT(, false)
-#undef ASSIGNCONVERT
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is  TensorEvalToOp
-#define KERNELBROKERCONVERT(CVQual, Res, ExprNode)\
-template <typename Expr>\
-struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \
-: DeviceConvertor<ExprNode, Res, Expr>{};
-
-
-KERNELBROKERCONVERT(const, true, TensorEvalToOp)
-KERNELBROKERCONVERT(, false, TensorEvalToOp)
-#undef KERNELBROKERCONVERT
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node types are TensorForcedEvalOp and TensorLayoutSwapOp
-#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(CVQual, ExprNode)\
-template <typename Expr>\
-struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\
-  typedef CVQual ExprNode< typename ConvertToDeviceExpression<Expr>::Type> Type;\
-};
-
-
-// TensorForcedEvalOp
-KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorForcedEvalOp)
-KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorForcedEvalOp)
-
-// TensorLayoutSwapOp
-KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorLayoutSwapOp)
-KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorLayoutSwapOp)
-
-//TensorIndexTupleOp
-KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorIndexTupleOp)
-KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorIndexTupleOp)
-#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
-#define KERNELBROKERCONVERTREDUCTION(CVQual)\
-template <typename OP, typename Dim, typename subExpr, template <class> class MakePointer_>\
-struct ConvertToDeviceExpression<CVQual TensorReductionOp<OP, Dim, subExpr, MakePointer_> > {\
-  typedef CVQual TensorReductionOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type, MakeGlobalPointer> Type;\
-};
-
-KERNELBROKERCONVERTREDUCTION(const)
-KERNELBROKERCONVERTREDUCTION()
-#undef KERNELBROKERCONVERTREDUCTION
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
-#define KERNELBROKERCONVERTTUPLEREDUCTION(CVQual)\
-template <typename OP, typename Dim, typename subExpr>\
-struct ConvertToDeviceExpression<CVQual TensorTupleReducerOp<OP, Dim, subExpr> > {\
-  typedef CVQual TensorTupleReducerOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type> Type;\
-};
-
-KERNELBROKERCONVERTTUPLEREDUCTION(const)
-KERNELBROKERCONVERTTUPLEREDUCTION()
-#undef KERNELBROKERCONVERTTUPLEREDUCTION
-
-//TensorSlicingOp
-#define KERNELBROKERCONVERTSLICEOP(CVQual)\
-template<typename StartIndices, typename Sizes, typename XprType>\
-struct ConvertToDeviceExpression<CVQual TensorSlicingOp <StartIndices, Sizes, XprType> >{\
-  typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename ConvertToDeviceExpression<XprType>::Type> Type;\
-};
-
-KERNELBROKERCONVERTSLICEOP(const)
-KERNELBROKERCONVERTSLICEOP()
-#undef KERNELBROKERCONVERTSLICEOP
-
-//TensorStridingSlicingOp
-#define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\
-struct ConvertToDeviceExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >{\
-  typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename ConvertToDeviceExpression<XprType>::Type> Type;\
-};
-
-KERNELBROKERCONVERTERSLICESTRIDEOP(const)
-KERNELBROKERCONVERTERSLICESTRIDEOP()
-#undef KERNELBROKERCONVERTERSLICESTRIDEOP
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp
-#define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\
-template <DenseIndex DimId, typename Expr>\
-struct ConvertToDeviceExpression<CVQual TensorChippingOp<DimId, Expr> > {\
-  typedef CVQual TensorChippingOp<DimId, typename ConvertToDeviceExpression<Expr>::Type> Type;\
-};
-KERNELBROKERCONVERTCHIPPINGOP(const)
-KERNELBROKERCONVERTCHIPPINGOP()
-#undef KERNELBROKERCONVERTCHIPPINGOP
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorImagePatchOp
-#define KERNELBROKERCONVERTIMAGEPATCHOP(CVQual)\
-template<DenseIndex Rows, DenseIndex Cols, typename XprType>\
-struct ConvertToDeviceExpression<CVQual TensorImagePatchOp<Rows, Cols, XprType> >{\
-  typedef CVQual TensorImagePatchOp<Rows, Cols, typename ConvertToDeviceExpression<XprType>::Type> Type;\
-};
-KERNELBROKERCONVERTIMAGEPATCHOP(const)
-KERNELBROKERCONVERTIMAGEPATCHOP()
-#undef KERNELBROKERCONVERTIMAGEPATCHOP
-
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorVolumePatchOp
-#define KERNELBROKERCONVERTVOLUMEPATCHOP(CVQual)\
-template<DenseIndex Plannes, DenseIndex Rows, DenseIndex Cols, typename XprType>\
-struct ConvertToDeviceExpression<CVQual TensorVolumePatchOp<Plannes, Rows, Cols, XprType> >{\
-  typedef CVQual TensorVolumePatchOp<Plannes, Rows, Cols, typename ConvertToDeviceExpression<XprType>::Type> Type;\
-};
-KERNELBROKERCONVERTVOLUMEPATCHOP(const)
-KERNELBROKERCONVERTVOLUMEPATCHOP()
-#undef KERNELBROKERCONVERTVOLUMEPATCHOP
-
-}  // namespace internal
-}  // namespace TensorSycl
-}  // namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX1
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
deleted file mode 100644
index 67003daf5..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
+++ /dev/null
@@ -1,514 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclExprConstructor.h
- *
- * \brief:
- *  This file re-create an expression on the SYCL device in order
- *  to use the original tensor evaluator.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-template <typename Expr, typename Dims>
-struct DeviceFixedSizeTensor;
-
-template <typename Expr, typename std::ptrdiff_t... Indices>
-struct DeviceFixedSizeTensor<Expr, Eigen::Sizes<Indices...>>{
-  template<typename Data>
-  static EIGEN_ALWAYS_INLINE Expr instantiate(Data& dt) {return Expr(ConvertToActualTypeSycl(typename Expr::Scalar, dt), Indices...);}
-};
-/// this class is used by EvalToOp in order to create an lhs expression which is
-/// a pointer from an accessor on device-only buffer
-template <typename PtrType, size_t N, typename... Params>
-struct EvalToLHSConstructor {
-  PtrType expr;
-  EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t) : expr(ConvertToActualTypeSycl(typename Eigen::internal::remove_all<PtrType>::type, utility::tuple::get<N>(t))) {}
-};
-
-/// struct ExprConstructor is used to reconstruct the expression on the device and
-/// recreate the expression with MakeGlobalPointer containing the device address
-/// space for the TensorMap pointers used in eval function.
-/// It receives the original expression type, the functor of the node, the tuple
-/// of accessors, and the device expression type to re-instantiate the
-/// expression tree for the device
-template <typename OrigExpr, typename IndexExpr, typename... Params>
-struct ExprConstructor;
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorMap
-#define TENSORMAP(CVQual)\
-template <typename T,  int Options_,\
-template <class> class MakePointer_, size_t N, typename... Params>\
-struct ExprConstructor< CVQual TensorMap<T, Options_, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N>, Params...>{\
-  typedef  CVQual TensorMap<T, Options_, MakeGlobalPointer>  Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
-  : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())){}\
-};
-
-TENSORMAP(const)
-TENSORMAP()
-#undef TENSORMAP
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorMap
-#define TENSORMAPFIXEDSIZE(CVQual)\
-template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_,\
-template <class> class MakePointer_, size_t N, typename... Params>\
-struct ExprConstructor< CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_>, N>, Params...>{\
-  typedef  CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakeGlobalPointer>  Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &, const utility::tuple::Tuple<Params...> &t)\
-  : expr(DeviceFixedSizeTensor<Type,Dimensions_>::instantiate(utility::tuple::get<N>(t))){}\
-};
-
-TENSORMAPFIXEDSIZE(const)
-TENSORMAPFIXEDSIZE()
-#undef TENSORMAPFIXEDSIZE
-
-#define UNARYCATEGORY(CVQual)\
-template <template<class, class> class UnaryCategory, typename OP, typename OrigRHSExpr, typename RHSExpr, typename... Params>\
-struct ExprConstructor<CVQual UnaryCategory<OP, OrigRHSExpr>, CVQual UnaryCategory<OP, RHSExpr>, Params...> {\
-  typedef  ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_type;\
-  my_type rhsExpr;\
-  typedef CVQual UnaryCategory<OP, typename my_type::Type> Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : rhsExpr(funcD.rhsExpr, t), expr(rhsExpr.expr, funcD.func) {}\
-};
-
-UNARYCATEGORY(const)
-UNARYCATEGORY()
-#undef UNARYCATEGORY
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorBinaryOp
-#define BINARYCATEGORY(CVQual)\
-template <template<class, class, class> class BinaryCategory, typename OP, typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr,\
-typename RHSExpr, typename... Params>\
-struct ExprConstructor<CVQual BinaryCategory<OP, OrigLHSExpr, OrigRHSExpr>,  CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Params...> {\
-  typedef  ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\
-  typedef  ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\
-  typedef  CVQual BinaryCategory<OP, typename my_left_type::Type, typename my_right_type::Type> Type;\
-  my_left_type lhsExpr;\
-  my_right_type rhsExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : lhsExpr(funcD.lhsExpr, t),rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr, funcD.func) {}\
-};
-
-BINARYCATEGORY(const)
-BINARYCATEGORY()
-#undef BINARYCATEGORY
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorCwiseTernaryOp
-#define TERNARYCATEGORY(CVQual)\
-template <template <class, class, class, class> class TernaryCategory, typename OP, typename OrigArg1Expr, typename OrigArg2Expr,typename OrigArg3Expr,\
-typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename... Params>\
-struct ExprConstructor<CVQual TernaryCategory<OP, OrigArg1Expr, OrigArg2Expr, OrigArg3Expr>, CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Params...> {\
-  typedef ExprConstructor<OrigArg1Expr, Arg1Expr, Params...> my_arg1_type;\
-  typedef ExprConstructor<OrigArg2Expr, Arg2Expr, Params...> my_arg2_type;\
-  typedef ExprConstructor<OrigArg3Expr, Arg3Expr, Params...> my_arg3_type;\
-  typedef  CVQual TernaryCategory<OP, typename my_arg1_type::Type, typename my_arg2_type::Type, typename my_arg3_type::Type> Type;\
-  my_arg1_type arg1Expr;\
-  my_arg2_type arg2Expr;\
-  my_arg3_type arg3Expr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD,const utility::tuple::Tuple<Params...> &t)\
-  : arg1Expr(funcD.arg1Expr, t), arg2Expr(funcD.arg2Expr, t), arg3Expr(funcD.arg3Expr, t), expr(arg1Expr.expr, arg2Expr.expr, arg3Expr.expr, funcD.func) {}\
-};
-
-TERNARYCATEGORY(const)
-TERNARYCATEGORY()
-#undef TERNARYCATEGORY
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorCwiseSelectOp
-#define SELECTOP(CVQual)\
-template <typename OrigIfExpr, typename OrigThenExpr, typename OrigElseExpr, typename IfExpr, typename ThenExpr, typename ElseExpr, typename... Params>\
-struct ExprConstructor< CVQual TensorSelectOp<OrigIfExpr, OrigThenExpr, OrigElseExpr>, CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Params...> {\
-  typedef  ExprConstructor<OrigIfExpr, IfExpr, Params...> my_if_type;\
-  typedef  ExprConstructor<OrigThenExpr, ThenExpr, Params...> my_then_type;\
-  typedef  ExprConstructor<OrigElseExpr, ElseExpr, Params...> my_else_type;\
-  typedef CVQual TensorSelectOp<typename my_if_type::Type, typename my_then_type::Type, typename my_else_type::Type> Type;\
-  my_if_type ifExpr;\
-  my_then_type thenExpr;\
-  my_else_type elseExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : ifExpr(funcD.ifExpr, t), thenExpr(funcD.thenExpr, t), elseExpr(funcD.elseExpr, t), expr(ifExpr.expr, thenExpr.expr, elseExpr.expr) {}\
-};
-
-SELECTOP(const)
-SELECTOP()
-#undef SELECTOP
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// const TensorAssignOp
-#define ASSIGN(CVQual)\
-template <typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr, typename RHSExpr, typename... Params>\
-struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>,  CVQual TensorAssignOp<LHSExpr, RHSExpr>, Params...> {\
-  typedef ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\
-  typedef ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\
-  typedef CVQual TensorAssignOp<typename my_left_type::Type, typename my_right_type::Type>  Type;\
-  my_left_type lhsExpr;\
-  my_right_type rhsExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : lhsExpr(funcD.lhsExpr, t), rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr) {}\
- };
-
- ASSIGN(const)
- ASSIGN()
- #undef ASSIGN
-
- /// specialisation of the \ref ExprConstructor struct when the node type is
- /// const TensorAssignOp
- #define CONVERSIONEXPRCONST(CVQual)\
- template <typename OrigNestedExpr, typename ConvertType, typename NestedExpr, typename... Params>\
- struct ExprConstructor<CVQual TensorConversionOp<ConvertType, OrigNestedExpr>,  CVQual TensorConversionOp<ConvertType, NestedExpr>, Params...> {\
-   typedef ExprConstructor<OrigNestedExpr, NestedExpr, Params...> my_nested_type;\
-   typedef CVQual TensorConversionOp<ConvertType, typename my_nested_type::Type>  Type;\
-   my_nested_type nestedExpr;\
-   Type expr;\
-   template <typename FuncDetector>\
-   ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-   : nestedExpr(funcD.subExpr, t), expr(nestedExpr.expr) {}\
-  };
-
-  CONVERSIONEXPRCONST(const)
-  CONVERSIONEXPRCONST()
-  #undef CONVERSIONEXPRCONST
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-///  TensorEvalToOp /// 0 here is the output number in the buffer
-#define EVALTO(CVQual)\
-template <typename OrigExpr, typename Expr, typename... Params>\
-struct ExprConstructor<CVQual TensorEvalToOp<OrigExpr, MakeGlobalPointer>, CVQual TensorEvalToOp<Expr>, Params...> {\
-  typedef ExprConstructor<OrigExpr, Expr, Params...> my_expr_type;\
-  typedef typename TensorEvalToOp<OrigExpr, MakeGlobalPointer>::PointerType my_buffer_type;\
-  typedef CVQual TensorEvalToOp<typename my_expr_type::Type, MakeGlobalPointer> Type;\
-  my_expr_type nestedExpression;\
-  EvalToLHSConstructor<my_buffer_type, 0, Params...> buffer;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : nestedExpression(funcD.xprExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\
-};
-
-EVALTO(const)
-EVALTO()
-#undef EVALTO
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorForcedEvalOp
-#define FORCEDEVAL(CVQual)\
-template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\
-CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
-  typedef  TensorForcedEvalOp<OrigExpr> XprType;\
-  typedef CVQual TensorMap<\
-                            Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\
-                            Eigen::internal::traits<XprType>::Layout, \
-                            MakeGlobalPointer\
-                          > Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
-  : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
-};
-
-FORCEDEVAL(const)
-FORCEDEVAL()
-#undef FORCEDEVAL
-
-#define TENSORCUSTOMUNARYOP(CVQual)\
-template <typename CustomUnaryFunc, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr>,\
-CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, DevExpr>, N>, Params...> {\
-  typedef TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr> XprType;\
-  typedef CVQual TensorMap<\
-                            Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\
-                            Eigen::internal::traits<XprType>::Layout, \
-                            MakeGlobalPointer\
-                          > Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
-  : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
-};
-
-TENSORCUSTOMUNARYOP(const)
-TENSORCUSTOMUNARYOP()
-#undef TENSORCUSTOMUNARYOP
-
-/// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp
-#define SYCLREDUCTIONEXPR(CVQual)\
-template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\
-  static const auto NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0,  1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
-  typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\
-  NumIndices, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
-  :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
-};
-
-SYCLREDUCTIONEXPR(const)
-SYCLREDUCTIONEXPR()
-#undef SYCLREDUCTIONEXPR
-
-/// specialisation of the \ref ExprConstructor struct when the node type is TensorTupleReducerOp
-/// use reductionOp instead of the TensorTupleReducerOp in order to build the tensor map. Because the tensorMap is the output of Tensor ReductionOP.
-#define SYCLTUPLEREDUCTIONEXPR(CVQual)\
-template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorTupleReducerOp<OP, Dim, OrigExpr>,\
-CVQual PlaceHolder<CVQual TensorTupleReducerOp<OP, Dim, DevExpr>, N>, Params...> {\
-  static const auto NumRedDims= TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr> , MakeGlobalPointer>::NumDimensions;\
-  static const auto NumIndices= ValueCondition<NumRedDims==0, 1, NumRedDims>::Res;\
-static const int Layout =static_cast<int>(Eigen::internal::traits<TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr>, MakeGlobalPointer>>::Layout);\
-  typedef CVQual TensorMap<\
-                          Tensor<typename TensorIndexTupleOp<OrigExpr>::CoeffReturnType,NumIndices, Layout, typename TensorTupleReducerOp<OP, Dim, OrigExpr>::Index>,\
-                          Layout,\
-                          MakeGlobalPointer\
-                          > XprType;\
-  typedef typename TensorEvaluator<const TensorIndexTupleOp<OrigExpr> , SyclKernelDevice>::Dimensions InputDimensions;\
-  static const int NumDims = Eigen::internal::array_size<InputDimensions>::value;\
-  typedef array<Index, NumDims> StrideDims;\
-  typedef const TensorTupleReducerDeviceOp<StrideDims, XprType> Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
-  :expr(Type(XprType(ConvertToActualTypeSycl(typename XprType::CoeffReturnType, utility::tuple::get<N>(t)), fd.dimensions()),\
-    fd.return_dim(), fd.strides(), fd.stride_mod(), fd.stride_div())) {\
-    }\
-};
-
-SYCLTUPLEREDUCTIONEXPR(const)
-SYCLTUPLEREDUCTIONEXPR()
-#undef SYCLTUPLEREDUCTIONEXPR
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp
-#define SYCLCONTRACTCONVCUSBIOPS(CVQual, ExprNode)\
-template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\
-struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\
-CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType,  RhsXprType>, N>, Params...> {\
-  typedef ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> XprTyp;\
-  static const auto NumIndices= Eigen::internal::traits<XprTyp>::NumDimensions;\
-  typedef CVQual TensorMap<\
-                            Tensor<typename XprTyp::Scalar,NumIndices, Eigen::internal::traits<XprTyp>::Layout, typename XprTyp::Index>,\
-                            Eigen::internal::traits<XprTyp>::Layout, \
-                            MakeGlobalPointer\
-                          > Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
-  :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
-};
-
-//TensorContractionOp
-SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp)
-SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp)
-//TensorConvolutionOp
-SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp)
-SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp)
-//TensorCustomBinaryOp
-SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp)
-SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp)
-#undef SYCLCONTRACTCONVCUSBIOPS
-
-//TensorSlicingOp
-#define SYCLSLICEOPEXPR(CVQual)\
-template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorSlicingOp <StartIndices, Sizes, OrigXprType> , CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Params... >{\
-  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
-  typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename my_xpr_type::Type> Type;\
-  my_xpr_type xprExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.dimensions()) {}\
-};
-
-SYCLSLICEOPEXPR(const)
-SYCLSLICEOPEXPR()
-#undef SYCLSLICEOPEXPR
-
-//TensorStridingSlicingOp
-#define SYCLSLICESTRIDEOPEXPR(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, OrigXprType>, CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Params... >{\
-  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
-  typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename my_xpr_type::Type> Type;\
-  my_xpr_type xprExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.stopIndices(),funcD.strides()) {}\
-};
-
-SYCLSLICESTRIDEOPEXPR(const)
-SYCLSLICESTRIDEOPEXPR()
-#undef SYCLSLICESTRIDEOPEXPR
-
-//TensorReshapingOp and TensorShufflingOp
-#define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\
-template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
-  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
-  typedef CVQual OPEXPR <Param, typename my_xpr_type::Type> Type ;\
-  my_xpr_type xprExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param()) {}\
-};
-
-// TensorReshapingOp
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, const)
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, )
-// TensorShufflingOp
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, const)
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, )
-#undef SYCLRESHAPEANDSHUFFLEOPEXPRCONST
-
-//TensorPaddingOp
-#define SYCLPADDINGOPEXPRCONST(OPEXPR, CVQual)\
-template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
-  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
-  typedef CVQual OPEXPR <Param, typename my_xpr_type::Type> Type ;\
-  my_xpr_type xprExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param() , funcD.scalar_param()) {}\
-};
-
-//TensorPaddingOp
-SYCLPADDINGOPEXPRCONST(TensorPaddingOp, const)
-SYCLPADDINGOPEXPRCONST(TensorPaddingOp, )
-#undef SYCLPADDINGOPEXPRCONST
-
-// TensorChippingOp
-#define SYCLTENSORCHIPPINGOPEXPR(CVQual)\
-template<DenseIndex DimId, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorChippingOp <DimId, OrigXprType> , CVQual TensorChippingOp<DimId, XprType>, Params... >{\
-  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
-  typedef CVQual TensorChippingOp<DimId, typename my_xpr_type::Type> Type;\
-  my_xpr_type xprExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.offset(), funcD.dimId()) {}\
-};
-
-SYCLTENSORCHIPPINGOPEXPR(const)
-SYCLTENSORCHIPPINGOPEXPR()
-#undef SYCLTENSORCHIPPINGOPEXPR
-
-// TensorImagePatchOp
-#define SYCLTENSORIMAGEPATCHOPEXPR(CVQual)\
-template<DenseIndex Rows, DenseIndex Cols, typename OrigXprType, typename XprType, typename... Params>\
-struct  ExprConstructor<CVQual TensorImagePatchOp<Rows, Cols, OrigXprType>, CVQual TensorImagePatchOp<Rows, Cols, XprType>, Params... > {\
-  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
-  typedef CVQual TensorImagePatchOp<Rows, Cols, typename my_xpr_type::Type> Type;\
-  my_xpr_type xprExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_row_strides, funcD.m_col_strides,\
-    funcD.m_in_row_strides, funcD.m_in_col_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, funcD.m_padding_explicit, \
-    funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, funcD.m_padding_type, funcD.m_padding_value){}\
-};
-
-SYCLTENSORIMAGEPATCHOPEXPR(const)
-SYCLTENSORIMAGEPATCHOPEXPR()
-#undef SYCLTENSORIMAGEPATCHOPEXPR
-
-// TensorVolumePatchOp
-#define SYCLTENSORVOLUMEPATCHOPEXPR(CVQual)\
-template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename OrigXprType, typename XprType, typename... Params>\
-struct  ExprConstructor<CVQual TensorVolumePatchOp<Planes, Rows, Cols, OrigXprType>, CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Params... > {\
-  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
-  typedef CVQual TensorVolumePatchOp<Planes, Rows, Cols, typename my_xpr_type::Type> Type;\
-  my_xpr_type xprExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_planes, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_plane_strides, funcD.m_row_strides, funcD.m_col_strides,\
-    funcD.m_in_plane_strides, funcD.m_in_row_strides, funcD.m_in_col_strides,funcD.m_plane_inflate_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, \
-    funcD.m_padding_explicit, funcD.m_padding_top_z, funcD.m_padding_bottom_z, funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, \
-    funcD.m_padding_type, funcD.m_padding_value ){\
-    }\
-};
-
-SYCLTENSORVOLUMEPATCHOPEXPR(const)
-SYCLTENSORVOLUMEPATCHOPEXPR()
-#undef SYCLTENSORVOLUMEPATCHOPEXPR
-
-// TensorLayoutSwapOp and TensorIndexTupleOp
-#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(CVQual, ExprNode)\
-template<typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual ExprNode <OrigXprType> , CVQual ExprNode<XprType>, Params... >{\
-  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
-  typedef CVQual ExprNode<typename my_xpr_type::Type> Type;\
-  my_xpr_type xprExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr) {}\
-};
-
-//TensorLayoutSwapOp
-SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorLayoutSwapOp)
-SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorLayoutSwapOp)
-//TensorIndexTupleOp
-SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorIndexTupleOp)
-SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorIndexTupleOp)
-
-#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR
-
-/// template deduction for \ref ExprConstructor struct
-template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params>
-auto createDeviceExpression(FuncD &funcD, const utility::tuple::Tuple<Params...> &t)
-    -> decltype(ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t)) {
-  return ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t);
-}
-
-} /// namespace TensorSycl
-} /// namespace internal
-} /// namespace Eigen
-
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
deleted file mode 100644
index 3c74d1696..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
+++ /dev/null
@@ -1,310 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclExtractAccessor.h
- *
- * \brief:
- * ExtractAccessor takes Expression placeHolder expression and the tuple of sycl
- * buffers as an input. Using pre-order tree traversal, ExtractAccessor
- * recursively calls itself for its children in the expression tree. The
- * leaf node in the PlaceHolder expression is nothing but a container preserving
- * the order of the actual data in the tuple of sycl buffer. By invoking the
- * extract accessor for the PlaceHolder<N>, an accessor is created for the Nth
- * buffer in the tuple of buffers. This accessor is then added as an Nth
- * element in the tuple of accessors. In this case we preserve the order of data
- * in the expression tree.
- *
- * This is the specialisation of extract accessor method for different operation
- * type in the PlaceHolder expression.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-#define RETURN_CPP11(expr) ->decltype(expr) {return expr;}
-
-/// struct ExtractAccessor: Extract Accessor Class is used to extract the
-/// accessor from a buffer.
-/// Depending on the type of the leaf node we can get a read accessor or a
-/// read_write accessor
-template <typename Evaluator>
-struct ExtractAccessor;
-
-struct AccessorConstructor{
-  template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, const Arg& eval)
-  RETURN_CPP11(ExtractAccessor<Arg>::getTuple(cgh, eval))
-
-  template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1, const Arg2& eval2)
-  RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2)))
-
-  template<typename Arg1, typename Arg2, typename Arg3>	static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1 , const Arg2& eval2 , const Arg3& eval3)
-  RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3))))
-
-  template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, const Arg& eval)
-  RETURN_CPP11(utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data())))
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-///  TensorCwiseNullaryOp,  TensorCwiseUnaryOp and  TensorBroadcastingOp
-#define SYCLUNARYCATEGORYEXTACC(CVQual)\
-template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& eval)\
-RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLUNARYCATEGORYEXTACC(const)
-SYCLUNARYCATEGORYEXTACC()
-#undef SYCLUNARYCATEGORYEXTACC
-
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp
-#define SYCLBINARYCATEGORYEXTACC(CVQual)\
-template <template<class, class, class> class BinaryCategory, typename OP,  typename LHSExpr, typename RHSExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
-};
-
-SYCLBINARYCATEGORYEXTACC(const)
-SYCLBINARYCATEGORYEXTACC()
-#undef SYCLBINARYCATEGORYEXTACC
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorCwiseTernaryOp
-#define SYCLTERNARYCATEGORYEXTACC(CVQual)\
-template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl()))\
-};
-
-SYCLTERNARYCATEGORYEXTACC(const)
-SYCLTERNARYCATEGORYEXTACC()
-#undef SYCLTERNARYCATEGORYEXTACC
-
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorCwiseSelectOp. This is a special case where there is no OP
-#define SYCLSELECTOPEXTACC(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl()))\
-};
-
-SYCLSELECTOPEXTACC(const)
-SYCLSELECTOPEXTACC()
-#undef SYCLSELECTOPEXTACC
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorAssignOp
-#define SYCLTENSORASSIGNOPEXTACC(CVQual)\
-template <typename LHSExpr, typename RHSExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
-};
-
- SYCLTENSORASSIGNOPEXTACC(const)
- SYCLTENSORASSIGNOPEXTACC()
- #undef SYCLTENSORASSIGNOPEXTACC
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorMap
-#define TENSORMAPEXPR(CVQual, ACCType)\
-template <typename PlainObjectType, int Options_, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::template getAccessor<ACCType>(cgh, eval))\
-};
-
-TENSORMAPEXPR(const, cl::sycl::access::mode::read)
-TENSORMAPEXPR(, cl::sycl::access::mode::read_write)
-#undef TENSORMAPEXPR
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp
-#define SYCLFORCEDEVALEXTACC(CVQual)\
-template <typename Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
-};
-
-SYCLFORCEDEVALEXTACC(const)
-SYCLFORCEDEVALEXTACC()
-#undef SYCLFORCEDEVALEXTACC
-
-//TensorCustomUnaryOp
-#define SYCLCUSTOMUNARYOPEXTACC(CVQual)\
-template <typename CustomUnaryFunc, typename XprType, typename Dev >\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
-};
-
-
-SYCLCUSTOMUNARYOPEXTACC(const)
-SYCLCUSTOMUNARYOPEXTACC()
-#undef SYCLCUSTOMUNARYOPEXTACC
-
-//TensorCustomBinaryOp
-#define SYCLCUSTOMBINARYOPEXTACC(CVQual)\
-template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType , typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
-};
-
-SYCLCUSTOMBINARYOPEXTACC(const)
-SYCLCUSTOMBINARYOPEXTACC()
-#undef SYCLCUSTOMBIBARYOPEXTACC
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
-#define SYCLEVALTOEXTACC(CVQual)\
-template <typename Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev>& eval)\
-  RETURN_CPP11(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())))\
-};
-
-SYCLEVALTOEXTACC(const)
-SYCLEVALTOEXTACC()
-#undef SYCLEVALTOEXTACC
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
-#define SYCLREDUCTIONEXTACC(CVQual, ExprNode)\
-template <typename OP, typename Dim, typename Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
-};
-// TensorReductionOp
-SYCLREDUCTIONEXTACC(const,TensorReductionOp)
-SYCLREDUCTIONEXTACC(,TensorReductionOp)
-
-// TensorTupleReducerOp
-SYCLREDUCTIONEXTACC(const,TensorTupleReducerOp)
-SYCLREDUCTIONEXTACC(,TensorTupleReducerOp)
-#undef SYCLREDUCTIONEXTACC
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp
-#define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\
-template<typename Indices, typename LhsXprType, typename RhsXprType, typename Dev>\
- struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
-};
-//TensorContractionOp
-SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp)
-//TensorConvolutionOp
-SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp)
-SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp)
-#undef SYCLCONTRACTIONCONVOLUTIONEXTACC
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorSlicingOp.
-#define SYCLSLICEOPEXTACC(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& eval)\
-  RETURN_CPP11( AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLSLICEOPEXTACC(const)
-SYCLSLICEOPEXTACC()
-#undef SYCLSLICEOPEXTACC
-// specialisation of the \ref ExtractAccessor struct when the node type is
-///  TensorStridingSlicingOp.
-#define SYCLSLICESTRIDEOPEXTACC(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLSLICESTRIDEOPEXTACC(const)
-SYCLSLICESTRIDEOPEXTACC()
-#undef SYCLSLICESTRIDEOPEXTACC
-
-// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorChippingOp.
-#define SYCLTENSORCHIPPINGOPEXTACC(CVQual)\
-template<DenseIndex DimId, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev> >{\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLTENSORCHIPPINGOPEXTACC(const)
-SYCLTENSORCHIPPINGOPEXTACC()
-#undef SYCLTENSORCHIPPINGOPEXTACC
-
-// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorImagePatchOp.
-#define SYCLTENSORIMAGEPATCHOPEXTACC(CVQual)\
-template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Dev> >{\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLTENSORIMAGEPATCHOPEXTACC(const)
-SYCLTENSORIMAGEPATCHOPEXTACC()
-#undef SYCLTENSORIMAGEPATCHOPEXTACC
-
-// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorVolumePatchOp.
-#define SYCLTENSORVOLUMEPATCHOPEXTACC(CVQual)\
-template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Dev> >{\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLTENSORVOLUMEPATCHOPEXTACC(const)
-SYCLTENSORVOLUMEPATCHOPEXTACC()
-#undef SYCLTENSORVOLUMEPATCHOPEXTACC
-
-// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorLayoutSwapOp, TensorIndexTupleOp
-#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(CVQual, ExprNode)\
-template<typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<XprType>, Dev> >{\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<XprType>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-// TensorLayoutSwapOp
-SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorLayoutSwapOp)
-SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorLayoutSwapOp)
-//TensorIndexTupleOp
-SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorIndexTupleOp)
-SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorIndexTupleOp)
-
-#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC
-
-/// template deduction for \ref ExtractAccessor
-template <typename Evaluator>
-auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& eval)
--> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, eval)) {
-  return ExtractAccessor<Evaluator>::getTuple(cgh, eval);
-}
-
-} /// namespace TensorSycl
-} /// namespace internal
-} /// namespace Eigen
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
deleted file mode 100644
index 09407e53e..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ /dev/null
@@ -1,467 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclextractFunctors.h
- *
- * \brief:
- *  Used to extract all the functors allocated to each node of the expression
-*tree.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-/// struct FunctorExtractor:  This struct is used to extract the functors
-/// constructed on
-/// the host-side, to pack them and reuse them in reconstruction of the
-/// expression on the device.
-/// We have to do that as in Eigen the functors are not stateless so we cannot
-/// re-instantiate them on the device.
-/// We have to pass instantiated functors to the device.
-// This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval).
-#define DEFALTACTION(Evaluator)\
-typedef typename Evaluator::Dimensions Dimensions;\
-const Dimensions m_dimensions;\
-EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
-FunctorExtractor(const Evaluator& expr): m_dimensions(expr.dimensions()) {}
-
-template <typename Evaluator> struct FunctorExtractor{
-  DEFALTACTION(Evaluator)
-};
-
-
-/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything
-///TensorConversionOp
-#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\
-template <typename ArgType1, typename ArgType2, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<ArgType2, Dev> > subExpr;\
-  FunctorExtractor(const TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev>& expr)\
-  : subExpr(expr.impl()) {}\
-};
-
-SYCLEXTRFUNCCONVERSION(TensorConversionOp, const)
-SYCLEXTRFUNCCONVERSION(TensorConversionOp, )
-#undef SYCLEXTRFUNCCONVERSION
-
-#define SYCLEXTRTENSORMAPFIXEDSIZE(CVQual)\
-template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_, template <class> class MakePointer_, typename Dev>\
-struct FunctorExtractor< TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev> >{\
-FunctorExtractor(const TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev>& ){}\
-};
-
-SYCLEXTRTENSORMAPFIXEDSIZE(const)
-SYCLEXTRTENSORMAPFIXEDSIZE()
-#undef SYCLEXTRTENSORMAPFIXEDSIZE
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseNullaryOp,  TensorCwiseUnaryOp, and  TensorBroadcastingOp
-#define SYCLEXTRFUNCUNARY(CVQual)\
-template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
-  const OP func;\
-  FunctorExtractor(const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& expr)\
-  : rhsExpr(expr.impl()), func(expr.functor()) {}\
-};
-
-SYCLEXTRFUNCUNARY(const)
-SYCLEXTRFUNCUNARY()
-#undef SYCLEXTRFUNCUNARY
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseBinaryOp
-#define SYCLEXTRFUNCBIINARY(CVQual)\
-template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
-  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
-  const OP func;\
-  FunctorExtractor(const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)\
-  : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}\
-};
-
-SYCLEXTRFUNCBIINARY(const)
-SYCLEXTRFUNCBIINARY()
-#undef SYCLEXTRFUNCBIINARY
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is TensorCwiseTernaryOp
-#define SYCLEXTRFUNCTERNARY(CVQual)\
-template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr,typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;\
-  FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;\
-  FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;\
-  const OP func;\
-  FunctorExtractor(const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)\
-  : arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}\
-};
-
-SYCLEXTRFUNCTERNARY(const)
-SYCLEXTRFUNCTERNARY()
-#undef SYCLEXTRFUNCTERNARY
-
-
-
-//TensorCustomOp must be specialised otherwise it will be captured by UnaryCategory while its action is different
-//from the UnaryCategory and it is similar to the general FunctorExtractor.
-/// specialisation of TensorCustomOp
-#define SYCLEXTRFUNCCUSTOMUNARYOP(CVQual)\
-template <typename CustomUnaryFunc, typename ArgType, typename Dev >\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> > {\
-  typedef TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> Evaluator;\
-  DEFALTACTION(Evaluator)\
-};
-//TensorCustomUnaryOp
-SYCLEXTRFUNCCUSTOMUNARYOP(const)
-SYCLEXTRFUNCCUSTOMUNARYOP()
-#undef SYCLEXTRFUNCCUSTOMUNARYOP
-
-//TensorCustomBinaryOp
-#define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\
-template <typename CustomBinaryFunc, typename ArgType1, typename ArgType2, typename Dev >\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> > {\
-  typedef TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> Evaluator;\
-  DEFALTACTION(Evaluator)\
-};
-//TensorCustomBinaryOp
-SYCLEXTRFUNCCUSTOMBIBARYOP(const)
-SYCLEXTRFUNCCUSTOMBIBARYOP()
-#undef SYCLEXTRFUNCCUSTOMBIBARYOP
-
-
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCSELECTOP(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\
-struct FunctorExtractor< TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<IfExpr, Dev> > ifExpr;\
-  FunctorExtractor<TensorEvaluator<ThenExpr, Dev> > thenExpr;\
-  FunctorExtractor<TensorEvaluator<ElseExpr, Dev> > elseExpr;\
-  FunctorExtractor(const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& expr)\
-  : ifExpr(expr.cond_impl()), thenExpr(expr.then_impl()), elseExpr(expr.else_impl()) {}\
-};
-
-SYCLEXTRFUNCSELECTOP(const)
-SYCLEXTRFUNCSELECTOP()
-#undef SYCLEXTRFUNCSELECTOP
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorAssignOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCASSIGNOP(CVQual)\
-template <typename LHSExpr, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
-  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
-  FunctorExtractor(const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& expr)\
-  : lhsExpr(expr.left_impl()), rhsExpr(expr.right_impl()) {}\
-};
-SYCLEXTRFUNCASSIGNOP(const)
-SYCLEXTRFUNCASSIGNOP()
-#undef SYCLEXTRFUNCASSIGNOP
-
-/// specialisation of the \ref FunctorExtractor struct when the node types are
-/// TensorEvalToOp, TensorLayoutSwapOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(CVQual, ExprNode)\
-template <typename Expr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Expr>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<Expr, Dev> > xprExpr;\
-  FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Expr>, Dev>& expr)\
-  : xprExpr(expr.impl()) {}\
-};
-//TensorEvalToOp
-SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorEvalToOp)
-SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorEvalToOp)
-// TensorLayoutSwapOp
-SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorLayoutSwapOp)
-SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorLayoutSwapOp)
-// TensorIndexTupleOp
-SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorIndexTupleOp)
-SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorIndexTupleOp)
-
-#undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE
-
-template<typename Dim, size_t NumOutputDim> struct DimConstr {
-template<typename InDim>
-  static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return dims;}
-};
-
-template<typename Dim> struct DimConstr<Dim, 0> {
-  template<typename InDim>
-    static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return Dim(static_cast<Dim>(dims.TotalSize()));}
-};
-//TensorReductionOp
-#define SYCLEXTRFUNCREDUCTIONOP(CVQual)\
-template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> >{\
-  typedef TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;\
-  typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\
-  const Dimensions m_dimensions;\
-  EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
-  FunctorExtractor(const TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)\
-  : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}\
-};
-SYCLEXTRFUNCREDUCTIONOP(const)
-SYCLEXTRFUNCREDUCTIONOP()
-#undef SYCLEXTRFUNCREDUCTIONOP
-
-//TensorTupleReducerOp
-#define SYCLEXTRFUNCTUPLEREDUCTIONOP(CVQual)\
-template<typename ReduceOp, typename Dims, typename ArgType, typename Device>\
- struct FunctorExtractor<TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> >{\
- typedef TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> Evaluator;\
- static const int  NumOutputDims= Eigen::internal::traits<TensorTupleReducerOp<ReduceOp, Dims, ArgType> >::NumDimensions;\
- typedef typename Evaluator::StrideDims StrideDims;\
- typedef typename Evaluator::Index Index;\
- typedef typename Eigen::internal::conditional<NumOutputDims==0, DSizes<Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\
- const Dimensions m_dimensions;\
- const Index m_return_dim;\
- const StrideDims m_strides;\
- const Index m_stride_mod;\
- const Index m_stride_div;\
- EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
- EIGEN_STRONG_INLINE  Index return_dim() const {return m_return_dim;}\
- EIGEN_STRONG_INLINE const StrideDims strides() const {return m_strides;}\
- EIGEN_STRONG_INLINE const Index stride_mod() const {return m_stride_mod;}\
- EIGEN_STRONG_INLINE const Index stride_div() const {return m_stride_div;}\
- FunctorExtractor(const TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device>& expr)\
- : m_dimensions(DimConstr<Dimensions, NumOutputDims>::getDim(expr.dimensions())), m_return_dim(expr.return_dim()),\
-   m_strides(expr.strides()), m_stride_mod(expr.stride_mod()), m_stride_div(expr.stride_div()){}\
-};
-
-SYCLEXTRFUNCTUPLEREDUCTIONOP(const)
-SYCLEXTRFUNCTUPLEREDUCTIONOP()
-#undef SYCLEXTRFUNCTUPLEREDUCTIONOP
-
-//TensorContractionOp and TensorConvolutionOp
-#define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\
-template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\
-  typedef TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device> Evaluator;\
-  typedef typename Evaluator::Dimensions Dimensions;\
-  const Dimensions m_dimensions;\
-  EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
-  FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>& expr)\
-  : m_dimensions(expr.dimensions()) {}\
-};
-
-//TensorContractionOp
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp)
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp)
-//TensorConvolutionOp
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp)
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp)
-#undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorSlicingOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCTSLICEOP(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<XprType, Dev> > xprExpr;\
-  const StartIndices m_offsets;\
-  const Sizes m_dimensions;\
-  FunctorExtractor(const TensorEvaluator<CVQual  TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& expr)\
-  : xprExpr(expr.impl()), m_offsets(expr.startIndices()), m_dimensions(expr.dimensions()) {}\
-  EIGEN_STRONG_INLINE const StartIndices& startIndices() const {return m_offsets;}\
-  EIGEN_STRONG_INLINE const Sizes& dimensions() const {return m_dimensions;}\
-};
-
-SYCLEXTRFUNCTSLICEOP(const)
-SYCLEXTRFUNCTSLICEOP()
-#undef SYCLEXTRFUNCTSLICEOP
-
-//TensorStridingSlicingOp
-#define SYCLEXTRFUNCTSLICESTRIDEOP(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
-  FunctorExtractor<TensorEvaluator<XprType, Dev> > xprExpr;\
-  const StartIndices m_startIndices;\
-  const StopIndices m_stopIndices;\
-  const Strides m_strides;\
-  FunctorExtractor(const TensorEvaluator<CVQual  TensorStridingSlicingOp<StartIndices, StopIndices,Strides, XprType>, Dev>& expr)\
-  : xprExpr(expr.impl()), m_startIndices(expr.exprStartIndices()), m_stopIndices(expr.exprStopIndices()), m_strides(expr.strides()) {}\
-  EIGEN_STRONG_INLINE  const StartIndices& startIndices() const { return m_startIndices; }\
-  EIGEN_STRONG_INLINE  const StartIndices& stopIndices() const { return m_stopIndices; }\
-  EIGEN_STRONG_INLINE  const StartIndices& strides() const { return m_strides; }\
-};
-
-SYCLEXTRFUNCTSLICESTRIDEOP(const)
-SYCLEXTRFUNCTSLICESTRIDEOP()
-#undef SYCLEXTRFUNCTSLICESTRIDEOP
-
-// Had to separate TensorReshapingOp and TensorShufflingOp. Otherwise it will be mistaken by UnaryCategory
-#define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\
-template<typename Param, typename XprType, typename Dev>\
-struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\
-  FunctorExtractor<Eigen::TensorEvaluator<XprType, Dev> > xprExpr;\
-  const Param m_param;\
-  EIGEN_STRONG_INLINE const Param& param() const { return m_param; }\
-  FunctorExtractor(const Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>& expr)\
-  : xprExpr(expr.impl()), m_param(expr.FUNCCALL) {}\
-};
-
-//TensorReshapingOp
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), const)
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), )
-
-//TensorShufflingOp
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), const)
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), )
-#undef SYCLRESHAPEANDSHUFFLEOPFUNCEXT
-
-// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory
-#define PADDINGOPFUNCEXT(OPEXPR, FUNCCALL, SCALARFUNCCALL, CVQual)\
-template<typename Param, typename XprType, typename Dev>\
-struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\
-  FunctorExtractor<Eigen::TensorEvaluator<XprType, Dev> > xprExpr;\
-  const Param m_param;\
-  typedef typename Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>::Scalar Scalar;\
-  const Scalar m_scalar_param;\
-  EIGEN_STRONG_INLINE const Param& param() const { return m_param; }\
-  EIGEN_STRONG_INLINE const Scalar& scalar_param() const { return m_scalar_param; }\
-  FunctorExtractor(const Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>& expr)\
-  : xprExpr(expr.impl()), m_param(expr.FUNCCALL), m_scalar_param(expr.SCALARFUNCCALL)  {}\
-};
-
-PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), const)
-PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), )
-#undef PADDINGOPFUNCEXT
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is TensorContractionOp and TensorConcatenationOp
-/// for TensorContractionOp the LHS and RHS here are the original one no need to apply condition on their type.
-#define SYCLEXTRFUNCCONTRACTCONCAT(OPEXPR, FUNCCALL, CVQual)\
-template <typename Param, typename LHSExpr, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
-  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
-  const Param func;\
-  FunctorExtractor(const TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev>& expr)\
-  : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\
-};
-
-// TensorConcatenationOp
-SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const)
-SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),)
-#undef SYCLEXTRFUNCCONTRACTCONCAT
-
-//TensorChippingOp
-#define SYCLEXTRFUNCCHIPPINGOP(CVQual)\
-template<DenseIndex DimId, typename XprType, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device> >{\
-  FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\
-  const DenseIndex m_dim;\
-  const DenseIndex m_offset;\
-  EIGEN_STRONG_INLINE const DenseIndex& dimId() const { return m_dim; }\
-  EIGEN_STRONG_INLINE const DenseIndex& offset() const { return m_offset; }\
-  FunctorExtractor(const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>& expr)\
-  : xprExpr(expr.impl()), m_dim(expr.dimId()), m_offset(expr.offset()) {}\
-};
-
-SYCLEXTRFUNCCHIPPINGOP(const)
-SYCLEXTRFUNCCHIPPINGOP()
-#undef SYCLEXTRFUNCCHIPPINGOP
-
-//TensorImagePatchOp
-#define SYCLEXTRFUNCIMAGEPATCHOP(CVQual)\
-template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Device> >{\
-typedef CVQual TensorImagePatchOp<Rows, Cols, XprType> Self;\
-FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\
-const DenseIndex m_patch_rows;\
-const DenseIndex m_patch_cols;\
-const DenseIndex m_row_strides;\
-const DenseIndex m_col_strides;\
-const DenseIndex m_in_row_strides;\
-const DenseIndex m_in_col_strides;\
-const DenseIndex m_row_inflate_strides;\
-const DenseIndex m_col_inflate_strides;\
-const bool m_padding_explicit;\
-const DenseIndex m_padding_top;\
-const DenseIndex m_padding_bottom;\
-const DenseIndex m_padding_left;\
-const DenseIndex m_padding_right;\
-const PaddingType m_padding_type;\
-const typename Self::Scalar m_padding_value;\
-FunctorExtractor(const TensorEvaluator<Self, Device>& expr)\
-: xprExpr(expr.impl()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\
-  m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\
-  m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\
-  m_row_inflate_strides(expr.xpr().row_inflate_strides()), m_col_inflate_strides(expr.xpr().col_inflate_strides()),\
-  m_padding_explicit(expr.xpr().padding_explicit()),m_padding_top(expr.xpr().padding_top()),\
-  m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\
-  m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),\
-  m_padding_value(expr.xpr().padding_value()){}\
-};
-
-SYCLEXTRFUNCIMAGEPATCHOP(const)
-SYCLEXTRFUNCIMAGEPATCHOP()
-#undef SYCLEXTRFUNCIMAGEPATCHOP
-
-/// TensorVolumePatchOp
-#define SYCLEXTRFUNCVOLUMEPATCHOP(CVQual)\
-template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Device> >{\
-typedef CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> Self;\
-FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\
-const DenseIndex m_patch_planes;\
-const DenseIndex m_patch_rows;\
-const DenseIndex m_patch_cols;\
-const DenseIndex m_plane_strides;\
-const DenseIndex m_row_strides;\
-const DenseIndex m_col_strides;\
-const DenseIndex m_in_plane_strides;\
-const DenseIndex m_in_row_strides;\
-const DenseIndex m_in_col_strides;\
-const DenseIndex m_plane_inflate_strides;\
-const DenseIndex m_row_inflate_strides;\
-const DenseIndex m_col_inflate_strides;\
-const bool m_padding_explicit;\
-const DenseIndex m_padding_top_z;\
-const DenseIndex m_padding_bottom_z;\
-const DenseIndex m_padding_top;\
-const DenseIndex m_padding_bottom;\
-const DenseIndex m_padding_left;\
-const DenseIndex m_padding_right;\
-const PaddingType m_padding_type;\
-const typename Self::Scalar m_padding_value;\
-FunctorExtractor(const TensorEvaluator<Self, Device>& expr)\
-: xprExpr(expr.impl()), m_patch_planes(expr.xpr().patch_planes()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\
-  m_plane_strides(expr.xpr().plane_strides()), m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\
-  m_in_plane_strides(expr.xpr().in_plane_strides()), m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\
-  m_plane_inflate_strides(expr.xpr().plane_inflate_strides()),m_row_inflate_strides(expr.xpr().row_inflate_strides()),\
-  m_col_inflate_strides(expr.xpr().col_inflate_strides()), m_padding_explicit(expr.xpr().padding_explicit()),\
-  m_padding_top_z(expr.xpr().padding_top_z()), m_padding_bottom_z(expr.xpr().padding_bottom_z()), \
-  m_padding_top(expr.xpr().padding_top()), m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\
-  m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),m_padding_value(expr.xpr().padding_value()){}\
-};
-SYCLEXTRFUNCVOLUMEPATCHOP(const)
-SYCLEXTRFUNCVOLUMEPATCHOP()
-#undef SYCLEXTRFUNCVOLUMEPATCHOP
-
-
-/// template deduction function for FunctorExtractor
-template <typename Evaluator>
-auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> {
-  return FunctorExtractor<Evaluator>(evaluator);
-}
-}  // namespace internal
-}  // namespace TensorSycl
-}  // namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
deleted file mode 100644
index a447c3f88..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+++ /dev/null
@@ -1,248 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: eigen@codeplay.com
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// General include header of SYCL target for Tensor Module
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-  template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
-    OP op;
-    OutputAccessor aOut;
-    ptrdiff_t out_offset;
-    InputAccessor aI;
-    LocalAccessor scratch;
-    size_t length, local;
-    GenericKernelReducer(OP op_, OutputAccessor aOut_, ptrdiff_t out_offset_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
-    : op(op_), aOut(aOut_), out_offset(out_offset_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
-    void operator()(cl::sycl::nd_item<1> itemID) {
-      size_t globalid = itemID.get_global(0);
-      size_t localid = itemID.get_local(0);
-      /* All threads collectively read from global memory into local.
-       * The barrier ensures all threads' IO is resolved before
-       * execution continues (strictly speaking, all threads within
-       * a single work-group - there is no co-ordination between
-       * work-groups, only work-items). */
-      if (globalid < length) {
-        scratch[localid] = aI[globalid];
-      }
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-
-      /* Apply the reduction operation between the current local
-       * id and the one on the other half of the vector. */
-      if (globalid < length) {
-        auto min = (length < local) ? length : local;
-        for (size_t offset = min / 2; offset > 0; offset /= 2) {
-          if (localid < offset) {
-            auto accum = op.initialize();
-            op.reduce(scratch[localid], &accum);
-            op.reduce(scratch[localid + offset], &accum);
-            op.finalize(accum);
-            scratch[localid]=accum;
-            //scratch[localid] += scratch[localid + offset];
-          }
-          itemID.barrier(cl::sycl::access::fence_space::local_space);
-        }
-        /* The final result will be stored in local id 0. */
-        if (localid == 0) {
-          aI[itemID.get_group(0)] = scratch[localid];
-          if((length<=local) && globalid ==0){
-            auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
-            aOutPtr[0 + ConvertToActualSyclOffset(CoeffReturnType, out_offset)]=scratch[0];
-          }
-        }
-      }
-    }
-
-  };
-
-/// ReductionFunctor
-template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
- public:
-  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
-  ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
-  :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
-  void operator()(cl::sycl::nd_item<1> itemID) {
-
-    typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
-    auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-    /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
-    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
-    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
-    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
-    /// the device_evaluator is detectable and recognisable on the device.
-    typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
-    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
-    auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
-    /// const cast added as a naive solution to solve the qualifier drop error
-    auto globalid=static_cast<Index>(itemID.get_global_linear_id());
-    if (globalid< range) {
-      typename DeviceSelf::CoeffReturnType accum = functor.initialize();
-      Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
-      functor.finalize(accum);
-      output_accessor_ptr[globalid + ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum;
-    }
-  }
- private:
-  write_accessor output_accessor;
-  ptrdiff_t out_offset;
-  FunctorExpr functors;
-  Tuple_of_Acc tuple_of_accessors;
-  Dims dims;
-  Op functor;
-  Index range;
-};
-
-template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Index>
-class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> {
- public:
-  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
-  typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
-  ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
-    Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>,  Index range_, Index num_values_to_reduce_)
-  :output_accessor(output_accessor_),  out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
-  void operator()(cl::sycl::nd_item<1> itemID) {
-
-    typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
-    auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-    /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
-    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
-    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
-    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
-    /// the device_evaluator is detectable and recognisable on the device.
-    typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
-    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
-    auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
-    /// const cast added as a naive solution to solve the qualifier drop error
-    auto globalid=static_cast<Index>(itemID.get_global_linear_id());
-    if (globalid< range) {
-      typename DeviceSelf::CoeffReturnType accum = functor.initialize();
-      Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
-      functor.finalize(accum);
-      output_accessor_ptr[globalid+ ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum/num_values_to_reduce;
-    }
-  }
- private:
-  write_accessor output_accessor;
-  ptrdiff_t out_offset;
-  FunctorExpr functors;
-  Tuple_of_Acc tuple_of_accessors;
-  Dims dims;
-  Op functor;
-  Index range;
-  Index num_values_to_reduce;
-};
-
-template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
-class FullReductionKernelFunctor{
-public:
-  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-  OutAccessor tmp_global_accessor;
-  Index rng , remaining, red_factor;
-  Op op;
-  Dims dims;
-  FunctorExpr functors;
-  TupleType tuple_of_accessors;
-
-  FullReductionKernelFunctor(OutAccessor acc,   Index rng_, Index remaining_, Index red_factor_, Op op_, Dims dims_, FunctorExpr functors_, TupleType t_acc)
-  :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(op_), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}
-
-  void operator()(cl::sycl::nd_item<1> itemID) {
-
-    typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
-    auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-    /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
-    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
-    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
-    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
-    /// the device_evaluator is detectable and recognisable on the device.
-    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
-    /// const cast added as a naive solution to solve the qualifier drop error
-    auto globalid=itemID.get_global_linear_id();
-
-    tmp_global_accessor.get_pointer()[globalid]=(globalid<rng) ? Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op))
-    : static_cast<CoeffReturnType>(op.initialize());
-
-    if(remaining!=0 && globalid==0 ){
-      // this will add the rest of input buffer when the input size is not devidable to red_factor.
-      auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::
-      reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
-      auto accum = op.initialize();
-      op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
-      op.reduce(remaining_reduce, &accum);
-      op.finalize(accum);
-      tmp_global_accessor.get_pointer()[0]=accum;
-
-    }
-  }
-};
-
-template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr,  typename Dims, typename Index, typename TupleType>
-class FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Eigen::internal::MeanReducer<CoeffReturnType>, Dims, Index, TupleType>{
-public:
-  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-  typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
-
-  OutAccessor tmp_global_accessor;
-  Index rng , remaining, red_factor;
-  Op op;
-  Dims dims;
-  FunctorExpr functors;
-  TupleType tuple_of_accessors;
-
-  FullReductionKernelFunctor(OutAccessor acc,   Index rng_, Index remaining_, Index red_factor_, Eigen::internal::MeanReducer<CoeffReturnType>, Dims dims_, FunctorExpr functors_, TupleType t_acc)
-  :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(Op()), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}
-
-  void operator()(cl::sycl::nd_item<1> itemID) {
-
-    typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
-    auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-    /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
-    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
-    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
-    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
-    /// the device_evaluator is detectable and recognisable on the device.
-    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
-    /// const cast added as a naive solution to solve the qualifier drop error
-    auto globalid=itemID.get_global_linear_id();
-    auto scale = (rng*red_factor) + remaining;
-
-    tmp_global_accessor.get_pointer()[globalid]= (globalid<rng)? ((Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)))/scale)
-    :static_cast<CoeffReturnType>(op.initialize())/scale;
-
-    if(remaining!=0 && globalid==0 ){
-      // this will add the rest of input buffer when the input size is not devidable to red_factor.
-      auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
-      auto accum = op.initialize();
-      tmp_global_accessor.get_pointer()[0]= tmp_global_accessor.get_pointer()[0]*scale;
-      op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
-      op.reduce(remaining_reduce, &accum);
-      op.finalize(accum);
-      tmp_global_accessor.get_pointer()[0]=accum/scale;
-
-    }
-  }
-};
-
-}
-}
-}
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
deleted file mode 100644
index 234580c7c..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
+++ /dev/null
@@ -1,213 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclLeafCount.h
- *
- * \brief:
- *  The leaf count used the pre-order expression tree traverse in order to name
- *  count the number of leaf nodes in the expression
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-/// \brief LeafCount used to counting terminal nodes. The total number of
-/// leaf nodes is used by MakePlaceHolderExprHelper to find the order
-/// of the leaf node in a expression tree at compile time.
-template <typename Expr>
-struct LeafCount;
-
-template<typename... Args> struct CategoryCount;
-
-template<> struct CategoryCount<>
-{
-  static const size_t Count =0;
-};
-
-template<typename Arg, typename... Args>
-struct CategoryCount<Arg,Args...>{
-  static const size_t Count = LeafCount<Arg>::Count + CategoryCount<Args...>::Count;
-};
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorMap
-#define SYCLTENSORMAPLEAFCOUNT(CVQual)\
-template <typename PlainObjectType, int Options_, template <class> class MakePointer_>\
-struct LeafCount<CVQual TensorMap<PlainObjectType, Options_, MakePointer_> > {\
-  static const size_t Count =1;\
-};
-
-SYCLTENSORMAPLEAFCOUNT(const)
-SYCLTENSORMAPLEAFCOUNT()
-#undef SYCLTENSORMAPLEAFCOUNT
-
-//  TensorCwiseUnaryOp,  TensorCwiseNullaryOp,  TensorCwiseBinaryOp,  TensorCwiseTernaryOp, and  TensorBroadcastingOp
-#define SYCLCATEGORYLEAFCOUNT(CVQual)\
-template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>\
-struct LeafCount<CVQual CategoryExpr<OP, RHSExpr...> >: CategoryCount<RHSExpr...> {};
-
-SYCLCATEGORYLEAFCOUNT(const)
-SYCLCATEGORYLEAFCOUNT()
-#undef SYCLCATEGORYLEAFCOUNT
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorSelectOp is an exception
-#define SYCLSELECTOPLEAFCOUNT(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr>\
-struct LeafCount<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > : CategoryCount<IfExpr, ThenExpr, ElseExpr> {};
-
-SYCLSELECTOPLEAFCOUNT(const)
-SYCLSELECTOPLEAFCOUNT()
-#undef SYCLSELECTOPLEAFCOUNT
-
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorAssignOp
-#define SYCLLEAFCOUNTASSIGNOP(CVQual)\
-template <typename LHSExpr, typename RHSExpr>\
-struct LeafCount<CVQual TensorAssignOp<LHSExpr, RHSExpr> >: CategoryCount<LHSExpr,RHSExpr> {};
-
-SYCLLEAFCOUNTASSIGNOP(const)
-SYCLLEAFCOUNTASSIGNOP()
-#undef SYCLLEAFCOUNTASSIGNOP
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorForcedEvalOp
-#define SYCLFORCEDEVALLEAFCOUNT(CVQual)\
-template <typename Expr>\
-struct LeafCount<CVQual TensorForcedEvalOp<Expr> > {\
-    static const size_t Count =1;\
-};
-
-SYCLFORCEDEVALLEAFCOUNT(const)
-SYCLFORCEDEVALLEAFCOUNT()
-#undef SYCLFORCEDEVALLEAFCOUNT
-
-#define SYCLCUSTOMUNARYOPLEAFCOUNT(CVQual)\
-template <typename CustomUnaryFunc, typename XprType>\
-struct LeafCount<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType> > {\
-static const size_t Count =1;\
-};
-
-SYCLCUSTOMUNARYOPLEAFCOUNT(const)
-SYCLCUSTOMUNARYOPLEAFCOUNT()
-#undef SYCLCUSTOMUNARYOPLEAFCOUNT
-
-
-#define SYCLCUSTOMBINARYOPLEAFCOUNT(CVQual)\
-template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>\
-struct LeafCount<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > {\
-static const size_t Count =1;\
-};
-SYCLCUSTOMBINARYOPLEAFCOUNT( const)
-SYCLCUSTOMBINARYOPLEAFCOUNT()
-#undef SYCLCUSTOMBINARYOPLEAFCOUNT
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
-#define EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(CVQual , ExprNode, Num)\
-template <typename Expr>\
-struct LeafCount<CVQual ExprNode<Expr> > {\
-  static const size_t Count = Num + CategoryCount<Expr>::Count;\
-};
-
-EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorEvalToOp, 1)
-EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorEvalToOp, 1)
-EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorLayoutSwapOp, 0)
-EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorLayoutSwapOp, 0)
-
-EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorIndexTupleOp, 0)
-EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorIndexTupleOp, 0)
-
-#undef EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp
-#define REDUCTIONLEAFCOUNT(CVQual, ExprNode)\
-template <typename OP, typename Dim, typename Expr>\
-struct LeafCount<CVQual ExprNode<OP, Dim, Expr> > {\
-    static const size_t Count =1;\
-};
-
-// TensorReductionOp
-REDUCTIONLEAFCOUNT(const,TensorReductionOp)
-REDUCTIONLEAFCOUNT(,TensorReductionOp)
-
-// tensor Argmax -TensorTupleReducerOp
-REDUCTIONLEAFCOUNT(const, TensorTupleReducerOp)
-REDUCTIONLEAFCOUNT(, TensorTupleReducerOp)
-
-#undef REDUCTIONLEAFCOUNT
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp
-#define CONTRACTIONCONVOLUTIONLEAFCOUNT(CVQual, ExprNode)\
-template <typename Indices, typename LhsXprType, typename RhsXprType>\
-struct LeafCount<CVQual ExprNode<Indices, LhsXprType, RhsXprType> > {\
-    static const size_t Count =1;\
-};
-
-CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorContractionOp)
-CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorContractionOp)
-CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp)
-CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp)
-#undef CONTRACTIONCONVOLUTIONLEAFCOUNT
-
-/// specialisation of the \ref LeafCount struct when the node type is  TensorSlicingOp
-#define SLICEOPLEAFCOUNT(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType>\
-struct LeafCount<CVQual TensorSlicingOp<StartIndices, Sizes, XprType> >:CategoryCount<XprType>{};
-
-SLICEOPLEAFCOUNT(const)
-SLICEOPLEAFCOUNT()
-#undef SLICEOPLEAFCOUNT
-
-/// specialisation of the \ref LeafCount struct when the node type is  TensorChippingOp
-#define CHIPPINGOPLEAFCOUNT(CVQual)\
-template <DenseIndex DimId, typename XprType>\
-struct LeafCount<CVQual TensorChippingOp<DimId, XprType> >:CategoryCount<XprType>{};
-
-CHIPPINGOPLEAFCOUNT(const)
-CHIPPINGOPLEAFCOUNT()
-#undef CHIPPINGOPLEAFCOUNT
-
-///TensorStridingSlicingOp
-#define SLICESTRIDEOPLEAFCOUNT(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\
-struct LeafCount<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >:CategoryCount<XprType>{};
-
-SLICESTRIDEOPLEAFCOUNT(const)
-SLICESTRIDEOPLEAFCOUNT()
-#undef SLICESTRIDEOPLEAFCOUNT
-
-//TensorImagePatchOp
-#define TENSORIMAGEPATCHOPLEAFCOUNT(CVQual)\
-template<DenseIndex Rows, DenseIndex Cols, typename XprType>\
-struct LeafCount<CVQual TensorImagePatchOp<Rows, Cols, XprType> >:CategoryCount<XprType>{};
-
-
-TENSORIMAGEPATCHOPLEAFCOUNT(const)
-TENSORIMAGEPATCHOPLEAFCOUNT()
-#undef TENSORIMAGEPATCHOPLEAFCOUNT
-
-// TensorVolumePatchOp
-#define TENSORVOLUMEPATCHOPLEAFCOUNT(CVQual)\
-template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>\
-struct LeafCount<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> >:CategoryCount<XprType>{};
-
-TENSORVOLUMEPATCHOPLEAFCOUNT(const)
-TENSORVOLUMEPATCHOPLEAFCOUNT()
-#undef TENSORVOLUMEPATCHOPLEAFCOUNT
-
-} /// namespace TensorSycl
-} /// namespace internal
-} /// namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
deleted file mode 100644
index 9d5708fc5..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
+++ /dev/null
@@ -1,302 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclPlaceHolderExpr.h
- *
- * \brief:
- *  This is the specialisation of the placeholder expression based on the
- * operation type
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-/// \struct PlaceHolder
-/// \brief PlaceHolder is used to replace the \ref TensorMap in the expression
-/// tree.
-/// PlaceHolder contains the order of the leaf node in the expression tree.
-template <typename Scalar, size_t N>
-struct PlaceHolder {
-  static constexpr size_t I = N;
-  typedef Scalar Type;
-};
-
-/// \sttruct PlaceHolderExpression
-/// \brief it is used to create the PlaceHolder expression. The PlaceHolder
-/// expression is a copy of expression type in which the TensorMap of the has
-/// been replaced with PlaceHolder.
-template <typename Expr, size_t N>
-struct PlaceHolderExpression;
-
-template<size_t N, typename... Args>
-struct CalculateIndex;
-
-template<size_t N, typename Arg>
-struct CalculateIndex<N, Arg>{
-  typedef typename PlaceHolderExpression<Arg, N>::Type ArgType;
-  typedef utility::tuple::Tuple<ArgType> ArgsTuple;
-};
-
-template<size_t N, typename Arg1, typename Arg2>
-struct CalculateIndex<N, Arg1, Arg2>{
-  static const size_t Arg2LeafCount = LeafCount<Arg2>::Count;
-  typedef typename PlaceHolderExpression<Arg1, N - Arg2LeafCount>::Type Arg1Type;
-  typedef typename PlaceHolderExpression<Arg2, N>::Type Arg2Type;
-  typedef utility::tuple::Tuple<Arg1Type, Arg2Type> ArgsTuple;
-};
-
-template<size_t N, typename Arg1, typename Arg2, typename Arg3>
-struct CalculateIndex<N, Arg1, Arg2, Arg3> {
-  static const size_t Arg3LeafCount = LeafCount<Arg3>::Count;
-  static const size_t Arg2LeafCount = LeafCount<Arg2>::Count;
-  typedef typename PlaceHolderExpression<Arg1, N - Arg3LeafCount - Arg2LeafCount>::Type Arg1Type;
-  typedef typename PlaceHolderExpression<Arg2, N - Arg3LeafCount>::Type Arg2Type;
-  typedef typename PlaceHolderExpression<Arg3, N>::Type Arg3Type;
-  typedef utility::tuple::Tuple<Arg1Type, Arg2Type, Arg3Type> ArgsTuple;
-};
-
-template<template<class...> class Category , class OP, class TPL>
-struct CategoryHelper;
-
-template<template<class...> class Category , class OP, class ...T >
-struct CategoryHelper<Category, OP, utility::tuple::Tuple<T...> > {
-  typedef Category<OP, T... > Type;
-};
-
-template<template<class...> class Category , class ...T >
-struct CategoryHelper<Category, NoOP, utility::tuple::Tuple<T...> > {
-  typedef Category<T... > Type;
-};
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorBroadcastingOp, TensorCwiseBinaryOp,  TensorCwiseTernaryOp
-#define OPEXPRCATEGORY(CVQual)\
-template <template <class, class... > class Category, typename OP, typename... SubExpr, size_t N>\
-struct PlaceHolderExpression<CVQual Category<OP, SubExpr...>, N>{\
-  typedef CVQual typename CategoryHelper<Category, OP, typename CalculateIndex<N, SubExpr...>::ArgsTuple>::Type Type;\
-};
-
-OPEXPRCATEGORY(const)
-OPEXPRCATEGORY()
-#undef OPEXPRCATEGORY
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorCwiseSelectOp
-#define SELECTEXPR(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, N> {\
-  typedef CVQual typename CategoryHelper<TensorSelectOp, NoOP, typename CalculateIndex<N, IfExpr, ThenExpr, ElseExpr>::ArgsTuple>::Type Type;\
-};
-
-SELECTEXPR(const)
-SELECTEXPR()
-#undef SELECTEXPR
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorAssignOp
-#define ASSIGNEXPR(CVQual)\
-template <typename LHSExpr, typename RHSExpr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr>, N> {\
-  typedef CVQual typename CategoryHelper<TensorAssignOp, NoOP, typename CalculateIndex<N, LHSExpr, RHSExpr>::ArgsTuple>::Type Type;\
-};
-
-ASSIGNEXPR(const)
-ASSIGNEXPR()
-#undef ASSIGNEXPR
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorMap
-#define TENSORMAPEXPR(CVQual)\
-template <typename T, int Options_, template <class> class MakePointer_, size_t N>\
-struct PlaceHolderExpression< CVQual TensorMap< T, Options_, MakePointer_>, N> {\
-  typedef CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N> Type;\
-};
-
-TENSORMAPEXPR(const)
-TENSORMAPEXPR()
-#undef TENSORMAPEXPR
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorForcedEvalOp
-#define FORCEDEVAL(CVQual)\
-template <typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorForcedEvalOp<Expr>, N> {\
-  typedef CVQual PlaceHolder<CVQual TensorForcedEvalOp<Expr>, N> Type;\
-};
-
-FORCEDEVAL(const)
-FORCEDEVAL()
-#undef FORCEDEVAL
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorForcedEvalOp
-#define CUSTOMUNARYOPEVAL(CVQual)\
-template <typename CustomUnaryFunc, typename XprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> {\
-  typedef CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> Type;\
-};
-
-CUSTOMUNARYOPEVAL(const)
-CUSTOMUNARYOPEVAL()
-#undef CUSTOMUNARYOPEVAL
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorForcedEvalOp
-#define CUSTOMBINARYOPEVAL(CVQual)\
-template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> {\
-  typedef CVQual PlaceHolder<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> Type;\
-};
-
-CUSTOMBINARYOPEVAL(const)
-CUSTOMBINARYOPEVAL()
-#undef CUSTOMBINARYOPEVAL
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensoroOp, TensorLayoutSwapOp, and TensorIndexTupleOp
-#define EVALTOLAYOUTSWAPINDEXTUPLE(CVQual, ExprNode)\
-template <typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual ExprNode<Expr>, N> {\
-  typedef CVQual ExprNode<typename CalculateIndex <N, Expr>::ArgType> Type;\
-};
-
-// TensorEvalToOp
-EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorEvalToOp)
-EVALTOLAYOUTSWAPINDEXTUPLE(, TensorEvalToOp)
-//TensorLayoutSwapOp
-EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorLayoutSwapOp)
-EVALTOLAYOUTSWAPINDEXTUPLE(, TensorLayoutSwapOp)
-//TensorIndexTupleOp
-EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorIndexTupleOp)
-EVALTOLAYOUTSWAPINDEXTUPLE(, TensorIndexTupleOp)
-
-#undef EVALTOLAYOUTSWAPINDEXTUPLE
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorChippingOp
-#define CHIPPINGOP(CVQual)\
-template <DenseIndex DimId, typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorChippingOp<DimId, Expr>, N> {\
-  typedef CVQual TensorChippingOp< DimId, typename CalculateIndex <N, Expr>::ArgType> Type;\
-};
-
-CHIPPINGOP(const)
-CHIPPINGOP()
-#undef CHIPPINGOP
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorReductionOp and TensorTupleReducerOp (Argmax)
-#define SYCLREDUCTION(CVQual, ExprNode)\
-template <typename OP, typename Dims, typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual ExprNode<OP, Dims, Expr>, N>{\
-  typedef CVQual PlaceHolder<CVQual ExprNode<OP, Dims,Expr>, N> Type;\
-};
-
-// tensor reduction
-SYCLREDUCTION(const, TensorReductionOp)
-SYCLREDUCTION(, TensorReductionOp)
-
-// tensor Argmax -TensorTupleReducerOp
-SYCLREDUCTION(const, TensorTupleReducerOp)
-SYCLREDUCTION(, TensorTupleReducerOp)
-#undef SYCLREDUCTION
-
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorReductionOp
-#define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\
-template <typename Indices, typename LhsXprType, typename RhsXprType, size_t N>\
-struct PlaceHolderExpression<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>{\
-  typedef CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N> Type;\
-};
-SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONPLH(,TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorConvolutionOp)
-SYCLCONTRACTIONCONVOLUTIONPLH(,TensorConvolutionOp)
-#undef SYCLCONTRACTIONCONVOLUTIONPLH
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorCwiseSelectOp
-#define SLICEOPEXPR(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, N> {\
-  typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename CalculateIndex<N, XprType>::ArgType> Type;\
-};
-
-SLICEOPEXPR(const)
-SLICEOPEXPR()
-#undef SLICEOPEXPR
-
-
-#define SYCLSLICESTRIDEOPPLH(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, N> {\
-  typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename CalculateIndex<N, XprType>::ArgType> Type;\
-};
-
-SYCLSLICESTRIDEOPPLH(const)
-SYCLSLICESTRIDEOPPLH()
-#undef SYCLSLICESTRIDEOPPLH
-
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorImagePatchOp
-#define SYCLTENSORIMAGEPATCHOP(CVQual)\
-template<DenseIndex Rows, DenseIndex Cols, typename XprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorImagePatchOp<Rows, Cols, XprType>, N> {\
-  typedef CVQual TensorImagePatchOp<Rows, Cols, typename CalculateIndex <N, XprType>::ArgType> Type;\
-};
-
-SYCLTENSORIMAGEPATCHOP(const)
-SYCLTENSORIMAGEPATCHOP()
-#undef SYCLTENSORIMAGEPATCHOP
-
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorVolumePatchOp
-#define SYCLTENSORVOLUMEPATCHOP(CVQual)\
-template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorVolumePatchOp<Planes,Rows, Cols, XprType>, N> {\
-  typedef CVQual TensorVolumePatchOp<Planes,Rows, Cols, typename CalculateIndex <N, XprType>::ArgType> Type;\
-};
-
-SYCLTENSORVOLUMEPATCHOP(const)
-SYCLTENSORVOLUMEPATCHOP()
-#undef SYCLTENSORVOLUMEPATCHOP
-
-
-/// template deduction for \ref PlaceHolderExpression struct
-template <typename Expr>
-struct createPlaceHolderExpression {
-  static const size_t TotalLeaves = LeafCount<Expr>::Count;
-  typedef typename PlaceHolderExpression<Expr, TotalLeaves - 1>::Type Type;
-};
-
-}  // internal
-}  // TensorSycl
-}  // namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
deleted file mode 100644
index 29c78184d..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
+++ /dev/null
@@ -1,96 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Cummins Chris PhD student at The University of Edinburgh.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclRun.h
- *
- * \brief:
- * Schedule_kernel invoke an specialised version of kernel struct. The
- * specialisation is based on the data dimension in sycl buffer
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{
-  typedef  typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
-
-  typedef typename Expr::Index Index;
-  FunctorExpr functors;
-  TupleType tuple_of_accessors;
-  Index range;
-  ExecExprFunctorKernel(Index range_, FunctorExpr functors_, TupleType tuple_of_accessors_)
-    : functors(functors_), tuple_of_accessors(tuple_of_accessors_), range(range_){}
-  void operator()(cl::sycl::nd_item<1> itemID) {
-    typedef  typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
-    auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
-    typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id());
-    if (gId < range)
-      device_evaluator.evalScalar(gId);
-  }
-};
-
-/// The run function in tensor sycl convert the expression tree to a buffer
-/// based expression tree;
-/// creates the expression tree for the device with accessor to buffers;
-/// construct the kernel and submit it to the sycl queue.
-/// std::array does not have TotalSize. So I have to get the size through template specialisation.
-template<typename , typename Dimensions> struct DimensionSize{
-  static auto getDimSize(const Dimensions& dim)->decltype(dim.TotalSize()){
-    return dim.TotalSize();
-  }
-};
-#define DIMSIZEMACRO(CVQual)\
-template<typename Index, size_t NumDims> struct DimensionSize<Index, CVQual std::array<Index, NumDims>>{\
-  static inline Index getDimSize(const std::array<Index, NumDims>& dim){\
-    return (NumDims == 0) ? 1 : ::Eigen::internal::array_prod(dim);\
-  }\
-};
-
-DIMSIZEMACRO(const)
-DIMSIZEMACRO()
-#undef DIMSIZEMACRO
-
-
-template <typename Expr, typename Dev>
-void run(Expr &expr, Dev &dev) {
-  Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev);
-  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-  if (needs_assign) {
-    typedef Eigen::TensorSycl::internal::FunctorExtractor<Eigen::TensorEvaluator<Expr, Dev> > FunctorExpr;
-    FunctorExpr functors = internal::extractFunctors(evaluator);
-    dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
-      // create a tuple of accessors from Evaluator
-      typedef decltype(internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator)) TupleType;
-      TupleType tuple_of_accessors = internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator);
-      typename Expr::Index range, GRange, tileSize;
-      typename Expr::Index total_size = static_cast<typename Expr::Index>(DimensionSize<typename Expr::Index, typename Eigen::TensorEvaluator<Expr, Dev>::Dimensions>::getDimSize(evaluator.dimensions()));
-      dev.parallel_for_setup(total_size, tileSize, range, GRange);
-
-      cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
-      ExecExprFunctorKernel<Expr,FunctorExpr,TupleType>(range
-        , functors, tuple_of_accessors
-      ));
-    });
-      dev.asynchronousExec();
-  }
-  evaluator.cleanup();
-}
-}  // namespace TensorSycl
-}  // namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
deleted file mode 100644
index 5385c7eac..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
+++ /dev/null
@@ -1,239 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensroSyclTuple.h
- *
- * \brief:
- *  Minimal implementation of std::tuple that can be used inside a SYCL kernel.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
-
-namespace utility {
-namespace tuple {
-/// \struct StaticIf
-/// \brief The StaticIf struct is used to statically choose the type based on the
-/// condition.
-template <bool, typename T = void> struct StaticIf;
-/// \brief specialisation of the \ref StaticIf when the condition is true
-template <typename T>
-struct StaticIf<true, T> {
-  typedef T type;
-};
-
-/// \struct Tuple
-/// \brief is a fixed-size collection of heterogeneous values
-/// \tparam Ts...	-	the types of the elements that the tuple stores.
-/// Empty list is supported.
-template <class... Ts>
-struct Tuple {};
-
-/// \brief specialisation of the \ref Tuple class when the tuple has at least
-/// one element.
-/// \tparam T : the type of the first element in the tuple.
-/// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty.
-template <class T, class... Ts>
-struct Tuple<T, Ts...> {
-  Tuple(T t, Ts... ts) : head(t), tail(ts...) {}
-  T head;
-  Tuple<Ts...> tail;
-};
-
-///\ struct ElemTypeHolder
-/// \brief ElemTypeHolder class is used to specify the types of the
-/// elements inside the tuple
-/// \tparam size_t the number of elements inside the tuple
-/// \tparam class the tuple class
-template <size_t, class>
-struct ElemTypeHolder;
-
-/// \brief specialisation of the \ref ElemTypeHolder class when the number of
-/// elements inside the tuple is 1
-template <class T, class... Ts>
-struct ElemTypeHolder<0, Tuple<T, Ts...> > {
-  typedef T type;
-};
-
-/// \brief specialisation of the \ref ElemTypeHolder class when the number of
-/// elements inside the tuple is bigger than 1. It recursively calls itself to
-/// detect the type of each element in the tuple
-/// \tparam T : the type of the first element in the tuple.
-/// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty.
-/// \tparam K is the Kth element in the tuple
-template <size_t k, class T, class... Ts>
-struct ElemTypeHolder<k, Tuple<T, Ts...> > {
-  typedef typename ElemTypeHolder<k - 1, Tuple<Ts...> >::type type;
-};
-
-/// get
-/// \brief Extracts the first element from the tuple.
-/// K=0 represents the first element of the tuple. The tuple cannot be empty.
-/// \tparam Ts... are the type of the elements in the tuple.
-/// \param t is the tuple whose contents to extract
-/// \return  typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type
-
-#define TERMINATE_CONDS_TUPLE_GET(CVQual) \
-template <size_t k, class... Ts> \
-typename StaticIf<k == 0, CVQual typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type \
-get(CVQual Tuple<Ts...> &t) { \
-  static_assert(sizeof...(Ts)!=0, "The requseted value is bigger than the size of the tuple"); \
-  return t.head; \
-}
-
-TERMINATE_CONDS_TUPLE_GET(const)
-TERMINATE_CONDS_TUPLE_GET()
-#undef TERMINATE_CONDS_TUPLE_GET
-/// get
-/// \brief Extracts the Kth element from the tuple.
-///\tparam K is an integer value in [0,sizeof...(Types)).
-/// \tparam T is the (sizeof...(Types) -(K+1)) element in the tuple
-/// \tparam Ts... are the type of the elements  in the tuple.
-/// \param t is the tuple whose contents to extract
-/// \return  typename ElemTypeHolder<K, Tuple<Ts...> >::type &>::type
-#define RECURSIVE_TUPLE_GET(CVQual) \
-template <size_t k, class T, class... Ts> \
-typename StaticIf<k != 0, CVQual typename ElemTypeHolder<k, Tuple<T, Ts...> >::type &>::type \
-get(CVQual Tuple<T, Ts...> &t) { \
-  return utility::tuple::get<k - 1>(t.tail); \
-}
-RECURSIVE_TUPLE_GET(const)
-RECURSIVE_TUPLE_GET()
-#undef RECURSIVE_TUPLE_GET
-
-/// make_tuple
-/// \brief Creates a tuple object, deducing the target type from the types of
-/// arguments.
-/// \tparam Args the type of the arguments to construct the tuple from
-/// \param args zero or more arguments to construct the tuple from
-/// \return Tuple<Args...>
-template <typename... Args>
-Tuple<Args...> make_tuple(Args... args) {
-  return Tuple<Args...>(args...);
-}
-
-/// size
-/// \brief Provides access to the number of elements in a tuple as a
-/// compile-time constant expression.
-/// \tparam Args the type of the arguments to construct the tuple from
-/// \return size_t
-template <typename... Args>
-static constexpr size_t size(Tuple<Args...> &) {
-  return sizeof...(Args);
-}
-
-/// \struct IndexList
-/// \brief Creates a list of index from the elements in the tuple
-/// \tparam Is... a list of index from [0 to sizeof...(tuple elements))
-template <size_t... Is>
-struct IndexList {};
-
-/// \struct RangeBuilder
-/// \brief Collects internal details for generating index ranges [MIN, MAX)
-/// Declare primary template for index range builder
-/// \tparam MIN is the starting index in the tuple
-/// \tparam N represents sizeof..(elements)- sizeof...(Is)
-/// \tparam Is... are the list of generated index so far
-template <size_t MIN, size_t N, size_t... Is>
-struct RangeBuilder;
-
-// FIXME Doxygen has problems with recursive inheritance
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-/// \brief base Step: Specialisation of the \ref RangeBuilder when the
-/// MIN==MAX. In this case the Is... is [0 to sizeof...(tuple elements))
-/// \tparam MIN is the starting index of the tuple
-/// \tparam Is is [0 to sizeof...(tuple elements))
-template <size_t MIN, size_t... Is>
-struct RangeBuilder<MIN, MIN, Is...> {
-  typedef IndexList<Is...> type;
-};
-
-/// Induction step: Specialisation of the RangeBuilder class when N!=MIN
-/// in this case we are recursively subtracting N by one and adding one
-/// index to Is... list until MIN==N
-/// \tparam MIN is the starting index in the tuple
-/// \tparam N represents sizeof..(elements)- sizeof...(Is)
-/// \tparam Is... are the list of generated index so far
-template <size_t MIN, size_t N, size_t... Is>
-struct RangeBuilder : public RangeBuilder<MIN, N - 1, N - 1, Is...> {};
-#endif // EIGEN_PARSED_BY_DOXYGEN
-
-/// \brief IndexRange that returns a [MIN, MAX) index range
-/// \tparam MIN is the starting index in the tuple
-/// \tparam MAX is the size of the tuple
-template <size_t MIN, size_t MAX>
-struct IndexRange: RangeBuilder<MIN, MAX>::type {};
-
-/// append_base
-/// \brief unpacking the elements of the input tuple t and creating a new tuple
-/// by adding element a at the end of it.
-///\tparam Args... the type of the elements inside the tuple t
-/// \tparam T the type of the new element going to be added at the end of tuple
-/// \tparam I... is the list of index from [0 to sizeof...(t))
-/// \param t the tuple on which we want to append a.
-/// \param a the new elements going to be added to the tuple
-/// \return Tuple<Args..., T>
-template <typename... Args, typename T, size_t... I>
-Tuple<Args..., T> append_base(Tuple<Args...> t, T a,IndexList<I...>) {
-  return utility::tuple::make_tuple(get<I>(t)..., a);
-}
-
-/// append
-/// \brief the deduction function for \ref append_base that automatically
-/// generate the \ref IndexRange
-///\tparam Args... the type of the elements inside the tuple t
-/// \tparam T the type of the new element going to be added at the end of tuple
-/// \param t the tuple on which we want to append a.
-/// \param a the new elements going to be added to the tuple
-/// \return Tuple<Args..., T>
-template <typename... Args, typename T>
-Tuple<Args..., T> append(Tuple<Args...> t, T a) {
-  return utility::tuple::append_base(t, a,  IndexRange<0, sizeof...(Args)>());
-}
-
-/// append_base
-/// \brief This is a specialisation of \ref append_base when we want to
-/// concatenate
-/// tuple t2 at the end of the tuple t1. Here we unpack both tuples, generate the
-/// IndexRange for each of them and create an output tuple T that contains both
-/// elements of t1 and t2.
-///\tparam Args1... the type of the elements inside the tuple t1
-///\tparam Args2... the type of the elements inside the tuple t2
-/// \tparam I1... is the list of index from [0 to sizeof...(t1))
-/// \tparam I2... is the list of index from [0 to sizeof...(t2))
-/// \param t1 is the tuple on which we want to append t2.
-/// \param t2 is the tuple that is going to be added on t1.
-/// \return Tuple<Args1..., Args2...>
-template <typename... Args1, typename... Args2, size_t... I1, size_t... I2>
-Tuple<Args1..., Args2...> append_base(Tuple<Args1...> t1, Tuple<Args2...> t2, IndexList<I1...>, IndexList<I2...>) {
-  return utility::tuple::make_tuple(get<I1>(t1)...,get<I2>(t2)...);
-}
-
-/// append
-/// \brief deduction function for \ref append_base when we are appending tuple
-/// t1 by tuple t2. In this case the \ref IndexRange for both tuple are
-/// automatically generated.
-///\tparam Args1... the type of the elements inside the tuple t1
-///\tparam Args2... the type of the elements inside the tuple t2
-/// \param t1 is the tuple on which we want to append t2.
-/// \param t2 is the tuple that is going to be added on t1.
-/// \return Tuple<Args1..., Args2...>
-template <typename... Args1, typename... Args2>
-Tuple<Args1..., Args2...> append(Tuple<Args1...> t1,Tuple<Args2...> t2) {
-  return utility::tuple::append_base(t1, t2, IndexRange<0, sizeof...(Args1)>(), IndexRange<0, sizeof...(Args2)>());
-}
-}  // tuple
-}  // utility
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
diff --git a/unsupported/doc/Overview.dox b/unsupported/doc/Overview.dox
index 45464a545..bae51dcf6 100644
--- a/unsupported/doc/Overview.dox
+++ b/unsupported/doc/Overview.dox
@@ -11,6 +11,8 @@ Click on the \e Modules tab at the top of this page to get a list of all unsuppo
 
 Don't miss the <a href="../index.html">official Eigen documentation</a>.
 
+ \subpage SYCL_EIGEN "SYCL backend for Eigen"
+
 */
 
 /*
@@ -26,3 +28,4 @@ subject to be included in %Eigen in the future.
 /// \internal \brief Namespace containing low-level routines from the %Eigen library.
 namespace internal {}
 }
+
diff --git a/unsupported/doc/SYCL.dox b/unsupported/doc/SYCL.dox
new file mode 100644
index 000000000..2295adf21
--- /dev/null
+++ b/unsupported/doc/SYCL.dox
@@ -0,0 +1,9 @@
+/** \page SYCL_EIGEN Eigen SYCL Backend
+
+Useful information for Eigen SYCL Backend:
+
+- <a href="https://developer.codeplay.com/computecppce/latest/getting-started-with-eigen">Getting Started with Eigen</a> 
+
+- <a href="https://developer.codeplay.com/computecppce/latest/options-for-building-eigen-sycl">Options for Building Eigen SYCL</a>  
+
+*/
diff --git a/unsupported/doc/examples/CMakeLists.txt b/unsupported/doc/examples/CMakeLists.txt
index bee2b8ad4..7bb67736c 100644
--- a/unsupported/doc/examples/CMakeLists.txt
+++ b/unsupported/doc/examples/CMakeLists.txt
@@ -18,3 +18,7 @@ foreach(example_src ${examples_SRCS})
   )
   add_dependencies(unsupported_examples example_${example})
 endforeach(example_src)
+
+if(EIGEN_TEST_SYCL)
+  add_subdirectory(SYCL)
+endif(EIGEN_TEST_SYCL)
diff --git a/unsupported/doc/examples/SYCL/CMakeLists.txt b/unsupported/doc/examples/SYCL/CMakeLists.txt
new file mode 100644
index 000000000..bef4f1925
--- /dev/null
+++ b/unsupported/doc/examples/SYCL/CMakeLists.txt
@@ -0,0 +1,38 @@
+FILE(GLOB examples_SRCS "*.cpp")
+
+set(EIGEN_SYCL ON)
+list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread)
+if(EIGEN_SYCL_TRISYCL)
+  set(CMAKE_CXX_STANDARD 14)
+  set(STD_CXX_FLAG "-std=c++1z")
+else(EIGEN_SYCL_TRISYCL)
+  if(MSVC)
+    # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11
+    # can cause issues with the ComputeCpp device compiler parsing Visual Studio Headers.
+    set(CMAKE_CXX_STANDARD 14)
+    list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
+  else()
+    set(CMAKE_CXX_STANDARD 11)
+    list(APPEND COMPUTECPP_USER_FLAGS -Wall)
+  endif()
+  # The following flags are not supported by Clang and can cause warnings
+  # if used with -Werror so they are removed here.
+  if(COMPUTECPP_USE_COMPILER_DRIVER)
+    set(CMAKE_CXX_COMPILER ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE})
+    string(REPLACE "-Wlogical-op" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+    string(REPLACE "-Wno-psabi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+    string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  endif()
+  list(APPEND COMPUTECPP_USER_FLAGS
+      -DEIGEN_NO_ASSERTION_CHECKING=1
+      -no-serial-memop
+      -Xclang
+      -cl-mad-enable)
+endif(EIGEN_SYCL_TRISYCL)
+
+FOREACH(example_src ${examples_SRCS})
+  GET_FILENAME_COMPONENT(example ${example_src} NAME_WE)
+  ei_add_test_internal(${example} example_${example})
+  ADD_DEPENDENCIES(unsupported_examples example_${example})
+ENDFOREACH(example_src)
+set(EIGEN_SYCL OFF)
diff --git a/unsupported/doc/examples/SYCL/CwiseMul.cpp b/unsupported/doc/examples/SYCL/CwiseMul.cpp
new file mode 100644
index 000000000..31eb104c6
--- /dev/null
+++ b/unsupported/doc/examples/SYCL/CwiseMul.cpp
@@ -0,0 +1,63 @@
+#include <iostream>
+#define EIGEN_USE_SYCL
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+int main()
+{
+  using DataType = float;
+  using IndexType = int64_t;
+  constexpr auto DataLayout = Eigen::RowMajor;
+
+  auto devices = Eigen::get_sycl_supported_devices();
+  const auto device_selector = *devices.begin();
+  Eigen::QueueInterface queueInterface(device_selector);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  
+  // create the tensors to be used in the operation
+  IndexType sizeDim1 = 3;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 3;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+
+  // initialize the tensors with the data we want manipulate to
+  Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange);
+
+  // set up some random data in the tensors to be multiplied
+  in1 = in1.random();
+  in2 = in2.random();
+
+  // allocate memory for the tensors
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.size()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+  // 
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+
+  // copy the memory to the device and do the c=a*b calculation
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.size())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(DataType));
+  gpu_out.device(sycl_device) = gpu_in1 * gpu_in2;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  // print out the results
+   for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        std::cout << "device_out" << "(" << i << ", " << j << ", " << k << ") : " << out(i,j,k) 
+                  << " vs host_out" << "(" << i << ", " << j << ", " << k << ") : " << in1(i,j,k) * in2(i,j,k) << "\n";
+      }
+    }
+  }
+  printf("c=a*b Done\n");
+}
\ No newline at end of file
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 3d9ac9263..9db965ad8 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -111,40 +111,113 @@ ei_add_test(special_functions)
 
 if(EIGEN_TEST_CXX11)
   if(EIGEN_TEST_SYCL)
+    set(EIGEN_SYCL ON)
+    # Forward CMake options as preprocessor definitions
+    if(EIGEN_SYCL_USE_DEFAULT_SELECTOR)
+      add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR})
+    endif()
+    if(EIGEN_SYCL_NO_LOCAL_MEM)
+      add_definitions(-DEIGEN_SYCL_NO_LOCAL_MEM=${EIGEN_SYCL_NO_LOCAL_MEM})
+    endif()
+    if(EIGEN_SYCL_LOCAL_MEM)
+      add_definitions(-DEIGEN_SYCL_LOCAL_MEM=${EIGEN_SYCL_LOCAL_MEM})
+    endif()
+    if(EIGEN_SYCL_MAX_GLOBAL_RANGE)
+      add_definitions(-DEIGEN_SYCL_MAX_GLOBAL_RANGE=${EIGEN_SYCL_MAX_GLOBAL_RANGE})
+    endif()
+    if(EIGEN_SYCL_LOCAL_THREAD_DIM0)
+      add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM0=${EIGEN_SYCL_LOCAL_THREAD_DIM0})
+    endif()
+    if(EIGEN_SYCL_LOCAL_THREAD_DIM1)
+      add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM1=${EIGEN_SYCL_LOCAL_THREAD_DIM1})
+    endif()
+    if(EIGEN_SYCL_REG_M)
+      add_definitions(-DEIGEN_SYCL_REG_M=${EIGEN_SYCL_REG_M})
+    endif()
+    if(EIGEN_SYCL_REG_N)
+      add_definitions(-DEIGEN_SYCL_REG_N=${EIGEN_SYCL_REG_N})
+    endif()
+    if(EIGEN_SYCL_USE_PROGRAM_CLASS)
+      add_definitions(-DEIGEN_SYCL_USE_PROGRAM_CLASS=${EIGEN_SYCL_USE_PROGRAM_CLASS})
+    endif()
+    if(EIGEN_SYCL_ASYNC_EXECUTION)
+      add_definitions(-DEIGEN_SYCL_ASYNC_EXECUTION=${EIGEN_SYCL_ASYNC_EXECUTION})
+    endif()
+    if(EIGEN_SYCL_DISABLE_SKINNY)
+      add_definitions(-DEIGEN_SYCL_DISABLE_SKINNY=${EIGEN_SYCL_DISABLE_SKINNY})
+    endif()
+    if(EIGEN_SYCL_DISABLE_DOUBLE_BUFFER)
+    add_definitions(-DEIGEN_SYCL_DISABLE_DOUBLE_BUFFER=${EIGEN_SYCL_DISABLE_DOUBLE_BUFFER})
+  endif()
+    if(EIGEN_SYCL_DISABLE_RANK1)
+      add_definitions(-DEIGEN_SYCL_DISABLE_RANK1=${EIGEN_SYCL_DISABLE_RANK1})
+    endif()
+    if(EIGEN_SYCL_DISABLE_SCALAR)
+      add_definitions(-DEIGEN_SYCL_DISABLE_SCALAR=${EIGEN_SYCL_DISABLE_SCALAR})
+    endif()
+    if(EIGEN_SYCL_DISABLE_GEMV)
+      add_definitions(-DEIGEN_SYCL_DISABLE_GEMV=${EIGEN_SYCL_DISABLE_GEMV})
+    endif()
+    if(EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION)
+      add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION})
+    endif()
+
     if(EIGEN_SYCL_TRISYCL)
       set(CMAKE_CXX_STANDARD 14)
       set(STD_CXX_FLAG "-std=c++1z")
     else()
-      # It should be safe to always run these tests as there is some fallback code for
-      # older compiler that don't support cxx11.
-      # This is already set if EIGEN_TEST_CXX11 is enabled:
-      # set(CMAKE_CXX_STANDARD 11)
-      # set(STD_CXX_FLAG "-std=c++11")
+      if(MSVC)
+        # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11
+        # can cause issues with the ComputeCpp device compiler parsing Visual Studio Headers.
+        set(CMAKE_CXX_STANDARD 14)
+        list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
+      else()
+        set(CMAKE_CXX_STANDARD 11)
+        list(APPEND COMPUTECPP_USER_FLAGS -Wall)
+      endif()
+      # The following flags are not supported by Clang and can cause warnings
+      # if used with -Werror so they are removed here.
+      if(COMPUTECPP_USE_COMPILER_DRIVER)
+        set(CMAKE_CXX_COMPILER ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE})
+        string(REPLACE "-Wlogical-op" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+        string(REPLACE "-Wno-psabi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+        string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+      endif()
+      list(APPEND COMPUTECPP_USER_FLAGS
+          -DEIGEN_NO_ASSERTION_CHECKING=1
+          -no-serial-memop
+          -Xclang
+          -cl-mad-enable)
     endif()
 
-    ei_add_test_sycl(cxx11_tensor_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_device_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_padding_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_contract_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_striding_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_generator_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_patch_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_image_op_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_math_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_device_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_padding_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_contract_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_striding_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_random_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_generator_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_scan_sycl ${STD_CXX_FLAG})
+    set(EIGEN_SYCL OFF)
   endif()
 
   ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
index 0bbb0f6dc..41ea3cf7b 100644
--- a/unsupported/test/cxx11_tensor_argmax_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
@@ -18,6 +18,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
+
 #include <unsupported/Eigen/CXX11/Tensor>
 
 using Eigen::array;
@@ -26,9 +27,8 @@ using Eigen::Tensor;
 using Eigen::TensorMap;
 
 template <typename DataType, int Layout, typename DenseIndex>
-static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){
-
-  Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2,2,2}});
+static void test_sycl_simple_argmax(const Eigen::SyclDevice& sycl_device) {
+  Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2, 2, 2}});
   Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
   Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
   in.setRandom();
@@ -39,14 +39,15 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){
   std::size_t in_bytes = in.size() * sizeof(DataType);
   std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
 
-  DataType * d_in       = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+  DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
   DenseIndex* d_out_max = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
   DenseIndex* d_out_min = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
 
-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 3>{{2,2,2}});
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in,
+                                                                           Eigen::array<DenseIndex, 3>{{2, 2, 2}});
   Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_max(d_out_max);
   Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_min(d_out_min);
-  sycl_device.memcpyHostToDevice(d_in, in.data(),in_bytes);
+  sycl_device.memcpyHostToDevice(d_in, in.data(), in_bytes);
 
   gpu_out_max.device(sycl_device) = gpu_in.argmax();
   gpu_out_min.device(sycl_device) = gpu_in.argmin();
@@ -54,7 +55,7 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){
   sycl_device.memcpyDeviceToHost(out_max.data(), d_out_max, out_bytes);
   sycl_device.memcpyDeviceToHost(out_min.data(), d_out_min, out_bytes);
 
-  VERIFY_IS_EQUAL(out_max(), 2*2*2 - 1);
+  VERIFY_IS_EQUAL(out_max(), 2 * 2 * 2 - 1);
   VERIFY_IS_EQUAL(out_min(), 0);
 
   sycl_device.deallocate(d_in);
@@ -62,22 +63,22 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){
   sycl_device.deallocate(d_out_min);
 }
 
-
 template <typename DataType, int DataLayout, typename DenseIndex>
-static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
-{
-  DenseIndex sizeDim0=9;
-  DenseIndex sizeDim1=3;
-  DenseIndex sizeDim2=5;
-  DenseIndex sizeDim3=7;
-  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3);
+static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) {
+  DenseIndex sizeDim0 = 9;
+  DenseIndex sizeDim1 = 3;
+  DenseIndex sizeDim2 = 5;
+  DenseIndex sizeDim3 = 7;
+  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3);
 
   std::vector<DenseIndex> dims;
-  dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3);
+  dims.push_back(sizeDim0);
+  dims.push_back(sizeDim1);
+  dims.push_back(sizeDim2);
+  dims.push_back(sizeDim3);
   for (DenseIndex dim = 0; dim < 4; ++dim) {
-
     array<DenseIndex, 3> out_shape;
-    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1];
 
     Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
 
@@ -86,9 +87,13 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
       for (DenseIndex j = 0; j < sizeDim1; ++j) {
         for (DenseIndex k = 0; k < sizeDim2; ++k) {
           for (DenseIndex l = 0; l < sizeDim3; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
-            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
-            tensor(ix)=(ix[dim] != 0)?-1.0:10.0;
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l)
+            // = 10.0
+            tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0;
           }
         }
       }
@@ -97,23 +102,23 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
     std::size_t in_bytes = tensor.size() * sizeof(DataType);
     std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
 
+    DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+    DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
 
-    DataType * d_in       = static_cast<DataType*>(sycl_device.allocate(in_bytes));
-    DenseIndex* d_out= static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
-
-    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 4>{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}});
+    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(
+        d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}});
     Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
 
-    sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
     gpu_out.device(sycl_device) = gpu_in.argmax(dim);
     sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
 
     VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
-                    size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim)));
+                    size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim)));
 
     for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
       // Expect max to be in the first index of the reduced dimension
-       VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
     }
 
     sycl_device.synchronize();
@@ -122,15 +127,18 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
       for (DenseIndex j = 0; j < sizeDim1; ++j) {
         for (DenseIndex k = 0; k < sizeDim2; ++k) {
           for (DenseIndex l = 0; l < sizeDim3; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
             // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
-            tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?-1.0:20.0;
+            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0;
           }
         }
       }
     }
 
-    sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
     gpu_out.device(sycl_device) = gpu_in.argmax(dim);
     sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
 
@@ -144,20 +152,21 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
 }
 
 template <typename DataType, int DataLayout, typename DenseIndex>
-static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
-{
-  DenseIndex sizeDim0=9;
-  DenseIndex sizeDim1=3;
-  DenseIndex sizeDim2=5;
-  DenseIndex sizeDim3=7;
-  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3);
+static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) {
+  DenseIndex sizeDim0 = 9;
+  DenseIndex sizeDim1 = 3;
+  DenseIndex sizeDim2 = 5;
+  DenseIndex sizeDim3 = 7;
+  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3);
 
   std::vector<DenseIndex> dims;
-  dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3);
+  dims.push_back(sizeDim0);
+  dims.push_back(sizeDim1);
+  dims.push_back(sizeDim2);
+  dims.push_back(sizeDim3);
   for (DenseIndex dim = 0; dim < 4; ++dim) {
-
     array<DenseIndex, 3> out_shape;
-    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1];
 
     Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
 
@@ -166,9 +175,12 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
       for (DenseIndex j = 0; j < sizeDim1; ++j) {
         for (DenseIndex k = 0; k < sizeDim2; ++k) {
           for (DenseIndex l = 0; l < sizeDim3; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
-            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
-            tensor(ix)=(ix[dim] != 0)?1.0:-10.0;
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
+            tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0;
           }
         }
       }
@@ -177,23 +189,23 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
     std::size_t in_bytes = tensor.size() * sizeof(DataType);
     std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
 
+    DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+    DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
 
-    DataType * d_in       = static_cast<DataType*>(sycl_device.allocate(in_bytes));
-    DenseIndex* d_out= static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
-
-    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 4>{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}});
+    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(
+        d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}});
     Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
 
-    sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
     gpu_out.device(sycl_device) = gpu_in.argmin(dim);
     sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
 
     VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
-                    size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim)));
+                    size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim)));
 
     for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
       // Expect max to be in the first index of the reduced dimension
-       VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
     }
 
     sycl_device.synchronize();
@@ -202,15 +214,18 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
       for (DenseIndex j = 0; j < sizeDim1; ++j) {
         for (DenseIndex k = 0; k < sizeDim2; ++k) {
           for (DenseIndex l = 0; l < sizeDim3; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
-            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
-            tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?1.0:-20.0;
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
+            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0;
           }
         }
       }
     }
 
-    sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
     gpu_out.device(sycl_device) = gpu_in.argmin(dim);
     sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
 
@@ -223,10 +238,8 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
   }
 }
 
-
-
-
-template<typename DataType, typename Device_Selector> void sycl_argmax_test_per_device(const Device_Selector& d){
+template <typename DataType, typename Device_Selector>
+void sycl_argmax_test_per_device(const Device_Selector& d) {
   QueueInterface queueInterface(d);
   auto sycl_device = Eigen::SyclDevice(&queueInterface);
   test_sycl_simple_argmax<DataType, RowMajor, int64_t>(sycl_device);
@@ -238,8 +251,7 @@ template<typename DataType, typename Device_Selector> void sycl_argmax_test_per_
 }
 
 EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) {
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
-    CALL_SUBTEST(sycl_argmax_test_per_device<double>(device));
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_argmax_test_per_device<float>(device));
   }
-
 }
diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
index db2975783..72cb62fd5 100644
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@@ -25,243 +25,330 @@ using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
 
-namespace std {
-template <typename T> T rsqrt(T x) { return 1 / std::sqrt(x); }
+// Functions used to compare the TensorMap implementation on the device with
+// the equivalent on the host
+namespace cl {
+namespace sycl {
+template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
 template <typename T> T square(T x) { return x * x; }
 template <typename T> T cube(T x) { return x * x * x; }
-template <typename T> T inverse(T x) { return 1 / x; }
+template <typename T> T inverse(T x) { return T(1) / x; }
+template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
+template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
 }
+}
+
+struct EqualAssignement {
+  template <typename Lhs, typename Rhs>
+  void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
+};
+
+struct PlusEqualAssignement {
+  template <typename Lhs, typename Rhs>
+  void operator()(Lhs& lhs, const Rhs& rhs) { lhs += rhs; }
+};
 
-#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR, Layout)         \
-  {                                                                            \
-    /* out OPERATOR in.FUNC() */                                               \
-    Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange);                        \
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
-    in = in.random() + static_cast<SCALAR>(0.01);                              \
-    out = out.random() + static_cast<SCALAR>(0.01);                            \
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
-    SCALAR *gpu_data = static_cast<SCALAR *>(                                  \
-        sycl_device.allocate(in.size() * sizeof(SCALAR)));                     \
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange);          \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
-    sycl_device.memcpyHostToDevice(gpu_data, in.data(),                        \
-                                   (in.size()) * sizeof(SCALAR));              \
-    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    gpu_out.device(sycl_device) OPERATOR gpu.FUNC();                           \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      SCALAR ver = reference(i);                                               \
-      ver OPERATOR std::FUNC(in(i));                                           \
-      VERIFY_IS_APPROX(out(i), ver);                                           \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data);                                          \
-    sycl_device.deallocate(gpu_data_out);                                      \
-  }                                                                            \
-  {                                                                            \
-    /* out OPERATOR out.FUNC() */                                              \
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
-    out = out.random() + static_cast<SCALAR>(0.01);                            \
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
-    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    gpu_out.device(sycl_device) OPERATOR gpu_out.FUNC();                       \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      SCALAR ver = reference(i);                                               \
-      ver OPERATOR std::FUNC(reference(i));                                    \
-      VERIFY_IS_APPROX(out(i), ver);                                           \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data_out);                                      \
+template <typename DataType, int DataLayout,
+          typename Assignement, typename Operator>
+void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
+                                    const array<int64_t, 3>& tensor_range) {
+  Operator op;
+  Assignement asgn;
+  {
+    /* Assignement(out, Operator(in)) */
+    Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+    Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+    in = in.random() + DataType(0.01);
+    out = out.random() + DataType(0.01);
+    Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+    DataType *gpu_data = static_cast<DataType *>(
+        sycl_device.allocate(in.size() * sizeof(DataType)));
+    DataType *gpu_data_out = static_cast<DataType *>(
+        sycl_device.allocate(out.size() * sizeof(DataType)));
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+    sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+                                   (in.size()) * sizeof(DataType));
+    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+                                   (out.size()) * sizeof(DataType));
+    auto device_expr = gpu_out.device(sycl_device);
+    asgn(device_expr, op(gpu));
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                   (out.size()) * sizeof(DataType));
+    for (int64_t i = 0; i < out.size(); ++i) {
+      DataType ver = reference(i);
+      asgn(ver, op(in(i)));
+      VERIFY_IS_APPROX(out(i), ver);
+    }
+    sycl_device.deallocate(gpu_data);
+    sycl_device.deallocate(gpu_data_out);
   }
+  {
+    /* Assignement(out, Operator(out)) */
+    Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+    out = out.random() + DataType(0.01);
+    Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+    DataType *gpu_data_out = static_cast<DataType *>(
+        sycl_device.allocate(out.size() * sizeof(DataType)));
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+                                   (out.size()) * sizeof(DataType));
+    auto device_expr = gpu_out.device(sycl_device);
+    asgn(device_expr, op(gpu_out));
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                   (out.size()) * sizeof(DataType));
+    for (int64_t i = 0; i < out.size(); ++i) {
+      DataType ver = reference(i);
+      asgn(ver, op(reference(i)));
+      VERIFY_IS_APPROX(out(i), ver);
+    }
+    sycl_device.deallocate(gpu_data_out);
+  }
+}
+
+#define DECLARE_UNARY_STRUCT(FUNC)                                 \
+  struct op_##FUNC {                                               \
+    template <typename T>                                          \
+    auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) {   \
+      return cl::sycl::FUNC(x);                                    \
+    }                                                              \
+    template <typename T>                                          \
+    auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
+      return x.FUNC();                                             \
+    }                                                              \
+  };
 
-#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR , Layout)                \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout)               \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR , Layout)              \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR , Layout)             \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR , Layout)            \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR , Layout)              \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR , Layout)           \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR , Layout)              \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR , Layout)               \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR , Layout)             \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR , Layout)               \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout)               \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR , Layout)              \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR , Layout)             \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR , Layout)             \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR , Layout)
+DECLARE_UNARY_STRUCT(abs)
+DECLARE_UNARY_STRUCT(sqrt)
+DECLARE_UNARY_STRUCT(rsqrt)
+DECLARE_UNARY_STRUCT(square)
+DECLARE_UNARY_STRUCT(cube)
+DECLARE_UNARY_STRUCT(inverse)
+DECLARE_UNARY_STRUCT(tanh)
+DECLARE_UNARY_STRUCT(exp)
+DECLARE_UNARY_STRUCT(expm1)
+DECLARE_UNARY_STRUCT(log)
+DECLARE_UNARY_STRUCT(ceil)
+DECLARE_UNARY_STRUCT(floor)
+DECLARE_UNARY_STRUCT(round)
+DECLARE_UNARY_STRUCT(log1p)
+DECLARE_UNARY_STRUCT(sign)
+DECLARE_UNARY_STRUCT(isnan)
+DECLARE_UNARY_STRUCT(isfinite)
+DECLARE_UNARY_STRUCT(isinf)
 
-#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC, Layout)                        \
-  {                                                                            \
-    /* out = in.FUNC() */                                                      \
-    Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange);                        \
-    Tensor<bool, 3, Layout, int64_t> out(tensorRange);                         \
-    in = in.random() + static_cast<SCALAR>(0.01);                              \
-    SCALAR *gpu_data = static_cast<SCALAR *>(                                  \
-        sycl_device.allocate(in.size() * sizeof(SCALAR)));                     \
-    bool *gpu_data_out =                                                       \
-        static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool)));  \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange);          \
-    TensorMap<Tensor<bool, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);    \
-    sycl_device.memcpyHostToDevice(gpu_data, in.data(),                        \
-                                   (in.size()) * sizeof(SCALAR));              \
-    gpu_out.device(sycl_device) = gpu.FUNC();                                  \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(bool));               \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      VERIFY_IS_EQUAL(out(i), std::FUNC(in(i)));                               \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data);                                          \
-    sycl_device.deallocate(gpu_data_out);                                      \
+template <typename DataType, int DataLayout, typename Assignement>
+void test_unary_builtins_for_assignement(const Eigen::SyclDevice& sycl_device,
+                                         const array<int64_t, 3>& tensor_range) {
+#define RUN_UNARY_TEST(FUNC) \
+  test_unary_builtins_for_scalar<DataType, DataLayout, Assignement, \
+                                 op_##FUNC>(sycl_device, tensor_range)
+  RUN_UNARY_TEST(abs);
+  RUN_UNARY_TEST(sqrt);
+  RUN_UNARY_TEST(rsqrt);
+  RUN_UNARY_TEST(square);
+  RUN_UNARY_TEST(cube);
+  RUN_UNARY_TEST(inverse);
+  RUN_UNARY_TEST(tanh);
+  RUN_UNARY_TEST(exp);
+  RUN_UNARY_TEST(expm1);
+  RUN_UNARY_TEST(log);
+  RUN_UNARY_TEST(ceil);
+  RUN_UNARY_TEST(floor);
+  RUN_UNARY_TEST(round);
+  RUN_UNARY_TEST(log1p);
+  RUN_UNARY_TEST(sign);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_unary_builtins_return_bool(const Eigen::SyclDevice& sycl_device,
+                                     const array<int64_t, 3>& tensor_range) {
+  /* out = op(in) */
+  Operator op;
+  Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+  Tensor<bool, 3, DataLayout, int64_t> out(tensor_range);
+  in = in.random() + DataType(0.01);
+  DataType *gpu_data = static_cast<DataType *>(
+      sycl_device.allocate(in.size() * sizeof(DataType)));
+  bool *gpu_data_out =
+      static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+  TensorMap<Tensor<bool, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+                                 (in.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(bool));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_EQUAL(out(i), op(in(i)));
   }
+  sycl_device.deallocate(gpu_data);
+  sycl_device.deallocate(gpu_data_out);
+}
 
-#define TEST_UNARY_BUILTINS(SCALAR, Layout)                                    \
-  TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=, Layout)                             \
-  TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =, Layout)                              \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan, Layout)                             \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite, Layout)                          \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf, Layout)
+template <typename DataType, int DataLayout>
+void test_unary_builtins(const Eigen::SyclDevice& sycl_device,
+                         const array<int64_t, 3>& tensor_range) {
+  test_unary_builtins_for_assignement<DataType, DataLayout,
+                                      PlusEqualAssignement>(sycl_device, tensor_range);
+  test_unary_builtins_for_assignement<DataType, DataLayout,
+                                      EqualAssignement>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isnan>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isfinite>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isinf>(sycl_device, tensor_range);
+}
 
+template <typename DataType>
 static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) {
   int64_t sizeDim1 = 10;
   int64_t sizeDim2 = 10;
   int64_t sizeDim3 = 10;
-  array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
 
-  TEST_UNARY_BUILTINS(float, RowMajor)
-  TEST_UNARY_BUILTINS(float, ColMajor)
+  test_unary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+  test_unary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
 }
 
-namespace std {
-template <typename T> T cwiseMax(T x, T y) { return std::max(x, y); }
-template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_func(const Eigen::SyclDevice& sycl_device,
+                               const array<int64_t, 3>& tensor_range) {
+  /* out = op(in_1, in_2) */
+  Operator op;
+  Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> in_2(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+  in_1 = in_1.random() + DataType(0.01);
+  in_2 = in_2.random() + DataType(0.01);
+  Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+  DataType *gpu_data_1 = static_cast<DataType *>(
+      sycl_device.allocate(in_1.size() * sizeof(DataType)));
+  DataType *gpu_data_2 = static_cast<DataType *>(
+      sycl_device.allocate(in_2.size() * sizeof(DataType)));
+  DataType *gpu_data_out = static_cast<DataType *>(
+      sycl_device.allocate(out.size() * sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_2(gpu_data_2, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+                                 (in_1.size()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),
+                                 (in_2.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu_1, gpu_2);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(DataType));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), op(in_1(i), in_2(i)));
+  }
+  sycl_device.deallocate(gpu_data_1);
+  sycl_device.deallocate(gpu_data_2);
+  sycl_device.deallocate(gpu_data_out);
 }
 
-#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC, Layout)                        \
-  {                                                                            \
-    /* out = in_1.FUNC(in_2) */                                                \
-    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
-    Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange);                      \
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
-    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
-    in_2 = in_2.random() + static_cast<SCALAR>(0.01);                          \
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
-    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
-        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
-    SCALAR *gpu_data_2 = static_cast<SCALAR *>(                                \
-        sycl_device.allocate(in_2.size() * sizeof(SCALAR)));                   \
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange);      \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
-    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
-                                   (in_1.size()) * sizeof(SCALAR));            \
-    sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),                    \
-                                   (in_2.size()) * sizeof(SCALAR));            \
-    gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2);                           \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      SCALAR ver = reference(i);                                               \
-      ver = std::FUNC(in_1(i), in_2(i));                                       \
-      VERIFY_IS_APPROX(out(i), ver);                                           \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data_1);                                        \
-    sycl_device.deallocate(gpu_data_2);                                        \
-    sycl_device.deallocate(gpu_data_out);                                      \
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device,
+                                     const array<int64_t, 3>& tensor_range) {
+  /* out = op(in_1, 2) */
+  Operator op;
+  const DataType arg2(2);
+  Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+  in_1 = in_1.random();
+  Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+  DataType *gpu_data_1 = static_cast<DataType *>(
+      sycl_device.allocate(in_1.size() * sizeof(DataType)));
+  DataType *gpu_data_out = static_cast<DataType *>(
+      sycl_device.allocate(out.size() * sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+                                 (in_1.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu_1, arg2);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(DataType));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), op(in_1(i), arg2));
   }
+  sycl_device.deallocate(gpu_data_1);
+  sycl_device.deallocate(gpu_data_out);
+}
 
-#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR, Layout)               \
-  {                                                                            \
-    /* out = in_1 OPERATOR in_2 */                                             \
-    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
-    Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange);                      \
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
-    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
-    in_2 = in_2.random() + static_cast<SCALAR>(0.01);                          \
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
-    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
-        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
-    SCALAR *gpu_data_2 = static_cast<SCALAR *>(                                \
-        sycl_device.allocate(in_2.size() * sizeof(SCALAR)));                   \
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange);      \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
-    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
-                                   (in_1.size()) * sizeof(SCALAR));            \
-    sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),                    \
-                                   (in_2.size()) * sizeof(SCALAR));            \
-    gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2;                        \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR in_2(i));                      \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data_1);                                        \
-    sycl_device.deallocate(gpu_data_2);                                        \
-    sycl_device.deallocate(gpu_data_out);                                      \
-  }
+#define DECLARE_BINARY_STRUCT(FUNC)                                                          \
+  struct op_##FUNC {                                                                         \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const T1& x, const T2& y) -> decltype(cl::sycl::FUNC(x, y)) {            \
+      return cl::sycl::FUNC(x, y);                                                           \
+    }                                                                                        \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) -> decltype(x.FUNC(y)) { \
+      return x.FUNC(y);                                                                      \
+    }                                                                                        \
+  };
 
-#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR, Layout)     \
-  {                                                                            \
-    /* out = in_1 OPERATOR 2 */                                                \
-    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
-    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
-    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
-        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
-    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
-                                   (in_1.size()) * sizeof(SCALAR));            \
-    gpu_out.device(sycl_device) = gpu_1 OPERATOR 2;                            \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR 2);                            \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data_1);                                        \
-    sycl_device.deallocate(gpu_data_out);                                      \
-  }
+DECLARE_BINARY_STRUCT(cwiseMax)
+DECLARE_BINARY_STRUCT(cwiseMin)
 
-#define TEST_BINARY_BUILTINS(SCALAR, Layout)                                   \
-  TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax , Layout)                         \
-  TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin , Layout)                         \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, + , Layout)                           \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, - , Layout)                           \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, * , Layout)                           \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, / , Layout)
+#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR)                          \
+  struct op_##NAME {                                                      \
+    template <typename T1, typename T2>                                   \
+    auto operator()(const T1& x, const T2& y) -> decltype(x OPERATOR y) { \
+      return x OPERATOR y;                                                \
+    }                                                                     \
+  };
+
+DECLARE_BINARY_STRUCT_OP(plus, +)
+DECLARE_BINARY_STRUCT_OP(minus, -)
+DECLARE_BINARY_STRUCT_OP(times, *)
+DECLARE_BINARY_STRUCT_OP(divide, /)
+DECLARE_BINARY_STRUCT_OP(modulo, %)
+
+template <typename DataType, int DataLayout>
+void test_binary_builtins(const Eigen::SyclDevice& sycl_device,
+                          const array<int64_t, 3>& tensor_range) {
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_cwiseMax>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_cwiseMin>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_plus>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_minus>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_times>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_divide>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_floating_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+  test_binary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+  test_binary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
+}
 
-static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+template <typename DataType>
+static void test_integer_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
   int64_t sizeDim1 = 10;
   int64_t sizeDim2 = 10;
   int64_t sizeDim3 = 10;
-  array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-  TEST_BINARY_BUILTINS(float, RowMajor)
-  TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, RowMajor)
-  TEST_BINARY_BUILTINS(float, ColMajor)
-  TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor)
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+  test_binary_builtins_fixed_arg2<DataType, RowMajor,
+                                  op_modulo>(sycl_device, tensor_range);
+  test_binary_builtins_fixed_arg2<DataType, ColMajor,
+                                  op_modulo>(sycl_device, tensor_range);
 }
 
 EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     QueueInterface queueInterface(device);
     Eigen::SyclDevice sycl_device(&queueInterface);
-    CALL_SUBTEST(test_builtin_unary_sycl(sycl_device));
-    CALL_SUBTEST(test_builtin_binary_sycl(sycl_device));
+    CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device));
+    CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device));
+    CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
index a91efe00c..1e7093104 100644
--- a/unsupported/test/cxx11_tensor_chipping_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
@@ -419,6 +419,7 @@ static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
 
   const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
   const size_t input2TensorBuffSize =input2.size()*sizeof(DataType);
+  std::cout << tensorBuffSize << " , "<<  input2TensorBuffSize << std::endl;
   DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
   DataType* gpu_data_input1  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
   DataType* gpu_data_input2  = static_cast<DataType*>(sycl_device.allocate(input2TensorBuffSize));
@@ -605,14 +606,14 @@ static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
 template<typename DataType, typename dev_Selector> void sycl_chipping_test_per_device(dev_Selector s){
   QueueInterface queueInterface(s);
   auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ /* test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_static_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_dynamic_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_dynamic_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_chip_in_expr<DataType, RowMajor, int64_t>(sycl_device);
-  test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device);
+  test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device);*/
   test_chip_as_lvalue_sycl<DataType, RowMajor, int64_t>(sycl_device);
-  test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ // test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device);
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl)
 {
diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp
index c8e86e69f..fbcc29358 100644
--- a/unsupported/test/cxx11_tensor_contract_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp
@@ -17,23 +17,27 @@
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
-#include <iostream>
+#include <algorithm>
 #include <chrono>
 #include <ctime>
+#include <iostream>
 
 #include "main.h"
+
 #include <unsupported/Eigen/CXX11/Tensor>
 
 using Eigen::array;
 using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
-template<int DataLayout, typename DataType, typename IndexType, typename Device>
-void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
-{
-  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
-  static const DataType error_threshold =1e-4f;
-//  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void static test_sycl_contraction(const Device &sycl_device, IndexType m_size,
+                                  IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
   // with these dimensions, the output has 300 * 140 elements, which is
   // more than 30 * 1024, which is the number of threads in blocks on
   // a 15 SM GK110 GPU
@@ -41,7 +45,6 @@ void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, I
   Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
   Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
   Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size);
-//  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
   Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
   Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
   Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
@@ -50,117 +53,217 @@ void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, I
   t_left.setRandom();
   t_right.setRandom();
 
-  std::size_t t_left_bytes = t_left.size()  * sizeof(DataType);
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
   std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
   std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
 
-  DataType * d_t_left  = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
-  DataType * d_t_right  = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
-  DataType * d_t_result =  static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
 
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, result_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, result_dims);
 
-  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
-  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
 
   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
-  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
 
   t_result = t_left.contract(t_right, dims);
 
   for (IndexType i = 0; i < t_result.size(); i++) {
-    if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
       continue;
     }
-    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) {
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
       continue;
     }
-    std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
-              << " vs " <<  t_result_gpu(i) << std::endl;
-    assert(false);
+
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
   }
   sycl_device.deallocate(d_t_left);
   sycl_device.deallocate(d_t_right);
   sycl_device.deallocate(d_t_result);
 }
 
-template<int DataLayout, typename DataType, typename IndexType, typename Device>
-void test_TF(const Device& sycl_device)
-{
-  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
-  static const DataType error_threshold =1e-4f;
-  Eigen::array<IndexType, 2> left_dims = {{2, 3}};
-  Eigen::array<IndexType, 2> right_dims = {{3, 1}};
-  Eigen::array<IndexType, 2> res_dims = {{2, 1}};
-  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_m(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128,
+                                                           128);
+  }
+}
 
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_k(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k,
+                                                           128);
+  }
+}
 
-  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
-  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
-  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
-  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_n(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128,
+                                                           128, k);
+  }
+}
 
-  t_left.data()[0] = 1.0f;
-  t_left.data()[1] = 2.0f;
-  t_left.data()[2] = 3.0f;
-  t_left.data()[3] = 4.0f;
-  t_left.data()[4] = 5.0f;
-  t_left.data()[5] = 6.0f;
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_sizes(const Device &sycl_device) {
+  IndexType m_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
+                         257, 511, 512, 513, 1023, 1024, 1025};
 
-  t_right.data()[0] = -1.0f;
-  t_right.data()[1] = 0.5f;
-  t_right.data()[2] = 2.0f;
+  IndexType n_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
+                         257, 511, 512, 513, 1023, 1024, 1025};
 
-  std::size_t t_left_bytes = t_left.size()  * sizeof(DataType);
-  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
-  std::size_t t_result_bytes = t_result.size()*sizeof(DataType);
+  IndexType k_sizes[] = {31,  39,  63,  64,  65,  95,   96,   127, 129,
+                         255, 257, 511, 512, 513, 1023, 1024, 1025};
+
+  for (IndexType i = 0; i < 15; i++) {
+    for (IndexType j = 0; j < 15; j++) {
+      for (IndexType k = 0; k < 17; k++) {
+        test_sycl_contraction<DataLayout, DataType, IndexType>(
+            sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
+      }
+    }
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void static test_no_out_of_bounds(const Device &sycl_device, IndexType m_size,
+                                  IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
 
+  t_left.setRandom();
+  t_right.setRandom();
 
-  DataType * d_t_left  = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
-  DataType * d_t_right  = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
-  DataType * d_t_result =  static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
+  // Allocate buffers twice as big to check for invalid read and write
+  auto padded_left_size = 2 * t_left.size();
+  auto padded_right_size = 2 * t_right.size();
+  auto padded_result_size = 2 * t_result.size();
 
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, res_dims);
+  std::size_t t_left_bytes = padded_left_size * sizeof(DataType);
+  std::size_t t_right_bytes = padded_right_size * sizeof(DataType);
+  std::size_t t_result_bytes = padded_result_size * sizeof(DataType);
 
-  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
-  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  // TensorMaps are still of the same size than the Tensors
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, result_dims);
+
+  // Write nan after the actual buffer to propagate nans everywhere in case of
+  // invalid reads
+  DataType nan = std::numeric_limits<DataType>::quiet_NaN();
+  auto host_left_data = new DataType[padded_left_size];
+  std::copy_n(t_left.data(), t_left.size(), host_left_data);
+  std::fill_n(host_left_data + t_left.size(), t_left.size(), nan);
+  auto host_right_data = new DataType[padded_right_size];
+  std::copy_n(t_right.data(), t_right.size(), host_right_data);
+  std::fill_n(host_right_data + t_right.size(), t_right.size(), nan);
+  auto host_result_data = new DataType[padded_result_size];
+  std::fill_n(host_result_data, padded_result_size, nan);
+
+  sycl_device.memcpyHostToDevice(d_t_left, host_left_data, t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, host_right_data, t_right_bytes);
+  sycl_device.memcpyHostToDevice(d_t_result, host_result_data, t_result_bytes);
 
   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
-  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
+  sycl_device.memcpyDeviceToHost(host_result_data, d_t_result, t_result_bytes);
 
   t_result = t_left.contract(t_right, dims);
 
   for (IndexType i = 0; i < t_result.size(); i++) {
-    if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - host_result_data[i]))) < error_threshold) {
       continue;
     }
-    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) {
+    if (Eigen::internal::isApprox(t_result(i), host_result_data[i],
+                                  error_threshold)) {
       continue;
     }
-    std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
-              << " vs " <<  t_result_gpu(i) << std::endl;
-    assert(false);
+    if (std::isnan(host_result_data[i])) {
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", invalid read detected at IndexType " << i << ": "
+                << t_result(i) << " vs " << host_result_data[i] << std::endl;
+    } else {
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", mismatch detected at IndexType " << i << ": "
+                << t_result(i) << " vs " << host_result_data[i] << std::endl;
+    }
+    VERIFY_IS_APPROX(host_result_data[i], t_result(i));
+  }
+  // Make sure that the rest of the result is still nans
+  for (IndexType i = t_result.size(); i < padded_result_size; i++) {
+    if (std::isnan(host_result_data[i])) {
+      continue;
+    }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", invalid write detected at IndexType " << i << ": "
+              << host_result_data[i] << std::endl;
+    VERIFY_IS_APPROX(host_result_data[i], t_result(i));
   }
   sycl_device.deallocate(d_t_left);
   sycl_device.deallocate(d_t_right);
   sycl_device.deallocate(d_t_result);
 
-
+  delete[] host_left_data;
+  delete[] host_right_data;
+  delete[] host_result_data;
 }
 
-template<int DataLayout,  typename DataType, typename IndexType, typename Device>
-void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
-{
-  //std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_scalar(const Device &sycl_device, IndexType m_size, IndexType k_size,
+                 IndexType n_size) {
+  // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size <<
+  // ")" << std::endl;
   // with these dimensions, the output has 300 * 140 elements, which is
   // more than 30 * 1024, which is the number of threads in blocks on
   // a 15 SM GK110 GPU
-  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
-  static const DataType error_threshold =1e-4f;
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
   Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
   Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
   Tensor<DataType, 0, DataLayout, IndexType> t_result;
@@ -171,32 +274,40 @@ void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size,
   t_left.setRandom();
   t_right.setRandom();
 
-  std::size_t t_left_bytes = t_left.size()  * sizeof(DataType);
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
   std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
   std::size_t t_result_bytes = sizeof(DataType);
 
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
 
-  DataType * d_t_left  = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
-  DataType * d_t_right  = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
-  DataType * d_t_result =  static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType>>
+      gpu_t_result(d_t_result);
 
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType> > gpu_t_result(d_t_result);
-
-  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
-  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
 
   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
-  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
 
   t_result = t_left.contract(t_right, dims);
 
-  if (static_cast<DataType>(fabs(t_result() - t_result_gpu())) > error_threshold &&
+  if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+          t_result() - t_result_gpu()))) > error_threshold &&
       !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) {
-    std::cout << "mismatch detected: " << t_result()
-              << " vs " <<  t_result_gpu() << std::endl;
-    assert(false);
+    std::cout << "K: " << k_size << ", N: " << n_size << ", M: " << m_size
+              << " : mismatch detected: " << t_result() << " vs "
+              << t_result_gpu() << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(), t_result());
   }
 
   sycl_device.deallocate(d_t_left);
@@ -204,87 +315,712 @@ void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size,
   sycl_device.deallocate(d_t_result);
 }
 
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_batch(const Device &sycl_device, IndexType m_size,
+                       IndexType k_size, IndexType n_size, IndexType m_batch,
+                       IndexType start, IndexType limit) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  typedef Eigen::array<IndexType, 3> TensorDim;
+  typedef Eigen::Tensor<DataType, 3, DataLayout, IndexType> TensorType;
+  TensorDim left_dims = {{m_batch, k_size, m_size}};
+  TensorDim right_dims = {{m_batch, n_size, k_size}};
+  TensorDim res_dims = {{m_batch, m_size, n_size}};
+  Eigen::array<DimPair, 1> contract_pairs = {{DimPair(0, 1)}};
 
-template<int DataLayout, typename DataType, typename IndexType, typename Device>
-void test_sycl_contraction_m(const Device& sycl_device) {
-  for (IndexType k = 32; k < 256; k++) {
-    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128, 128);
+  TensorType t_left(left_dims);
+  TensorType t_right(right_dims);
+  TensorType t_result_gpu(res_dims);
+  TensorType t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<TensorType> gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<TensorType> gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<TensorType> gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+  for (int i = start; i < limit; ++i) {
+    auto x = gpu_t_left.template chip<0>(i);
+    auto y = gpu_t_right.template chip<0>(i);
+    auto z = gpu_t_result.template chip<0>(i);
+    z.device(sycl_device) = x.contract(y, contract_pairs);
+  }
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  for (int i = start; i < limit; ++i) {
+    auto x = t_left.template chip<0>(i);
+    auto y = t_right.template chip<0>(i);
+    auto z = t_result.template chip<0>(i);
+    z = x.contract(y, contract_pairs);
+  }
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
   }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
 }
 
-template<int DataLayout,  typename DataType, typename IndexType, typename Device>
-void test_sycl_contraction_k(const Device& sycl_device) {
-  for (IndexType k = 32; k < 256; k++) {
-    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k, 128);
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_rhs_transposed(const Device &sycl_device, IndexType m_size,
+                                IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 1)}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType j = 0; j < m_size; j++) {
+    for (IndexType i = 0; i < n_size; i++) {
+      if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+              t_result(j, i) - t_result_gpu(j, i)))) < error_threshold) {
+        continue;
+      }
+      if (Eigen::internal::isApprox(t_result(j, i), t_result_gpu(j, i),
+                                    error_threshold)) {
+        continue;
+      }
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", mismatch detected at IndexType m: " << j << " n: " << i
+                << " CPU : " << t_result(j, i)
+                << " vs SYCL:" << t_result_gpu(j, i) << std::endl;
+      VERIFY_IS_APPROX(t_result_gpu(j, i), t_result(j, i));
+    }
   }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
 }
 
-template<int DataLayout,  typename DataType, typename IndexType, typename Device>
-void test_sycl_contraction_n(const Device& sycl_device) {
-  for (IndexType k = 32; k < 256; k++) {
-    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, 128, k);
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_lhs_transposed(const Device &sycl_device, IndexType m_size,
+                                IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
   }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
 }
 
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_both_transposed(const Device &sycl_device, IndexType m_size,
+                                 IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
+  Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(0, 1)}};
 
-template<int DataLayout,  typename DataType, typename IndexType, typename Device>
-void test_sycl_contraction_sizes(const Device& sycl_device) {
-  IndexType m_sizes[] = { 31,  39,   63,   64,   65,
-                   127, 129,  255,  257 , 511,
-                   512, 513, 1023, 1024, 1025};
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
 
-  IndexType n_sizes[] = { 31,  39,   63,   64,   65,
-                   127, 129,  255,  257,  511,
-                   512, 513, 1023, 1024, 1025};
+  t_left.setRandom();
+  t_right.setRandom();
 
-  IndexType k_sizes[] = {  31,   39,  63,  64,   65,
-                     95,   96, 127, 129,  255,
-                    257,  511, 512, 513, 1023,
-                   1024, 1025};
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
 
-  for (IndexType i = 0; i < 15; i++) {
-    for (IndexType j = 0; j < 15; j++) {
-      for (IndexType k = 0; k < 17; k++) {
-        test_sycl_contraction<DataLayout, DataType,IndexType>(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
-      }
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
     }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
   }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <typename Dev>
+void inline tensorOutofBound(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Test out of bound for Tensor-Tensor
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
+                                                       1024);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
+                                                       4096);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 4096, 1024,
+                                                       2048);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
+                                                       1024);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 2048, 1024,
+                                                       784);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
+                                                       10);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 513, 4096,
+                                                       513);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 783, 1024,
+                                                       783);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
+                                                       784);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 11, 1024,
+                                                       11);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor out of bound tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 128, 128,
+                                                       128);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 128, 128,
+                                                       128);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_m(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_m<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_m<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_n(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_n<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_n<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
 }
 
-template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s){
-  QueueInterface queueInterface(s);
-  auto sycl_device=Eigen::SyclDevice(&queueInterface);
-  test_sycl_contraction<ColMajor, float,int64_t>(sycl_device, 32, 32, 32);
-  test_sycl_contraction<RowMajor,float,int64_t>(sycl_device, 32, 32, 32);
-  test_scalar<ColMajor,float,int64_t>(sycl_device, 32, 32, 32);
-  test_scalar<RowMajor,float,int64_t>(sycl_device, 32, 32, 32);
+template <typename Dev>
+void inline tensorTensor_k(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
   std::chrono::time_point<std::chrono::system_clock> start, end;
   start = std::chrono::system_clock::now();
-  test_sycl_contraction<ColMajor,float,int64_t>(sycl_device, 128, 128, 128);
-  test_sycl_contraction<RowMajor,float,int64_t>(sycl_device, 128, 128, 128);
-  test_scalar<ColMajor,float,int64_t>(sycl_device, 128, 128, 128);
-  test_scalar<RowMajor,float,int64_t>(sycl_device, 128, 128, 128);
-  test_sycl_contraction_m<ColMajor, float, int64_t>(sycl_device);
-  test_sycl_contraction_m<RowMajor, float, int64_t>(sycl_device);
-  test_sycl_contraction_n<ColMajor, float, int64_t>(sycl_device);
-  test_sycl_contraction_n<RowMajor, float, int64_t>(sycl_device);
-  test_sycl_contraction_k<ColMajor, float, int64_t>(sycl_device);
-  test_sycl_contraction_k<RowMajor, float, int64_t>(sycl_device);
-  test_sycl_contraction_sizes<ColMajor, float, int64_t>(sycl_device);
-  test_sycl_contraction_sizes<RowMajor, float, int64_t>(sycl_device);
-  test_TF<RowMajor, float, int64_t>(sycl_device);
-  test_TF<ColMajor, float, int64_t>(sycl_device);
+  test_sycl_contraction_k<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_k<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_sizes(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_sizes<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_sizes<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+template <typename Dev>
+void inline vectorVector(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // VECTOR-VECTOR
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1,
+                                                       1025);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1025, 1,
+                                                       1025);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1024, 1,
+                                                       1024);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1,
+                                                       1024);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1,
+                                                       1023);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1,
+                                                       1023);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "contracted tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline vectorTensor(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Vector-Tensor
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1025,
+                                                       1025);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1025,
+                                                       1025);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1024,
+                                                       1024);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1024,
+                                                       1024);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1023,
+                                                       1023);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1023,
+                                                       1023);
+
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4097,
+                                                       4097);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4097,
+                                                       4097);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4096,
+                                                       4096);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4096,
+                                                       4096);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4095,
+                                                       4095);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4095,
+                                                       4095);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 802816,
+                                                       32);
 
   end = std::chrono::system_clock::now();
-  std::chrono::duration<double> elapsed_seconds = end-start;
+  std::chrono::duration<double> elapsed_seconds = end - start;
   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
   std::cout << "finished computation at " << std::ctime(&end_time)
             << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorVector(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Matrix-Vector
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1025,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1125, 1025,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1224, 1024,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1023,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1023,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4097, 4197,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4097, 4097,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4096, 4096,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4096, 8196,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4095, 4095,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4095, 4095,
+                                                       1);
+// If the GEMV disabled it will creates one kernel to calculate the contraction.
+// Therefore the acumuation of float number will overflow the precision
+// threshold for float and cause the test to fail. While it the GMV multiple
+// kernel will be created and each one run the overflow of accumutation breaks
+// among the kernels.
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 32, 802032,
+                                                       1);
+#endif
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorScalar(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // SCALAR Contraction
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline skinnyTensor_row(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 257, 131073,
+                                                       257);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 256, 131072,
+                                                       256);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 131073,
+                                                       16);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 17, 131072,
+                                                       17);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline skinnyTensor_col(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 257, 131073,
+                                                       257);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 256, 131072,
+                                                       256);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 131073,
+                                                       16);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 17, 131072,
+                                                       17);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
 
+template <typename Dev>
+void inline tensor_contraction_batch_per_device(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_batch<RowMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
+                                                   0, 4);
+  contraction_batch<ColMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
+                                                   0, 4);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_lhs_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 8, 4,
+                                                            8);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                            32);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
+                                                            64);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 784,
+                                                            2048, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
+                                                            10, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
+                                                            1024, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
+                                                            4096, 1024);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_rhs_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 16, 4,
+                                                            16);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
+                                                            17);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                            32);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
+                                                            64);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 10,
+                                                            1024, 1024);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
+                                                            1024, 4096);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
+                                                            1024, 2048);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
+                                                            1024, 784);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_both_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
+                                                             17);
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                             32);
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 64,
+                                                             16, 64);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
 }
 
 EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl) {
-  for (const auto& device :Eigen::get_sycl_supported_devices()) {
-    CALL_SUBTEST(tensorContractionPerDevice(device));
+  for (const auto &device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(tensorOutofBound(sycl_device));
+    CALL_SUBTEST_2(tensorTensor(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_m(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_n(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_k(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_sizes(sycl_device));
+    CALL_SUBTEST_3(vectorVector(sycl_device));
+    CALL_SUBTEST_4(vectorTensor(sycl_device));
+    CALL_SUBTEST_5(tensorVector(sycl_device));
+    CALL_SUBTEST_6(tensorScalar(sycl_device));
+    CALL_SUBTEST_7(skinnyTensor_row(sycl_device));
+    CALL_SUBTEST_7(skinnyTensor_col(sycl_device));
+    CALL_SUBTEST_8(tensor_contraction_batch_per_device(sycl_device));
+    CALL_SUBTEST_9(tensor_contraction_lhs_transposed_per_device(sycl_device));
+    CALL_SUBTEST_10(tensor_contraction_rhs_transposed_per_device(sycl_device));
+    CALL_SUBTEST_11(tensor_contraction_both_transposed_per_device(sycl_device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
index cc3b02448..d947ead83 100644
--- a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
@@ -80,6 +80,8 @@ static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device)
       VERIFY_IS_EQUAL(out(i, j), 0);
     }
   }
+  sycl_device.deallocate(gpu_in1_data);
+sycl_device.deallocate(gpu_out_data);
 }
 
 template<typename TensorType>
@@ -147,6 +149,9 @@ static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device)
       }
     }
   }
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+  sycl_device.deallocate(gpu_out_data);
 }
 
 template <typename DataType, typename Dev_selector> void custom_op_perDevice(Dev_selector s){
diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
index 74d38a644..a55a5ad8a 100644
--- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
@@ -36,8 +36,8 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
   DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
   DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
 
-  in1 = in1.random() + in1.constant(10.0f);
-  in2 = in2.random() + in2.constant(10.0f);
+  in1 = in1.random() + in1.constant(static_cast<DataType>(10.0f));
+  in2 = in2.random() + in2.constant(static_cast<DataType>(10.0f));
 
   // creating TensorMap from tensor
   Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
@@ -72,5 +72,6 @@ template <typename DataType, typename Dev_selector> void tensorForced_evalperDev
 EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(tensorForced_evalperDevice<float>(device));
+    CALL_SUBTEST(tensorForced_evalperDevice<half>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_image_op_sycl.cpp b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
new file mode 100644
index 000000000..db1c0206e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_image_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 245;
+  IndexType sizeDim2 = 343;
+  IndexType sizeDim3 = 577;
+
+  array<IndexType, 3> input_range ={{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> slice_range ={{sizeDim1-1, sizeDim2, sizeDim3}};
+
+  Tensor<DataType, 3,DataLayout, IndexType> tensor1(input_range);
+  Tensor<DataType, 3,DataLayout, IndexType> tensor2(input_range);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor3(slice_range);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor3_cpu(slice_range);
+
+
+
+  typedef Eigen::DSizes<IndexType, 3> Index3;
+  Index3 strides1(1L,1L, 1L);
+  Index3 indicesStart1(1L, 0L, 0L);
+  Index3 indicesStop1(sizeDim1, sizeDim2, sizeDim3);
+
+  Index3 strides2(1L,1L, 1L);
+  Index3 indicesStart2(0L, 0L, 0L);
+  Index3 indicesStop2(sizeDim1-1, sizeDim2, sizeDim3);
+  Eigen::DSizes<IndexType, 3> sizes(sizeDim1-1,sizeDim2,sizeDim3);
+
+  tensor1.setRandom();
+  tensor2.setRandom();
+
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, input_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, input_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu3(gpu_data3, slice_range);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data2, tensor2.data(),(tensor2.size())*sizeof(DataType));
+  gpu3.device(sycl_device)= gpu1.slice(indicesStart1, sizes) - gpu2.slice(indicesStart2, sizes);
+  sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType));
+
+  tensor3_cpu = tensor1.stridedSlice(indicesStart1,indicesStop1,strides1) - tensor2.stridedSlice(indicesStart2,indicesStop2,strides2);
+
+
+  for (IndexType i = 0; i <slice_range[0] ; ++i) {
+    for (IndexType j = 0; j < slice_range[1]; ++j) {
+      for (IndexType k = 0; k < slice_range[2]; ++k) {
+        VERIFY_IS_EQUAL(tensor3_cpu(i,j,k), tensor3(i,j,k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_image_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) { 
+   CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+   CALL_SUBTEST(sycl_computing_test_per_device<double>(device));
+#endif
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_math_sycl.cpp b/unsupported/test/cxx11_tensor_math_sycl.cpp
new file mode 100644
index 000000000..029653e27
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_math_sycl.cpp
@@ -0,0 +1,105 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_tanh_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+  IndexType sizeDim1 = 4;
+  IndexType sizeDim2 = 4;
+  IndexType sizeDim3 = 1;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange);
+
+  in = in.random();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType));
+  gpu2.device(sycl_device) = gpu1.tanh();
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType));
+
+  out_cpu=in.tanh();
+
+  for (int i = 0; i < in.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), out_cpu(i));
+  }
+}
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_sigmoid_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+  IndexType sizeDim1 = 4;
+  IndexType sizeDim2 = 4;
+  IndexType sizeDim3 = 1;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange);
+
+  in = in.random();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType));
+  gpu2.device(sycl_device) = gpu1.sigmoid();
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType));
+
+  out_cpu=in.sigmoid();
+
+  for (int i = 0; i < in.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), out_cpu(i));
+  }
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_tanh_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_tanh_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_sigmoid_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_sigmoid_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
index 93dabe3ec..bf001b40f 100644
--- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
@@ -180,6 +180,82 @@ static void test_simple_slice(const Eigen::SyclDevice &sycl_device)
   sycl_device.deallocate(gpu_data3);
 }
 
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_strided_slice_as_rhs_sycl(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+  typedef Eigen::DSizes<IndexType, 5> Index5;
+  Index5 strides(1L,1L,1L,1L,1L);
+  Index5 indicesStart(1L,2L,3L,4L,5L);
+  Index5 indicesStop(2L,3L,4L,5L,6L);
+  Index5 lengths(1L,1L,1L,1L,1L);
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 5, DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+
+  array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range);
+  Tensor<DataType, 5, DataLayout, IndexType> slice_stride1(slice1_range);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType)));
+  DataType* gpu_data_stride2  = static_cast<DataType*>(sycl_device.allocate(slice_stride1.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride2(gpu_data_stride2, slice1_range);
+
+  Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1);
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.slice(indices, sizes);
+  sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType));
+
+  gpu_stride2.device(sycl_device)=gpu1.stridedSlice(indicesStart,indicesStop,strides);
+  sycl_device.memcpyDeviceToHost(slice_stride1.data(), gpu_data_stride2,(slice_stride1.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+  VERIFY_IS_EQUAL(slice_stride1(0,0,0,0,0), tensor(1,2,3,4,5));
+
+  array<IndexType, 5> slice2_range ={{1,1,2,2,3}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range);
+  Tensor<DataType, 5, DataLayout, IndexType> strideSlice2(slice2_range);
+
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType)));
+  DataType* gpu_data_stride3  = static_cast<DataType*>(sycl_device.allocate(strideSlice2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride3(gpu_data_stride3, slice2_range);
+  Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3);
+  Index5 strides2(1L,1L,1L,1L,1L);
+  Index5 indicesStart2(1L,1L,3L,4L,5L);
+  Index5 indicesStop2(2L,2L,5L,6L,8L);
+
+  gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
+  sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType));
+
+  gpu_stride3.device(sycl_device)=gpu1.stridedSlice(indicesStart2,indicesStop2,strides2);
+  sycl_device.memcpyDeviceToHost(strideSlice2.data(), gpu_data_stride3,(strideSlice2.size())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 2; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+        VERIFY_IS_EQUAL(strideSlice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
 template<typename DataType, int DataLayout, typename IndexType>
 static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
 {
@@ -228,6 +304,65 @@ static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
   sycl_device.deallocate(gpu_data3);
 }
 
+template <typename OutIndex, typename DSizes>
+Eigen::array<OutIndex, DSizes::count> To32BitDims(const DSizes& in) {
+  Eigen::array<OutIndex, DSizes::count> out;
+  for (int i = 0; i < DSizes::count; ++i) {
+    out[i] = in[i];
+  }
+  return out;
+}
+
+template <class DataType, int DataLayout, typename IndexType, typename ConvertedIndexType>
+int run_eigen(const SyclDevice& sycl_device) {
+  using TensorI64 = Tensor<DataType, 5, DataLayout, IndexType>;
+  using TensorI32 = Tensor<DataType, 5, DataLayout, ConvertedIndexType>;
+  using TensorMI64 = TensorMap<TensorI64>;
+  using TensorMI32 = TensorMap<TensorI32>;
+  Eigen::array<IndexType, 5> tensor_range{{4, 1, 1, 1, 6}};
+  Eigen::array<IndexType, 5> slice_range{{4, 1, 1, 1, 3}};
+
+  TensorI64 out_tensor_gpu(tensor_range);
+  TensorI64 out_tensor_cpu(tensor_range);
+  out_tensor_cpu.setRandom();
+
+  TensorI64 sub_tensor(slice_range);
+  sub_tensor.setRandom();
+
+  DataType* out_gpu_data = static_cast<DataType*>(sycl_device.allocate(out_tensor_cpu.size() * sizeof(DataType)));
+  DataType* sub_gpu_data = static_cast<DataType*>(sycl_device.allocate(sub_tensor.size() * sizeof(DataType)));
+  TensorMI64 out_gpu(out_gpu_data, tensor_range);
+  TensorMI64 sub_gpu(sub_gpu_data, slice_range);
+
+  sycl_device.memcpyHostToDevice(out_gpu_data, out_tensor_cpu.data(), out_tensor_cpu.size() * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(sub_gpu_data, sub_tensor.data(), sub_tensor.size() * sizeof(DataType));
+
+  Eigen::array<ConvertedIndexType, 5> slice_offset_32{{0, 0, 0, 0, 3}};
+  Eigen::array<ConvertedIndexType, 5> slice_range_32{{4, 1, 1, 1, 3}};
+  TensorMI32 out_cpu_32(out_tensor_cpu.data(), To32BitDims<ConvertedIndexType>(out_tensor_cpu.dimensions()));
+  TensorMI32 sub_cpu_32(sub_tensor.data(), To32BitDims<ConvertedIndexType>(sub_tensor.dimensions()));
+  TensorMI32 out_gpu_32(out_gpu.data(), To32BitDims<ConvertedIndexType>(out_gpu.dimensions()));
+  TensorMI32 sub_gpu_32(sub_gpu.data(), To32BitDims<ConvertedIndexType>(sub_gpu.dimensions()));
+
+  out_gpu_32.slice(slice_offset_32, slice_range_32).device(sycl_device) = sub_gpu_32;
+
+  out_cpu_32.slice(slice_offset_32, slice_range_32) = sub_cpu_32;
+
+  sycl_device.memcpyDeviceToHost(out_tensor_gpu.data(), out_gpu_data, out_tensor_cpu.size() * sizeof(DataType));
+  int has_err = 0;
+  for (IndexType i = 0; i < out_tensor_cpu.size(); ++i) {
+    auto exp = out_tensor_cpu(i);
+    auto val = out_tensor_gpu(i);
+    if (val != exp) {
+      std::cout << "#" << i << " got " << val << " but expected " << exp << std::endl;
+      has_err = 1;
+    }
+  }
+  sycl_device.deallocate(out_gpu_data);
+  sycl_device.deallocate(sub_gpu_data);
+  return has_err;
+}
+
 template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_device(dev_Selector s){
   QueueInterface queueInterface(s);
   auto sycl_device = Eigen::SyclDevice(&queueInterface);
@@ -239,6 +374,9 @@ template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_d
   test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
   test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_strided_slice_as_rhs_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_strided_slice_as_rhs_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  run_eigen<float, RowMajor, long, int>(sycl_device); 
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl)
 {
diff --git a/unsupported/test/cxx11_tensor_random_sycl.cpp b/unsupported/test/cxx11_tensor_random_sycl.cpp
new file mode 100644
index 000000000..6c83894a3
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_random_sycl.cpp
@@ -0,0 +1,100 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_sycl_random_uniform(const Eigen::SyclDevice& sycl_device)
+{
+  Tensor<DataType, 2,DataLayout, IndexType> out(72,97);
+  out.setZero();
+
+  std::size_t out_bytes = out.size() * sizeof(DataType);
+
+  IndexType sizeDim0 = 72;
+  IndexType sizeDim1 = 97;
+
+  array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}};
+
+  DataType* d_out  = static_cast<DataType*>(sycl_device.allocate(out_bytes));
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange);
+
+  gpu_out.device(sycl_device)=gpu_out.random();
+  sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes);
+  for(IndexType i=1; i<sizeDim0; i++)
+    for(IndexType j=1; j<sizeDim1; j++)
+    {
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1));    }
+
+  // For now we just check thes code doesn't crash.
+  // TODO: come up with a valid test of randomness
+  sycl_device.deallocate(d_out);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_random_normal(const Eigen::SyclDevice& sycl_device)
+{
+  Tensor<DataType, 2,DataLayout,IndexType> out(72,97);
+  out.setZero();
+  std::size_t out_bytes = out.size() * sizeof(DataType);
+
+  IndexType sizeDim0 = 72;
+  IndexType sizeDim1 = 97;
+
+  array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}};
+
+  DataType* d_out  = static_cast<DataType*>(sycl_device.allocate(out_bytes));
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange);
+  Eigen::internal::NormalRandomGenerator<DataType> gen(true);
+  gpu_out.device(sycl_device)=gpu_out.random(gen);
+  sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes);
+  for(IndexType i=1; i<sizeDim0; i++)
+    for(IndexType j=1; j<sizeDim1; j++)
+    {
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1));
+
+    }
+
+  // For now we just check thes code doesn't crash.
+  // TODO: come up with a valid test of randomness
+  sycl_device.deallocate(d_out);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_random_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_sycl_random_uniform<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_random_uniform<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_random_normal<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_random_normal<DataType, ColMajor, int64_t>(sycl_device);
+
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_random_test_per_device<float>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+    CALL_SUBTEST(sycl_random_test_per_device<double>(device));
+#endif
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
index f526299c6..a297716e4 100644
--- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
@@ -16,16 +16,99 @@
 
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
+#define EIGEN_HAS_CONSTEXPR 1
 
 #include "main.h"
+
 #include <unsupported/Eigen/CXX11/Tensor>
 
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 753;
+  const IndexType num_cols = 537;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+
+  array<IndexType, 2> outRange = {{1, 1}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange);
+  Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange);
+
+  in.setRandom();
+  auto dim = DSizes<IndexType, 2>(1, 1);
+  full_redux = in.sum().reshape(dim);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(
+      sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize()));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data,
+                                                                outRange);
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim);
+  sycl_device.memcpyDeviceToHost(
+      full_redux_gpu.data(), gpu_out_data,
+      (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  std::cout << "SYCL FULL :" << full_redux_gpu(0, 0)
+            << ", CPU FULL: " << full_redux(0, 0) << "\n";
+  VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0));
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
 
 template <typename DataType, int DataLayout, typename IndexType>
-static void test_full_reductions_mean_sycl(const Eigen::SyclDevice&  sycl_device) {
+static void test_full_reductions_sum_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.sum();
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.sum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
 
-  const IndexType num_rows = 452;
-  const IndexType num_cols = 765;
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_max_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 4096;
+  const IndexType num_cols = 4096;
   array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
 
   Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
@@ -34,27 +117,250 @@ static void test_full_reductions_mean_sycl(const Eigen::SyclDevice&  sycl_device
 
   in.setRandom();
 
-  full_redux = in.mean();
+  full_redux = in.maximum();
 
-  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
-  DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType));
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
 
-  TensorMap<Tensor<DataType, 2, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 0, DataLayout, IndexType> >  out_gpu(gpu_out_data);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set the initial value to be the max.
+  // As we don't include this in the reduction the result should not be 2.
+  in(0) = static_cast<DataType>(2);
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.maximum();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
-  out_gpu.device(sycl_device) = in_gpu.mean();
-  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }
 
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 4096;
+  const IndexType num_cols = 4096;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+  array<IndexType, 1> argRange = {{num_cols}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  //  red_axis[1]=1;
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange);
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+  in.setRandom();
+  in_arg1.setRandom();
+  in_arg2.setRandom();
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_in_arg1_data = static_cast<DataType*>(sycl_device.allocate(
+      in_arg1.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_in_arg2_data = static_cast<DataType*>(sycl_device.allocate(
+      in_arg2.dimensions().TotalSize() * sizeof(DataType)));
+  bool* gpu_out_arg__gpu_helper_data = static_cast<bool*>(sycl_device.allocate(
+      out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
+  bool* gpu_out_arg_data = static_cast<bool*>(sycl_device.allocate(
+      out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg1_gpu(
+      gpu_in_arg1_data, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg2_gpu(
+      gpu_in_arg2_data, tensorRange);
+  TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu(
+      gpu_out_arg_data, argRange);
+  TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu_helper(
+      gpu_out_arg__gpu_helper_data, argRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+
+  // CPU VERSION
+  out_arg_cpu =
+      (in_arg1.argmax(1) == in_arg2.argmax(1))
+          .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false));
+  full_redux = (out_arg_cpu.template cast<float>())
+                   .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+
+  // GPU VERSION
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_arg1_data, in_arg1.data(),
+      (in_arg1.dimensions().TotalSize()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_arg2_data, in_arg2.data(),
+      (in_arg2.dimensions().TotalSize()) * sizeof(DataType));
+  out_Argout_gpu_helper.device(sycl_device) =
+      (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1));
+  out_Argout_gpu.device(sycl_device) =
+      (out_Argout_gpu_helper)
+          .select(out_Argout_gpu.constant(true),
+                  out_Argout_gpu.constant(false));
+  out_gpu.device(sycl_device) =
+      (out_Argout_gpu.template cast<float>())
+          .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux()
+            << '\n';
+  VERIFY_IS_EQUAL(full_redux_gpu(), full_redux());
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_in_arg1_data);
+  sycl_device.deallocate(gpu_in_arg2_data);
+  sycl_device.deallocate(gpu_out_arg__gpu_helper_data);
+  sycl_device.deallocate(gpu_out_arg_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.mean();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
 
 template <typename DataType, int DataLayout, typename IndexType>
-static void test_full_reductions_min_sycl(const Eigen::SyclDevice&  sycl_device) {
+static void test_full_reductions_mean_with_odd_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  // This is a particular case which illustrates a possible problem when the
+  // number of local threads in a workgroup is even, but is not a power of two.
+  using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  // 2177 = (17 * 128) + 1 gives rise to 18 local threads.
+  // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads.
+  const IndexType n_elems = 8707;
+  array<IndexType, 1> tensor_range = {{n_elems}};
+
+  data_tensor in(tensor_range);
+  DataType full_redux;
+  DataType full_redux_gpu;
+  TensorMap<scalar_tensor> red_cpu(&full_redux);
+  TensorMap<scalar_tensor> red_gpu(&full_redux_gpu);
+
+  const DataType const_val = static_cast<DataType>(0.6391);
+  in = in.constant(const_val);
+
+  Eigen::IndexList<Eigen::type2index<0>> red_axis;
+  red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  VERIFY_IS_APPROX(const_val, red_cpu());
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) =
+      in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu, full_redux);
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
 
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_min_sycl(
+    const Eigen::SyclDevice& sycl_device) {
   const IndexType num_rows = 876;
   const IndexType num_cols = 953;
   array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
@@ -67,25 +373,73 @@ static void test_full_reductions_min_sycl(const Eigen::SyclDevice&  sycl_device)
 
   full_redux = in.minimum();
 
-  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
-  DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType));
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
 
-  TensorMap<Tensor<DataType, 2, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 0, DataLayout, IndexType> >  out_gpu(gpu_out_data);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.minimum();
-  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }
 
-
 template <typename DataType, int DataLayout, typename IndexType>
-static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_device) {
+static void test_full_reductions_min_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set the initial value to be the min.
+  // As we don't include this in the reduction the result should not be -2.
+  in(0) = static_cast<DataType>(-2);
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.minimum();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.minimum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
 
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_max_sycl(
+    const Eigen::SyclDevice& sycl_device) {
   IndexType dim_x = 145;
   IndexType dim_y = 1;
   IndexType dim_z = 67;
@@ -101,33 +455,293 @@ static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_dev
 
   in.setRandom();
 
-  redux= in.maximum(red_axis);
+  redux = in.maximum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  array<IndexType, 1> reduced_range = {{num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+  const IndexType n_reduced = num_cols;
+
+  data_tensor in(tensor_range);
+  reduced_tensor redux;
+  reduced_tensor redux_gpu(reduced_range);
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set maximum value outside of the considered range.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    in(i) = static_cast<DataType>(2);
+  }
+
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  redux = in_offset.maximum(red_axis);
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_NOT_EQUAL(redux(i), in(i));
+  }
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(
+      sycl_device.allocate(n_reduced * sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data,
+                                 n_reduced * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_APPROX(redux_gpu(i), redux(i));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  array<IndexType, 1> full_reduced_range = {{num_rows}};
+  array<IndexType, 1> reduced_range = {{num_rows - 1}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+  const IndexType n_reduced = reduced_range[0];
+
+  data_tensor in(tensor_range);
+  reduced_tensor redux(full_reduced_range);
+  reduced_tensor redux_gpu(reduced_range);
+
+  in.setRandom();
+  redux.setZero();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set maximum value outside of the considered range.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    in(i) = static_cast<DataType>(2);
+  }
 
-  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
-  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType)));
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 1;
+
+  const IndexType offset = 64;
+  // Introduce an offset in both the input and the output.
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range);
+  red_offset = in_offset.maximum(red_axis);
+
+  // Check that the first value hasn't been changed and that the reduced values
+  // are not equal to the previously set maximum in the input outside the range.
+  VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0));
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_NOT_EQUAL(red_offset(i), in(i));
+  }
 
-  TensorMap<Tensor<DataType, 3, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 2, DataLayout, IndexType> >  out_gpu(gpu_out_data, reduced_tensorRange);
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(
+      sycl_device.allocate((n_reduced + 1) * sizeof(DataType)));
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
-  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(),
+                                 n_reduced * sizeof(DataType));
 
   // Check that the CPU and GPU reductions return the same result.
-  for(IndexType j=0; j<reduced_tensorRange[0]; j++ )
-    for(IndexType k=0; k<reduced_tensorRange[1]; k++ )
-      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_APPROX(redux_gpu(i), red_offset(i));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) {
+  array<IndexType, 2> tensorRange = {{dim_x, dim_y}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  array<IndexType, 1> reduced_tensorRange = {{dim_y}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+  redux = in.sum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
 
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < redux.size(); i++) {
+    VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]);
+  }
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }
 
 template <typename DataType, int DataLayout, typename IndexType>
-static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_device) {
+static void test_first_dim_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 145;
+  IndexType dim_y = 1;
+  IndexType dim_z = 67;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux = in.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 64;
+  IndexType dim_y = 1;
+  IndexType dim_z = 32;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 2;
+  array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux = in.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
 
-  IndexType dim_x = 567;
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 64;
   IndexType dim_y = 1;
-  IndexType dim_z = 47;
+  IndexType dim_z = 32;
 
   array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
   Eigen::array<IndexType, 1> red_axis;
@@ -140,42 +754,261 @@ static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_devi
 
   in.setRandom();
 
-  redux= in.sum(red_axis);
+  redux = in.sum(red_axis);
 
-  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
-  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType)));
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
 
-  TensorMap<Tensor<DataType, 3, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 2, DataLayout, IndexType> >  out_gpu(gpu_out_data, reduced_tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
-  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
-  for(IndexType j=0; j<reduced_tensorRange[0]; j++ )
-    for(IndexType k=0; k<reduced_tensorRange[1]; k++ )
-      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
 
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
+}
 
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  auto tensorRange = Sizes<64, 32>(64, 32);
+  // auto red_axis =  Sizes<0,1>(0,1);
+  Eigen::IndexList<Eigen::type2index<1>> red_axis;
+  auto reduced_tensorRange = Sizes<64>(64);
+  TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
+
+  in_fix.setRandom();
+
+  redux_fix = in_fix.sum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in_fix.data(),
+      (in_fix.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu_fix.data(), gpu_out_data,
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
+    VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
 }
-template<typename DataType> void sycl_reduction_test_per_device(const cl::sycl::device& d){
-  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
-  QueueInterface queueInterface(d);
-  auto sycl_device = Eigen::SyclDevice(&queueInterface);
 
-  test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  auto tensorRange = Sizes<64, 32>(64, 32);
+  Eigen::IndexList<Eigen::type2index<1>> red_axis;
+  auto reduced_tensorRange = Sizes<64>(64);
+  TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
+
+  in_fix.setRandom();
+  redux_fix = in_fix.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in_fix.data(),
+      (in_fix.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu_fix.data(), gpu_out_data,
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
+  sycl_device.synchronize();
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
+    VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+// SYCL supports a generic case of reduction where the accumulator is a
+// different type than the input data This is an example on how to get if a
+// Tensor contains nan and/or inf in one reduction
+template <typename InT, typename OutT>
+struct CustomReducer {
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
+  static constexpr OutT InfBit = 1;
+  static constexpr OutT NanBit = 2;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x,
+                                                    OutT* accum) const {
+    if (Eigen::numext::isinf(x))
+      *accum |= InfBit;
+    else if (Eigen::numext::isnan(x))
+      *accum |= NanBit;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x,
+                                                    OutT* accum) const {
+    *accum |= x;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const {
+    return OutT(0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const {
+    return accum;
+  }
+};
+
+template <typename DataType, typename AccumType, int DataLayout,
+          typename IndexType>
+static void test_full_reductions_custom_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  constexpr IndexType InSize = 64;
+  auto tensorRange = Sizes<InSize>(InSize);
+  Eigen::IndexList<Eigen::type2index<0>> dims;
+  auto reduced_tensorRange = Sizes<>();
+  TensorFixedSize<DataType, Sizes<InSize>, DataLayout> in_fix;
+  TensorFixedSize<AccumType, Sizes<>, DataLayout> redux_gpu_fix;
+
+  CustomReducer<DataType, AccumType> reducer;
+
+  in_fix.setRandom();
+
+  size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType);
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(in_size_bytes));
+  AccumType* gpu_out_data =
+      static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<InSize>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<AccumType, Sizes<>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes);
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer);
+  sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data,
+                                 sizeof(AccumType));
+  VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_full_per_device(const Dev& sycl_device) {
+  test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
+
+  test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_full_offset_per_device(const Dev& sycl_device) {
+  test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) {
+  test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device,
+                                                                  4197, 4097);
+  test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
+                                                                  4197, 4097);
+  test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
+                                                                  129, 8);
   test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) {
   test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
-  test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
-  test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
-  test_first_dim_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
-  test_last_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  sycl_device.synchronize();
 }
+
 EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) {
-  for (const auto& device :Eigen::get_sycl_supported_devices()) {
-    CALL_SUBTEST(sycl_reduction_test_per_device<float>(device));
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device));
+    CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device));
+    CALL_SUBTEST_3(
+        sycl_reduction_test_first_dim_per_device<float>(sycl_device));
+    CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
index 77c2235d1..dd30c235d 100644
--- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
@@ -20,10 +20,8 @@
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
-
 template <typename DataType, int DataLayout, typename IndexType>
-static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
-
+static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) {
   IndexType dim1 = 2;
   IndexType dim2 = 3;
   IndexType dim3 = 5;
@@ -40,21 +38,30 @@ static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
   dim_rev[2] = true;
   dim_rev[3] = false;
 
-  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType)));
-  DataType* gpu_out_data =static_cast<DataType*>(sycl_device.allocate(reversed_tensor.dimensions().TotalSize()*sizeof(DataType)));
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType)));
 
-  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  out_gpu(gpu_out_data, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data,
+                                                                tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu(gpu_out_data,
+                                                                 tensorRange);
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, tensor.data(),
+      (tensor.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
-  sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
   for (IndexType i = 0; i < 2; ++i) {
     for (IndexType j = 0; j < 3; ++j) {
       for (IndexType k = 0; k < 5; ++k) {
         for (IndexType l = 0; l < 7; ++l) {
-          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l));
+          VERIFY_IS_EQUAL(tensor(i, j, k, l),
+                          reversed_tensor(i, 2 - j, 4 - k, l));
         }
       }
     }
@@ -65,13 +72,15 @@ static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
   dim_rev[3] = false;
 
   out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
-  sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
 
   for (IndexType i = 0; i < 2; ++i) {
     for (IndexType j = 0; j < 3; ++j) {
       for (IndexType k = 0; k < 5; ++k) {
         for (IndexType l = 0; l < 7; ++l) {
-          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l));
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), reversed_tensor(1 - i, j, k, l));
         }
       }
     }
@@ -82,13 +91,16 @@ static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
   dim_rev[2] = false;
   dim_rev[3] = true;
   out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
-  sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
 
   for (IndexType i = 0; i < 2; ++i) {
     for (IndexType j = 0; j < 3; ++j) {
       for (IndexType k = 0; k < 5; ++k) {
         for (IndexType l = 0; l < 7; ++l) {
-          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l));
+          VERIFY_IS_EQUAL(tensor(i, j, k, l),
+                          reversed_tensor(1 - i, j, k, 6 - l));
         }
       }
     }
@@ -98,11 +110,9 @@ static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
   sycl_device.deallocate(gpu_out_data);
 }
 
-
-
 template <typename DataType, int DataLayout, typename IndexType>
-static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue)
-{
+static void test_expr_reverse(const Eigen::SyclDevice& sycl_device,
+                              bool LValue) {
   IndexType dim1 = 2;
   IndexType dim2 = 3;
   IndexType dim3 = 5;
@@ -120,24 +130,32 @@ static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue
   dim_rev[2] = false;
   dim_rev[3] = true;
 
-  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType)));
-  DataType* gpu_out_data_expected =static_cast<DataType*>(sycl_device.allocate(expected.dimensions().TotalSize()*sizeof(DataType)));
-  DataType* gpu_out_data_result =static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));
-
-  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  out_gpu_expected(gpu_out_data_expected, tensorRange);
-  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  out_gpu_result(gpu_out_data_result, tensorRange);
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data_expected = static_cast<DataType*>(sycl_device.allocate(
+      expected.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data_result = static_cast<DataType*>(
+      sycl_device.allocate(result.dimensions().TotalSize() * sizeof(DataType)));
 
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data,
+                                                                tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_expected(
+      gpu_out_data_expected, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_result(
+      gpu_out_data_result, tensorRange);
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, tensor.data(),
+      (tensor.dimensions().TotalSize()) * sizeof(DataType));
 
   if (LValue) {
     out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu;
   } else {
     out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev);
   }
-  sycl_device.memcpyDeviceToHost(expected.data(), gpu_out_data_expected, expected.dimensions().TotalSize()*sizeof(DataType));
-
+  sycl_device.memcpyDeviceToHost(
+      expected.data(), gpu_out_data_expected,
+      expected.dimensions().TotalSize() * sizeof(DataType));
 
   array<IndexType, 4> src_slice_dim;
   src_slice_dim[0] = 2;
@@ -154,8 +172,9 @@ static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue
 
   for (IndexType i = 0; i < 5; ++i) {
     if (LValue) {
-        out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) =
-          in_gpu.slice(src_slice_start, src_slice_dim);
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim)
+          .reverse(dim_rev)
+          .device(sycl_device) = in_gpu.slice(src_slice_start, src_slice_dim);
     } else {
       out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
           in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
@@ -163,13 +182,15 @@ static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue
     src_slice_start[2] += 1;
     dst_slice_start[2] += 1;
   }
-  sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
+      result.data(), gpu_out_data_result,
+      result.dimensions().TotalSize() * sizeof(DataType));
 
   for (IndexType i = 0; i < expected.dimension(0); ++i) {
     for (IndexType j = 0; j < expected.dimension(1); ++j) {
       for (IndexType k = 0; k < expected.dimension(2); ++k) {
         for (IndexType l = 0; l < expected.dimension(3); ++l) {
-          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+          VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l));
         }
       }
     }
@@ -177,34 +198,37 @@ static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue
 
   dst_slice_start[2] = 0;
   result.setRandom();
-  sycl_device.memcpyHostToDevice(gpu_out_data_result, result.data(),(result.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_out_data_result, result.data(),
+      (result.dimensions().TotalSize()) * sizeof(DataType));
   for (IndexType i = 0; i < 5; ++i) {
-     if (LValue) {
-       out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) =
-           in_gpu.slice(dst_slice_start, dst_slice_dim);
-     } else {
-       out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
-           in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
-     }
+    if (LValue) {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim)
+          .reverse(dim_rev)
+          .device(sycl_device) = in_gpu.slice(dst_slice_start, dst_slice_dim);
+    } else {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
+          in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
+    }
     dst_slice_start[2] += 1;
   }
-  sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
+      result.data(), gpu_out_data_result,
+      result.dimensions().TotalSize() * sizeof(DataType));
 
   for (IndexType i = 0; i < expected.dimension(0); ++i) {
     for (IndexType j = 0; j < expected.dimension(1); ++j) {
       for (IndexType k = 0; k < expected.dimension(2); ++k) {
         for (IndexType l = 0; l < expected.dimension(3); ++l) {
-          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+          VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l));
         }
       }
     }
   }
 }
 
-
-
-template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::device& d){
-  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
+template <typename DataType>
+void sycl_reverse_test_per_device(const cl::sycl::device& d) {
   QueueInterface queueInterface(d);
   auto sycl_device = Eigen::SyclDevice(&queueInterface);
   test_simple_reverse<DataType, RowMajor, int64_t>(sycl_device);
@@ -215,7 +239,15 @@ template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::de
   test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true);
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) {
-  for (const auto& device :Eigen::get_sycl_supported_devices()) {
-    CALL_SUBTEST(sycl_reverse_test_per_device<float>(device));
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.get_info<cl::sycl::info::device::name>() << std::endl;
+    CALL_SUBTEST_1(sycl_reverse_test_per_device<short>(device));
+    CALL_SUBTEST_2(sycl_reverse_test_per_device<int>(device));
+    CALL_SUBTEST_3(sycl_reverse_test_per_device<unsigned int>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+    CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device));
+#endif
+    CALL_SUBTEST_5(sycl_reverse_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_scan_sycl.cpp b/unsupported/test/cxx11_tensor_scan_sycl.cpp
new file mode 100644
index 000000000..09c45fce5
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_scan_sycl.cpp
@@ -0,0 +1,141 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_cumsum(const Eigen::SyclDevice& sycl_device, IndexType m_size,
+                      IndexType k_size, IndexType n_size, int consume_dim,
+                      bool exclusive) {
+  static const DataType error_threshold = 1e-4f;
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size
+            << " consume_dim : " << consume_dim << ")" << std::endl;
+  Tensor<DataType, 3, DataLayout, IndexType> t_input(m_size, k_size, n_size);
+  Tensor<DataType, 3, DataLayout, IndexType> t_result(m_size, k_size, n_size);
+  Tensor<DataType, 3, DataLayout, IndexType> t_result_gpu(m_size, k_size,
+                                                          n_size);
+
+  t_input.setRandom();
+  std::size_t t_input_bytes = t_input.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType* gpu_data_in =
+      static_cast<DataType*>(sycl_device.allocate(t_input_bytes));
+  DataType* gpu_data_out =
+      static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
+
+  array<IndexType, 3> tensorRange = {{m_size, k_size, n_size}};
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_input(
+      gpu_data_in, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_result(
+      gpu_data_out, tensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_in, t_input.data(), t_input_bytes);
+  sycl_device.memcpyHostToDevice(gpu_data_out, t_input.data(), t_input_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_input.cumsum(consume_dim, exclusive);
+
+  t_result = t_input.cumsum(consume_dim, exclusive);
+
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), gpu_data_out,
+                                 t_result_bytes);
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << " CPU : " << t_result(i)
+              << " vs SYCL : " << t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+  sycl_device.deallocate(gpu_data_in);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim0_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim1_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim2_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim0_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                false);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim1_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                false);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim2_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                false);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_scan_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(
+        sycl_scan_test_exclusive_dim0_per_device<float>(sycl_device));
+    CALL_SUBTEST_2(
+        sycl_scan_test_exclusive_dim1_per_device<float>(sycl_device));
+    CALL_SUBTEST_3(
+        sycl_scan_test_exclusive_dim2_per_device<float>(sycl_device));
+    CALL_SUBTEST_4(
+        sycl_scan_test_inclusive_dim0_per_device<float>(sycl_device));
+    CALL_SUBTEST_5(
+        sycl_scan_test_inclusive_dim1_per_device<float>(sycl_device));
+    CALL_SUBTEST_6(
+        sycl_scan_test_inclusive_dim2_per_device<float>(sycl_device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
index 0e8cc3bd2..ca4e8b5ef 100644
--- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
@@ -12,14 +12,12 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
-
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
@@ -29,33 +27,33 @@ using Eigen::Tensor;
 using Eigen::TensorMap;
 
 template <typename DataType, int DataLayout, typename IndexType>
-static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
-{
+static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) {
   IndexType sizeDim1 = 2;
   IndexType sizeDim2 = 3;
   IndexType sizeDim3 = 5;
   IndexType sizeDim4 = 7;
   array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
-  Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
-  Tensor<DataType, 4, DataLayout,IndexType> no_shuffle(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> no_shuffle(tensorRange);
   tensor.setRandom();
 
-  const size_t buffSize =tensor.size()*sizeof(DataType);
+  const size_t buffSize = tensor.size() * sizeof(DataType);
   array<IndexType, 4> shuffles;
   shuffles[0] = 0;
   shuffles[1] = 1;
   shuffles[2] = 2;
   shuffles[3] = 3;
-  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(buffSize));
-  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(buffSize));
-
+  DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+  DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(buffSize));
 
-  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
-  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu2(gpu_data2, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu1(gpu_data1,
+                                                             tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu2(gpu_data2,
+                                                             tensorRange);
 
   sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize);
 
-  gpu2.device(sycl_device)=gpu1.shuffle(shuffles);
+  gpu2.device(sycl_device) = gpu1.shuffle(shuffles);
   sycl_device.memcpyDeviceToHost(no_shuffle.data(), gpu_data2, buffSize);
   sycl_device.synchronize();
 
@@ -68,7 +66,7 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
     for (IndexType j = 0; j < sizeDim2; ++j) {
       for (IndexType k = 0; k < sizeDim3; ++k) {
         for (IndexType l = 0; l < sizeDim4; ++l) {
-          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l));
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), no_shuffle(i, j, k, l));
         }
       }
     }
@@ -78,12 +76,14 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
   shuffles[1] = 3;
   shuffles[2] = 1;
   shuffles[3] = 0;
-  array<IndexType, 4> tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
-  Tensor<DataType, 4, DataLayout,IndexType> shuffle(tensorrangeShuffle);
-  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(buffSize));
-  TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu3(gpu_data3, tensorrangeShuffle);
-
-  gpu3.device(sycl_device)=gpu1.shuffle(shuffles);
+  array<IndexType, 4> tensorrangeShuffle = {
+      {sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
+  Tensor<DataType, 4, DataLayout, IndexType> shuffle(tensorrangeShuffle);
+  DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu3(
+      gpu_data3, tensorrangeShuffle);
+
+  gpu3.device(sycl_device) = gpu1.shuffle(shuffles);
   sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize);
   sycl_device.synchronize();
 
@@ -96,24 +96,22 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
     for (IndexType j = 0; j < sizeDim2; ++j) {
       for (IndexType k = 0; k < sizeDim3; ++k) {
         for (IndexType l = 0; l < sizeDim4; ++l) {
-          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), shuffle(k, l, j, i));
         }
       }
     }
   }
 }
 
-
-template<typename DataType, typename dev_Selector> void sycl_shuffling_test_per_device(dev_Selector s){
+template <typename DataType, typename dev_Selector>
+void sycl_shuffling_test_per_device(dev_Selector s) {
   QueueInterface queueInterface(s);
   auto sycl_device = Eigen::SyclDevice(&queueInterface);
   test_simple_shuffling_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device);
-
 }
-EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl)
-{
-  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp
index 9357bed02..e6c5e2378 100644
--- a/unsupported/test/cxx11_tensor_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_sycl.cpp
@@ -29,9 +29,9 @@ using Eigen::TensorMap;
 
 template <typename DataType, int DataLayout, typename IndexType>
 void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
-  IndexType sizeDim1 = 100;
-  IndexType sizeDim2 = 10;
-  IndexType sizeDim3 = 20;
+  IndexType sizeDim1 = 5;
+  IndexType sizeDim2 = 5;
+  IndexType sizeDim3 = 1;
   array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
   Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
   Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange);
@@ -56,6 +56,7 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
   sycl_device.synchronize();
 
   for (IndexType i = 0; i < in1.size(); ++i) {
+  //  std::cout << "SYCL DATA : " << out1(i) << "  vs  CPU DATA : " << in1(i) * 3.14f << "\n";
     VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f);
     VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f);
     VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f);
@@ -93,6 +94,88 @@ void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) {
   sycl_device.deallocate(gpu_data);
 }
 
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_mem_sync_offsets(const Eigen::SyclDevice &sycl_device) {
+  using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
+  IndexType full_size = 32;
+  IndexType half_size = full_size / 2;
+  array<IndexType, 1> tensorRange = {{full_size}};
+  tensor_type in1(tensorRange);
+  tensor_type out(tensorRange);
+
+  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
+
+  in1 = in1.random();
+  // Copy all data to device, then permute on copy back to host
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data() + half_size, gpu_data, half_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  in1 = in1.random();
+  out.setZero();
+  // Permute copies to device, then copy all back to host
+  sycl_device.memcpyHostToDevice(gpu_data + half_size, in1.data(), half_size * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data() + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  in1 = in1.random();
+  out.setZero();
+  DataType* gpu_data_out  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu2(gpu_data_out, tensorRange);
+  // Copy all to device, permute copies on device, then copy all back to host
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
+  sycl_device.memcpy(gpu_data_out + half_size, gpu_data, half_size * sizeof(DataType));
+  sycl_device.memcpy(gpu_data_out, gpu_data + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  sycl_device.deallocate(gpu_data_out);
+  sycl_device.deallocate(gpu_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_memset_offsets(const Eigen::SyclDevice &sycl_device) {
+  using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
+  IndexType full_size = 32;
+  IndexType half_size = full_size / 2;
+  array<IndexType, 1> tensorRange = {{full_size}};
+  tensor_type cpu_out(tensorRange);
+  tensor_type out(tensorRange);
+
+  cpu_out.setZero();
+
+  std::memset(cpu_out.data(), 0, half_size * sizeof(DataType));
+  std::memset(cpu_out.data() + half_size, 1, half_size * sizeof(DataType));
+
+  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
+
+  sycl_device.memset(gpu_data, 0, half_size * sizeof(DataType));
+  sycl_device.memset(gpu_data + half_size, 1, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < full_size; ++i) {
+    VERIFY_IS_APPROX(out(i), cpu_out(i));
+  }
+
+  sycl_device.deallocate(gpu_data);
+}
+
 template <typename DataType, int DataLayout, typename IndexType>
 void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
 
@@ -262,6 +345,8 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_
   test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device);
   test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device);
   test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync_offsets<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_memset_offsets<DataType, RowMajor, int64_t>(sycl_device);
   test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device);
   test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device);
   test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device);
-- 
cgit v1.2.3