From 00f32752f7d0b193c6788691c3cf0b76457a044d Mon Sep 17 00:00:00 2001
From: Mehdi Goli <mehdi.goli@codeplay.com>
Date: Thu, 28 Nov 2019 10:08:54 +0000
Subject: [SYCL] Rebasing the SYCL support branch on top of the Einge upstream
 master branch. * Unifying all loadLocalTile from lhs and rhs to an
 extract_block function. * Adding get_tensor operation which was missing in
 TensorContractionMapper. * Adding the -D method missing from cmake for
 Disable_Skinny Contraction operation. * Wrapping all the indices in
 TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL *
 Unifying load to private register for tall/skinny no shared * Unifying load
 to vector tile for tensor-vector/vector-tensor operation * Removing all the
 LHS/RHS class for extracting data from global * Removing Outputfunction from
 TensorContractionSkinnyNoshared. * Combining the local memory version of
 tall/skinny and normal tensor contraction into one kernel. * Combining the
 no-local memory version of tall/skinny and normal tensor contraction into one
 kernel. * Combining General Tensor-Vector and VectorTensor contraction into
 one kernel. * Making double buffering optional for Tensor contraction when
 local memory is version is used. * Modifying benchmark to accept custom
 Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to
 allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL
 CMake

---
 unsupported/test/CMakeLists.txt                    |  129 ++-
 unsupported/test/cxx11_tensor_argmax_sycl.cpp      |  136 +--
 unsupported/test/cxx11_tensor_builtins_sycl.cpp    |  497 ++++++----
 unsupported/test/cxx11_tensor_chipping_sycl.cpp    |    7 +-
 unsupported/test/cxx11_tensor_contract_sycl.cpp    | 1010 +++++++++++++++++---
 unsupported/test/cxx11_tensor_custom_op_sycl.cpp   |    5 +
 unsupported/test/cxx11_tensor_forced_eval_sycl.cpp |    5 +-
 unsupported/test/cxx11_tensor_image_op_sycl.cpp    |  103 ++
 unsupported/test/cxx11_tensor_math_sycl.cpp        |  105 ++
 unsupported/test/cxx11_tensor_morphing_sycl.cpp    |  138 +++
 unsupported/test/cxx11_tensor_random_sycl.cpp      |  100 ++
 unsupported/test/cxx11_tensor_reduction_sycl.cpp   |  941 ++++++++++++++++--
 unsupported/test/cxx11_tensor_reverse_sycl.cpp     |  128 ++-
 unsupported/test/cxx11_tensor_scan_sycl.cpp        |  141 +++
 unsupported/test/cxx11_tensor_shuffling_sycl.cpp   |   52 +-
 unsupported/test/cxx11_tensor_sycl.cpp             |   91 +-
 16 files changed, 3019 insertions(+), 569 deletions(-)
 create mode 100644 unsupported/test/cxx11_tensor_image_op_sycl.cpp
 create mode 100644 unsupported/test/cxx11_tensor_math_sycl.cpp
 create mode 100644 unsupported/test/cxx11_tensor_random_sycl.cpp
 create mode 100644 unsupported/test/cxx11_tensor_scan_sycl.cpp

(limited to 'unsupported/test')

diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 3d9ac9263..9db965ad8 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -111,40 +111,113 @@ ei_add_test(special_functions)
 
 if(EIGEN_TEST_CXX11)
   if(EIGEN_TEST_SYCL)
+    set(EIGEN_SYCL ON)
+    # Forward CMake options as preprocessor definitions
+    if(EIGEN_SYCL_USE_DEFAULT_SELECTOR)
+      add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR})
+    endif()
+    if(EIGEN_SYCL_NO_LOCAL_MEM)
+      add_definitions(-DEIGEN_SYCL_NO_LOCAL_MEM=${EIGEN_SYCL_NO_LOCAL_MEM})
+    endif()
+    if(EIGEN_SYCL_LOCAL_MEM)
+      add_definitions(-DEIGEN_SYCL_LOCAL_MEM=${EIGEN_SYCL_LOCAL_MEM})
+    endif()
+    if(EIGEN_SYCL_MAX_GLOBAL_RANGE)
+      add_definitions(-DEIGEN_SYCL_MAX_GLOBAL_RANGE=${EIGEN_SYCL_MAX_GLOBAL_RANGE})
+    endif()
+    if(EIGEN_SYCL_LOCAL_THREAD_DIM0)
+      add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM0=${EIGEN_SYCL_LOCAL_THREAD_DIM0})
+    endif()
+    if(EIGEN_SYCL_LOCAL_THREAD_DIM1)
+      add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM1=${EIGEN_SYCL_LOCAL_THREAD_DIM1})
+    endif()
+    if(EIGEN_SYCL_REG_M)
+      add_definitions(-DEIGEN_SYCL_REG_M=${EIGEN_SYCL_REG_M})
+    endif()
+    if(EIGEN_SYCL_REG_N)
+      add_definitions(-DEIGEN_SYCL_REG_N=${EIGEN_SYCL_REG_N})
+    endif()
+    if(EIGEN_SYCL_USE_PROGRAM_CLASS)
+      add_definitions(-DEIGEN_SYCL_USE_PROGRAM_CLASS=${EIGEN_SYCL_USE_PROGRAM_CLASS})
+    endif()
+    if(EIGEN_SYCL_ASYNC_EXECUTION)
+      add_definitions(-DEIGEN_SYCL_ASYNC_EXECUTION=${EIGEN_SYCL_ASYNC_EXECUTION})
+    endif()
+    if(EIGEN_SYCL_DISABLE_SKINNY)
+      add_definitions(-DEIGEN_SYCL_DISABLE_SKINNY=${EIGEN_SYCL_DISABLE_SKINNY})
+    endif()
+    if(EIGEN_SYCL_DISABLE_DOUBLE_BUFFER)
+    add_definitions(-DEIGEN_SYCL_DISABLE_DOUBLE_BUFFER=${EIGEN_SYCL_DISABLE_DOUBLE_BUFFER})
+  endif()
+    if(EIGEN_SYCL_DISABLE_RANK1)
+      add_definitions(-DEIGEN_SYCL_DISABLE_RANK1=${EIGEN_SYCL_DISABLE_RANK1})
+    endif()
+    if(EIGEN_SYCL_DISABLE_SCALAR)
+      add_definitions(-DEIGEN_SYCL_DISABLE_SCALAR=${EIGEN_SYCL_DISABLE_SCALAR})
+    endif()
+    if(EIGEN_SYCL_DISABLE_GEMV)
+      add_definitions(-DEIGEN_SYCL_DISABLE_GEMV=${EIGEN_SYCL_DISABLE_GEMV})
+    endif()
+    if(EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION)
+      add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION})
+    endif()
+
     if(EIGEN_SYCL_TRISYCL)
       set(CMAKE_CXX_STANDARD 14)
       set(STD_CXX_FLAG "-std=c++1z")
     else()
-      # It should be safe to always run these tests as there is some fallback code for
-      # older compiler that don't support cxx11.
-      # This is already set if EIGEN_TEST_CXX11 is enabled:
-      # set(CMAKE_CXX_STANDARD 11)
-      # set(STD_CXX_FLAG "-std=c++11")
+      if(MSVC)
+        # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11
+        # can cause issues with the ComputeCpp device compiler parsing Visual Studio Headers.
+        set(CMAKE_CXX_STANDARD 14)
+        list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
+      else()
+        set(CMAKE_CXX_STANDARD 11)
+        list(APPEND COMPUTECPP_USER_FLAGS -Wall)
+      endif()
+      # The following flags are not supported by Clang and can cause warnings
+      # if used with -Werror so they are removed here.
+      if(COMPUTECPP_USE_COMPILER_DRIVER)
+        set(CMAKE_CXX_COMPILER ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE})
+        string(REPLACE "-Wlogical-op" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+        string(REPLACE "-Wno-psabi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+        string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+      endif()
+      list(APPEND COMPUTECPP_USER_FLAGS
+          -DEIGEN_NO_ASSERTION_CHECKING=1
+          -no-serial-memop
+          -Xclang
+          -cl-mad-enable)
     endif()
 
-    ei_add_test_sycl(cxx11_tensor_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_device_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_padding_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_contract_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_striding_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_generator_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_patch_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_image_op_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_math_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_device_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_padding_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_contract_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_striding_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_random_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_generator_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_scan_sycl ${STD_CXX_FLAG})
+    set(EIGEN_SYCL OFF)
   endif()
 
   ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
index 0bbb0f6dc..41ea3cf7b 100644
--- a/unsupported/test/cxx11_tensor_argmax_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
@@ -18,6 +18,7 @@
 #define EIGEN_USE_SYCL
 
 #include "main.h"
+
 #include <unsupported/Eigen/CXX11/Tensor>
 
 using Eigen::array;
@@ -26,9 +27,8 @@ using Eigen::Tensor;
 using Eigen::TensorMap;
 
 template <typename DataType, int Layout, typename DenseIndex>
-static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){
-
-  Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2,2,2}});
+static void test_sycl_simple_argmax(const Eigen::SyclDevice& sycl_device) {
+  Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2, 2, 2}});
   Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
   Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
   in.setRandom();
@@ -39,14 +39,15 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){
   std::size_t in_bytes = in.size() * sizeof(DataType);
   std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
 
-  DataType * d_in       = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+  DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
   DenseIndex* d_out_max = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
   DenseIndex* d_out_min = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
 
-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 3>{{2,2,2}});
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in,
+                                                                           Eigen::array<DenseIndex, 3>{{2, 2, 2}});
   Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_max(d_out_max);
   Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_min(d_out_min);
-  sycl_device.memcpyHostToDevice(d_in, in.data(),in_bytes);
+  sycl_device.memcpyHostToDevice(d_in, in.data(), in_bytes);
 
   gpu_out_max.device(sycl_device) = gpu_in.argmax();
   gpu_out_min.device(sycl_device) = gpu_in.argmin();
@@ -54,7 +55,7 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){
   sycl_device.memcpyDeviceToHost(out_max.data(), d_out_max, out_bytes);
   sycl_device.memcpyDeviceToHost(out_min.data(), d_out_min, out_bytes);
 
-  VERIFY_IS_EQUAL(out_max(), 2*2*2 - 1);
+  VERIFY_IS_EQUAL(out_max(), 2 * 2 * 2 - 1);
   VERIFY_IS_EQUAL(out_min(), 0);
 
   sycl_device.deallocate(d_in);
@@ -62,22 +63,22 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){
   sycl_device.deallocate(d_out_min);
 }
 
-
 template <typename DataType, int DataLayout, typename DenseIndex>
-static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
-{
-  DenseIndex sizeDim0=9;
-  DenseIndex sizeDim1=3;
-  DenseIndex sizeDim2=5;
-  DenseIndex sizeDim3=7;
-  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3);
+static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) {
+  DenseIndex sizeDim0 = 9;
+  DenseIndex sizeDim1 = 3;
+  DenseIndex sizeDim2 = 5;
+  DenseIndex sizeDim3 = 7;
+  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3);
 
   std::vector<DenseIndex> dims;
-  dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3);
+  dims.push_back(sizeDim0);
+  dims.push_back(sizeDim1);
+  dims.push_back(sizeDim2);
+  dims.push_back(sizeDim3);
   for (DenseIndex dim = 0; dim < 4; ++dim) {
-
     array<DenseIndex, 3> out_shape;
-    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1];
 
     Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
 
@@ -86,9 +87,13 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
       for (DenseIndex j = 0; j < sizeDim1; ++j) {
         for (DenseIndex k = 0; k < sizeDim2; ++k) {
           for (DenseIndex l = 0; l < sizeDim3; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
-            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
-            tensor(ix)=(ix[dim] != 0)?-1.0:10.0;
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l)
+            // = 10.0
+            tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0;
           }
         }
       }
@@ -97,23 +102,23 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
     std::size_t in_bytes = tensor.size() * sizeof(DataType);
     std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
 
+    DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+    DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
 
-    DataType * d_in       = static_cast<DataType*>(sycl_device.allocate(in_bytes));
-    DenseIndex* d_out= static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
-
-    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 4>{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}});
+    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(
+        d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}});
     Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
 
-    sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
     gpu_out.device(sycl_device) = gpu_in.argmax(dim);
     sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
 
     VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
-                    size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim)));
+                    size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim)));
 
     for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
       // Expect max to be in the first index of the reduced dimension
-       VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
     }
 
     sycl_device.synchronize();
@@ -122,15 +127,18 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
       for (DenseIndex j = 0; j < sizeDim1; ++j) {
         for (DenseIndex k = 0; k < sizeDim2; ++k) {
           for (DenseIndex l = 0; l < sizeDim3; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
             // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
-            tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?-1.0:20.0;
+            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0;
           }
         }
       }
     }
 
-    sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
     gpu_out.device(sycl_device) = gpu_in.argmax(dim);
     sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
 
@@ -144,20 +152,21 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
 }
 
 template <typename DataType, int DataLayout, typename DenseIndex>
-static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
-{
-  DenseIndex sizeDim0=9;
-  DenseIndex sizeDim1=3;
-  DenseIndex sizeDim2=5;
-  DenseIndex sizeDim3=7;
-  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3);
+static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) {
+  DenseIndex sizeDim0 = 9;
+  DenseIndex sizeDim1 = 3;
+  DenseIndex sizeDim2 = 5;
+  DenseIndex sizeDim3 = 7;
+  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3);
 
   std::vector<DenseIndex> dims;
-  dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3);
+  dims.push_back(sizeDim0);
+  dims.push_back(sizeDim1);
+  dims.push_back(sizeDim2);
+  dims.push_back(sizeDim3);
   for (DenseIndex dim = 0; dim < 4; ++dim) {
-
     array<DenseIndex, 3> out_shape;
-    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1];
 
     Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
 
@@ -166,9 +175,12 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
       for (DenseIndex j = 0; j < sizeDim1; ++j) {
         for (DenseIndex k = 0; k < sizeDim2; ++k) {
           for (DenseIndex l = 0; l < sizeDim3; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
-            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
-            tensor(ix)=(ix[dim] != 0)?1.0:-10.0;
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
+            tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0;
           }
         }
       }
@@ -177,23 +189,23 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
     std::size_t in_bytes = tensor.size() * sizeof(DataType);
     std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
 
+    DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+    DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
 
-    DataType * d_in       = static_cast<DataType*>(sycl_device.allocate(in_bytes));
-    DenseIndex* d_out= static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
-
-    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 4>{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}});
+    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(
+        d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}});
     Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
 
-    sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
     gpu_out.device(sycl_device) = gpu_in.argmin(dim);
     sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
 
     VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
-                    size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim)));
+                    size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim)));
 
     for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
       // Expect max to be in the first index of the reduced dimension
-       VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
     }
 
     sycl_device.synchronize();
@@ -202,15 +214,18 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
       for (DenseIndex j = 0; j < sizeDim1; ++j) {
         for (DenseIndex k = 0; k < sizeDim2; ++k) {
           for (DenseIndex l = 0; l < sizeDim3; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
-            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
-            tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?1.0:-20.0;
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
+            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0;
           }
         }
       }
     }
 
-    sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
     gpu_out.device(sycl_device) = gpu_in.argmin(dim);
     sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
 
@@ -223,10 +238,8 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
   }
 }
 
-
-
-
-template<typename DataType, typename Device_Selector> void sycl_argmax_test_per_device(const Device_Selector& d){
+template <typename DataType, typename Device_Selector>
+void sycl_argmax_test_per_device(const Device_Selector& d) {
   QueueInterface queueInterface(d);
   auto sycl_device = Eigen::SyclDevice(&queueInterface);
   test_sycl_simple_argmax<DataType, RowMajor, int64_t>(sycl_device);
@@ -238,8 +251,7 @@ template<typename DataType, typename Device_Selector> void sycl_argmax_test_per_
 }
 
 EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) {
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
-    CALL_SUBTEST(sycl_argmax_test_per_device<double>(device));
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_argmax_test_per_device<float>(device));
   }
-
 }
diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
index db2975783..72cb62fd5 100644
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@@ -25,243 +25,330 @@ using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
 
-namespace std {
-template <typename T> T rsqrt(T x) { return 1 / std::sqrt(x); }
+// Functions used to compare the TensorMap implementation on the device with
+// the equivalent on the host
+namespace cl {
+namespace sycl {
+template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
 template <typename T> T square(T x) { return x * x; }
 template <typename T> T cube(T x) { return x * x * x; }
-template <typename T> T inverse(T x) { return 1 / x; }
+template <typename T> T inverse(T x) { return T(1) / x; }
+template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
+template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
 }
+}
+
+struct EqualAssignement {
+  template <typename Lhs, typename Rhs>
+  void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
+};
+
+struct PlusEqualAssignement {
+  template <typename Lhs, typename Rhs>
+  void operator()(Lhs& lhs, const Rhs& rhs) { lhs += rhs; }
+};
 
-#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR, Layout)         \
-  {                                                                            \
-    /* out OPERATOR in.FUNC() */                                               \
-    Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange);                        \
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
-    in = in.random() + static_cast<SCALAR>(0.01);                              \
-    out = out.random() + static_cast<SCALAR>(0.01);                            \
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
-    SCALAR *gpu_data = static_cast<SCALAR *>(                                  \
-        sycl_device.allocate(in.size() * sizeof(SCALAR)));                     \
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange);          \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
-    sycl_device.memcpyHostToDevice(gpu_data, in.data(),                        \
-                                   (in.size()) * sizeof(SCALAR));              \
-    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    gpu_out.device(sycl_device) OPERATOR gpu.FUNC();                           \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      SCALAR ver = reference(i);                                               \
-      ver OPERATOR std::FUNC(in(i));                                           \
-      VERIFY_IS_APPROX(out(i), ver);                                           \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data);                                          \
-    sycl_device.deallocate(gpu_data_out);                                      \
-  }                                                                            \
-  {                                                                            \
-    /* out OPERATOR out.FUNC() */                                              \
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
-    out = out.random() + static_cast<SCALAR>(0.01);                            \
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
-    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    gpu_out.device(sycl_device) OPERATOR gpu_out.FUNC();                       \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      SCALAR ver = reference(i);                                               \
-      ver OPERATOR std::FUNC(reference(i));                                    \
-      VERIFY_IS_APPROX(out(i), ver);                                           \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data_out);                                      \
+template <typename DataType, int DataLayout,
+          typename Assignement, typename Operator>
+void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
+                                    const array<int64_t, 3>& tensor_range) {
+  Operator op;
+  Assignement asgn;
+  {
+    /* Assignement(out, Operator(in)) */
+    Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+    Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+    in = in.random() + DataType(0.01);
+    out = out.random() + DataType(0.01);
+    Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+    DataType *gpu_data = static_cast<DataType *>(
+        sycl_device.allocate(in.size() * sizeof(DataType)));
+    DataType *gpu_data_out = static_cast<DataType *>(
+        sycl_device.allocate(out.size() * sizeof(DataType)));
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+    sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+                                   (in.size()) * sizeof(DataType));
+    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+                                   (out.size()) * sizeof(DataType));
+    auto device_expr = gpu_out.device(sycl_device);
+    asgn(device_expr, op(gpu));
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                   (out.size()) * sizeof(DataType));
+    for (int64_t i = 0; i < out.size(); ++i) {
+      DataType ver = reference(i);
+      asgn(ver, op(in(i)));
+      VERIFY_IS_APPROX(out(i), ver);
+    }
+    sycl_device.deallocate(gpu_data);
+    sycl_device.deallocate(gpu_data_out);
   }
+  {
+    /* Assignement(out, Operator(out)) */
+    Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+    out = out.random() + DataType(0.01);
+    Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+    DataType *gpu_data_out = static_cast<DataType *>(
+        sycl_device.allocate(out.size() * sizeof(DataType)));
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+                                   (out.size()) * sizeof(DataType));
+    auto device_expr = gpu_out.device(sycl_device);
+    asgn(device_expr, op(gpu_out));
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                   (out.size()) * sizeof(DataType));
+    for (int64_t i = 0; i < out.size(); ++i) {
+      DataType ver = reference(i);
+      asgn(ver, op(reference(i)));
+      VERIFY_IS_APPROX(out(i), ver);
+    }
+    sycl_device.deallocate(gpu_data_out);
+  }
+}
+
+#define DECLARE_UNARY_STRUCT(FUNC)                                 \
+  struct op_##FUNC {                                               \
+    template <typename T>                                          \
+    auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) {   \
+      return cl::sycl::FUNC(x);                                    \
+    }                                                              \
+    template <typename T>                                          \
+    auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
+      return x.FUNC();                                             \
+    }                                                              \
+  };
 
-#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR , Layout)                \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout)               \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR , Layout)              \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR , Layout)             \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR , Layout)            \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR , Layout)              \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR , Layout)           \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR , Layout)              \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR , Layout)               \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR , Layout)             \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR , Layout)               \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout)               \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR , Layout)              \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR , Layout)             \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR , Layout)             \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR , Layout)
+DECLARE_UNARY_STRUCT(abs)
+DECLARE_UNARY_STRUCT(sqrt)
+DECLARE_UNARY_STRUCT(rsqrt)
+DECLARE_UNARY_STRUCT(square)
+DECLARE_UNARY_STRUCT(cube)
+DECLARE_UNARY_STRUCT(inverse)
+DECLARE_UNARY_STRUCT(tanh)
+DECLARE_UNARY_STRUCT(exp)
+DECLARE_UNARY_STRUCT(expm1)
+DECLARE_UNARY_STRUCT(log)
+DECLARE_UNARY_STRUCT(ceil)
+DECLARE_UNARY_STRUCT(floor)
+DECLARE_UNARY_STRUCT(round)
+DECLARE_UNARY_STRUCT(log1p)
+DECLARE_UNARY_STRUCT(sign)
+DECLARE_UNARY_STRUCT(isnan)
+DECLARE_UNARY_STRUCT(isfinite)
+DECLARE_UNARY_STRUCT(isinf)
 
-#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC, Layout)                        \
-  {                                                                            \
-    /* out = in.FUNC() */                                                      \
-    Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange);                        \
-    Tensor<bool, 3, Layout, int64_t> out(tensorRange);                         \
-    in = in.random() + static_cast<SCALAR>(0.01);                              \
-    SCALAR *gpu_data = static_cast<SCALAR *>(                                  \
-        sycl_device.allocate(in.size() * sizeof(SCALAR)));                     \
-    bool *gpu_data_out =                                                       \
-        static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool)));  \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange);          \
-    TensorMap<Tensor<bool, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);    \
-    sycl_device.memcpyHostToDevice(gpu_data, in.data(),                        \
-                                   (in.size()) * sizeof(SCALAR));              \
-    gpu_out.device(sycl_device) = gpu.FUNC();                                  \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(bool));               \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      VERIFY_IS_EQUAL(out(i), std::FUNC(in(i)));                               \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data);                                          \
-    sycl_device.deallocate(gpu_data_out);                                      \
+template <typename DataType, int DataLayout, typename Assignement>
+void test_unary_builtins_for_assignement(const Eigen::SyclDevice& sycl_device,
+                                         const array<int64_t, 3>& tensor_range) {
+#define RUN_UNARY_TEST(FUNC) \
+  test_unary_builtins_for_scalar<DataType, DataLayout, Assignement, \
+                                 op_##FUNC>(sycl_device, tensor_range)
+  RUN_UNARY_TEST(abs);
+  RUN_UNARY_TEST(sqrt);
+  RUN_UNARY_TEST(rsqrt);
+  RUN_UNARY_TEST(square);
+  RUN_UNARY_TEST(cube);
+  RUN_UNARY_TEST(inverse);
+  RUN_UNARY_TEST(tanh);
+  RUN_UNARY_TEST(exp);
+  RUN_UNARY_TEST(expm1);
+  RUN_UNARY_TEST(log);
+  RUN_UNARY_TEST(ceil);
+  RUN_UNARY_TEST(floor);
+  RUN_UNARY_TEST(round);
+  RUN_UNARY_TEST(log1p);
+  RUN_UNARY_TEST(sign);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_unary_builtins_return_bool(const Eigen::SyclDevice& sycl_device,
+                                     const array<int64_t, 3>& tensor_range) {
+  /* out = op(in) */
+  Operator op;
+  Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+  Tensor<bool, 3, DataLayout, int64_t> out(tensor_range);
+  in = in.random() + DataType(0.01);
+  DataType *gpu_data = static_cast<DataType *>(
+      sycl_device.allocate(in.size() * sizeof(DataType)));
+  bool *gpu_data_out =
+      static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+  TensorMap<Tensor<bool, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+                                 (in.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(bool));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_EQUAL(out(i), op(in(i)));
   }
+  sycl_device.deallocate(gpu_data);
+  sycl_device.deallocate(gpu_data_out);
+}
 
-#define TEST_UNARY_BUILTINS(SCALAR, Layout)                                    \
-  TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=, Layout)                             \
-  TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =, Layout)                              \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan, Layout)                             \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite, Layout)                          \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf, Layout)
+template <typename DataType, int DataLayout>
+void test_unary_builtins(const Eigen::SyclDevice& sycl_device,
+                         const array<int64_t, 3>& tensor_range) {
+  test_unary_builtins_for_assignement<DataType, DataLayout,
+                                      PlusEqualAssignement>(sycl_device, tensor_range);
+  test_unary_builtins_for_assignement<DataType, DataLayout,
+                                      EqualAssignement>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isnan>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isfinite>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isinf>(sycl_device, tensor_range);
+}
 
+template <typename DataType>
 static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) {
   int64_t sizeDim1 = 10;
   int64_t sizeDim2 = 10;
   int64_t sizeDim3 = 10;
-  array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
 
-  TEST_UNARY_BUILTINS(float, RowMajor)
-  TEST_UNARY_BUILTINS(float, ColMajor)
+  test_unary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+  test_unary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
 }
 
-namespace std {
-template <typename T> T cwiseMax(T x, T y) { return std::max(x, y); }
-template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_func(const Eigen::SyclDevice& sycl_device,
+                               const array<int64_t, 3>& tensor_range) {
+  /* out = op(in_1, in_2) */
+  Operator op;
+  Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> in_2(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+  in_1 = in_1.random() + DataType(0.01);
+  in_2 = in_2.random() + DataType(0.01);
+  Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+  DataType *gpu_data_1 = static_cast<DataType *>(
+      sycl_device.allocate(in_1.size() * sizeof(DataType)));
+  DataType *gpu_data_2 = static_cast<DataType *>(
+      sycl_device.allocate(in_2.size() * sizeof(DataType)));
+  DataType *gpu_data_out = static_cast<DataType *>(
+      sycl_device.allocate(out.size() * sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_2(gpu_data_2, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+                                 (in_1.size()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),
+                                 (in_2.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu_1, gpu_2);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(DataType));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), op(in_1(i), in_2(i)));
+  }
+  sycl_device.deallocate(gpu_data_1);
+  sycl_device.deallocate(gpu_data_2);
+  sycl_device.deallocate(gpu_data_out);
 }
 
-#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC, Layout)                        \
-  {                                                                            \
-    /* out = in_1.FUNC(in_2) */                                                \
-    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
-    Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange);                      \
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
-    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
-    in_2 = in_2.random() + static_cast<SCALAR>(0.01);                          \
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
-    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
-        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
-    SCALAR *gpu_data_2 = static_cast<SCALAR *>(                                \
-        sycl_device.allocate(in_2.size() * sizeof(SCALAR)));                   \
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange);      \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
-    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
-                                   (in_1.size()) * sizeof(SCALAR));            \
-    sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),                    \
-                                   (in_2.size()) * sizeof(SCALAR));            \
-    gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2);                           \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      SCALAR ver = reference(i);                                               \
-      ver = std::FUNC(in_1(i), in_2(i));                                       \
-      VERIFY_IS_APPROX(out(i), ver);                                           \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data_1);                                        \
-    sycl_device.deallocate(gpu_data_2);                                        \
-    sycl_device.deallocate(gpu_data_out);                                      \
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device,
+                                     const array<int64_t, 3>& tensor_range) {
+  /* out = op(in_1, 2) */
+  Operator op;
+  const DataType arg2(2);
+  Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+  in_1 = in_1.random();
+  Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+  DataType *gpu_data_1 = static_cast<DataType *>(
+      sycl_device.allocate(in_1.size() * sizeof(DataType)));
+  DataType *gpu_data_out = static_cast<DataType *>(
+      sycl_device.allocate(out.size() * sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+                                 (in_1.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu_1, arg2);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(DataType));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), op(in_1(i), arg2));
   }
+  sycl_device.deallocate(gpu_data_1);
+  sycl_device.deallocate(gpu_data_out);
+}
 
-#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR, Layout)               \
-  {                                                                            \
-    /* out = in_1 OPERATOR in_2 */                                             \
-    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
-    Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange);                      \
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
-    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
-    in_2 = in_2.random() + static_cast<SCALAR>(0.01);                          \
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
-    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
-        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
-    SCALAR *gpu_data_2 = static_cast<SCALAR *>(                                \
-        sycl_device.allocate(in_2.size() * sizeof(SCALAR)));                   \
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange);      \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
-    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
-                                   (in_1.size()) * sizeof(SCALAR));            \
-    sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),                    \
-                                   (in_2.size()) * sizeof(SCALAR));            \
-    gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2;                        \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR in_2(i));                      \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data_1);                                        \
-    sycl_device.deallocate(gpu_data_2);                                        \
-    sycl_device.deallocate(gpu_data_out);                                      \
-  }
+#define DECLARE_BINARY_STRUCT(FUNC)                                                          \
+  struct op_##FUNC {                                                                         \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const T1& x, const T2& y) -> decltype(cl::sycl::FUNC(x, y)) {            \
+      return cl::sycl::FUNC(x, y);                                                           \
+    }                                                                                        \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) -> decltype(x.FUNC(y)) { \
+      return x.FUNC(y);                                                                      \
+    }                                                                                        \
+  };
 
-#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR, Layout)     \
-  {                                                                            \
-    /* out = in_1 OPERATOR 2 */                                                \
-    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
-    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
-    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
-        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
-    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
-                                   (in_1.size()) * sizeof(SCALAR));            \
-    gpu_out.device(sycl_device) = gpu_1 OPERATOR 2;                            \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR 2);                            \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data_1);                                        \
-    sycl_device.deallocate(gpu_data_out);                                      \
-  }
+DECLARE_BINARY_STRUCT(cwiseMax)
+DECLARE_BINARY_STRUCT(cwiseMin)
 
-#define TEST_BINARY_BUILTINS(SCALAR, Layout)                                   \
-  TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax , Layout)                         \
-  TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin , Layout)                         \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, + , Layout)                           \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, - , Layout)                           \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, * , Layout)                           \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, / , Layout)
+#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR)                          \
+  struct op_##NAME {                                                      \
+    template <typename T1, typename T2>                                   \
+    auto operator()(const T1& x, const T2& y) -> decltype(x OPERATOR y) { \
+      return x OPERATOR y;                                                \
+    }                                                                     \
+  };
+
+DECLARE_BINARY_STRUCT_OP(plus, +)
+DECLARE_BINARY_STRUCT_OP(minus, -)
+DECLARE_BINARY_STRUCT_OP(times, *)
+DECLARE_BINARY_STRUCT_OP(divide, /)
+DECLARE_BINARY_STRUCT_OP(modulo, %)
+
+template <typename DataType, int DataLayout>
+void test_binary_builtins(const Eigen::SyclDevice& sycl_device,
+                          const array<int64_t, 3>& tensor_range) {
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_cwiseMax>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_cwiseMin>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_plus>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_minus>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_times>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_divide>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_floating_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+  test_binary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+  test_binary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
+}
 
-static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+template <typename DataType>
+static void test_integer_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
   int64_t sizeDim1 = 10;
   int64_t sizeDim2 = 10;
   int64_t sizeDim3 = 10;
-  array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-  TEST_BINARY_BUILTINS(float, RowMajor)
-  TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, RowMajor)
-  TEST_BINARY_BUILTINS(float, ColMajor)
-  TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor)
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+  test_binary_builtins_fixed_arg2<DataType, RowMajor,
+                                  op_modulo>(sycl_device, tensor_range);
+  test_binary_builtins_fixed_arg2<DataType, ColMajor,
+                                  op_modulo>(sycl_device, tensor_range);
 }
 
 EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     QueueInterface queueInterface(device);
     Eigen::SyclDevice sycl_device(&queueInterface);
-    CALL_SUBTEST(test_builtin_unary_sycl(sycl_device));
-    CALL_SUBTEST(test_builtin_binary_sycl(sycl_device));
+    CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device));
+    CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device));
+    CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
index a91efe00c..1e7093104 100644
--- a/unsupported/test/cxx11_tensor_chipping_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
@@ -419,6 +419,7 @@ static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
 
   const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
   const size_t input2TensorBuffSize =input2.size()*sizeof(DataType);
+  std::cout << tensorBuffSize << " , "<<  input2TensorBuffSize << std::endl;
   DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
   DataType* gpu_data_input1  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
   DataType* gpu_data_input2  = static_cast<DataType*>(sycl_device.allocate(input2TensorBuffSize));
@@ -605,14 +606,14 @@ static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
 template<typename DataType, typename dev_Selector> void sycl_chipping_test_per_device(dev_Selector s){
   QueueInterface queueInterface(s);
   auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ /* test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_static_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_dynamic_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_dynamic_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_chip_in_expr<DataType, RowMajor, int64_t>(sycl_device);
-  test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device);
+  test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device);*/
   test_chip_as_lvalue_sycl<DataType, RowMajor, int64_t>(sycl_device);
-  test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ // test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device);
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl)
 {
diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp
index c8e86e69f..fbcc29358 100644
--- a/unsupported/test/cxx11_tensor_contract_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp
@@ -17,23 +17,27 @@
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
-#include <iostream>
+#include <algorithm>
 #include <chrono>
 #include <ctime>
+#include <iostream>
 
 #include "main.h"
+
 #include <unsupported/Eigen/CXX11/Tensor>
 
 using Eigen::array;
 using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
-template<int DataLayout, typename DataType, typename IndexType, typename Device>
-void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
-{
-  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
-  static const DataType error_threshold =1e-4f;
-//  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void static test_sycl_contraction(const Device &sycl_device, IndexType m_size,
+                                  IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
   // with these dimensions, the output has 300 * 140 elements, which is
   // more than 30 * 1024, which is the number of threads in blocks on
   // a 15 SM GK110 GPU
@@ -41,7 +45,6 @@ void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, I
   Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
   Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
   Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size);
-//  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
   Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
   Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
   Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
@@ -50,117 +53,217 @@ void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, I
   t_left.setRandom();
   t_right.setRandom();
 
-  std::size_t t_left_bytes = t_left.size()  * sizeof(DataType);
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
   std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
   std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
 
-  DataType * d_t_left  = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
-  DataType * d_t_right  = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
-  DataType * d_t_result =  static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
 
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, result_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, result_dims);
 
-  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
-  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
 
   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
-  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
 
   t_result = t_left.contract(t_right, dims);
 
   for (IndexType i = 0; i < t_result.size(); i++) {
-    if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
       continue;
     }
-    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) {
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
       continue;
     }
-    std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
-              << " vs " <<  t_result_gpu(i) << std::endl;
-    assert(false);
+
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
   }
   sycl_device.deallocate(d_t_left);
   sycl_device.deallocate(d_t_right);
   sycl_device.deallocate(d_t_result);
 }
 
-template<int DataLayout, typename DataType, typename IndexType, typename Device>
-void test_TF(const Device& sycl_device)
-{
-  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
-  static const DataType error_threshold =1e-4f;
-  Eigen::array<IndexType, 2> left_dims = {{2, 3}};
-  Eigen::array<IndexType, 2> right_dims = {{3, 1}};
-  Eigen::array<IndexType, 2> res_dims = {{2, 1}};
-  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_m(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128,
+                                                           128);
+  }
+}
 
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_k(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k,
+                                                           128);
+  }
+}
 
-  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
-  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
-  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
-  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_n(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128,
+                                                           128, k);
+  }
+}
 
-  t_left.data()[0] = 1.0f;
-  t_left.data()[1] = 2.0f;
-  t_left.data()[2] = 3.0f;
-  t_left.data()[3] = 4.0f;
-  t_left.data()[4] = 5.0f;
-  t_left.data()[5] = 6.0f;
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_sizes(const Device &sycl_device) {
+  IndexType m_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
+                         257, 511, 512, 513, 1023, 1024, 1025};
 
-  t_right.data()[0] = -1.0f;
-  t_right.data()[1] = 0.5f;
-  t_right.data()[2] = 2.0f;
+  IndexType n_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
+                         257, 511, 512, 513, 1023, 1024, 1025};
 
-  std::size_t t_left_bytes = t_left.size()  * sizeof(DataType);
-  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
-  std::size_t t_result_bytes = t_result.size()*sizeof(DataType);
+  IndexType k_sizes[] = {31,  39,  63,  64,  65,  95,   96,   127, 129,
+                         255, 257, 511, 512, 513, 1023, 1024, 1025};
+
+  for (IndexType i = 0; i < 15; i++) {
+    for (IndexType j = 0; j < 15; j++) {
+      for (IndexType k = 0; k < 17; k++) {
+        test_sycl_contraction<DataLayout, DataType, IndexType>(
+            sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
+      }
+    }
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void static test_no_out_of_bounds(const Device &sycl_device, IndexType m_size,
+                                  IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
 
+  t_left.setRandom();
+  t_right.setRandom();
 
-  DataType * d_t_left  = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
-  DataType * d_t_right  = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
-  DataType * d_t_result =  static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
+  // Allocate buffers twice as big to check for invalid read and write
+  auto padded_left_size = 2 * t_left.size();
+  auto padded_right_size = 2 * t_right.size();
+  auto padded_result_size = 2 * t_result.size();
 
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, res_dims);
+  std::size_t t_left_bytes = padded_left_size * sizeof(DataType);
+  std::size_t t_right_bytes = padded_right_size * sizeof(DataType);
+  std::size_t t_result_bytes = padded_result_size * sizeof(DataType);
 
-  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
-  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  // TensorMaps are still of the same size than the Tensors
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, result_dims);
+
+  // Write nan after the actual buffer to propagate nans everywhere in case of
+  // invalid reads
+  DataType nan = std::numeric_limits<DataType>::quiet_NaN();
+  auto host_left_data = new DataType[padded_left_size];
+  std::copy_n(t_left.data(), t_left.size(), host_left_data);
+  std::fill_n(host_left_data + t_left.size(), t_left.size(), nan);
+  auto host_right_data = new DataType[padded_right_size];
+  std::copy_n(t_right.data(), t_right.size(), host_right_data);
+  std::fill_n(host_right_data + t_right.size(), t_right.size(), nan);
+  auto host_result_data = new DataType[padded_result_size];
+  std::fill_n(host_result_data, padded_result_size, nan);
+
+  sycl_device.memcpyHostToDevice(d_t_left, host_left_data, t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, host_right_data, t_right_bytes);
+  sycl_device.memcpyHostToDevice(d_t_result, host_result_data, t_result_bytes);
 
   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
-  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
+  sycl_device.memcpyDeviceToHost(host_result_data, d_t_result, t_result_bytes);
 
   t_result = t_left.contract(t_right, dims);
 
   for (IndexType i = 0; i < t_result.size(); i++) {
-    if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - host_result_data[i]))) < error_threshold) {
       continue;
     }
-    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) {
+    if (Eigen::internal::isApprox(t_result(i), host_result_data[i],
+                                  error_threshold)) {
       continue;
     }
-    std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
-              << " vs " <<  t_result_gpu(i) << std::endl;
-    assert(false);
+    if (std::isnan(host_result_data[i])) {
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", invalid read detected at IndexType " << i << ": "
+                << t_result(i) << " vs " << host_result_data[i] << std::endl;
+    } else {
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", mismatch detected at IndexType " << i << ": "
+                << t_result(i) << " vs " << host_result_data[i] << std::endl;
+    }
+    VERIFY_IS_APPROX(host_result_data[i], t_result(i));
+  }
+  // Make sure that the rest of the result is still nans
+  for (IndexType i = t_result.size(); i < padded_result_size; i++) {
+    if (std::isnan(host_result_data[i])) {
+      continue;
+    }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", invalid write detected at IndexType " << i << ": "
+              << host_result_data[i] << std::endl;
+    VERIFY_IS_APPROX(host_result_data[i], t_result(i));
   }
   sycl_device.deallocate(d_t_left);
   sycl_device.deallocate(d_t_right);
   sycl_device.deallocate(d_t_result);
 
-
+  delete[] host_left_data;
+  delete[] host_right_data;
+  delete[] host_result_data;
 }
 
-template<int DataLayout,  typename DataType, typename IndexType, typename Device>
-void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
-{
-  //std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_scalar(const Device &sycl_device, IndexType m_size, IndexType k_size,
+                 IndexType n_size) {
+  // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size <<
+  // ")" << std::endl;
   // with these dimensions, the output has 300 * 140 elements, which is
   // more than 30 * 1024, which is the number of threads in blocks on
   // a 15 SM GK110 GPU
-  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
-  static const DataType error_threshold =1e-4f;
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
   Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
   Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
   Tensor<DataType, 0, DataLayout, IndexType> t_result;
@@ -171,32 +274,40 @@ void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size,
   t_left.setRandom();
   t_right.setRandom();
 
-  std::size_t t_left_bytes = t_left.size()  * sizeof(DataType);
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
   std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
   std::size_t t_result_bytes = sizeof(DataType);
 
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
 
-  DataType * d_t_left  = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
-  DataType * d_t_right  = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
-  DataType * d_t_result =  static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType>>
+      gpu_t_result(d_t_result);
 
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType> > gpu_t_result(d_t_result);
-
-  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
-  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
 
   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
-  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
 
   t_result = t_left.contract(t_right, dims);
 
-  if (static_cast<DataType>(fabs(t_result() - t_result_gpu())) > error_threshold &&
+  if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+          t_result() - t_result_gpu()))) > error_threshold &&
       !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) {
-    std::cout << "mismatch detected: " << t_result()
-              << " vs " <<  t_result_gpu() << std::endl;
-    assert(false);
+    std::cout << "K: " << k_size << ", N: " << n_size << ", M: " << m_size
+              << " : mismatch detected: " << t_result() << " vs "
+              << t_result_gpu() << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(), t_result());
   }
 
   sycl_device.deallocate(d_t_left);
@@ -204,87 +315,712 @@ void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size,
   sycl_device.deallocate(d_t_result);
 }
 
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_batch(const Device &sycl_device, IndexType m_size,
+                       IndexType k_size, IndexType n_size, IndexType m_batch,
+                       IndexType start, IndexType limit) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  typedef Eigen::array<IndexType, 3> TensorDim;
+  typedef Eigen::Tensor<DataType, 3, DataLayout, IndexType> TensorType;
+  TensorDim left_dims = {{m_batch, k_size, m_size}};
+  TensorDim right_dims = {{m_batch, n_size, k_size}};
+  TensorDim res_dims = {{m_batch, m_size, n_size}};
+  Eigen::array<DimPair, 1> contract_pairs = {{DimPair(0, 1)}};
 
-template<int DataLayout, typename DataType, typename IndexType, typename Device>
-void test_sycl_contraction_m(const Device& sycl_device) {
-  for (IndexType k = 32; k < 256; k++) {
-    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128, 128);
+  TensorType t_left(left_dims);
+  TensorType t_right(right_dims);
+  TensorType t_result_gpu(res_dims);
+  TensorType t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<TensorType> gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<TensorType> gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<TensorType> gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+  for (int i = start; i < limit; ++i) {
+    auto x = gpu_t_left.template chip<0>(i);
+    auto y = gpu_t_right.template chip<0>(i);
+    auto z = gpu_t_result.template chip<0>(i);
+    z.device(sycl_device) = x.contract(y, contract_pairs);
+  }
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  for (int i = start; i < limit; ++i) {
+    auto x = t_left.template chip<0>(i);
+    auto y = t_right.template chip<0>(i);
+    auto z = t_result.template chip<0>(i);
+    z = x.contract(y, contract_pairs);
+  }
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
   }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
 }
 
-template<int DataLayout,  typename DataType, typename IndexType, typename Device>
-void test_sycl_contraction_k(const Device& sycl_device) {
-  for (IndexType k = 32; k < 256; k++) {
-    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k, 128);
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_rhs_transposed(const Device &sycl_device, IndexType m_size,
+                                IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 1)}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType j = 0; j < m_size; j++) {
+    for (IndexType i = 0; i < n_size; i++) {
+      if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+              t_result(j, i) - t_result_gpu(j, i)))) < error_threshold) {
+        continue;
+      }
+      if (Eigen::internal::isApprox(t_result(j, i), t_result_gpu(j, i),
+                                    error_threshold)) {
+        continue;
+      }
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", mismatch detected at IndexType m: " << j << " n: " << i
+                << " CPU : " << t_result(j, i)
+                << " vs SYCL:" << t_result_gpu(j, i) << std::endl;
+      VERIFY_IS_APPROX(t_result_gpu(j, i), t_result(j, i));
+    }
   }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
 }
 
-template<int DataLayout,  typename DataType, typename IndexType, typename Device>
-void test_sycl_contraction_n(const Device& sycl_device) {
-  for (IndexType k = 32; k < 256; k++) {
-    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, 128, k);
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_lhs_transposed(const Device &sycl_device, IndexType m_size,
+                                IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
   }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
 }
 
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_both_transposed(const Device &sycl_device, IndexType m_size,
+                                 IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
+  Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(0, 1)}};
 
-template<int DataLayout,  typename DataType, typename IndexType, typename Device>
-void test_sycl_contraction_sizes(const Device& sycl_device) {
-  IndexType m_sizes[] = { 31,  39,   63,   64,   65,
-                   127, 129,  255,  257 , 511,
-                   512, 513, 1023, 1024, 1025};
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
 
-  IndexType n_sizes[] = { 31,  39,   63,   64,   65,
-                   127, 129,  255,  257,  511,
-                   512, 513, 1023, 1024, 1025};
+  t_left.setRandom();
+  t_right.setRandom();
 
-  IndexType k_sizes[] = {  31,   39,  63,  64,   65,
-                     95,   96, 127, 129,  255,
-                    257,  511, 512, 513, 1023,
-                   1024, 1025};
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
 
-  for (IndexType i = 0; i < 15; i++) {
-    for (IndexType j = 0; j < 15; j++) {
-      for (IndexType k = 0; k < 17; k++) {
-        test_sycl_contraction<DataLayout, DataType,IndexType>(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
-      }
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
     }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
   }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <typename Dev>
+void inline tensorOutofBound(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Test out of bound for Tensor-Tensor
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
+                                                       1024);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
+                                                       4096);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 4096, 1024,
+                                                       2048);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
+                                                       1024);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 2048, 1024,
+                                                       784);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
+                                                       10);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 513, 4096,
+                                                       513);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 783, 1024,
+                                                       783);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
+                                                       784);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 11, 1024,
+                                                       11);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor out of bound tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 128, 128,
+                                                       128);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 128, 128,
+                                                       128);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_m(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_m<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_m<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_n(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_n<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_n<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
 }
 
-template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s){
-  QueueInterface queueInterface(s);
-  auto sycl_device=Eigen::SyclDevice(&queueInterface);
-  test_sycl_contraction<ColMajor, float,int64_t>(sycl_device, 32, 32, 32);
-  test_sycl_contraction<RowMajor,float,int64_t>(sycl_device, 32, 32, 32);
-  test_scalar<ColMajor,float,int64_t>(sycl_device, 32, 32, 32);
-  test_scalar<RowMajor,float,int64_t>(sycl_device, 32, 32, 32);
+template <typename Dev>
+void inline tensorTensor_k(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
   std::chrono::time_point<std::chrono::system_clock> start, end;
   start = std::chrono::system_clock::now();
-  test_sycl_contraction<ColMajor,float,int64_t>(sycl_device, 128, 128, 128);
-  test_sycl_contraction<RowMajor,float,int64_t>(sycl_device, 128, 128, 128);
-  test_scalar<ColMajor,float,int64_t>(sycl_device, 128, 128, 128);
-  test_scalar<RowMajor,float,int64_t>(sycl_device, 128, 128, 128);
-  test_sycl_contraction_m<ColMajor, float, int64_t>(sycl_device);
-  test_sycl_contraction_m<RowMajor, float, int64_t>(sycl_device);
-  test_sycl_contraction_n<ColMajor, float, int64_t>(sycl_device);
-  test_sycl_contraction_n<RowMajor, float, int64_t>(sycl_device);
-  test_sycl_contraction_k<ColMajor, float, int64_t>(sycl_device);
-  test_sycl_contraction_k<RowMajor, float, int64_t>(sycl_device);
-  test_sycl_contraction_sizes<ColMajor, float, int64_t>(sycl_device);
-  test_sycl_contraction_sizes<RowMajor, float, int64_t>(sycl_device);
-  test_TF<RowMajor, float, int64_t>(sycl_device);
-  test_TF<ColMajor, float, int64_t>(sycl_device);
+  test_sycl_contraction_k<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_k<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_sizes(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_sizes<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_sizes<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+template <typename Dev>
+void inline vectorVector(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // VECTOR-VECTOR
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1,
+                                                       1025);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1025, 1,
+                                                       1025);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1024, 1,
+                                                       1024);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1,
+                                                       1024);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1,
+                                                       1023);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1,
+                                                       1023);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "contracted tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline vectorTensor(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Vector-Tensor
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1025,
+                                                       1025);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1025,
+                                                       1025);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1024,
+                                                       1024);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1024,
+                                                       1024);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1023,
+                                                       1023);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1023,
+                                                       1023);
+
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4097,
+                                                       4097);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4097,
+                                                       4097);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4096,
+                                                       4096);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4096,
+                                                       4096);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4095,
+                                                       4095);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4095,
+                                                       4095);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 802816,
+                                                       32);
 
   end = std::chrono::system_clock::now();
-  std::chrono::duration<double> elapsed_seconds = end-start;
+  std::chrono::duration<double> elapsed_seconds = end - start;
   std::time_t end_time = std::chrono::system_clock::to_time_t(end);
   std::cout << "finished computation at " << std::ctime(&end_time)
             << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorVector(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Matrix-Vector
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1025,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1125, 1025,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1224, 1024,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1023,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1023,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4097, 4197,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4097, 4097,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4096, 4096,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4096, 8196,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4095, 4095,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4095, 4095,
+                                                       1);
+// If the GEMV disabled it will creates one kernel to calculate the contraction.
+// Therefore the acumuation of float number will overflow the precision
+// threshold for float and cause the test to fail. While it the GMV multiple
+// kernel will be created and each one run the overflow of accumutation breaks
+// among the kernels.
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 32, 802032,
+                                                       1);
+#endif
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorScalar(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // SCALAR Contraction
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline skinnyTensor_row(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 257, 131073,
+                                                       257);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 256, 131072,
+                                                       256);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 131073,
+                                                       16);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 17, 131072,
+                                                       17);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline skinnyTensor_col(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 257, 131073,
+                                                       257);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 256, 131072,
+                                                       256);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 131073,
+                                                       16);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 17, 131072,
+                                                       17);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
 
+template <typename Dev>
+void inline tensor_contraction_batch_per_device(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_batch<RowMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
+                                                   0, 4);
+  contraction_batch<ColMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
+                                                   0, 4);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_lhs_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 8, 4,
+                                                            8);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                            32);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
+                                                            64);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 784,
+                                                            2048, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
+                                                            10, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
+                                                            1024, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
+                                                            4096, 1024);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_rhs_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 16, 4,
+                                                            16);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
+                                                            17);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                            32);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
+                                                            64);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 10,
+                                                            1024, 1024);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
+                                                            1024, 4096);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
+                                                            1024, 2048);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
+                                                            1024, 784);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_both_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
+                                                             17);
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                             32);
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 64,
+                                                             16, 64);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
 }
 
 EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl) {
-  for (const auto& device :Eigen::get_sycl_supported_devices()) {
-    CALL_SUBTEST(tensorContractionPerDevice(device));
+  for (const auto &device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(tensorOutofBound(sycl_device));
+    CALL_SUBTEST_2(tensorTensor(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_m(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_n(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_k(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_sizes(sycl_device));
+    CALL_SUBTEST_3(vectorVector(sycl_device));
+    CALL_SUBTEST_4(vectorTensor(sycl_device));
+    CALL_SUBTEST_5(tensorVector(sycl_device));
+    CALL_SUBTEST_6(tensorScalar(sycl_device));
+    CALL_SUBTEST_7(skinnyTensor_row(sycl_device));
+    CALL_SUBTEST_7(skinnyTensor_col(sycl_device));
+    CALL_SUBTEST_8(tensor_contraction_batch_per_device(sycl_device));
+    CALL_SUBTEST_9(tensor_contraction_lhs_transposed_per_device(sycl_device));
+    CALL_SUBTEST_10(tensor_contraction_rhs_transposed_per_device(sycl_device));
+    CALL_SUBTEST_11(tensor_contraction_both_transposed_per_device(sycl_device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
index cc3b02448..d947ead83 100644
--- a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
@@ -80,6 +80,8 @@ static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device)
       VERIFY_IS_EQUAL(out(i, j), 0);
     }
   }
+  sycl_device.deallocate(gpu_in1_data);
+sycl_device.deallocate(gpu_out_data);
 }
 
 template<typename TensorType>
@@ -147,6 +149,9 @@ static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device)
       }
     }
   }
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+  sycl_device.deallocate(gpu_out_data);
 }
 
 template <typename DataType, typename Dev_selector> void custom_op_perDevice(Dev_selector s){
diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
index 74d38a644..a55a5ad8a 100644
--- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
@@ -36,8 +36,8 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
   DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
   DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
 
-  in1 = in1.random() + in1.constant(10.0f);
-  in2 = in2.random() + in2.constant(10.0f);
+  in1 = in1.random() + in1.constant(static_cast<DataType>(10.0f));
+  in2 = in2.random() + in2.constant(static_cast<DataType>(10.0f));
 
   // creating TensorMap from tensor
   Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
@@ -72,5 +72,6 @@ template <typename DataType, typename Dev_selector> void tensorForced_evalperDev
 EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(tensorForced_evalperDevice<float>(device));
+    CALL_SUBTEST(tensorForced_evalperDevice<half>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_image_op_sycl.cpp b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
new file mode 100644
index 000000000..db1c0206e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_image_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 245;
+  IndexType sizeDim2 = 343;
+  IndexType sizeDim3 = 577;
+
+  array<IndexType, 3> input_range ={{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> slice_range ={{sizeDim1-1, sizeDim2, sizeDim3}};
+
+  Tensor<DataType, 3,DataLayout, IndexType> tensor1(input_range);
+  Tensor<DataType, 3,DataLayout, IndexType> tensor2(input_range);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor3(slice_range);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor3_cpu(slice_range);
+
+
+
+  typedef Eigen::DSizes<IndexType, 3> Index3;
+  Index3 strides1(1L,1L, 1L);
+  Index3 indicesStart1(1L, 0L, 0L);
+  Index3 indicesStop1(sizeDim1, sizeDim2, sizeDim3);
+
+  Index3 strides2(1L,1L, 1L);
+  Index3 indicesStart2(0L, 0L, 0L);
+  Index3 indicesStop2(sizeDim1-1, sizeDim2, sizeDim3);
+  Eigen::DSizes<IndexType, 3> sizes(sizeDim1-1,sizeDim2,sizeDim3);
+
+  tensor1.setRandom();
+  tensor2.setRandom();
+
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, input_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, input_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu3(gpu_data3, slice_range);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data2, tensor2.data(),(tensor2.size())*sizeof(DataType));
+  gpu3.device(sycl_device)= gpu1.slice(indicesStart1, sizes) - gpu2.slice(indicesStart2, sizes);
+  sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType));
+
+  tensor3_cpu = tensor1.stridedSlice(indicesStart1,indicesStop1,strides1) - tensor2.stridedSlice(indicesStart2,indicesStop2,strides2);
+
+
+  for (IndexType i = 0; i <slice_range[0] ; ++i) {
+    for (IndexType j = 0; j < slice_range[1]; ++j) {
+      for (IndexType k = 0; k < slice_range[2]; ++k) {
+        VERIFY_IS_EQUAL(tensor3_cpu(i,j,k), tensor3(i,j,k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_image_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) { 
+   CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+   CALL_SUBTEST(sycl_computing_test_per_device<double>(device));
+#endif
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_math_sycl.cpp b/unsupported/test/cxx11_tensor_math_sycl.cpp
new file mode 100644
index 000000000..029653e27
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_math_sycl.cpp
@@ -0,0 +1,105 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_tanh_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+  IndexType sizeDim1 = 4;
+  IndexType sizeDim2 = 4;
+  IndexType sizeDim3 = 1;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange);
+
+  in = in.random();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType));
+  gpu2.device(sycl_device) = gpu1.tanh();
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType));
+
+  out_cpu=in.tanh();
+
+  for (int i = 0; i < in.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), out_cpu(i));
+  }
+}
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_sigmoid_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+  IndexType sizeDim1 = 4;
+  IndexType sizeDim2 = 4;
+  IndexType sizeDim3 = 1;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange);
+
+  in = in.random();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType));
+  gpu2.device(sycl_device) = gpu1.sigmoid();
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType));
+
+  out_cpu=in.sigmoid();
+
+  for (int i = 0; i < in.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), out_cpu(i));
+  }
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_tanh_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_tanh_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_sigmoid_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_sigmoid_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
index 93dabe3ec..bf001b40f 100644
--- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
@@ -180,6 +180,82 @@ static void test_simple_slice(const Eigen::SyclDevice &sycl_device)
   sycl_device.deallocate(gpu_data3);
 }
 
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_strided_slice_as_rhs_sycl(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+  typedef Eigen::DSizes<IndexType, 5> Index5;
+  Index5 strides(1L,1L,1L,1L,1L);
+  Index5 indicesStart(1L,2L,3L,4L,5L);
+  Index5 indicesStop(2L,3L,4L,5L,6L);
+  Index5 lengths(1L,1L,1L,1L,1L);
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 5, DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+
+  array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range);
+  Tensor<DataType, 5, DataLayout, IndexType> slice_stride1(slice1_range);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType)));
+  DataType* gpu_data_stride2  = static_cast<DataType*>(sycl_device.allocate(slice_stride1.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride2(gpu_data_stride2, slice1_range);
+
+  Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1);
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.slice(indices, sizes);
+  sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType));
+
+  gpu_stride2.device(sycl_device)=gpu1.stridedSlice(indicesStart,indicesStop,strides);
+  sycl_device.memcpyDeviceToHost(slice_stride1.data(), gpu_data_stride2,(slice_stride1.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+  VERIFY_IS_EQUAL(slice_stride1(0,0,0,0,0), tensor(1,2,3,4,5));
+
+  array<IndexType, 5> slice2_range ={{1,1,2,2,3}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range);
+  Tensor<DataType, 5, DataLayout, IndexType> strideSlice2(slice2_range);
+
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType)));
+  DataType* gpu_data_stride3  = static_cast<DataType*>(sycl_device.allocate(strideSlice2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride3(gpu_data_stride3, slice2_range);
+  Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3);
+  Index5 strides2(1L,1L,1L,1L,1L);
+  Index5 indicesStart2(1L,1L,3L,4L,5L);
+  Index5 indicesStop2(2L,2L,5L,6L,8L);
+
+  gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
+  sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType));
+
+  gpu_stride3.device(sycl_device)=gpu1.stridedSlice(indicesStart2,indicesStop2,strides2);
+  sycl_device.memcpyDeviceToHost(strideSlice2.data(), gpu_data_stride3,(strideSlice2.size())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 2; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+        VERIFY_IS_EQUAL(strideSlice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
 template<typename DataType, int DataLayout, typename IndexType>
 static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
 {
@@ -228,6 +304,65 @@ static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
   sycl_device.deallocate(gpu_data3);
 }
 
+template <typename OutIndex, typename DSizes>
+Eigen::array<OutIndex, DSizes::count> To32BitDims(const DSizes& in) {
+  Eigen::array<OutIndex, DSizes::count> out;
+  for (int i = 0; i < DSizes::count; ++i) {
+    out[i] = in[i];
+  }
+  return out;
+}
+
+template <class DataType, int DataLayout, typename IndexType, typename ConvertedIndexType>
+int run_eigen(const SyclDevice& sycl_device) {
+  using TensorI64 = Tensor<DataType, 5, DataLayout, IndexType>;
+  using TensorI32 = Tensor<DataType, 5, DataLayout, ConvertedIndexType>;
+  using TensorMI64 = TensorMap<TensorI64>;
+  using TensorMI32 = TensorMap<TensorI32>;
+  Eigen::array<IndexType, 5> tensor_range{{4, 1, 1, 1, 6}};
+  Eigen::array<IndexType, 5> slice_range{{4, 1, 1, 1, 3}};
+
+  TensorI64 out_tensor_gpu(tensor_range);
+  TensorI64 out_tensor_cpu(tensor_range);
+  out_tensor_cpu.setRandom();
+
+  TensorI64 sub_tensor(slice_range);
+  sub_tensor.setRandom();
+
+  DataType* out_gpu_data = static_cast<DataType*>(sycl_device.allocate(out_tensor_cpu.size() * sizeof(DataType)));
+  DataType* sub_gpu_data = static_cast<DataType*>(sycl_device.allocate(sub_tensor.size() * sizeof(DataType)));
+  TensorMI64 out_gpu(out_gpu_data, tensor_range);
+  TensorMI64 sub_gpu(sub_gpu_data, slice_range);
+
+  sycl_device.memcpyHostToDevice(out_gpu_data, out_tensor_cpu.data(), out_tensor_cpu.size() * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(sub_gpu_data, sub_tensor.data(), sub_tensor.size() * sizeof(DataType));
+
+  Eigen::array<ConvertedIndexType, 5> slice_offset_32{{0, 0, 0, 0, 3}};
+  Eigen::array<ConvertedIndexType, 5> slice_range_32{{4, 1, 1, 1, 3}};
+  TensorMI32 out_cpu_32(out_tensor_cpu.data(), To32BitDims<ConvertedIndexType>(out_tensor_cpu.dimensions()));
+  TensorMI32 sub_cpu_32(sub_tensor.data(), To32BitDims<ConvertedIndexType>(sub_tensor.dimensions()));
+  TensorMI32 out_gpu_32(out_gpu.data(), To32BitDims<ConvertedIndexType>(out_gpu.dimensions()));
+  TensorMI32 sub_gpu_32(sub_gpu.data(), To32BitDims<ConvertedIndexType>(sub_gpu.dimensions()));
+
+  out_gpu_32.slice(slice_offset_32, slice_range_32).device(sycl_device) = sub_gpu_32;
+
+  out_cpu_32.slice(slice_offset_32, slice_range_32) = sub_cpu_32;
+
+  sycl_device.memcpyDeviceToHost(out_tensor_gpu.data(), out_gpu_data, out_tensor_cpu.size() * sizeof(DataType));
+  int has_err = 0;
+  for (IndexType i = 0; i < out_tensor_cpu.size(); ++i) {
+    auto exp = out_tensor_cpu(i);
+    auto val = out_tensor_gpu(i);
+    if (val != exp) {
+      std::cout << "#" << i << " got " << val << " but expected " << exp << std::endl;
+      has_err = 1;
+    }
+  }
+  sycl_device.deallocate(out_gpu_data);
+  sycl_device.deallocate(sub_gpu_data);
+  return has_err;
+}
+
 template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_device(dev_Selector s){
   QueueInterface queueInterface(s);
   auto sycl_device = Eigen::SyclDevice(&queueInterface);
@@ -239,6 +374,9 @@ template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_d
   test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
   test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_strided_slice_as_rhs_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_strided_slice_as_rhs_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  run_eigen<float, RowMajor, long, int>(sycl_device); 
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl)
 {
diff --git a/unsupported/test/cxx11_tensor_random_sycl.cpp b/unsupported/test/cxx11_tensor_random_sycl.cpp
new file mode 100644
index 000000000..6c83894a3
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_random_sycl.cpp
@@ -0,0 +1,100 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_sycl_random_uniform(const Eigen::SyclDevice& sycl_device)
+{
+  Tensor<DataType, 2,DataLayout, IndexType> out(72,97);
+  out.setZero();
+
+  std::size_t out_bytes = out.size() * sizeof(DataType);
+
+  IndexType sizeDim0 = 72;
+  IndexType sizeDim1 = 97;
+
+  array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}};
+
+  DataType* d_out  = static_cast<DataType*>(sycl_device.allocate(out_bytes));
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange);
+
+  gpu_out.device(sycl_device)=gpu_out.random();
+  sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes);
+  for(IndexType i=1; i<sizeDim0; i++)
+    for(IndexType j=1; j<sizeDim1; j++)
+    {
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1));    }
+
+  // For now we just check thes code doesn't crash.
+  // TODO: come up with a valid test of randomness
+  sycl_device.deallocate(d_out);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_random_normal(const Eigen::SyclDevice& sycl_device)
+{
+  Tensor<DataType, 2,DataLayout,IndexType> out(72,97);
+  out.setZero();
+  std::size_t out_bytes = out.size() * sizeof(DataType);
+
+  IndexType sizeDim0 = 72;
+  IndexType sizeDim1 = 97;
+
+  array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}};
+
+  DataType* d_out  = static_cast<DataType*>(sycl_device.allocate(out_bytes));
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange);
+  Eigen::internal::NormalRandomGenerator<DataType> gen(true);
+  gpu_out.device(sycl_device)=gpu_out.random(gen);
+  sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes);
+  for(IndexType i=1; i<sizeDim0; i++)
+    for(IndexType j=1; j<sizeDim1; j++)
+    {
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1));
+
+    }
+
+  // For now we just check thes code doesn't crash.
+  // TODO: come up with a valid test of randomness
+  sycl_device.deallocate(d_out);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_random_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_sycl_random_uniform<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_random_uniform<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_random_normal<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_random_normal<DataType, ColMajor, int64_t>(sycl_device);
+
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_random_test_per_device<float>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+    CALL_SUBTEST(sycl_random_test_per_device<double>(device));
+#endif
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
index f526299c6..a297716e4 100644
--- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
@@ -16,16 +16,99 @@
 
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
+#define EIGEN_HAS_CONSTEXPR 1
 
 #include "main.h"
+
 #include <unsupported/Eigen/CXX11/Tensor>
 
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 753;
+  const IndexType num_cols = 537;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+
+  array<IndexType, 2> outRange = {{1, 1}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange);
+  Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange);
+
+  in.setRandom();
+  auto dim = DSizes<IndexType, 2>(1, 1);
+  full_redux = in.sum().reshape(dim);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(
+      sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize()));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data,
+                                                                outRange);
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim);
+  sycl_device.memcpyDeviceToHost(
+      full_redux_gpu.data(), gpu_out_data,
+      (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  std::cout << "SYCL FULL :" << full_redux_gpu(0, 0)
+            << ", CPU FULL: " << full_redux(0, 0) << "\n";
+  VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0));
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
 
 template <typename DataType, int DataLayout, typename IndexType>
-static void test_full_reductions_mean_sycl(const Eigen::SyclDevice&  sycl_device) {
+static void test_full_reductions_sum_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.sum();
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.sum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
 
-  const IndexType num_rows = 452;
-  const IndexType num_cols = 765;
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_max_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 4096;
+  const IndexType num_cols = 4096;
   array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
 
   Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
@@ -34,27 +117,250 @@ static void test_full_reductions_mean_sycl(const Eigen::SyclDevice&  sycl_device
 
   in.setRandom();
 
-  full_redux = in.mean();
+  full_redux = in.maximum();
 
-  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
-  DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType));
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
 
-  TensorMap<Tensor<DataType, 2, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 0, DataLayout, IndexType> >  out_gpu(gpu_out_data);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set the initial value to be the max.
+  // As we don't include this in the reduction the result should not be 2.
+  in(0) = static_cast<DataType>(2);
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.maximum();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
-  out_gpu.device(sycl_device) = in_gpu.mean();
-  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }
 
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 4096;
+  const IndexType num_cols = 4096;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+  array<IndexType, 1> argRange = {{num_cols}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  //  red_axis[1]=1;
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange);
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+  in.setRandom();
+  in_arg1.setRandom();
+  in_arg2.setRandom();
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_in_arg1_data = static_cast<DataType*>(sycl_device.allocate(
+      in_arg1.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_in_arg2_data = static_cast<DataType*>(sycl_device.allocate(
+      in_arg2.dimensions().TotalSize() * sizeof(DataType)));
+  bool* gpu_out_arg__gpu_helper_data = static_cast<bool*>(sycl_device.allocate(
+      out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
+  bool* gpu_out_arg_data = static_cast<bool*>(sycl_device.allocate(
+      out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg1_gpu(
+      gpu_in_arg1_data, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg2_gpu(
+      gpu_in_arg2_data, tensorRange);
+  TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu(
+      gpu_out_arg_data, argRange);
+  TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu_helper(
+      gpu_out_arg__gpu_helper_data, argRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+
+  // CPU VERSION
+  out_arg_cpu =
+      (in_arg1.argmax(1) == in_arg2.argmax(1))
+          .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false));
+  full_redux = (out_arg_cpu.template cast<float>())
+                   .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+
+  // GPU VERSION
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_arg1_data, in_arg1.data(),
+      (in_arg1.dimensions().TotalSize()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_arg2_data, in_arg2.data(),
+      (in_arg2.dimensions().TotalSize()) * sizeof(DataType));
+  out_Argout_gpu_helper.device(sycl_device) =
+      (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1));
+  out_Argout_gpu.device(sycl_device) =
+      (out_Argout_gpu_helper)
+          .select(out_Argout_gpu.constant(true),
+                  out_Argout_gpu.constant(false));
+  out_gpu.device(sycl_device) =
+      (out_Argout_gpu.template cast<float>())
+          .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux()
+            << '\n';
+  VERIFY_IS_EQUAL(full_redux_gpu(), full_redux());
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_in_arg1_data);
+  sycl_device.deallocate(gpu_in_arg2_data);
+  sycl_device.deallocate(gpu_out_arg__gpu_helper_data);
+  sycl_device.deallocate(gpu_out_arg_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.mean();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
 
 template <typename DataType, int DataLayout, typename IndexType>
-static void test_full_reductions_min_sycl(const Eigen::SyclDevice&  sycl_device) {
+static void test_full_reductions_mean_with_odd_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  // This is a particular case which illustrates a possible problem when the
+  // number of local threads in a workgroup is even, but is not a power of two.
+  using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  // 2177 = (17 * 128) + 1 gives rise to 18 local threads.
+  // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads.
+  const IndexType n_elems = 8707;
+  array<IndexType, 1> tensor_range = {{n_elems}};
+
+  data_tensor in(tensor_range);
+  DataType full_redux;
+  DataType full_redux_gpu;
+  TensorMap<scalar_tensor> red_cpu(&full_redux);
+  TensorMap<scalar_tensor> red_gpu(&full_redux_gpu);
+
+  const DataType const_val = static_cast<DataType>(0.6391);
+  in = in.constant(const_val);
+
+  Eigen::IndexList<Eigen::type2index<0>> red_axis;
+  red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  VERIFY_IS_APPROX(const_val, red_cpu());
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) =
+      in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu, full_redux);
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
 
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_min_sycl(
+    const Eigen::SyclDevice& sycl_device) {
   const IndexType num_rows = 876;
   const IndexType num_cols = 953;
   array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
@@ -67,25 +373,73 @@ static void test_full_reductions_min_sycl(const Eigen::SyclDevice&  sycl_device)
 
   full_redux = in.minimum();
 
-  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
-  DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType));
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
 
-  TensorMap<Tensor<DataType, 2, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 0, DataLayout, IndexType> >  out_gpu(gpu_out_data);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.minimum();
-  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }
 
-
 template <typename DataType, int DataLayout, typename IndexType>
-static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_device) {
+static void test_full_reductions_min_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set the initial value to be the min.
+  // As we don't include this in the reduction the result should not be -2.
+  in(0) = static_cast<DataType>(-2);
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.minimum();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.minimum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
 
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_max_sycl(
+    const Eigen::SyclDevice& sycl_device) {
   IndexType dim_x = 145;
   IndexType dim_y = 1;
   IndexType dim_z = 67;
@@ -101,33 +455,293 @@ static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_dev
 
   in.setRandom();
 
-  redux= in.maximum(red_axis);
+  redux = in.maximum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  array<IndexType, 1> reduced_range = {{num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+  const IndexType n_reduced = num_cols;
+
+  data_tensor in(tensor_range);
+  reduced_tensor redux;
+  reduced_tensor redux_gpu(reduced_range);
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set maximum value outside of the considered range.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    in(i) = static_cast<DataType>(2);
+  }
+
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  redux = in_offset.maximum(red_axis);
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_NOT_EQUAL(redux(i), in(i));
+  }
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(
+      sycl_device.allocate(n_reduced * sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data,
+                                 n_reduced * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_APPROX(redux_gpu(i), redux(i));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  array<IndexType, 1> full_reduced_range = {{num_rows}};
+  array<IndexType, 1> reduced_range = {{num_rows - 1}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+  const IndexType n_reduced = reduced_range[0];
+
+  data_tensor in(tensor_range);
+  reduced_tensor redux(full_reduced_range);
+  reduced_tensor redux_gpu(reduced_range);
+
+  in.setRandom();
+  redux.setZero();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set maximum value outside of the considered range.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    in(i) = static_cast<DataType>(2);
+  }
 
-  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
-  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType)));
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 1;
+
+  const IndexType offset = 64;
+  // Introduce an offset in both the input and the output.
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range);
+  red_offset = in_offset.maximum(red_axis);
+
+  // Check that the first value hasn't been changed and that the reduced values
+  // are not equal to the previously set maximum in the input outside the range.
+  VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0));
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_NOT_EQUAL(red_offset(i), in(i));
+  }
 
-  TensorMap<Tensor<DataType, 3, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 2, DataLayout, IndexType> >  out_gpu(gpu_out_data, reduced_tensorRange);
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(
+      sycl_device.allocate((n_reduced + 1) * sizeof(DataType)));
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
-  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(),
+                                 n_reduced * sizeof(DataType));
 
   // Check that the CPU and GPU reductions return the same result.
-  for(IndexType j=0; j<reduced_tensorRange[0]; j++ )
-    for(IndexType k=0; k<reduced_tensorRange[1]; k++ )
-      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_APPROX(redux_gpu(i), red_offset(i));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) {
+  array<IndexType, 2> tensorRange = {{dim_x, dim_y}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  array<IndexType, 1> reduced_tensorRange = {{dim_y}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+  redux = in.sum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
 
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < redux.size(); i++) {
+    VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]);
+  }
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }
 
 template <typename DataType, int DataLayout, typename IndexType>
-static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_device) {
+static void test_first_dim_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 145;
+  IndexType dim_y = 1;
+  IndexType dim_z = 67;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux = in.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 64;
+  IndexType dim_y = 1;
+  IndexType dim_z = 32;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 2;
+  array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux = in.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
 
-  IndexType dim_x = 567;
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 64;
   IndexType dim_y = 1;
-  IndexType dim_z = 47;
+  IndexType dim_z = 32;
 
   array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
   Eigen::array<IndexType, 1> red_axis;
@@ -140,42 +754,261 @@ static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_devi
 
   in.setRandom();
 
-  redux= in.sum(red_axis);
+  redux = in.sum(red_axis);
 
-  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
-  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType)));
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
 
-  TensorMap<Tensor<DataType, 3, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 2, DataLayout, IndexType> >  out_gpu(gpu_out_data, reduced_tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
-  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
-  for(IndexType j=0; j<reduced_tensorRange[0]; j++ )
-    for(IndexType k=0; k<reduced_tensorRange[1]; k++ )
-      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
 
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
+}
 
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  auto tensorRange = Sizes<64, 32>(64, 32);
+  // auto red_axis =  Sizes<0,1>(0,1);
+  Eigen::IndexList<Eigen::type2index<1>> red_axis;
+  auto reduced_tensorRange = Sizes<64>(64);
+  TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
+
+  in_fix.setRandom();
+
+  redux_fix = in_fix.sum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in_fix.data(),
+      (in_fix.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu_fix.data(), gpu_out_data,
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
+    VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
 }
-template<typename DataType> void sycl_reduction_test_per_device(const cl::sycl::device& d){
-  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
-  QueueInterface queueInterface(d);
-  auto sycl_device = Eigen::SyclDevice(&queueInterface);
 
-  test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  auto tensorRange = Sizes<64, 32>(64, 32);
+  Eigen::IndexList<Eigen::type2index<1>> red_axis;
+  auto reduced_tensorRange = Sizes<64>(64);
+  TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
+
+  in_fix.setRandom();
+  redux_fix = in_fix.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in_fix.data(),
+      (in_fix.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu_fix.data(), gpu_out_data,
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
+  sycl_device.synchronize();
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
+    VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+// SYCL supports a generic case of reduction where the accumulator is a
+// different type than the input data This is an example on how to get if a
+// Tensor contains nan and/or inf in one reduction
+template <typename InT, typename OutT>
+struct CustomReducer {
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
+  static constexpr OutT InfBit = 1;
+  static constexpr OutT NanBit = 2;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x,
+                                                    OutT* accum) const {
+    if (Eigen::numext::isinf(x))
+      *accum |= InfBit;
+    else if (Eigen::numext::isnan(x))
+      *accum |= NanBit;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x,
+                                                    OutT* accum) const {
+    *accum |= x;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const {
+    return OutT(0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const {
+    return accum;
+  }
+};
+
+template <typename DataType, typename AccumType, int DataLayout,
+          typename IndexType>
+static void test_full_reductions_custom_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  constexpr IndexType InSize = 64;
+  auto tensorRange = Sizes<InSize>(InSize);
+  Eigen::IndexList<Eigen::type2index<0>> dims;
+  auto reduced_tensorRange = Sizes<>();
+  TensorFixedSize<DataType, Sizes<InSize>, DataLayout> in_fix;
+  TensorFixedSize<AccumType, Sizes<>, DataLayout> redux_gpu_fix;
+
+  CustomReducer<DataType, AccumType> reducer;
+
+  in_fix.setRandom();
+
+  size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType);
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(in_size_bytes));
+  AccumType* gpu_out_data =
+      static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<InSize>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<AccumType, Sizes<>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes);
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer);
+  sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data,
+                                 sizeof(AccumType));
+  VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_full_per_device(const Dev& sycl_device) {
+  test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
+
+  test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_full_offset_per_device(const Dev& sycl_device) {
+  test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) {
+  test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device,
+                                                                  4197, 4097);
+  test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
+                                                                  4197, 4097);
+  test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
+                                                                  129, 8);
   test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) {
   test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
-  test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
-  test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
-  test_first_dim_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
-  test_last_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  sycl_device.synchronize();
 }
+
 EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) {
-  for (const auto& device :Eigen::get_sycl_supported_devices()) {
-    CALL_SUBTEST(sycl_reduction_test_per_device<float>(device));
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device));
+    CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device));
+    CALL_SUBTEST_3(
+        sycl_reduction_test_first_dim_per_device<float>(sycl_device));
+    CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
index 77c2235d1..dd30c235d 100644
--- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
@@ -20,10 +20,8 @@
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
-
 template <typename DataType, int DataLayout, typename IndexType>
-static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
-
+static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) {
   IndexType dim1 = 2;
   IndexType dim2 = 3;
   IndexType dim3 = 5;
@@ -40,21 +38,30 @@ static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
   dim_rev[2] = true;
   dim_rev[3] = false;
 
-  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType)));
-  DataType* gpu_out_data =static_cast<DataType*>(sycl_device.allocate(reversed_tensor.dimensions().TotalSize()*sizeof(DataType)));
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType)));
 
-  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  out_gpu(gpu_out_data, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data,
+                                                                tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu(gpu_out_data,
+                                                                 tensorRange);
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, tensor.data(),
+      (tensor.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
-  sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
   for (IndexType i = 0; i < 2; ++i) {
     for (IndexType j = 0; j < 3; ++j) {
       for (IndexType k = 0; k < 5; ++k) {
         for (IndexType l = 0; l < 7; ++l) {
-          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l));
+          VERIFY_IS_EQUAL(tensor(i, j, k, l),
+                          reversed_tensor(i, 2 - j, 4 - k, l));
         }
       }
     }
@@ -65,13 +72,15 @@ static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
   dim_rev[3] = false;
 
   out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
-  sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
 
   for (IndexType i = 0; i < 2; ++i) {
     for (IndexType j = 0; j < 3; ++j) {
       for (IndexType k = 0; k < 5; ++k) {
         for (IndexType l = 0; l < 7; ++l) {
-          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l));
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), reversed_tensor(1 - i, j, k, l));
         }
       }
     }
@@ -82,13 +91,16 @@ static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
   dim_rev[2] = false;
   dim_rev[3] = true;
   out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
-  sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
 
   for (IndexType i = 0; i < 2; ++i) {
     for (IndexType j = 0; j < 3; ++j) {
       for (IndexType k = 0; k < 5; ++k) {
         for (IndexType l = 0; l < 7; ++l) {
-          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l));
+          VERIFY_IS_EQUAL(tensor(i, j, k, l),
+                          reversed_tensor(1 - i, j, k, 6 - l));
         }
       }
     }
@@ -98,11 +110,9 @@ static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
   sycl_device.deallocate(gpu_out_data);
 }
 
-
-
 template <typename DataType, int DataLayout, typename IndexType>
-static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue)
-{
+static void test_expr_reverse(const Eigen::SyclDevice& sycl_device,
+                              bool LValue) {
   IndexType dim1 = 2;
   IndexType dim2 = 3;
   IndexType dim3 = 5;
@@ -120,24 +130,32 @@ static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue
   dim_rev[2] = false;
   dim_rev[3] = true;
 
-  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType)));
-  DataType* gpu_out_data_expected =static_cast<DataType*>(sycl_device.allocate(expected.dimensions().TotalSize()*sizeof(DataType)));
-  DataType* gpu_out_data_result =static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));
-
-  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  out_gpu_expected(gpu_out_data_expected, tensorRange);
-  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  out_gpu_result(gpu_out_data_result, tensorRange);
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data_expected = static_cast<DataType*>(sycl_device.allocate(
+      expected.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data_result = static_cast<DataType*>(
+      sycl_device.allocate(result.dimensions().TotalSize() * sizeof(DataType)));
 
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data,
+                                                                tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_expected(
+      gpu_out_data_expected, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_result(
+      gpu_out_data_result, tensorRange);
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, tensor.data(),
+      (tensor.dimensions().TotalSize()) * sizeof(DataType));
 
   if (LValue) {
     out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu;
   } else {
     out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev);
   }
-  sycl_device.memcpyDeviceToHost(expected.data(), gpu_out_data_expected, expected.dimensions().TotalSize()*sizeof(DataType));
-
+  sycl_device.memcpyDeviceToHost(
+      expected.data(), gpu_out_data_expected,
+      expected.dimensions().TotalSize() * sizeof(DataType));
 
   array<IndexType, 4> src_slice_dim;
   src_slice_dim[0] = 2;
@@ -154,8 +172,9 @@ static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue
 
   for (IndexType i = 0; i < 5; ++i) {
     if (LValue) {
-        out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) =
-          in_gpu.slice(src_slice_start, src_slice_dim);
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim)
+          .reverse(dim_rev)
+          .device(sycl_device) = in_gpu.slice(src_slice_start, src_slice_dim);
     } else {
       out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
           in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
@@ -163,13 +182,15 @@ static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue
     src_slice_start[2] += 1;
     dst_slice_start[2] += 1;
   }
-  sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
+      result.data(), gpu_out_data_result,
+      result.dimensions().TotalSize() * sizeof(DataType));
 
   for (IndexType i = 0; i < expected.dimension(0); ++i) {
     for (IndexType j = 0; j < expected.dimension(1); ++j) {
       for (IndexType k = 0; k < expected.dimension(2); ++k) {
         for (IndexType l = 0; l < expected.dimension(3); ++l) {
-          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+          VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l));
         }
       }
     }
@@ -177,34 +198,37 @@ static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue
 
   dst_slice_start[2] = 0;
   result.setRandom();
-  sycl_device.memcpyHostToDevice(gpu_out_data_result, result.data(),(result.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_out_data_result, result.data(),
+      (result.dimensions().TotalSize()) * sizeof(DataType));
   for (IndexType i = 0; i < 5; ++i) {
-     if (LValue) {
-       out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) =
-           in_gpu.slice(dst_slice_start, dst_slice_dim);
-     } else {
-       out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
-           in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
-     }
+    if (LValue) {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim)
+          .reverse(dim_rev)
+          .device(sycl_device) = in_gpu.slice(dst_slice_start, dst_slice_dim);
+    } else {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
+          in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
+    }
     dst_slice_start[2] += 1;
   }
-  sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
+      result.data(), gpu_out_data_result,
+      result.dimensions().TotalSize() * sizeof(DataType));
 
   for (IndexType i = 0; i < expected.dimension(0); ++i) {
     for (IndexType j = 0; j < expected.dimension(1); ++j) {
       for (IndexType k = 0; k < expected.dimension(2); ++k) {
         for (IndexType l = 0; l < expected.dimension(3); ++l) {
-          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+          VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l));
         }
       }
     }
   }
 }
 
-
-
-template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::device& d){
-  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
+template <typename DataType>
+void sycl_reverse_test_per_device(const cl::sycl::device& d) {
   QueueInterface queueInterface(d);
   auto sycl_device = Eigen::SyclDevice(&queueInterface);
   test_simple_reverse<DataType, RowMajor, int64_t>(sycl_device);
@@ -215,7 +239,15 @@ template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::de
   test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true);
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) {
-  for (const auto& device :Eigen::get_sycl_supported_devices()) {
-    CALL_SUBTEST(sycl_reverse_test_per_device<float>(device));
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.get_info<cl::sycl::info::device::name>() << std::endl;
+    CALL_SUBTEST_1(sycl_reverse_test_per_device<short>(device));
+    CALL_SUBTEST_2(sycl_reverse_test_per_device<int>(device));
+    CALL_SUBTEST_3(sycl_reverse_test_per_device<unsigned int>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+    CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device));
+#endif
+    CALL_SUBTEST_5(sycl_reverse_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_scan_sycl.cpp b/unsupported/test/cxx11_tensor_scan_sycl.cpp
new file mode 100644
index 000000000..09c45fce5
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_scan_sycl.cpp
@@ -0,0 +1,141 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_cumsum(const Eigen::SyclDevice& sycl_device, IndexType m_size,
+                      IndexType k_size, IndexType n_size, int consume_dim,
+                      bool exclusive) {
+  static const DataType error_threshold = 1e-4f;
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size
+            << " consume_dim : " << consume_dim << ")" << std::endl;
+  Tensor<DataType, 3, DataLayout, IndexType> t_input(m_size, k_size, n_size);
+  Tensor<DataType, 3, DataLayout, IndexType> t_result(m_size, k_size, n_size);
+  Tensor<DataType, 3, DataLayout, IndexType> t_result_gpu(m_size, k_size,
+                                                          n_size);
+
+  t_input.setRandom();
+  std::size_t t_input_bytes = t_input.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType* gpu_data_in =
+      static_cast<DataType*>(sycl_device.allocate(t_input_bytes));
+  DataType* gpu_data_out =
+      static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
+
+  array<IndexType, 3> tensorRange = {{m_size, k_size, n_size}};
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_input(
+      gpu_data_in, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_result(
+      gpu_data_out, tensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_in, t_input.data(), t_input_bytes);
+  sycl_device.memcpyHostToDevice(gpu_data_out, t_input.data(), t_input_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_input.cumsum(consume_dim, exclusive);
+
+  t_result = t_input.cumsum(consume_dim, exclusive);
+
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), gpu_data_out,
+                                 t_result_bytes);
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << " CPU : " << t_result(i)
+              << " vs SYCL : " << t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+  sycl_device.deallocate(gpu_data_in);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim0_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim1_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim2_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim0_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                false);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim1_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                false);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim2_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                false);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_scan_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(
+        sycl_scan_test_exclusive_dim0_per_device<float>(sycl_device));
+    CALL_SUBTEST_2(
+        sycl_scan_test_exclusive_dim1_per_device<float>(sycl_device));
+    CALL_SUBTEST_3(
+        sycl_scan_test_exclusive_dim2_per_device<float>(sycl_device));
+    CALL_SUBTEST_4(
+        sycl_scan_test_inclusive_dim0_per_device<float>(sycl_device));
+    CALL_SUBTEST_5(
+        sycl_scan_test_inclusive_dim1_per_device<float>(sycl_device));
+    CALL_SUBTEST_6(
+        sycl_scan_test_inclusive_dim2_per_device<float>(sycl_device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
index 0e8cc3bd2..ca4e8b5ef 100644
--- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
@@ -12,14 +12,12 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
-
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
@@ -29,33 +27,33 @@ using Eigen::Tensor;
 using Eigen::TensorMap;
 
 template <typename DataType, int DataLayout, typename IndexType>
-static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
-{
+static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) {
   IndexType sizeDim1 = 2;
   IndexType sizeDim2 = 3;
   IndexType sizeDim3 = 5;
   IndexType sizeDim4 = 7;
   array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
-  Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
-  Tensor<DataType, 4, DataLayout,IndexType> no_shuffle(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> no_shuffle(tensorRange);
   tensor.setRandom();
 
-  const size_t buffSize =tensor.size()*sizeof(DataType);
+  const size_t buffSize = tensor.size() * sizeof(DataType);
   array<IndexType, 4> shuffles;
   shuffles[0] = 0;
   shuffles[1] = 1;
   shuffles[2] = 2;
   shuffles[3] = 3;
-  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(buffSize));
-  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(buffSize));
-
+  DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+  DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(buffSize));
 
-  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
-  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu2(gpu_data2, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu1(gpu_data1,
+                                                             tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu2(gpu_data2,
+                                                             tensorRange);
 
   sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize);
 
-  gpu2.device(sycl_device)=gpu1.shuffle(shuffles);
+  gpu2.device(sycl_device) = gpu1.shuffle(shuffles);
   sycl_device.memcpyDeviceToHost(no_shuffle.data(), gpu_data2, buffSize);
   sycl_device.synchronize();
 
@@ -68,7 +66,7 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
     for (IndexType j = 0; j < sizeDim2; ++j) {
       for (IndexType k = 0; k < sizeDim3; ++k) {
         for (IndexType l = 0; l < sizeDim4; ++l) {
-          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l));
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), no_shuffle(i, j, k, l));
         }
       }
     }
@@ -78,12 +76,14 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
   shuffles[1] = 3;
   shuffles[2] = 1;
   shuffles[3] = 0;
-  array<IndexType, 4> tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
-  Tensor<DataType, 4, DataLayout,IndexType> shuffle(tensorrangeShuffle);
-  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(buffSize));
-  TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu3(gpu_data3, tensorrangeShuffle);
-
-  gpu3.device(sycl_device)=gpu1.shuffle(shuffles);
+  array<IndexType, 4> tensorrangeShuffle = {
+      {sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
+  Tensor<DataType, 4, DataLayout, IndexType> shuffle(tensorrangeShuffle);
+  DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu3(
+      gpu_data3, tensorrangeShuffle);
+
+  gpu3.device(sycl_device) = gpu1.shuffle(shuffles);
   sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize);
   sycl_device.synchronize();
 
@@ -96,24 +96,22 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
     for (IndexType j = 0; j < sizeDim2; ++j) {
       for (IndexType k = 0; k < sizeDim3; ++k) {
         for (IndexType l = 0; l < sizeDim4; ++l) {
-          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), shuffle(k, l, j, i));
         }
       }
     }
   }
 }
 
-
-template<typename DataType, typename dev_Selector> void sycl_shuffling_test_per_device(dev_Selector s){
+template <typename DataType, typename dev_Selector>
+void sycl_shuffling_test_per_device(dev_Selector s) {
   QueueInterface queueInterface(s);
   auto sycl_device = Eigen::SyclDevice(&queueInterface);
   test_simple_shuffling_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device);
-
 }
-EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl)
-{
-  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp
index 9357bed02..e6c5e2378 100644
--- a/unsupported/test/cxx11_tensor_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_sycl.cpp
@@ -29,9 +29,9 @@ using Eigen::TensorMap;
 
 template <typename DataType, int DataLayout, typename IndexType>
 void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
-  IndexType sizeDim1 = 100;
-  IndexType sizeDim2 = 10;
-  IndexType sizeDim3 = 20;
+  IndexType sizeDim1 = 5;
+  IndexType sizeDim2 = 5;
+  IndexType sizeDim3 = 1;
   array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
   Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
   Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange);
@@ -56,6 +56,7 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
   sycl_device.synchronize();
 
   for (IndexType i = 0; i < in1.size(); ++i) {
+  //  std::cout << "SYCL DATA : " << out1(i) << "  vs  CPU DATA : " << in1(i) * 3.14f << "\n";
     VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f);
     VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f);
     VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f);
@@ -93,6 +94,88 @@ void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) {
   sycl_device.deallocate(gpu_data);
 }
 
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_mem_sync_offsets(const Eigen::SyclDevice &sycl_device) {
+  using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
+  IndexType full_size = 32;
+  IndexType half_size = full_size / 2;
+  array<IndexType, 1> tensorRange = {{full_size}};
+  tensor_type in1(tensorRange);
+  tensor_type out(tensorRange);
+
+  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
+
+  in1 = in1.random();
+  // Copy all data to device, then permute on copy back to host
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data() + half_size, gpu_data, half_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  in1 = in1.random();
+  out.setZero();
+  // Permute copies to device, then copy all back to host
+  sycl_device.memcpyHostToDevice(gpu_data + half_size, in1.data(), half_size * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data() + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  in1 = in1.random();
+  out.setZero();
+  DataType* gpu_data_out  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu2(gpu_data_out, tensorRange);
+  // Copy all to device, permute copies on device, then copy all back to host
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
+  sycl_device.memcpy(gpu_data_out + half_size, gpu_data, half_size * sizeof(DataType));
+  sycl_device.memcpy(gpu_data_out, gpu_data + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  sycl_device.deallocate(gpu_data_out);
+  sycl_device.deallocate(gpu_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_memset_offsets(const Eigen::SyclDevice &sycl_device) {
+  using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
+  IndexType full_size = 32;
+  IndexType half_size = full_size / 2;
+  array<IndexType, 1> tensorRange = {{full_size}};
+  tensor_type cpu_out(tensorRange);
+  tensor_type out(tensorRange);
+
+  cpu_out.setZero();
+
+  std::memset(cpu_out.data(), 0, half_size * sizeof(DataType));
+  std::memset(cpu_out.data() + half_size, 1, half_size * sizeof(DataType));
+
+  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
+
+  sycl_device.memset(gpu_data, 0, half_size * sizeof(DataType));
+  sycl_device.memset(gpu_data + half_size, 1, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < full_size; ++i) {
+    VERIFY_IS_APPROX(out(i), cpu_out(i));
+  }
+
+  sycl_device.deallocate(gpu_data);
+}
+
 template <typename DataType, int DataLayout, typename IndexType>
 void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
 
@@ -262,6 +345,8 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_
   test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device);
   test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device);
   test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync_offsets<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_memset_offsets<DataType, RowMajor, int64_t>(sycl_device);
   test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device);
   test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device);
   test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device);
-- 
cgit v1.2.3