From 00f32752f7d0b193c6788691c3cf0b76457a044d Mon Sep 17 00:00:00 2001
From: Mehdi Goli <mehdi.goli@codeplay.com>
Date: Thu, 28 Nov 2019 10:08:54 +0000
Subject: [SYCL] Rebasing the SYCL support branch on top of the Einge upstream
 master branch. * Unifying all loadLocalTile from lhs and rhs to an
 extract_block function. * Adding get_tensor operation which was missing in
 TensorContractionMapper. * Adding the -D method missing from cmake for
 Disable_Skinny Contraction operation. * Wrapping all the indices in
 TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL *
 Unifying load to private register for tall/skinny no shared * Unifying load
 to vector tile for tensor-vector/vector-tensor operation * Removing all the
 LHS/RHS class for extracting data from global * Removing Outputfunction from
 TensorContractionSkinnyNoshared. * Combining the local memory version of
 tall/skinny and normal tensor contraction into one kernel. * Combining the
 no-local memory version of tall/skinny and normal tensor contraction into one
 kernel. * Combining General Tensor-Vector and VectorTensor contraction into
 one kernel. * Making double buffering optional for Tensor contraction when
 local memory is version is used. * Modifying benchmark to accept custom
 Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to
 allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL
 CMake

---
 unsupported/test/cxx11_tensor_builtins_sycl.cpp | 497 ++++++++++++++----------
 1 file changed, 292 insertions(+), 205 deletions(-)

(limited to 'unsupported/test/cxx11_tensor_builtins_sycl.cpp')
diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
index db2975783..72cb62fd5 100644
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@@ -25,243 +25,330 @@ using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
 
-namespace std {
-template <typename T> T rsqrt(T x) { return 1 / std::sqrt(x); }
+// Functions used to compare the TensorMap implementation on the device with
+// the equivalent on the host
+namespace cl {
+namespace sycl {
+template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
 template <typename T> T square(T x) { return x * x; }
 template <typename T> T cube(T x) { return x * x * x; }
-template <typename T> T inverse(T x) { return 1 / x; }
+template <typename T> T inverse(T x) { return T(1) / x; }
+template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
+template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
 }
+}
+
+struct EqualAssignement {
+  template <typename Lhs, typename Rhs>
+  void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
+};
+
+struct PlusEqualAssignement {
+  template <typename Lhs, typename Rhs>
+  void operator()(Lhs& lhs, const Rhs& rhs) { lhs += rhs; }
+};
 
-#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR, Layout)         \
-  {                                                                            \
-    /* out OPERATOR in.FUNC() */                                               \
-    Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange);                        \
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
-    in = in.random() + static_cast<SCALAR>(0.01);                              \
-    out = out.random() + static_cast<SCALAR>(0.01);                            \
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
-    SCALAR *gpu_data = static_cast<SCALAR *>(                                  \
-        sycl_device.allocate(in.size() * sizeof(SCALAR)));                     \
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange);          \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
-    sycl_device.memcpyHostToDevice(gpu_data, in.data(),                        \
-                                   (in.size()) * sizeof(SCALAR));              \
-    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    gpu_out.device(sycl_device) OPERATOR gpu.FUNC();                           \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      SCALAR ver = reference(i);                                               \
-      ver OPERATOR std::FUNC(in(i));                                           \
-      VERIFY_IS_APPROX(out(i), ver);                                           \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data);                                          \
-    sycl_device.deallocate(gpu_data_out);                                      \
-  }                                                                            \
-  {                                                                            \
-    /* out OPERATOR out.FUNC() */                                              \
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
-    out = out.random() + static_cast<SCALAR>(0.01);                            \
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
-    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    gpu_out.device(sycl_device) OPERATOR gpu_out.FUNC();                       \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      SCALAR ver = reference(i);                                               \
-      ver OPERATOR std::FUNC(reference(i));                                    \
-      VERIFY_IS_APPROX(out(i), ver);                                           \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data_out);                                      \
+template <typename DataType, int DataLayout,
+          typename Assignement, typename Operator>
+void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
+                                    const array<int64_t, 3>& tensor_range) {
+  Operator op;
+  Assignement asgn;
+  {
+    /* Assignement(out, Operator(in)) */
+    Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+    Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+    in = in.random() + DataType(0.01);
+    out = out.random() + DataType(0.01);
+    Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+    DataType *gpu_data = static_cast<DataType *>(
+        sycl_device.allocate(in.size() * sizeof(DataType)));
+    DataType *gpu_data_out = static_cast<DataType *>(
+        sycl_device.allocate(out.size() * sizeof(DataType)));
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+    sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+                                   (in.size()) * sizeof(DataType));
+    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+                                   (out.size()) * sizeof(DataType));
+    auto device_expr = gpu_out.device(sycl_device);
+    asgn(device_expr, op(gpu));
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                   (out.size()) * sizeof(DataType));
+    for (int64_t i = 0; i < out.size(); ++i) {
+      DataType ver = reference(i);
+      asgn(ver, op(in(i)));
+      VERIFY_IS_APPROX(out(i), ver);
+    }
+    sycl_device.deallocate(gpu_data);
+    sycl_device.deallocate(gpu_data_out);
   }
+  {
+    /* Assignement(out, Operator(out)) */
+    Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+    out = out.random() + DataType(0.01);
+    Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+    DataType *gpu_data_out = static_cast<DataType *>(
+        sycl_device.allocate(out.size() * sizeof(DataType)));
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+                                   (out.size()) * sizeof(DataType));
+    auto device_expr = gpu_out.device(sycl_device);
+    asgn(device_expr, op(gpu_out));
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                   (out.size()) * sizeof(DataType));
+    for (int64_t i = 0; i < out.size(); ++i) {
+      DataType ver = reference(i);
+      asgn(ver, op(reference(i)));
+      VERIFY_IS_APPROX(out(i), ver);
+    }
+    sycl_device.deallocate(gpu_data_out);
+  }
+}
+
+#define DECLARE_UNARY_STRUCT(FUNC)                                 \
+  struct op_##FUNC {                                               \
+    template <typename T>                                          \
+    auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) {   \
+      return cl::sycl::FUNC(x);                                    \
+    }                                                              \
+    template <typename T>                                          \
+    auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
+      return x.FUNC();                                             \
+    }                                                              \
+  };
 
-#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR , Layout)                \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout)               \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR , Layout)              \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR , Layout)             \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR , Layout)            \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR , Layout)              \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR , Layout)           \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR , Layout)              \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR , Layout)               \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR , Layout)             \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR , Layout)               \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout)               \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR , Layout)              \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR , Layout)             \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR , Layout)             \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR , Layout)
+DECLARE_UNARY_STRUCT(abs)
+DECLARE_UNARY_STRUCT(sqrt)
+DECLARE_UNARY_STRUCT(rsqrt)
+DECLARE_UNARY_STRUCT(square)
+DECLARE_UNARY_STRUCT(cube)
+DECLARE_UNARY_STRUCT(inverse)
+DECLARE_UNARY_STRUCT(tanh)
+DECLARE_UNARY_STRUCT(exp)
+DECLARE_UNARY_STRUCT(expm1)
+DECLARE_UNARY_STRUCT(log)
+DECLARE_UNARY_STRUCT(ceil)
+DECLARE_UNARY_STRUCT(floor)
+DECLARE_UNARY_STRUCT(round)
+DECLARE_UNARY_STRUCT(log1p)
+DECLARE_UNARY_STRUCT(sign)
+DECLARE_UNARY_STRUCT(isnan)
+DECLARE_UNARY_STRUCT(isfinite)
+DECLARE_UNARY_STRUCT(isinf)
 
-#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC, Layout)                        \
-  {                                                                            \
-    /* out = in.FUNC() */                                                      \
-    Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange);                        \
-    Tensor<bool, 3, Layout, int64_t> out(tensorRange);                         \
-    in = in.random() + static_cast<SCALAR>(0.01);                              \
-    SCALAR *gpu_data = static_cast<SCALAR *>(                                  \
-        sycl_device.allocate(in.size() * sizeof(SCALAR)));                     \
-    bool *gpu_data_out =                                                       \
-        static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool)));  \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange);          \
-    TensorMap<Tensor<bool, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);    \
-    sycl_device.memcpyHostToDevice(gpu_data, in.data(),                        \
-                                   (in.size()) * sizeof(SCALAR));              \
-    gpu_out.device(sycl_device) = gpu.FUNC();                                  \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(bool));               \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      VERIFY_IS_EQUAL(out(i), std::FUNC(in(i)));                               \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data);                                          \
-    sycl_device.deallocate(gpu_data_out);                                      \
+template <typename DataType, int DataLayout, typename Assignement>
+void test_unary_builtins_for_assignement(const Eigen::SyclDevice& sycl_device,
+                                         const array<int64_t, 3>& tensor_range) {
+#define RUN_UNARY_TEST(FUNC) \
+  test_unary_builtins_for_scalar<DataType, DataLayout, Assignement, \
+                                 op_##FUNC>(sycl_device, tensor_range)
+  RUN_UNARY_TEST(abs);
+  RUN_UNARY_TEST(sqrt);
+  RUN_UNARY_TEST(rsqrt);
+  RUN_UNARY_TEST(square);
+  RUN_UNARY_TEST(cube);
+  RUN_UNARY_TEST(inverse);
+  RUN_UNARY_TEST(tanh);
+  RUN_UNARY_TEST(exp);
+  RUN_UNARY_TEST(expm1);
+  RUN_UNARY_TEST(log);
+  RUN_UNARY_TEST(ceil);
+  RUN_UNARY_TEST(floor);
+  RUN_UNARY_TEST(round);
+  RUN_UNARY_TEST(log1p);
+  RUN_UNARY_TEST(sign);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_unary_builtins_return_bool(const Eigen::SyclDevice& sycl_device,
+                                     const array<int64_t, 3>& tensor_range) {
+  /* out = op(in) */
+  Operator op;
+  Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+  Tensor<bool, 3, DataLayout, int64_t> out(tensor_range);
+  in = in.random() + DataType(0.01);
+  DataType *gpu_data = static_cast<DataType *>(
+      sycl_device.allocate(in.size() * sizeof(DataType)));
+  bool *gpu_data_out =
+      static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+  TensorMap<Tensor<bool, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+                                 (in.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(bool));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_EQUAL(out(i), op(in(i)));
   }
+  sycl_device.deallocate(gpu_data);
+  sycl_device.deallocate(gpu_data_out);
+}
 
-#define TEST_UNARY_BUILTINS(SCALAR, Layout)                                    \
-  TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=, Layout)                             \
-  TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =, Layout)                              \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan, Layout)                             \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite, Layout)                          \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf, Layout)
+template <typename DataType, int DataLayout>
+void test_unary_builtins(const Eigen::SyclDevice& sycl_device,
+                         const array<int64_t, 3>& tensor_range) {
+  test_unary_builtins_for_assignement<DataType, DataLayout,
+                                      PlusEqualAssignement>(sycl_device, tensor_range);
+  test_unary_builtins_for_assignement<DataType, DataLayout,
+                                      EqualAssignement>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isnan>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isfinite>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isinf>(sycl_device, tensor_range);
+}
 
+template <typename DataType>
 static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) {
   int64_t sizeDim1 = 10;
   int64_t sizeDim2 = 10;
   int64_t sizeDim3 = 10;
-  array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
 
-  TEST_UNARY_BUILTINS(float, RowMajor)
-  TEST_UNARY_BUILTINS(float, ColMajor)
+  test_unary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+  test_unary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
 }
 
-namespace std {
-template <typename T> T cwiseMax(T x, T y) { return std::max(x, y); }
-template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_func(const Eigen::SyclDevice& sycl_device,
+                               const array<int64_t, 3>& tensor_range) {
+  /* out = op(in_1, in_2) */
+  Operator op;
+  Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> in_2(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+  in_1 = in_1.random() + DataType(0.01);
+  in_2 = in_2.random() + DataType(0.01);
+  Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+  DataType *gpu_data_1 = static_cast<DataType *>(
+      sycl_device.allocate(in_1.size() * sizeof(DataType)));
+  DataType *gpu_data_2 = static_cast<DataType *>(
+      sycl_device.allocate(in_2.size() * sizeof(DataType)));
+  DataType *gpu_data_out = static_cast<DataType *>(
+      sycl_device.allocate(out.size() * sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_2(gpu_data_2, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+                                 (in_1.size()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),
+                                 (in_2.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu_1, gpu_2);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(DataType));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), op(in_1(i), in_2(i)));
+  }
+  sycl_device.deallocate(gpu_data_1);
+  sycl_device.deallocate(gpu_data_2);
+  sycl_device.deallocate(gpu_data_out);
 }
 
-#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC, Layout)                        \
-  {                                                                            \
-    /* out = in_1.FUNC(in_2) */                                                \
-    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
-    Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange);                      \
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
-    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
-    in_2 = in_2.random() + static_cast<SCALAR>(0.01);                          \
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
-    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
-        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
-    SCALAR *gpu_data_2 = static_cast<SCALAR *>(                                \
-        sycl_device.allocate(in_2.size() * sizeof(SCALAR)));                   \
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange);      \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
-    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
-                                   (in_1.size()) * sizeof(SCALAR));            \
-    sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),                    \
-                                   (in_2.size()) * sizeof(SCALAR));            \
-    gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2);                           \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      SCALAR ver = reference(i);                                               \
-      ver = std::FUNC(in_1(i), in_2(i));                                       \
-      VERIFY_IS_APPROX(out(i), ver);                                           \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data_1);                                        \
-    sycl_device.deallocate(gpu_data_2);                                        \
-    sycl_device.deallocate(gpu_data_out);                                      \
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device,
+                                     const array<int64_t, 3>& tensor_range) {
+  /* out = op(in_1, 2) */
+  Operator op;
+  const DataType arg2(2);
+  Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+  in_1 = in_1.random();
+  Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+  DataType *gpu_data_1 = static_cast<DataType *>(
+      sycl_device.allocate(in_1.size() * sizeof(DataType)));
+  DataType *gpu_data_out = static_cast<DataType *>(
+      sycl_device.allocate(out.size() * sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+                                 (in_1.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu_1, arg2);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(DataType));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), op(in_1(i), arg2));
   }
+  sycl_device.deallocate(gpu_data_1);
+  sycl_device.deallocate(gpu_data_out);
+}
 
-#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR, Layout)               \
-  {                                                                            \
-    /* out = in_1 OPERATOR in_2 */                                             \
-    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
-    Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange);                      \
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
-    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
-    in_2 = in_2.random() + static_cast<SCALAR>(0.01);                          \
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
-    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
-        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
-    SCALAR *gpu_data_2 = static_cast<SCALAR *>(                                \
-        sycl_device.allocate(in_2.size() * sizeof(SCALAR)));                   \
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange);      \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
-    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
-                                   (in_1.size()) * sizeof(SCALAR));            \
-    sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),                    \
-                                   (in_2.size()) * sizeof(SCALAR));            \
-    gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2;                        \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR in_2(i));                      \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data_1);                                        \
-    sycl_device.deallocate(gpu_data_2);                                        \
-    sycl_device.deallocate(gpu_data_out);                                      \
-  }
+#define DECLARE_BINARY_STRUCT(FUNC)                                                          \
+  struct op_##FUNC {                                                                         \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const T1& x, const T2& y) -> decltype(cl::sycl::FUNC(x, y)) {            \
+      return cl::sycl::FUNC(x, y);                                                           \
+    }                                                                                        \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) -> decltype(x.FUNC(y)) { \
+      return x.FUNC(y);                                                                      \
+    }                                                                                        \
+  };
 
-#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR, Layout)     \
-  {                                                                            \
-    /* out = in_1 OPERATOR 2 */                                                \
-    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
-    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
-    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
-        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
-    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
-                                   (in_1.size()) * sizeof(SCALAR));            \
-    gpu_out.device(sycl_device) = gpu_1 OPERATOR 2;                            \
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
-                                   (out.size()) * sizeof(SCALAR));             \
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
-      VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR 2);                            \
-    }                                                                          \
-    sycl_device.deallocate(gpu_data_1);                                        \
-    sycl_device.deallocate(gpu_data_out);                                      \
-  }
+DECLARE_BINARY_STRUCT(cwiseMax)
+DECLARE_BINARY_STRUCT(cwiseMin)
 
-#define TEST_BINARY_BUILTINS(SCALAR, Layout)                                   \
-  TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax , Layout)                         \
-  TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin , Layout)                         \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, + , Layout)                           \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, - , Layout)                           \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, * , Layout)                           \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, / , Layout)
+#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR)                          \
+  struct op_##NAME {                                                      \
+    template <typename T1, typename T2>                                   \
+    auto operator()(const T1& x, const T2& y) -> decltype(x OPERATOR y) { \
+      return x OPERATOR y;                                                \
+    }                                                                     \
+  };
+
+DECLARE_BINARY_STRUCT_OP(plus, +)
+DECLARE_BINARY_STRUCT_OP(minus, -)
+DECLARE_BINARY_STRUCT_OP(times, *)
+DECLARE_BINARY_STRUCT_OP(divide, /)
+DECLARE_BINARY_STRUCT_OP(modulo, %)
+
+template <typename DataType, int DataLayout>
+void test_binary_builtins(const Eigen::SyclDevice& sycl_device,
+                          const array<int64_t, 3>& tensor_range) {
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_cwiseMax>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_cwiseMin>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_plus>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_minus>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_times>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_divide>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_floating_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+  test_binary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+  test_binary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
+}
 
-static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+template <typename DataType>
+static void test_integer_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
   int64_t sizeDim1 = 10;
   int64_t sizeDim2 = 10;
   int64_t sizeDim3 = 10;
-  array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-  TEST_BINARY_BUILTINS(float, RowMajor)
-  TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, RowMajor)
-  TEST_BINARY_BUILTINS(float, ColMajor)
-  TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor)
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+  test_binary_builtins_fixed_arg2<DataType, RowMajor,
+                                  op_modulo>(sycl_device, tensor_range);
+  test_binary_builtins_fixed_arg2<DataType, ColMajor,
+                                  op_modulo>(sycl_device, tensor_range);
 }
 
 EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     QueueInterface queueInterface(device);
     Eigen::SyclDevice sycl_device(&queueInterface);
-    CALL_SUBTEST(test_builtin_unary_sycl(sycl_device));
-    CALL_SUBTEST(test_builtin_binary_sycl(sycl_device));
+    CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device));
+    CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device));
+    CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device));
   }
 }
-- 
cgit v1.2.3