From 00f32752f7d0b193c6788691c3cf0b76457a044d Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Thu, 28 Nov 2019 10:08:54 +0000 Subject: [SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch. * Unifying all loadLocalTile from lhs and rhs to an extract_block function. * Adding get_tensor operation which was missing in TensorContractionMapper. * Adding the -D method missing from cmake for Disable_Skinny Contraction operation. * Wrapping all the indices in TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL * Unifying load to private register for tall/skinny no shared * Unifying load to vector tile for tensor-vector/vector-tensor operation * Removing all the LHS/RHS class for extracting data from global * Removing Outputfunction from TensorContractionSkinnyNoshared. * Combining the local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining General Tensor-Vector and VectorTensor contraction into one kernel. * Making double buffering optional for Tensor contraction when local memory is version is used. * Modifying benchmark to accept custom Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL CMake --- unsupported/test/cxx11_tensor_builtins_sycl.cpp | 497 ++++++++++++++---------- 1 file changed, 292 insertions(+), 205 deletions(-) (limited to 'unsupported/test/cxx11_tensor_builtins_sycl.cpp') diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp index db2975783..72cb62fd5 100644 --- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp +++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp @@ -25,243 +25,330 @@ using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -namespace std { -template T rsqrt(T x) { return 1 / std::sqrt(x); } +// Functions used to compare the TensorMap implementation on the device with +// the equivalent on the host +namespace cl { +namespace sycl { +template T abs(T x) { return cl::sycl::fabs(x); } template T square(T x) { return x * x; } template T cube(T x) { return x * x * x; } -template T inverse(T x) { return 1 / x; } +template T inverse(T x) { return T(1) / x; } +template T cwiseMax(T x, T y) { return cl::sycl::max(x, y); } +template T cwiseMin(T x, T y) { return cl::sycl::min(x, y); } } +} + +struct EqualAssignement { + template + void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; } +}; + +struct PlusEqualAssignement { + template + void operator()(Lhs& lhs, const Rhs& rhs) { lhs += rhs; } +}; -#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR, Layout) \ - { \ - /* out OPERATOR in.FUNC() */ \ - Tensor in(tensorRange); \ - Tensor out(tensorRange); \ - in = in.random() + static_cast(0.01); \ - out = out.random() + static_cast(0.01); \ - Tensor reference(out); \ - SCALAR *gpu_data = static_cast( \ - sycl_device.allocate(in.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_out = static_cast( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap> gpu(gpu_data, tensorRange); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data, in.data(), \ - (in.size()) * sizeof(SCALAR)); \ - sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \ - (out.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) OPERATOR gpu.FUNC(); \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - SCALAR ver = reference(i); \ - ver OPERATOR std::FUNC(in(i)); \ - VERIFY_IS_APPROX(out(i), ver); \ - } \ - sycl_device.deallocate(gpu_data); \ - sycl_device.deallocate(gpu_data_out); \ - } \ - { \ - /* out OPERATOR out.FUNC() */ \ - Tensor out(tensorRange); \ - out = out.random() + static_cast(0.01); \ - Tensor reference(out); \ - SCALAR *gpu_data_out = static_cast( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \ - (out.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) OPERATOR gpu_out.FUNC(); \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - SCALAR ver = reference(i); \ - ver OPERATOR std::FUNC(reference(i)); \ - VERIFY_IS_APPROX(out(i), ver); \ - } \ - sycl_device.deallocate(gpu_data_out); \ +template +void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device, + const array& tensor_range) { + Operator op; + Assignement asgn; + { + /* Assignement(out, Operator(in)) */ + Tensor in(tensor_range); + Tensor out(tensor_range); + in = in.random() + DataType(0.01); + out = out.random() + DataType(0.01); + Tensor reference(out); + DataType *gpu_data = static_cast( + sycl_device.allocate(in.size() * sizeof(DataType))); + DataType *gpu_data_out = static_cast( + sycl_device.allocate(out.size() * sizeof(DataType))); + TensorMap> gpu(gpu_data, tensor_range); + TensorMap> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data, in.data(), + (in.size()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), + (out.size()) * sizeof(DataType)); + auto device_expr = gpu_out.device(sycl_device); + asgn(device_expr, op(gpu)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(DataType)); + for (int64_t i = 0; i < out.size(); ++i) { + DataType ver = reference(i); + asgn(ver, op(in(i))); + VERIFY_IS_APPROX(out(i), ver); + } + sycl_device.deallocate(gpu_data); + sycl_device.deallocate(gpu_data_out); } + { + /* Assignement(out, Operator(out)) */ + Tensor out(tensor_range); + out = out.random() + DataType(0.01); + Tensor reference(out); + DataType *gpu_data_out = static_cast( + sycl_device.allocate(out.size() * sizeof(DataType))); + TensorMap> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), + (out.size()) * sizeof(DataType)); + auto device_expr = gpu_out.device(sycl_device); + asgn(device_expr, op(gpu_out)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(DataType)); + for (int64_t i = 0; i < out.size(); ++i) { + DataType ver = reference(i); + asgn(ver, op(reference(i))); + VERIFY_IS_APPROX(out(i), ver); + } + sycl_device.deallocate(gpu_data_out); + } +} + +#define DECLARE_UNARY_STRUCT(FUNC) \ + struct op_##FUNC { \ + template \ + auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) { \ + return cl::sycl::FUNC(x); \ + } \ + template \ + auto operator()(const TensorMap& x) -> decltype(x.FUNC()) { \ + return x.FUNC(); \ + } \ + }; -#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR , Layout) +DECLARE_UNARY_STRUCT(abs) +DECLARE_UNARY_STRUCT(sqrt) +DECLARE_UNARY_STRUCT(rsqrt) +DECLARE_UNARY_STRUCT(square) +DECLARE_UNARY_STRUCT(cube) +DECLARE_UNARY_STRUCT(inverse) +DECLARE_UNARY_STRUCT(tanh) +DECLARE_UNARY_STRUCT(exp) +DECLARE_UNARY_STRUCT(expm1) +DECLARE_UNARY_STRUCT(log) +DECLARE_UNARY_STRUCT(ceil) +DECLARE_UNARY_STRUCT(floor) +DECLARE_UNARY_STRUCT(round) +DECLARE_UNARY_STRUCT(log1p) +DECLARE_UNARY_STRUCT(sign) +DECLARE_UNARY_STRUCT(isnan) +DECLARE_UNARY_STRUCT(isfinite) +DECLARE_UNARY_STRUCT(isinf) -#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC, Layout) \ - { \ - /* out = in.FUNC() */ \ - Tensor in(tensorRange); \ - Tensor out(tensorRange); \ - in = in.random() + static_cast(0.01); \ - SCALAR *gpu_data = static_cast( \ - sycl_device.allocate(in.size() * sizeof(SCALAR))); \ - bool *gpu_data_out = \ - static_cast(sycl_device.allocate(out.size() * sizeof(bool))); \ - TensorMap> gpu(gpu_data, tensorRange); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data, in.data(), \ - (in.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) = gpu.FUNC(); \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(bool)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - VERIFY_IS_EQUAL(out(i), std::FUNC(in(i))); \ - } \ - sycl_device.deallocate(gpu_data); \ - sycl_device.deallocate(gpu_data_out); \ +template +void test_unary_builtins_for_assignement(const Eigen::SyclDevice& sycl_device, + const array& tensor_range) { +#define RUN_UNARY_TEST(FUNC) \ + test_unary_builtins_for_scalar(sycl_device, tensor_range) + RUN_UNARY_TEST(abs); + RUN_UNARY_TEST(sqrt); + RUN_UNARY_TEST(rsqrt); + RUN_UNARY_TEST(square); + RUN_UNARY_TEST(cube); + RUN_UNARY_TEST(inverse); + RUN_UNARY_TEST(tanh); + RUN_UNARY_TEST(exp); + RUN_UNARY_TEST(expm1); + RUN_UNARY_TEST(log); + RUN_UNARY_TEST(ceil); + RUN_UNARY_TEST(floor); + RUN_UNARY_TEST(round); + RUN_UNARY_TEST(log1p); + RUN_UNARY_TEST(sign); +} + +template +void test_unary_builtins_return_bool(const Eigen::SyclDevice& sycl_device, + const array& tensor_range) { + /* out = op(in) */ + Operator op; + Tensor in(tensor_range); + Tensor out(tensor_range); + in = in.random() + DataType(0.01); + DataType *gpu_data = static_cast( + sycl_device.allocate(in.size() * sizeof(DataType))); + bool *gpu_data_out = + static_cast(sycl_device.allocate(out.size() * sizeof(bool))); + TensorMap> gpu(gpu_data, tensor_range); + TensorMap> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data, in.data(), + (in.size()) * sizeof(DataType)); + gpu_out.device(sycl_device) = op(gpu); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(bool)); + for (int64_t i = 0; i < out.size(); ++i) { + VERIFY_IS_EQUAL(out(i), op(in(i))); } + sycl_device.deallocate(gpu_data); + sycl_device.deallocate(gpu_data_out); +} -#define TEST_UNARY_BUILTINS(SCALAR, Layout) \ - TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=, Layout) \ - TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =, Layout) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan, Layout) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite, Layout) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf, Layout) +template +void test_unary_builtins(const Eigen::SyclDevice& sycl_device, + const array& tensor_range) { + test_unary_builtins_for_assignement(sycl_device, tensor_range); + test_unary_builtins_for_assignement(sycl_device, tensor_range); + test_unary_builtins_return_bool(sycl_device, tensor_range); + test_unary_builtins_return_bool(sycl_device, tensor_range); + test_unary_builtins_return_bool(sycl_device, tensor_range); +} +template static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) { int64_t sizeDim1 = 10; int64_t sizeDim2 = 10; int64_t sizeDim3 = 10; - array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + array tensor_range = {{sizeDim1, sizeDim2, sizeDim3}}; - TEST_UNARY_BUILTINS(float, RowMajor) - TEST_UNARY_BUILTINS(float, ColMajor) + test_unary_builtins(sycl_device, tensor_range); + test_unary_builtins(sycl_device, tensor_range); } -namespace std { -template T cwiseMax(T x, T y) { return std::max(x, y); } -template T cwiseMin(T x, T y) { return std::min(x, y); } +template +void test_binary_builtins_func(const Eigen::SyclDevice& sycl_device, + const array& tensor_range) { + /* out = op(in_1, in_2) */ + Operator op; + Tensor in_1(tensor_range); + Tensor in_2(tensor_range); + Tensor out(tensor_range); + in_1 = in_1.random() + DataType(0.01); + in_2 = in_2.random() + DataType(0.01); + Tensor reference(out); + DataType *gpu_data_1 = static_cast( + sycl_device.allocate(in_1.size() * sizeof(DataType))); + DataType *gpu_data_2 = static_cast( + sycl_device.allocate(in_2.size() * sizeof(DataType))); + DataType *gpu_data_out = static_cast( + sycl_device.allocate(out.size() * sizeof(DataType))); + TensorMap> gpu_1(gpu_data_1, tensor_range); + TensorMap> gpu_2(gpu_data_2, tensor_range); + TensorMap> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), + (in_1.size()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), + (in_2.size()) * sizeof(DataType)); + gpu_out.device(sycl_device) = op(gpu_1, gpu_2); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(DataType)); + for (int64_t i = 0; i < out.size(); ++i) { + VERIFY_IS_APPROX(out(i), op(in_1(i), in_2(i))); + } + sycl_device.deallocate(gpu_data_1); + sycl_device.deallocate(gpu_data_2); + sycl_device.deallocate(gpu_data_out); } -#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC, Layout) \ - { \ - /* out = in_1.FUNC(in_2) */ \ - Tensor in_1(tensorRange); \ - Tensor in_2(tensorRange); \ - Tensor out(tensorRange); \ - in_1 = in_1.random() + static_cast(0.01); \ - in_2 = in_2.random() + static_cast(0.01); \ - Tensor reference(out); \ - SCALAR *gpu_data_1 = static_cast( \ - sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_2 = static_cast( \ - sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_out = static_cast( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap> gpu_1(gpu_data_1, tensorRange); \ - TensorMap> gpu_2(gpu_data_2, tensorRange); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ - (in_1.size()) * sizeof(SCALAR)); \ - sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \ - (in_2.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2); \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - SCALAR ver = reference(i); \ - ver = std::FUNC(in_1(i), in_2(i)); \ - VERIFY_IS_APPROX(out(i), ver); \ - } \ - sycl_device.deallocate(gpu_data_1); \ - sycl_device.deallocate(gpu_data_2); \ - sycl_device.deallocate(gpu_data_out); \ +template +void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device, + const array& tensor_range) { + /* out = op(in_1, 2) */ + Operator op; + const DataType arg2(2); + Tensor in_1(tensor_range); + Tensor out(tensor_range); + in_1 = in_1.random(); + Tensor reference(out); + DataType *gpu_data_1 = static_cast( + sycl_device.allocate(in_1.size() * sizeof(DataType))); + DataType *gpu_data_out = static_cast( + sycl_device.allocate(out.size() * sizeof(DataType))); + TensorMap> gpu_1(gpu_data_1, tensor_range); + TensorMap> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), + (in_1.size()) * sizeof(DataType)); + gpu_out.device(sycl_device) = op(gpu_1, arg2); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(DataType)); + for (int64_t i = 0; i < out.size(); ++i) { + VERIFY_IS_APPROX(out(i), op(in_1(i), arg2)); } + sycl_device.deallocate(gpu_data_1); + sycl_device.deallocate(gpu_data_out); +} -#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR, Layout) \ - { \ - /* out = in_1 OPERATOR in_2 */ \ - Tensor in_1(tensorRange); \ - Tensor in_2(tensorRange); \ - Tensor out(tensorRange); \ - in_1 = in_1.random() + static_cast(0.01); \ - in_2 = in_2.random() + static_cast(0.01); \ - Tensor reference(out); \ - SCALAR *gpu_data_1 = static_cast( \ - sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_2 = static_cast( \ - sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_out = static_cast( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap> gpu_1(gpu_data_1, tensorRange); \ - TensorMap> gpu_2(gpu_data_2, tensorRange); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ - (in_1.size()) * sizeof(SCALAR)); \ - sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \ - (in_2.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2; \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR in_2(i)); \ - } \ - sycl_device.deallocate(gpu_data_1); \ - sycl_device.deallocate(gpu_data_2); \ - sycl_device.deallocate(gpu_data_out); \ - } +#define DECLARE_BINARY_STRUCT(FUNC) \ + struct op_##FUNC { \ + template \ + auto operator()(const T1& x, const T2& y) -> decltype(cl::sycl::FUNC(x, y)) { \ + return cl::sycl::FUNC(x, y); \ + } \ + template \ + auto operator()(const TensorMap& x, const TensorMap& y) -> decltype(x.FUNC(y)) { \ + return x.FUNC(y); \ + } \ + }; -#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR, Layout) \ - { \ - /* out = in_1 OPERATOR 2 */ \ - Tensor in_1(tensorRange); \ - Tensor out(tensorRange); \ - in_1 = in_1.random() + static_cast(0.01); \ - Tensor reference(out); \ - SCALAR *gpu_data_1 = static_cast( \ - sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_out = static_cast( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap> gpu_1(gpu_data_1, tensorRange); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ - (in_1.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) = gpu_1 OPERATOR 2; \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR 2); \ - } \ - sycl_device.deallocate(gpu_data_1); \ - sycl_device.deallocate(gpu_data_out); \ - } +DECLARE_BINARY_STRUCT(cwiseMax) +DECLARE_BINARY_STRUCT(cwiseMin) -#define TEST_BINARY_BUILTINS(SCALAR, Layout) \ - TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax , Layout) \ - TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin , Layout) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, + , Layout) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, - , Layout) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, * , Layout) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, / , Layout) +#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR) \ + struct op_##NAME { \ + template \ + auto operator()(const T1& x, const T2& y) -> decltype(x OPERATOR y) { \ + return x OPERATOR y; \ + } \ + }; + +DECLARE_BINARY_STRUCT_OP(plus, +) +DECLARE_BINARY_STRUCT_OP(minus, -) +DECLARE_BINARY_STRUCT_OP(times, *) +DECLARE_BINARY_STRUCT_OP(divide, /) +DECLARE_BINARY_STRUCT_OP(modulo, %) + +template +void test_binary_builtins(const Eigen::SyclDevice& sycl_device, + const array& tensor_range) { + test_binary_builtins_func(sycl_device, tensor_range); + test_binary_builtins_func(sycl_device, tensor_range); + test_binary_builtins_func(sycl_device, tensor_range); + test_binary_builtins_func(sycl_device, tensor_range); + test_binary_builtins_func(sycl_device, tensor_range); + test_binary_builtins_func(sycl_device, tensor_range); +} + +template +static void test_floating_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { + int64_t sizeDim1 = 10; + int64_t sizeDim2 = 10; + int64_t sizeDim3 = 10; + array tensor_range = {{sizeDim1, sizeDim2, sizeDim3}}; + test_binary_builtins(sycl_device, tensor_range); + test_binary_builtins(sycl_device, tensor_range); +} -static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { +template +static void test_integer_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { int64_t sizeDim1 = 10; int64_t sizeDim2 = 10; int64_t sizeDim3 = 10; - array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - TEST_BINARY_BUILTINS(float, RowMajor) - TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, RowMajor) - TEST_BINARY_BUILTINS(float, ColMajor) - TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor) + array tensor_range = {{sizeDim1, sizeDim2, sizeDim3}}; + test_binary_builtins_fixed_arg2(sycl_device, tensor_range); + test_binary_builtins_fixed_arg2(sycl_device, tensor_range); } EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { QueueInterface queueInterface(device); Eigen::SyclDevice sycl_device(&queueInterface); - CALL_SUBTEST(test_builtin_unary_sycl(sycl_device)); - CALL_SUBTEST(test_builtin_binary_sycl(sycl_device)); + CALL_SUBTEST_1(test_builtin_unary_sycl(sycl_device)); + CALL_SUBTEST_2(test_floating_builtin_binary_sycl(sycl_device)); + CALL_SUBTEST_3(test_integer_builtin_binary_sycl(sycl_device)); } } -- cgit v1.2.3