aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/test/cxx11_tensor_builtins_sycl.cpp
diff options
context:
space:
mode:
authorGravatar Mehdi Goli <mehdi.goli@codeplay.com>2019-11-28 10:08:54 +0000
committerGravatar Mehdi Goli <mehdi.goli@codeplay.com>2019-11-28 10:08:54 +0000
commit00f32752f7d0b193c6788691c3cf0b76457a044d (patch)
tree792e46110f0751ea8802fa9d403d1472d5977ac3 /unsupported/test/cxx11_tensor_builtins_sycl.cpp
parentea51a9eace7e4f0ea839e61eb2df85ccfb94aee8 (diff)
[SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch.
* Unifying all loadLocalTile from lhs and rhs to an extract_block function. * Adding get_tensor operation which was missing in TensorContractionMapper. * Adding the -D method missing from cmake for Disable_Skinny Contraction operation. * Wrapping all the indices in TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL * Unifying load to private register for tall/skinny no shared * Unifying load to vector tile for tensor-vector/vector-tensor operation * Removing all the LHS/RHS class for extracting data from global * Removing Outputfunction from TensorContractionSkinnyNoshared. * Combining the local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining General Tensor-Vector and VectorTensor contraction into one kernel. * Making double buffering optional for Tensor contraction when local memory is version is used. * Modifying benchmark to accept custom Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL CMake
Diffstat (limited to 'unsupported/test/cxx11_tensor_builtins_sycl.cpp')
-rw-r--r--unsupported/test/cxx11_tensor_builtins_sycl.cpp497
1 files changed, 292 insertions, 205 deletions
diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
index db2975783..72cb62fd5 100644
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@@ -25,243 +25,330 @@ using Eigen::SyclDevice;
using Eigen::Tensor;
using Eigen::TensorMap;
-namespace std {
-template <typename T> T rsqrt(T x) { return 1 / std::sqrt(x); }
+// Functions used to compare the TensorMap implementation on the device with
+// the equivalent on the host
+namespace cl {
+namespace sycl {
+template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
template <typename T> T square(T x) { return x * x; }
template <typename T> T cube(T x) { return x * x * x; }
-template <typename T> T inverse(T x) { return 1 / x; }
+template <typename T> T inverse(T x) { return T(1) / x; }
+template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
+template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
}
+}
+
+struct EqualAssignement {
+ template <typename Lhs, typename Rhs>
+ void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
+};
+
+struct PlusEqualAssignement {
+ template <typename Lhs, typename Rhs>
+ void operator()(Lhs& lhs, const Rhs& rhs) { lhs += rhs; }
+};
-#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR, Layout) \
- { \
- /* out OPERATOR in.FUNC() */ \
- Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange); \
- Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \
- in = in.random() + static_cast<SCALAR>(0.01); \
- out = out.random() + static_cast<SCALAR>(0.01); \
- Tensor<SCALAR, 3, Layout, int64_t> reference(out); \
- SCALAR *gpu_data = static_cast<SCALAR *>( \
- sycl_device.allocate(in.size() * sizeof(SCALAR))); \
- SCALAR *gpu_data_out = static_cast<SCALAR *>( \
- sycl_device.allocate(out.size() * sizeof(SCALAR))); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
- sycl_device.memcpyHostToDevice(gpu_data, in.data(), \
- (in.size()) * sizeof(SCALAR)); \
- sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \
- (out.size()) * sizeof(SCALAR)); \
- gpu_out.device(sycl_device) OPERATOR gpu.FUNC(); \
- sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
- (out.size()) * sizeof(SCALAR)); \
- for (int64_t i = 0; i < out.size(); ++i) { \
- SCALAR ver = reference(i); \
- ver OPERATOR std::FUNC(in(i)); \
- VERIFY_IS_APPROX(out(i), ver); \
- } \
- sycl_device.deallocate(gpu_data); \
- sycl_device.deallocate(gpu_data_out); \
- } \
- { \
- /* out OPERATOR out.FUNC() */ \
- Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \
- out = out.random() + static_cast<SCALAR>(0.01); \
- Tensor<SCALAR, 3, Layout, int64_t> reference(out); \
- SCALAR *gpu_data_out = static_cast<SCALAR *>( \
- sycl_device.allocate(out.size() * sizeof(SCALAR))); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
- sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \
- (out.size()) * sizeof(SCALAR)); \
- gpu_out.device(sycl_device) OPERATOR gpu_out.FUNC(); \
- sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
- (out.size()) * sizeof(SCALAR)); \
- for (int64_t i = 0; i < out.size(); ++i) { \
- SCALAR ver = reference(i); \
- ver OPERATOR std::FUNC(reference(i)); \
- VERIFY_IS_APPROX(out(i), ver); \
- } \
- sycl_device.deallocate(gpu_data_out); \
+template <typename DataType, int DataLayout,
+ typename Assignement, typename Operator>
+void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
+ const array<int64_t, 3>& tensor_range) {
+ Operator op;
+ Assignement asgn;
+ {
+ /* Assignement(out, Operator(in)) */
+ Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+ Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+ in = in.random() + DataType(0.01);
+ out = out.random() + DataType(0.01);
+ Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+ DataType *gpu_data = static_cast<DataType *>(
+ sycl_device.allocate(in.size() * sizeof(DataType)));
+ DataType *gpu_data_out = static_cast<DataType *>(
+ sycl_device.allocate(out.size() * sizeof(DataType)));
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+ sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+ (in.size()) * sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+ (out.size()) * sizeof(DataType));
+ auto device_expr = gpu_out.device(sycl_device);
+ asgn(device_expr, op(gpu));
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+ (out.size()) * sizeof(DataType));
+ for (int64_t i = 0; i < out.size(); ++i) {
+ DataType ver = reference(i);
+ asgn(ver, op(in(i)));
+ VERIFY_IS_APPROX(out(i), ver);
+ }
+ sycl_device.deallocate(gpu_data);
+ sycl_device.deallocate(gpu_data_out);
}
+ {
+ /* Assignement(out, Operator(out)) */
+ Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+ out = out.random() + DataType(0.01);
+ Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+ DataType *gpu_data_out = static_cast<DataType *>(
+ sycl_device.allocate(out.size() * sizeof(DataType)));
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+ sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+ (out.size()) * sizeof(DataType));
+ auto device_expr = gpu_out.device(sycl_device);
+ asgn(device_expr, op(gpu_out));
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+ (out.size()) * sizeof(DataType));
+ for (int64_t i = 0; i < out.size(); ++i) {
+ DataType ver = reference(i);
+ asgn(ver, op(reference(i)));
+ VERIFY_IS_APPROX(out(i), ver);
+ }
+ sycl_device.deallocate(gpu_data_out);
+ }
+}
+
+#define DECLARE_UNARY_STRUCT(FUNC) \
+ struct op_##FUNC { \
+ template <typename T> \
+ auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) { \
+ return cl::sycl::FUNC(x); \
+ } \
+ template <typename T> \
+ auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
+ return x.FUNC(); \
+ } \
+ };
-#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR , Layout)
+DECLARE_UNARY_STRUCT(abs)
+DECLARE_UNARY_STRUCT(sqrt)
+DECLARE_UNARY_STRUCT(rsqrt)
+DECLARE_UNARY_STRUCT(square)
+DECLARE_UNARY_STRUCT(cube)
+DECLARE_UNARY_STRUCT(inverse)
+DECLARE_UNARY_STRUCT(tanh)
+DECLARE_UNARY_STRUCT(exp)
+DECLARE_UNARY_STRUCT(expm1)
+DECLARE_UNARY_STRUCT(log)
+DECLARE_UNARY_STRUCT(ceil)
+DECLARE_UNARY_STRUCT(floor)
+DECLARE_UNARY_STRUCT(round)
+DECLARE_UNARY_STRUCT(log1p)
+DECLARE_UNARY_STRUCT(sign)
+DECLARE_UNARY_STRUCT(isnan)
+DECLARE_UNARY_STRUCT(isfinite)
+DECLARE_UNARY_STRUCT(isinf)
-#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC, Layout) \
- { \
- /* out = in.FUNC() */ \
- Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange); \
- Tensor<bool, 3, Layout, int64_t> out(tensorRange); \
- in = in.random() + static_cast<SCALAR>(0.01); \
- SCALAR *gpu_data = static_cast<SCALAR *>( \
- sycl_device.allocate(in.size() * sizeof(SCALAR))); \
- bool *gpu_data_out = \
- static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool))); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange); \
- TensorMap<Tensor<bool, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
- sycl_device.memcpyHostToDevice(gpu_data, in.data(), \
- (in.size()) * sizeof(SCALAR)); \
- gpu_out.device(sycl_device) = gpu.FUNC(); \
- sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
- (out.size()) * sizeof(bool)); \
- for (int64_t i = 0; i < out.size(); ++i) { \
- VERIFY_IS_EQUAL(out(i), std::FUNC(in(i))); \
- } \
- sycl_device.deallocate(gpu_data); \
- sycl_device.deallocate(gpu_data_out); \
+template <typename DataType, int DataLayout, typename Assignement>
+void test_unary_builtins_for_assignement(const Eigen::SyclDevice& sycl_device,
+ const array<int64_t, 3>& tensor_range) {
+#define RUN_UNARY_TEST(FUNC) \
+ test_unary_builtins_for_scalar<DataType, DataLayout, Assignement, \
+ op_##FUNC>(sycl_device, tensor_range)
+ RUN_UNARY_TEST(abs);
+ RUN_UNARY_TEST(sqrt);
+ RUN_UNARY_TEST(rsqrt);
+ RUN_UNARY_TEST(square);
+ RUN_UNARY_TEST(cube);
+ RUN_UNARY_TEST(inverse);
+ RUN_UNARY_TEST(tanh);
+ RUN_UNARY_TEST(exp);
+ RUN_UNARY_TEST(expm1);
+ RUN_UNARY_TEST(log);
+ RUN_UNARY_TEST(ceil);
+ RUN_UNARY_TEST(floor);
+ RUN_UNARY_TEST(round);
+ RUN_UNARY_TEST(log1p);
+ RUN_UNARY_TEST(sign);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_unary_builtins_return_bool(const Eigen::SyclDevice& sycl_device,
+ const array<int64_t, 3>& tensor_range) {
+ /* out = op(in) */
+ Operator op;
+ Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+ Tensor<bool, 3, DataLayout, int64_t> out(tensor_range);
+ in = in.random() + DataType(0.01);
+ DataType *gpu_data = static_cast<DataType *>(
+ sycl_device.allocate(in.size() * sizeof(DataType)));
+ bool *gpu_data_out =
+ static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool)));
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+ TensorMap<Tensor<bool, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+ sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+ (in.size()) * sizeof(DataType));
+ gpu_out.device(sycl_device) = op(gpu);
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+ (out.size()) * sizeof(bool));
+ for (int64_t i = 0; i < out.size(); ++i) {
+ VERIFY_IS_EQUAL(out(i), op(in(i)));
}
+ sycl_device.deallocate(gpu_data);
+ sycl_device.deallocate(gpu_data_out);
+}
-#define TEST_UNARY_BUILTINS(SCALAR, Layout) \
- TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=, Layout) \
- TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =, Layout) \
- TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan, Layout) \
- TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite, Layout) \
- TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf, Layout)
+template <typename DataType, int DataLayout>
+void test_unary_builtins(const Eigen::SyclDevice& sycl_device,
+ const array<int64_t, 3>& tensor_range) {
+ test_unary_builtins_for_assignement<DataType, DataLayout,
+ PlusEqualAssignement>(sycl_device, tensor_range);
+ test_unary_builtins_for_assignement<DataType, DataLayout,
+ EqualAssignement>(sycl_device, tensor_range);
+ test_unary_builtins_return_bool<DataType, DataLayout,
+ op_isnan>(sycl_device, tensor_range);
+ test_unary_builtins_return_bool<DataType, DataLayout,
+ op_isfinite>(sycl_device, tensor_range);
+ test_unary_builtins_return_bool<DataType, DataLayout,
+ op_isinf>(sycl_device, tensor_range);
+}
+template <typename DataType>
static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) {
int64_t sizeDim1 = 10;
int64_t sizeDim2 = 10;
int64_t sizeDim3 = 10;
- array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
- TEST_UNARY_BUILTINS(float, RowMajor)
- TEST_UNARY_BUILTINS(float, ColMajor)
+ test_unary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+ test_unary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
}
-namespace std {
-template <typename T> T cwiseMax(T x, T y) { return std::max(x, y); }
-template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_func(const Eigen::SyclDevice& sycl_device,
+ const array<int64_t, 3>& tensor_range) {
+ /* out = op(in_1, in_2) */
+ Operator op;
+ Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+ Tensor<DataType, 3, DataLayout, int64_t> in_2(tensor_range);
+ Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+ in_1 = in_1.random() + DataType(0.01);
+ in_2 = in_2.random() + DataType(0.01);
+ Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+ DataType *gpu_data_1 = static_cast<DataType *>(
+ sycl_device.allocate(in_1.size() * sizeof(DataType)));
+ DataType *gpu_data_2 = static_cast<DataType *>(
+ sycl_device.allocate(in_2.size() * sizeof(DataType)));
+ DataType *gpu_data_out = static_cast<DataType *>(
+ sycl_device.allocate(out.size() * sizeof(DataType)));
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_2(gpu_data_2, tensor_range);
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+ sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+ (in_1.size()) * sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),
+ (in_2.size()) * sizeof(DataType));
+ gpu_out.device(sycl_device) = op(gpu_1, gpu_2);
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+ (out.size()) * sizeof(DataType));
+ for (int64_t i = 0; i < out.size(); ++i) {
+ VERIFY_IS_APPROX(out(i), op(in_1(i), in_2(i)));
+ }
+ sycl_device.deallocate(gpu_data_1);
+ sycl_device.deallocate(gpu_data_2);
+ sycl_device.deallocate(gpu_data_out);
}
-#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC, Layout) \
- { \
- /* out = in_1.FUNC(in_2) */ \
- Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \
- Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange); \
- Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \
- in_1 = in_1.random() + static_cast<SCALAR>(0.01); \
- in_2 = in_2.random() + static_cast<SCALAR>(0.01); \
- Tensor<SCALAR, 3, Layout, int64_t> reference(out); \
- SCALAR *gpu_data_1 = static_cast<SCALAR *>( \
- sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \
- SCALAR *gpu_data_2 = static_cast<SCALAR *>( \
- sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \
- SCALAR *gpu_data_out = static_cast<SCALAR *>( \
- sycl_device.allocate(out.size() * sizeof(SCALAR))); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
- sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \
- (in_1.size()) * sizeof(SCALAR)); \
- sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \
- (in_2.size()) * sizeof(SCALAR)); \
- gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2); \
- sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
- (out.size()) * sizeof(SCALAR)); \
- for (int64_t i = 0; i < out.size(); ++i) { \
- SCALAR ver = reference(i); \
- ver = std::FUNC(in_1(i), in_2(i)); \
- VERIFY_IS_APPROX(out(i), ver); \
- } \
- sycl_device.deallocate(gpu_data_1); \
- sycl_device.deallocate(gpu_data_2); \
- sycl_device.deallocate(gpu_data_out); \
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device,
+ const array<int64_t, 3>& tensor_range) {
+ /* out = op(in_1, 2) */
+ Operator op;
+ const DataType arg2(2);
+ Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+ Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+ in_1 = in_1.random();
+ Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+ DataType *gpu_data_1 = static_cast<DataType *>(
+ sycl_device.allocate(in_1.size() * sizeof(DataType)));
+ DataType *gpu_data_out = static_cast<DataType *>(
+ sycl_device.allocate(out.size() * sizeof(DataType)));
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+ sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+ (in_1.size()) * sizeof(DataType));
+ gpu_out.device(sycl_device) = op(gpu_1, arg2);
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+ (out.size()) * sizeof(DataType));
+ for (int64_t i = 0; i < out.size(); ++i) {
+ VERIFY_IS_APPROX(out(i), op(in_1(i), arg2));
}
+ sycl_device.deallocate(gpu_data_1);
+ sycl_device.deallocate(gpu_data_out);
+}
-#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR, Layout) \
- { \
- /* out = in_1 OPERATOR in_2 */ \
- Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \
- Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange); \
- Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \
- in_1 = in_1.random() + static_cast<SCALAR>(0.01); \
- in_2 = in_2.random() + static_cast<SCALAR>(0.01); \
- Tensor<SCALAR, 3, Layout, int64_t> reference(out); \
- SCALAR *gpu_data_1 = static_cast<SCALAR *>( \
- sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \
- SCALAR *gpu_data_2 = static_cast<SCALAR *>( \
- sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \
- SCALAR *gpu_data_out = static_cast<SCALAR *>( \
- sycl_device.allocate(out.size() * sizeof(SCALAR))); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
- sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \
- (in_1.size()) * sizeof(SCALAR)); \
- sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \
- (in_2.size()) * sizeof(SCALAR)); \
- gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2; \
- sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
- (out.size()) * sizeof(SCALAR)); \
- for (int64_t i = 0; i < out.size(); ++i) { \
- VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR in_2(i)); \
- } \
- sycl_device.deallocate(gpu_data_1); \
- sycl_device.deallocate(gpu_data_2); \
- sycl_device.deallocate(gpu_data_out); \
- }
+#define DECLARE_BINARY_STRUCT(FUNC) \
+ struct op_##FUNC { \
+ template <typename T1, typename T2> \
+ auto operator()(const T1& x, const T2& y) -> decltype(cl::sycl::FUNC(x, y)) { \
+ return cl::sycl::FUNC(x, y); \
+ } \
+ template <typename T1, typename T2> \
+ auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) -> decltype(x.FUNC(y)) { \
+ return x.FUNC(y); \
+ } \
+ };
-#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR, Layout) \
- { \
- /* out = in_1 OPERATOR 2 */ \
- Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \
- Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \
- in_1 = in_1.random() + static_cast<SCALAR>(0.01); \
- Tensor<SCALAR, 3, Layout, int64_t> reference(out); \
- SCALAR *gpu_data_1 = static_cast<SCALAR *>( \
- sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \
- SCALAR *gpu_data_out = static_cast<SCALAR *>( \
- sycl_device.allocate(out.size() * sizeof(SCALAR))); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
- sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \
- (in_1.size()) * sizeof(SCALAR)); \
- gpu_out.device(sycl_device) = gpu_1 OPERATOR 2; \
- sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
- (out.size()) * sizeof(SCALAR)); \
- for (int64_t i = 0; i < out.size(); ++i) { \
- VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR 2); \
- } \
- sycl_device.deallocate(gpu_data_1); \
- sycl_device.deallocate(gpu_data_out); \
- }
+DECLARE_BINARY_STRUCT(cwiseMax)
+DECLARE_BINARY_STRUCT(cwiseMin)
-#define TEST_BINARY_BUILTINS(SCALAR, Layout) \
- TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax , Layout) \
- TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin , Layout) \
- TEST_BINARY_BUILTINS_OPERATORS(SCALAR, + , Layout) \
- TEST_BINARY_BUILTINS_OPERATORS(SCALAR, - , Layout) \
- TEST_BINARY_BUILTINS_OPERATORS(SCALAR, * , Layout) \
- TEST_BINARY_BUILTINS_OPERATORS(SCALAR, / , Layout)
+#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR) \
+ struct op_##NAME { \
+ template <typename T1, typename T2> \
+ auto operator()(const T1& x, const T2& y) -> decltype(x OPERATOR y) { \
+ return x OPERATOR y; \
+ } \
+ };
+
+DECLARE_BINARY_STRUCT_OP(plus, +)
+DECLARE_BINARY_STRUCT_OP(minus, -)
+DECLARE_BINARY_STRUCT_OP(times, *)
+DECLARE_BINARY_STRUCT_OP(divide, /)
+DECLARE_BINARY_STRUCT_OP(modulo, %)
+
+template <typename DataType, int DataLayout>
+void test_binary_builtins(const Eigen::SyclDevice& sycl_device,
+ const array<int64_t, 3>& tensor_range) {
+ test_binary_builtins_func<DataType, DataLayout,
+ op_cwiseMax>(sycl_device, tensor_range);
+ test_binary_builtins_func<DataType, DataLayout,
+ op_cwiseMin>(sycl_device, tensor_range);
+ test_binary_builtins_func<DataType, DataLayout,
+ op_plus>(sycl_device, tensor_range);
+ test_binary_builtins_func<DataType, DataLayout,
+ op_minus>(sycl_device, tensor_range);
+ test_binary_builtins_func<DataType, DataLayout,
+ op_times>(sycl_device, tensor_range);
+ test_binary_builtins_func<DataType, DataLayout,
+ op_divide>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_floating_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+ int64_t sizeDim1 = 10;
+ int64_t sizeDim2 = 10;
+ int64_t sizeDim3 = 10;
+ array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+ test_binary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+ test_binary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
+}
-static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+template <typename DataType>
+static void test_integer_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
int64_t sizeDim1 = 10;
int64_t sizeDim2 = 10;
int64_t sizeDim3 = 10;
- array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
- TEST_BINARY_BUILTINS(float, RowMajor)
- TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, RowMajor)
- TEST_BINARY_BUILTINS(float, ColMajor)
- TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor)
+ array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+ test_binary_builtins_fixed_arg2<DataType, RowMajor,
+ op_modulo>(sycl_device, tensor_range);
+ test_binary_builtins_fixed_arg2<DataType, ColMajor,
+ op_modulo>(sycl_device, tensor_range);
}
EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) {
for (const auto& device :Eigen::get_sycl_supported_devices()) {
QueueInterface queueInterface(device);
Eigen::SyclDevice sycl_device(&queueInterface);
- CALL_SUBTEST(test_builtin_unary_sycl(sycl_device));
- CALL_SUBTEST(test_builtin_binary_sycl(sycl_device));
+ CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device));
+ CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device));
+ CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device));
}
}