diff options
Diffstat (limited to 'unsupported/test/cxx11_tensor_reduction_sycl.cpp')
-rw-r--r-- | unsupported/test/cxx11_tensor_reduction_sycl.cpp | 941 |
1 files changed, 887 insertions, 54 deletions
diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp index f526299c6..a297716e4 100644 --- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp @@ -16,16 +16,99 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL +#define EIGEN_HAS_CONSTEXPR 1 #include "main.h" + #include <unsupported/Eigen/CXX11/Tensor> +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device) { + const IndexType num_rows = 753; + const IndexType num_cols = 537; + array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; + + array<IndexType, 2> outRange = {{1, 1}}; + + Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange); + Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange); + + in.setRandom(); + auto dim = DSizes<IndexType, 2>(1, 1); + full_redux = in.sum().reshape(dim); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = (DataType*)sycl_device.allocate( + sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize())); + + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data, + outRange); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim); + sycl_device.memcpyDeviceToHost( + full_redux_gpu.data(), gpu_out_data, + (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + std::cout << "SYCL FULL :" << full_redux_gpu(0, 0) + << ", CPU FULL: " << full_redux(0, 0) << "\n"; + VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0)); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} template <typename DataType, int DataLayout, typename IndexType> -static void test_full_reductions_mean_sycl(const Eigen::SyclDevice& sycl_device) { +static void test_full_reductions_sum_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + + const IndexType offset = 64; + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.sum(); + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<scalar_tensor> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.sum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); - const IndexType num_rows = 452; - const IndexType num_cols = 765; + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_max_sycl( + const Eigen::SyclDevice& sycl_device) { + const IndexType num_rows = 4096; + const IndexType num_cols = 4096; array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); @@ -34,27 +117,250 @@ static void test_full_reductions_mean_sycl(const Eigen::SyclDevice& sycl_device in.setRandom(); - full_redux = in.mean(); + full_redux = in.maximum(); - DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType)); + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); - TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 0, DataLayout, IndexType> > out_gpu(gpu_out_data); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_max_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set the initial value to be the max. + // As we don't include this in the reduction the result should not be 2. + in(0) = static_cast<DataType>(2); + + const IndexType offset = 64; + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.maximum(); + VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<scalar_tensor> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); - out_gpu.device(sycl_device) = in_gpu.mean(); - sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + const IndexType num_rows = 4096; + const IndexType num_cols = 4096; + array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; + array<IndexType, 1> argRange = {{num_cols}}; + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 0; + // red_axis[1]=1; + Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange); + Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange); + Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange); + Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange); + Tensor<DataType, 0, DataLayout, IndexType> full_redux; + Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu; + + in.setRandom(); + in_arg1.setRandom(); + in_arg2.setRandom(); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_in_arg1_data = static_cast<DataType*>(sycl_device.allocate( + in_arg1.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_in_arg2_data = static_cast<DataType*>(sycl_device.allocate( + in_arg2.dimensions().TotalSize() * sizeof(DataType))); + bool* gpu_out_arg__gpu_helper_data = static_cast<bool*>(sycl_device.allocate( + out_arg_gpu.dimensions().TotalSize() * sizeof(DataType))); + bool* gpu_out_arg_data = static_cast<bool*>(sycl_device.allocate( + out_arg_gpu.dimensions().TotalSize() * sizeof(DataType))); + + DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); + + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg1_gpu( + gpu_in_arg1_data, tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg2_gpu( + gpu_in_arg2_data, tensorRange); + TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu( + gpu_out_arg_data, argRange); + TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu_helper( + gpu_out_arg__gpu_helper_data, argRange); + TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data); + + // CPU VERSION + out_arg_cpu = + (in_arg1.argmax(1) == in_arg2.argmax(1)) + .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false)); + full_redux = (out_arg_cpu.template cast<float>()) + .reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); + + // GPU VERSION + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_arg1_data, in_arg1.data(), + (in_arg1.dimensions().TotalSize()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_arg2_data, in_arg2.data(), + (in_arg2.dimensions().TotalSize()) * sizeof(DataType)); + out_Argout_gpu_helper.device(sycl_device) = + (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1)); + out_Argout_gpu.device(sycl_device) = + (out_Argout_gpu_helper) + .select(out_Argout_gpu.constant(true), + out_Argout_gpu.constant(false)); + out_gpu.device(sycl_device) = + (out_Argout_gpu.template cast<float>()) + .reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux() + << '\n'; + VERIFY_IS_EQUAL(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_in_arg1_data); + sycl_device.deallocate(gpu_in_arg2_data); + sycl_device.deallocate(gpu_out_arg__gpu_helper_data); + sycl_device.deallocate(gpu_out_arg_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_mean_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + + const IndexType offset = 64; + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.mean(); + VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<scalar_tensor> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.mean(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} template <typename DataType, int DataLayout, typename IndexType> -static void test_full_reductions_min_sycl(const Eigen::SyclDevice& sycl_device) { +static void test_full_reductions_mean_with_odd_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + // This is a particular case which illustrates a possible problem when the + // number of local threads in a workgroup is even, but is not a power of two. + using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>; + using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; + // 2177 = (17 * 128) + 1 gives rise to 18 local threads. + // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads. + const IndexType n_elems = 8707; + array<IndexType, 1> tensor_range = {{n_elems}}; + + data_tensor in(tensor_range); + DataType full_redux; + DataType full_redux_gpu; + TensorMap<scalar_tensor> red_cpu(&full_redux); + TensorMap<scalar_tensor> red_gpu(&full_redux_gpu); + + const DataType const_val = static_cast<DataType>(0.6391); + in = in.constant(const_val); + + Eigen::IndexList<Eigen::type2index<0>> red_axis; + red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); + VERIFY_IS_APPROX(const_val, red_cpu()); + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range); + TensorMap<scalar_tensor> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = + in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); + sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data, + sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu, full_redux); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_min_sycl( + const Eigen::SyclDevice& sycl_device) { const IndexType num_rows = 876; const IndexType num_cols = 953; array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; @@ -67,25 +373,73 @@ static void test_full_reductions_min_sycl(const Eigen::SyclDevice& sycl_device) full_redux = in.minimum(); - DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType)); + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); - TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 0, DataLayout, IndexType> > out_gpu(gpu_out_data); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.minimum(); - sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } - template <typename DataType, int DataLayout, typename IndexType> -static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_device) { +static void test_full_reductions_min_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set the initial value to be the min. + // As we don't include this in the reduction the result should not be -2. + in(0) = static_cast<DataType>(-2); + + const IndexType offset = 64; + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.minimum(); + VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<scalar_tensor> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.minimum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} +template <typename DataType, int DataLayout, typename IndexType> +static void test_first_dim_reductions_max_sycl( + const Eigen::SyclDevice& sycl_device) { IndexType dim_x = 145; IndexType dim_y = 1; IndexType dim_z = 67; @@ -101,33 +455,293 @@ static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_dev in.setRandom(); - redux= in.maximum(red_axis); + redux = in.maximum(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_first_dim_reductions_max_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>; + + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + array<IndexType, 1> reduced_range = {{num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + const IndexType n_reduced = num_cols; + + data_tensor in(tensor_range); + reduced_tensor redux; + reduced_tensor redux_gpu(reduced_range); + + in.setRandom(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set maximum value outside of the considered range. + for (IndexType i = 0; i < n_reduced; i++) { + in(i) = static_cast<DataType>(2); + } + + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 0; + + const IndexType offset = 64; + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + redux = in_offset.maximum(red_axis); + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_NOT_EQUAL(redux(i), in(i)); + } + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>( + sycl_device.allocate(n_reduced * sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); + sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, + n_reduced * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_APPROX(redux_gpu(i), redux(i)); + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_last_dim_reductions_max_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>; + + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + array<IndexType, 1> full_reduced_range = {{num_rows}}; + array<IndexType, 1> reduced_range = {{num_rows - 1}}; + const IndexType n_elems = internal::array_prod(tensor_range); + const IndexType n_reduced = reduced_range[0]; + + data_tensor in(tensor_range); + reduced_tensor redux(full_reduced_range); + reduced_tensor redux_gpu(reduced_range); + + in.setRandom(); + redux.setZero(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set maximum value outside of the considered range. + for (IndexType i = 0; i < n_reduced; i++) { + in(i) = static_cast<DataType>(2); + } - DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType))); + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 1; + + const IndexType offset = 64; + // Introduce an offset in both the input and the output. + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range); + red_offset = in_offset.maximum(red_axis); + + // Check that the first value hasn't been changed and that the reduced values + // are not equal to the previously set maximum in the input outside the range. + VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0)); + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_NOT_EQUAL(red_offset(i), in(i)); + } - TensorMap<Tensor<DataType, 3, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > out_gpu(gpu_out_data, reduced_tensorRange); + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>( + sycl_device.allocate((n_reduced + 1) * sizeof(DataType))); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); - sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(), + n_reduced * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for(IndexType j=0; j<reduced_tensorRange[0]; j++ ) - for(IndexType k=0; k<reduced_tensorRange[1]; k++ ) - VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k)); + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_APPROX(redux_gpu(i), red_offset(i)); + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_first_dim_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) { + array<IndexType, 2> tensorRange = {{dim_x, dim_y}}; + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 0; + array<IndexType, 1> reduced_tensorRange = {{dim_y}}; + + Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange); + Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange); + + in.setRandom(); + redux = in.sum(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> out_gpu( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.sum(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType i = 0; i < redux.size(); i++) { + VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]); + } sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> -static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_device) { +static void test_first_dim_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + IndexType dim_x = 145; + IndexType dim_y = 1; + IndexType dim_z = 67; + + array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 0; + array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}}; + + Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); + + in.setRandom(); + + redux = in.mean(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.mean(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_last_dim_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + IndexType dim_x = 64; + IndexType dim_y = 1; + IndexType dim_z = 32; + + array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 2; + array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}}; + + Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); + + in.setRandom(); + + redux = in.mean(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( + gpu_out_data, reduced_tensorRange); - IndexType dim_x = 567; + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.mean(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_last_dim_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device) { + IndexType dim_x = 64; IndexType dim_y = 1; - IndexType dim_z = 47; + IndexType dim_z = 32; array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; Eigen::array<IndexType, 1> red_axis; @@ -140,42 +754,261 @@ static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_devi in.setRandom(); - redux= in.sum(red_axis); + redux = in.sum(red_axis); - DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); - TensorMap<Tensor<DataType, 3, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > out_gpu(gpu_out_data, reduced_tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( + gpu_out_data, reduced_tensorRange); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(red_axis); - sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for(IndexType j=0; j<reduced_tensorRange[0]; j++ ) - for(IndexType k=0; k<reduced_tensorRange[1]; k++ ) - VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k)); + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); +} +template <typename DataType, int DataLayout, typename IndexType> +static void test_last_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device) { + auto tensorRange = Sizes<64, 32>(64, 32); + // auto red_axis = Sizes<0,1>(0,1); + Eigen::IndexList<Eigen::type2index<1>> red_axis; + auto reduced_tensorRange = Sizes<64>(64); + TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix; + TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix; + TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix; + + in_fix.setRandom(); + + redux_fix = in_fix.sum(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix( + gpu_in_data, tensorRange); + TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in_fix.data(), + (in_fix.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu_fix.data(), gpu_out_data, + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) { + VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j)); + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); } -template<typename DataType> void sycl_reduction_test_per_device(const cl::sycl::device& d){ - std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl; - QueueInterface queueInterface(d); - auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device); +template <typename DataType, int DataLayout, typename IndexType> +static void test_last_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + auto tensorRange = Sizes<64, 32>(64, 32); + Eigen::IndexList<Eigen::type2index<1>> red_axis; + auto reduced_tensorRange = Sizes<64>(64); + TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix; + TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix; + TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix; + + in_fix.setRandom(); + redux_fix = in_fix.mean(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix( + gpu_in_data, tensorRange); + TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in_fix.data(), + (in_fix.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu_fix.data(), gpu_out_data, + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)); + sycl_device.synchronize(); + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) { + VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j)); + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +// SYCL supports a generic case of reduction where the accumulator is a +// different type than the input data This is an example on how to get if a +// Tensor contains nan and/or inf in one reduction +template <typename InT, typename OutT> +struct CustomReducer { + static const bool PacketAccess = false; + static const bool IsStateful = false; + + static constexpr OutT InfBit = 1; + static constexpr OutT NanBit = 2; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x, + OutT* accum) const { + if (Eigen::numext::isinf(x)) + *accum |= InfBit; + else if (Eigen::numext::isnan(x)) + *accum |= NanBit; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x, + OutT* accum) const { + *accum |= x; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const { + return OutT(0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const { + return accum; + } +}; + +template <typename DataType, typename AccumType, int DataLayout, + typename IndexType> +static void test_full_reductions_custom_sycl( + const Eigen::SyclDevice& sycl_device) { + constexpr IndexType InSize = 64; + auto tensorRange = Sizes<InSize>(InSize); + Eigen::IndexList<Eigen::type2index<0>> dims; + auto reduced_tensorRange = Sizes<>(); + TensorFixedSize<DataType, Sizes<InSize>, DataLayout> in_fix; + TensorFixedSize<AccumType, Sizes<>, DataLayout> redux_gpu_fix; + + CustomReducer<DataType, AccumType> reducer; + + in_fix.setRandom(); + + size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType); + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(in_size_bytes)); + AccumType* gpu_out_data = + static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType))); + + TensorMap<TensorFixedSize<DataType, Sizes<InSize>, DataLayout>> in_gpu_fix( + gpu_in_data, tensorRange); + TensorMap<TensorFixedSize<AccumType, Sizes<>, DataLayout>> out_gpu_fix( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes); + out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer); + sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data, + sizeof(AccumType)); + VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, typename Dev> +void sycl_reduction_test_full_per_device(const Dev& sycl_device) { + test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device); test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device); + + test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>( + sycl_device); + test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>( + sycl_device); + sycl_device.synchronize(); +} + +template <typename DataType, typename Dev> +void sycl_reduction_full_offset_per_device(const Dev& sycl_device) { + test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>( + sycl_device); + test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>( + sycl_device); + test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>( + sycl_device); + test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>( + sycl_device); + test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + sycl_device.synchronize(); +} + +template <typename DataType, typename Dev> +void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) { + test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device, + 4197, 4097); + test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device, + 4197, 4097); + test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device, + 129, 8); test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + sycl_device.synchronize(); +} + +template <typename DataType, typename Dev> +void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) { test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device); - test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device); - test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device); - test_first_dim_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device); - test_last_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device); + sycl_device.synchronize(); } + EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_reduction_test_per_device<float>(device)); + for (const auto& device : Eigen::get_sycl_supported_devices()) { + std::cout << "Running on " + << device.template get_info<cl::sycl::info::device::name>() + << std::endl; + QueueInterface queueInterface(device); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device)); + CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device)); + CALL_SUBTEST_3( + sycl_reduction_test_first_dim_per_device<float>(sycl_device)); + CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device)); } } |