From d57430dd73ab2f88aa5e45c370f6ab91103ff18a Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Tue, 8 Nov 2016 17:08:02 +0000 Subject: Converting all sycl buffers to uninitialised device only buffers; adding memcpyHostToDevice and memcpyDeviceToHost on syclDevice; modifying all examples to obey the new rules; moving sycl queue creating to the device based on Benoit suggestion; removing the sycl specefic condition for returning m_result in TensorReduction.h according to Benoit suggestion. --- unsupported/test/cxx11_tensor_sycl.cpp | 67 ++++++++++++++++------------------ 1 file changed, 32 insertions(+), 35 deletions(-) (limited to 'unsupported/test/cxx11_tensor_sycl.cpp') diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp index 0f66cd8f0..6a9c33422 100644 --- a/unsupported/test/cxx11_tensor_sycl.cpp +++ b/unsupported/test/cxx11_tensor_sycl.cpp @@ -27,42 +27,33 @@ using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -// Types used in tests: -using TestTensor = Tensor; -using TestTensorMap = TensorMap>; - -void test_sycl_cpu() { - cl::sycl::gpu_selector s; - cl::sycl::queue q(s, [=](cl::sycl::exception_list l) { - for (const auto& e : l) { - try { - std::rethrow_exception(e); - } catch (cl::sycl::exception e) { - std::cout << e.what() << std::endl; - } - } - }); - SyclDevice sycl_device(q); +void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) { int sizeDim1 = 100; int sizeDim2 = 100; int sizeDim3 = 100; array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - TestTensor in1(tensorRange); - TestTensor in2(tensorRange); - TestTensor in3(tensorRange); - TestTensor out(tensorRange); - in1 = in1.random(); + Tensor in1(tensorRange); + Tensor in2(tensorRange); + Tensor in3(tensorRange); + Tensor out(tensorRange); + in2 = in2.random(); in3 = in3.random(); - TestTensorMap gpu_in1(in1.data(), tensorRange); - TestTensorMap gpu_in2(in2.data(), tensorRange); - TestTensorMap gpu_in3(in3.data(), tensorRange); - TestTensorMap gpu_out(out.data(), tensorRange); + + float * gpu_in1_data = static_cast(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float))); + float * gpu_in2_data = static_cast(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float))); + float * gpu_in3_data = static_cast(sycl_device.allocate(in3.dimensions().TotalSize()*sizeof(float))); + float * gpu_out_data = static_cast(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float))); + + TensorMap> gpu_in1(gpu_in1_data, tensorRange); + TensorMap> gpu_in2(gpu_in2_data, tensorRange); + TensorMap> gpu_in3(gpu_in3_data, tensorRange); + TensorMap> gpu_out(gpu_out_data, tensorRange); /// a=1.2f gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f); - sycl_device.deallocate(in1.data()); + sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.dimensions().TotalSize())*sizeof(float)); for (int i = 0; i < sizeDim1; ++i) { for (int j = 0; j < sizeDim2; ++j) { for (int k = 0; k < sizeDim3; ++k) { @@ -74,7 +65,7 @@ void test_sycl_cpu() { /// a=b*1.2f gpu_out.device(sycl_device) = gpu_in1 * 1.2f; - sycl_device.deallocate(out.data()); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.dimensions().TotalSize())*sizeof(float)); for (int i = 0; i < sizeDim1; ++i) { for (int j = 0; j < sizeDim2; ++j) { for (int k = 0; k < sizeDim3; ++k) { @@ -86,8 +77,9 @@ void test_sycl_cpu() { printf("a=b*1.2f Test Passed\n"); /// c=a*b + sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(float)); gpu_out.device(sycl_device) = gpu_in1 * gpu_in2; - sycl_device.deallocate(out.data()); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); for (int i = 0; i < sizeDim1; ++i) { for (int j = 0; j < sizeDim2; ++j) { for (int k = 0; k < sizeDim3; ++k) { @@ -101,7 +93,7 @@ void test_sycl_cpu() { /// c=a+b gpu_out.device(sycl_device) = gpu_in1 + gpu_in2; - sycl_device.deallocate(out.data()); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); for (int i = 0; i < sizeDim1; ++i) { for (int j = 0; j < sizeDim2; ++j) { for (int k = 0; k < sizeDim3; ++k) { @@ -115,7 +107,7 @@ void test_sycl_cpu() { /// c=a*a gpu_out.device(sycl_device) = gpu_in1 * gpu_in1; - sycl_device.deallocate(out.data()); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); for (int i = 0; i < sizeDim1; ++i) { for (int j = 0; j < sizeDim2; ++j) { for (int k = 0; k < sizeDim3; ++k) { @@ -125,12 +117,11 @@ void test_sycl_cpu() { } } } - printf("c= a*a Test Passed\n"); //a*3.14f + b*2.7f gpu_out.device(sycl_device) = gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f); - sycl_device.deallocate(out.data()); + sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); for (int i = 0; i < sizeDim1; ++i) { for (int j = 0; j < sizeDim2; ++j) { for (int k = 0; k < sizeDim3; ++k) { @@ -143,8 +134,9 @@ void test_sycl_cpu() { printf("a*3.14f + b*2.7f Test Passed\n"); ///d= (a>0.5? b:c) + sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.dimensions().TotalSize())*sizeof(float)); gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3); - sycl_device.deallocate(out.data()); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); for (int i = 0; i < sizeDim1; ++i) { for (int j = 0; j < sizeDim2; ++j) { for (int k = 0; k < sizeDim3; ++k) { @@ -155,8 +147,13 @@ void test_sycl_cpu() { } } printf("d= (a>0.5? b:c) Test Passed\n"); - + sycl_device.deallocate(gpu_in1_data); + sycl_device.deallocate(gpu_in2_data); + sycl_device.deallocate(gpu_in3_data); + sycl_device.deallocate(gpu_out_data); } void test_cxx11_tensor_sycl() { - CALL_SUBTEST(test_sycl_cpu()); + cl::sycl::gpu_selector s; + Eigen::SyclDevice sycl_device(s); + CALL_SUBTEST(test_sycl_cpu(sycl_device)); } -- cgit v1.2.3