From 622805a0c5d216141eca3090e80d58c159e175ee Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Fri, 18 Nov 2016 16:20:42 +0000 Subject: Modifying TensorDeviceSycl.h to always create buffer of type uint8_t and convert them to the actual type at the execution on the device; adding the queue interface class to separate the lifespan of sycl queue and buffers,created for that queue, from Eigen::SyclDevice; modifying sycl tests to support the evaluation of the results for both row major and column major data layout on all different devices that are supported by Sycl{CPU; GPU; and Host}. --- unsupported/test/cxx11_tensor_sycl.cpp | 105 ++++++++++++++++++--------------- 1 file changed, 56 insertions(+), 49 deletions(-) (limited to 'unsupported/test/cxx11_tensor_sycl.cpp') diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp index 05fbf9e62..bf115d652 100644 --- a/unsupported/test/cxx11_tensor_sycl.cpp +++ b/unsupported/test/cxx11_tensor_sycl.cpp @@ -26,35 +26,32 @@ using Eigen::array; using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; - +template void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { int sizeDim1 = 100; - int sizeDim2 = 100; - int sizeDim3 = 100; + int sizeDim2 = 10; + int sizeDim3 = 20; array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - Tensor in1(tensorRange); - Tensor out1(tensorRange); - Tensor out2(tensorRange); - Tensor out3(tensorRange); + Tensor in1(tensorRange); + Tensor out1(tensorRange); + Tensor out2(tensorRange); + Tensor out3(tensorRange); in1 = in1.random(); - float* gpu_data1 = static_cast(sycl_device.allocate(in1.size()*sizeof(float))); - float* gpu_data2 = static_cast(sycl_device.allocate(out1.size()*sizeof(float))); - //float* gpu_data = static_cast(sycl_device.allocate(out2.size()*sizeof(float))); + DataType* gpu_data1 = static_cast(sycl_device.allocate(in1.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast(sycl_device.allocate(out1.size()*sizeof(DataType))); + + TensorMap> gpu1(gpu_data1, tensorRange); + TensorMap> gpu2(gpu_data2, tensorRange); - TensorMap> gpu1(gpu_data1, tensorRange); - TensorMap> gpu2(gpu_data2, tensorRange); - //TensorMap> gpu_out2(gpu_out2_data, tensorRange); - - sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(float)); - sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(float)); + sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(DataType)); gpu1.device(sycl_device) = gpu1 * 3.14f; gpu2.device(sycl_device) = gpu2 * 2.7f; - sycl_device.memcpyDeviceToHost(out1.data(), gpu_data1,(out1.size())*sizeof(float)); - sycl_device.memcpyDeviceToHost(out2.data(), gpu_data1,(out2.size())*sizeof(float)); - sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(float)); - // sycl_device.Synchronize(); + sycl_device.memcpyDeviceToHost(out1.data(), gpu_data1,(out1.size())*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out2.data(), gpu_data1,(out2.size())*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(DataType)); for (int i = 0; i < in1.size(); ++i) { VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f); @@ -65,34 +62,34 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { sycl_device.deallocate(gpu_data1); sycl_device.deallocate(gpu_data2); } - +template void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { int sizeDim1 = 100; - int sizeDim2 = 100; - int sizeDim3 = 100; + int sizeDim2 = 10; + int sizeDim3 = 20; array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - Tensor in1(tensorRange); - Tensor in2(tensorRange); - Tensor in3(tensorRange); - Tensor out(tensorRange); + Tensor in1(tensorRange); + Tensor in2(tensorRange); + Tensor in3(tensorRange); + Tensor out(tensorRange); in2 = in2.random(); in3 = in3.random(); - float * gpu_in1_data = static_cast(sycl_device.allocate(in1.size()*sizeof(float))); - float * gpu_in2_data = static_cast(sycl_device.allocate(in2.size()*sizeof(float))); - float * gpu_in3_data = static_cast(sycl_device.allocate(in3.size()*sizeof(float))); - float * gpu_out_data = static_cast(sycl_device.allocate(out.size()*sizeof(float))); + DataType * gpu_in1_data = static_cast(sycl_device.allocate(in1.size()*sizeof(DataType))); + DataType * gpu_in2_data = static_cast(sycl_device.allocate(in2.size()*sizeof(DataType))); + DataType * gpu_in3_data = static_cast(sycl_device.allocate(in3.size()*sizeof(DataType))); + DataType * gpu_out_data = static_cast(sycl_device.allocate(out.size()*sizeof(DataType))); - TensorMap> gpu_in1(gpu_in1_data, tensorRange); - TensorMap> gpu_in2(gpu_in2_data, tensorRange); - TensorMap> gpu_in3(gpu_in3_data, tensorRange); - TensorMap> gpu_out(gpu_out_data, tensorRange); + TensorMap> gpu_in1(gpu_in1_data, tensorRange); + TensorMap> gpu_in2(gpu_in2_data, tensorRange); + TensorMap> gpu_in3(gpu_in3_data, tensorRange); + TensorMap> gpu_out(gpu_out_data, tensorRange); /// a=1.2f gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f); - sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(float)); + sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(DataType)); for (int i = 0; i < sizeDim1; ++i) { for (int j = 0; j < sizeDim2; ++j) { for (int k = 0; k < sizeDim3; ++k) { @@ -104,7 +101,7 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { /// a=b*1.2f gpu_out.device(sycl_device) = gpu_in1 * 1.2f; - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(float)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(DataType)); for (int i = 0; i < sizeDim1; ++i) { for (int j = 0; j < sizeDim2; ++j) { for (int k = 0; k < sizeDim3; ++k) { @@ -116,9 +113,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { printf("a=b*1.2f Test Passed\n"); /// c=a*b - sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(float)); + sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(DataType)); gpu_out.device(sycl_device) = gpu_in1 * gpu_in2; - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(float)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); for (int i = 0; i < sizeDim1; ++i) { for (int j = 0; j < sizeDim2; ++j) { for (int k = 0; k < sizeDim3; ++k) { @@ -132,7 +129,7 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { /// c=a+b gpu_out.device(sycl_device) = gpu_in1 + gpu_in2; - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(float)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); for (int i = 0; i < sizeDim1; ++i) { for (int j = 0; j < sizeDim2; ++j) { for (int k = 0; k < sizeDim3; ++k) { @@ -146,7 +143,7 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { /// c=a*a gpu_out.device(sycl_device) = gpu_in1 * gpu_in1; - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(float)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); for (int i = 0; i < sizeDim1; ++i) { for (int j = 0; j < sizeDim2; ++j) { for (int k = 0; k < sizeDim3; ++k) { @@ -160,7 +157,7 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { //a*3.14f + b*2.7f gpu_out.device(sycl_device) = gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f); - sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(float)); + sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(DataType)); for (int i = 0; i < sizeDim1; ++i) { for (int j = 0; j < sizeDim2; ++j) { for (int k = 0; k < sizeDim3; ++k) { @@ -173,9 +170,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { printf("a*3.14f + b*2.7f Test Passed\n"); ///d= (a>0.5? b:c) - sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.size())*sizeof(float)); + sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.size())*sizeof(DataType)); gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3); - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(float)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); for (int i = 0; i < sizeDim1; ++i) { for (int j = 0; j < sizeDim2; ++j) { for (int k = 0; k < sizeDim3; ++k) { @@ -191,10 +188,20 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { sycl_device.deallocate(gpu_in3_data); sycl_device.deallocate(gpu_out_data); } - +template void sycl_computing_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_sycl_mem_transfers(sycl_device); + test_sycl_computations(sycl_device); + test_sycl_mem_transfers(sycl_device); + test_sycl_computations(sycl_device); +} void test_cxx11_tensor_sycl() { - cl::sycl::gpu_selector s; - Eigen::SyclDevice sycl_device(s); - CALL_SUBTEST(test_sycl_mem_transfers(sycl_device)); - CALL_SUBTEST(test_sycl_computations(sycl_device)); + printf("Test on GPU: OpenCL\n"); + CALL_SUBTEST(sycl_computing_test_per_device((cl::sycl::gpu_selector()))); + printf("repeating the test on CPU: OpenCL\n"); + CALL_SUBTEST(sycl_computing_test_per_device((cl::sycl::cpu_selector()))); + printf("repeating the test on CPU: HOST\n"); + CALL_SUBTEST(sycl_computing_test_per_device((cl::sycl::host_selector()))); + printf("Test Passed******************\n" ); } -- cgit v1.2.3