From 730eb9fe1c0e0daa81aebbc4dbce52e185dda3dd Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 14 Dec 2016 17:38:53 +0000 Subject: Adding asynchronous execution as it improves the performance. --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index 48c5f9a47..d5bc7b71b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -81,7 +81,7 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de }); }; dev.sycl_queue().submit(f); - dev.synchronize(); + dev.asynchronousExec(); /* At this point, you could queue::wait_and_throw() to ensure that * errors are caught quickly. However, this would likely impact @@ -173,7 +173,7 @@ struct FullReducer { tmp_global_accessor.get_pointer()[0]+=InnerMostDimReducer::reduce(device_self_evaluator, static_cast(red_factor*(rng)), static_cast(remaining), const_cast(functor)); }); }); - dev.synchronize(); + dev.asynchronousExec(); /// This is used to recursively reduce the tmp value to an element of 1; syclGenericBufferReducer::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize); @@ -212,7 +212,7 @@ struct InnerReducer { (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range)); }); - dev.synchronize(); + dev.asynchronousExec(); return false; } }; -- cgit v1.2.3