From 730eb9fe1c0e0daa81aebbc4dbce52e185dda3dd Mon Sep 17 00:00:00 2001
From: Mehdi Goli <mehdi.goli@codeplay.com>
Date: Wed, 14 Dec 2016 17:38:53 +0000
Subject: Adding asynchronous execution as it improves the performance.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index 48c5f9a47..d5bc7b71b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -81,7 +81,7 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de
                 });
           };
             dev.sycl_queue().submit(f);
-            dev.synchronize();
+            dev.asynchronousExec();
 
           /* At this point, you could queue::wait_and_throw() to ensure that
            * errors are caught quickly. However, this would likely impact
@@ -173,7 +173,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
           tmp_global_accessor.get_pointer()[0]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(functor));
       });
     });
-    dev.synchronize();
+    dev.asynchronousExec();
 
 /// This is used to recursively reduce the tmp value to an element of 1;
   syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
@@ -212,7 +212,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
       (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range));
 
     });
-    dev.synchronize();
+    dev.asynchronousExec();
     return false;
   }
 };
-- 
cgit v1.2.3