Fixing TensorReductionSycl for min and max.

author: Mehdi Goli <mehdi.goli@codeplay.com> 2017-02-06 18:05:23 +0000
committer: Mehdi Goli <mehdi.goli@codeplay.com> 2017-02-06 18:05:23 +0000
commit: 42bd5c4e7b8f4b5875ae256e7ac20310161d8470 (patch)
tree: 61b825272a2a08f782ff453271e8488565e1738c
parent: bc128f9f3beff5a2a3225f03ec2e5eb111a15b87 (diff)
4 files changed, 33 insertions, 18 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index 8ecef59a8..9dcb42904 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -26,10 +26,10 @@ namespace Eigen {
 namespace internal {
 
 template<typename CoeffReturnType> struct syclGenericBufferReducer{
-template<typename BufferTOut, typename BufferTIn>
-static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+template<typename OP, typename BufferTOut, typename BufferTIn>
+static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
   do {
-          auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
+          auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable {
             cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
                                     cl::sycl::range<1>{std::min(length, local)}};
             /* Two accessors are used: one to the buffer that is being reduced,
@@ -43,7 +43,7 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de
 
             /* The parallel_for invocation chosen is the variant with an nd_item
              * parameter, since the code requires barriers for correctness. */
-            h.parallel_for(r, TensorSycl::internal::GenericKernelReducer< CoeffReturnType, OutputAccessor, InputAccessor, LocalAccessor>(aOut, aI, scratch,  length, local));
+            h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch,  length, local));
           };
             dev.sycl_queue().submit(f);
             dev.asynchronousExec();
@@ -123,7 +123,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
     // getting final out buffer at the moment the created buffer is true because there is no need for assign
     auto out_buffer =dev.get_sycl_buffer(output);
     /// This is used to recursively reduce the tmp value to an element of 1;
-    syclGenericBufferReducer<CoeffReturnType>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
+    syclGenericBufferReducer<CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
   }
 
 };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
index 84f660597..9d5a6d4c1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
@@ -35,7 +35,7 @@ namespace Eigen {
 namespace TensorSycl {
 namespace internal {
 
-  template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
+  template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
 
 
 /// This struct is used for special expression nodes with no operations (for example assign and selectOP).
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
index 710e22474..a77f408de 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
@@ -18,13 +18,14 @@ namespace Eigen {
 namespace TensorSycl {
 namespace internal {
 
-  template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
+  template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
+    OP op;
     OutputAccessor aOut;
     InputAccessor aI;
     LocalAccessor scratch;
     size_t length, local;
-    GenericKernelReducer(OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
-    : aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
+    GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
+    : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
     void operator()(cl::sycl::nd_item<1> itemID) {
       size_t globalid = itemID.get_global(0);
       size_t localid = itemID.get_local(0);
@@ -44,7 +45,12 @@ namespace internal {
         auto min = (length < local) ? length : local;
         for (size_t offset = min / 2; offset > 0; offset /= 2) {
           if (localid < offset) {
-            scratch[localid] += scratch[localid + offset];
+            auto accum = op.initialize();
+            op.reduce(scratch[localid], &accum);
+            op.reduce(scratch[localid + offset], &accum);
+            op.finalize(accum);
+            scratch[localid]=accum;
+            //scratch[localid] += scratch[localid + offset];
           }
           itemID.barrier(cl::sycl::access::fence_space::local_space);
         }
@@ -131,11 +137,21 @@ public:
     if(globalid<rng)
       tmp_global_accessor.get_pointer()[globalid]=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op));
     else
-      tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
+      tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(op.initialize());
 
-    if(remaining!=0 && globalid==0 )
+    if(remaining!=0 && globalid==0 ){
       // this will add the rest of input buffer when the input size is not devidable to red_factor.
-      tmp_global_accessor.get_pointer()[0]+=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
+    //  tmp_global_accessor.get_pointer()[0]+=
+      auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::
+      reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
+      auto accum = op.initialize();
+      op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
+      op.reduce(remaining_reduce, &accum);
+      op.finalize(accum);
+      tmp_global_accessor.get_pointer()[0]=accum;
+
+
+    }
   }
 };
 
diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
index 98a59a14c..251091f5b 100644
--- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
@@ -34,7 +34,7 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice&  sycl_device) {
 
   in.setRandom();
 
-  full_redux = in.sum();
+  full_redux = in.minimum();
 
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
   DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType));
@@ -43,11 +43,10 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice&  sycl_device) {
   TensorMap<Tensor<DataType, 0, DataLayout, IndexType> >  out_gpu(gpu_out_data);
 
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
-  out_gpu.device(sycl_device) = in_gpu.sum();
+  out_gpu.device(sycl_device) = in_gpu.minimum();
   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
-
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }
@@ -69,7 +68,7 @@ static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device)
 
   in.setRandom();
 
-  redux= in.sum(red_axis);
+  redux= in.maximum(red_axis);
 
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType)));
@@ -78,7 +77,7 @@ static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device)
   TensorMap<Tensor<DataType, 2, DataLayout, IndexType> >  out_gpu(gpu_out_data, reduced_tensorRange);
 
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
-  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
   sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType));
 
   // Check that the CPU and GPU reductions return the same result.
author	Mehdi Goli <mehdi.goli@codeplay.com>	2017-02-06 18:05:23 +0000
committer	Mehdi Goli <mehdi.goli@codeplay.com>	2017-02-06 18:05:23 +0000
commit	42bd5c4e7b8f4b5875ae256e7ac20310161d8470 (patch)
tree	61b825272a2a08f782ff453271e8488565e1738c
parent	bc128f9f3beff5a2a3225f03ec2e5eb111a15b87 (diff)