Converting all parallel for lambda to functor in order to prevent kernel duplication name error; adding tensorConcatinationOp backend for sycl.

author: Mehdi Goli <mehdi.goli@codeplay.com> 2016-12-16 19:46:45 +0000
committer: Mehdi Goli <mehdi.goli@codeplay.com> 2016-12-16 19:46:45 +0000
commit: 35bae513a0094f986c810c3f839e5a954caabd4b (patch)
tree: e0e89ec837c10509923e04fbeab7fd1f90563279 /unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
parent: 7949849ebcca49ce2730e767552eadfae0eb6e1a (diff)
1 files changed, 22 insertions, 77 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index d5bc7b71b..c9912d9d4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -25,8 +25,7 @@
 namespace Eigen {
 namespace internal {
 
-
-template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
+template<typename CoeffReturnType> struct syclGenericBufferReducer{
 template<typename BufferTOut, typename BufferTIn>
 static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
   do {
@@ -35,50 +34,16 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de
                                     cl::sycl::range<1>{std::min(length, local)}};
             /* Two accessors are used: one to the buffer that is being reduced,
              * and a second to local memory, used to store intermediate data. */
-            auto aI =
-                bufI.template get_access<cl::sycl::access::mode::read_write>(h);
-            auto aOut =
-                bufOut.template get_access<cl::sycl::access::mode::discard_write>(h);
-            cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
-                               cl::sycl::access::target::local>
-                scratch(cl::sycl::range<1>(local), h);
+            auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h);
+            auto aOut =bufOut.template get_access<cl::sycl::access::mode::discard_write>(h);
+            typedef decltype(aI) InputAccessor;
+            typedef decltype(aOut) OutputAccessor;
+            typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor;
+            LocalAccessor scratch(cl::sycl::range<1>(local), h);
 
             /* The parallel_for invocation chosen is the variant with an nd_item
              * parameter, since the code requires barriers for correctness. */
-            h.parallel_for<KernelName>(
-                r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
-                  size_t globalid = id.get_global(0);
-                  size_t localid = id.get_local(0);
-                  /* All threads collectively read from global memory into local.
-                   * The barrier ensures all threads' IO is resolved before
-                   * execution continues (strictly speaking, all threads within
-                   * a single work-group - there is no co-ordination between
-                   * work-groups, only work-items). */
-                  if (globalid < length) {
-                    scratch[localid] = aI[globalid];
-                  }
-                  id.barrier(cl::sycl::access::fence_space::local_space);
-
-                  /* Apply the reduction operation between the current local
-                   * id and the one on the other half of the vector. */
-                  if (globalid < length) {
-                    auto min = (length < local) ? length : local;
-                    for (size_t offset = min / 2; offset > 0; offset /= 2) {
-                      if (localid < offset) {
-                        scratch[localid] += scratch[localid + offset];
-                      }
-                      id.barrier(cl::sycl::access::fence_space::local_space);
-                    }
-                    /* The final result will be stored in local id 0. */
-                    if (localid == 0) {
-                      aI[id.get_group(0)] = scratch[localid];
-                      if((length<=local) && globalid ==0){
-                        auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
-                        aOutPtr[0]=scratch[0];
-                      }
-                    }
-                  }
-                });
+            h.parallel_for(r, TensorSycl::internal::GenericKernelReducer< CoeffReturnType, OutputAccessor, InputAccessor, LocalAccessor>(aOut, aI, scratch,  length, local));
           };
             dev.sycl_queue().submit(f);
             dev.asynchronousExec();
@@ -96,11 +61,11 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de
 
 };
 
-/// For now let's start with a full reducer
 /// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
 /// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
 /// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
 // a leafNode.
+
 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
 
@@ -109,8 +74,8 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
 
   static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
     typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
-    typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
     auto functors = TensorSycl::internal::extractFunctors(self.impl());
+    typedef decltype(functors) FunctorExpr;
     int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
     size_t inputSize =self.impl().dimensions().TotalSize();
     size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
@@ -135,48 +100,29 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
     size_t  outTileSize = tileSize;
     /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
     if (GRange < outTileSize) outTileSize=GRange;
-    // getting final out buffer at the moment the created buffer is true because there is no need for assign
-      auto out_buffer =dev.get_sycl_buffer(output);
-
     /// creating the shared memory for calculating reduction.
     /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
     /// recursively apply reduction on it in order to reduce the whole.
     auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
     typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
-    Dims dims= self.xprDims();
-    Op functor = reducer;
+  //  Dims dims= self.xprDims();
+    //Op functor = reducer;
     dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
       // create a tuple of accessors from Evaluator
       auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
+      typedef decltype(tuple_of_accessors) TupleType;
       auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
-
-      cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
-        typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
-        auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-        /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-        /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
-        /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
-        const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
-        /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
-        /// the device_evaluator is detectable and recognisable on the device.
-        auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
-        /// const cast added as a naive solution to solve the qualifier drop error
-        auto globalid=itemID.get_global_linear_id();
-
-        if(globalid<rng)
-          tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(functor));
-        else
-          tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
-
-        if(remaining!=0 && globalid==0 )
-          // this will add the rest of input buffer when the input size is not devidable to red_factor.
-          tmp_global_accessor.get_pointer()[0]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(functor));
-      });
+      typedef decltype(tmp_global_accessor) OutAccessor;
+      cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)),
+        TensorSycl::internal::FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Op, Dims, size_t, TupleType>
+       (tmp_global_accessor, rng, remaining, red_factor, reducer, self.xprDims(), functors,  tuple_of_accessors));
     });
     dev.asynchronousExec();
 
-/// This is used to recursively reduce the tmp value to an element of 1;
-  syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
+    // getting final out buffer at the moment the created buffer is true because there is no need for assign
+    auto out_buffer =dev.get_sycl_buffer(output);
+    /// This is used to recursively reduce the tmp value to an element of 1;
+    syclGenericBufferReducer<CoeffReturnType>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
   }
 
 };
@@ -190,7 +136,6 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
 
   static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
     typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
-    typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
     auto functors = TensorSycl::internal::extractFunctors(self.impl());
     typedef decltype(functors) FunctorExpr;
     typename Self::Index range, GRange, tileSize;
@@ -208,7 +153,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
       auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
 
       cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
-      TensorSycl::internal::ReductionFunctor<HostExpr, PlaceHolderExpr,  FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
+      TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
       (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range));
 
     });
author	Mehdi Goli <mehdi.goli@codeplay.com>	2016-12-16 19:46:45 +0000
committer	Mehdi Goli <mehdi.goli@codeplay.com>	2016-12-16 19:46:45 +0000
commit	35bae513a0094f986c810c3f839e5a954caabd4b (patch)
tree	e0e89ec837c10509923e04fbeab7fd1f90563279 /unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
parent	7949849ebcca49ce2730e767552eadfae0eb6e1a (diff)