aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
blob: e5b892f2e917e394314e7da18e4954c264abc531 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Mehdi Goli    Codeplay Software Ltd.
// Ralph Potter  Codeplay Software Ltd.
// Luke Iwanski  Codeplay Software Ltd.
// Contact: eigen@codeplay.com
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

// General include header of SYCL target for Tensor Module
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H

namespace Eigen {
namespace TensorSycl {
namespace internal {

  template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
    OP op;
    OutputAccessor aOut;
    ptrdiff_t out_offset;
    InputAccessor aI;
    LocalAccessor scratch;
    size_t length, local;
    GenericKernelReducer(OP op_, OutputAccessor aOut_, ptrdiff_t out_offset_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
    : op(op_), aOut(aOut_), out_offset(out_offset_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
    void operator()(cl::sycl::nd_item<1> itemID) {
      size_t globalid = itemID.get_global(0);
      size_t localid = itemID.get_local(0);
      /* All threads collectively read from global memory into local.
       * The barrier ensures all threads' IO is resolved before
       * execution continues (strictly speaking, all threads within
       * a single work-group - there is no co-ordination between
       * work-groups, only work-items). */
      if (globalid < length) {
        scratch[localid] = aI[globalid];
      }
      itemID.barrier(cl::sycl::access::fence_space::local_space);

      /* Apply the reduction operation between the current local
       * id and the one on the other half of the vector. */
      if (globalid < length) {
        auto min = (length < local) ? length : local;
        for (size_t offset = min / 2; offset > 0; offset /= 2) {
          if (localid < offset) {
            auto accum = op.initialize();
            op.reduce(scratch[localid], &accum);
            op.reduce(scratch[localid + offset], &accum);
            op.finalize(accum);
            scratch[localid]=accum;
            //scratch[localid] += scratch[localid + offset];
          }
          itemID.barrier(cl::sycl::access::fence_space::local_space);
        }
        /* The final result will be stored in local id 0. */
        if (localid == 0) {
          aI[itemID.get_group(0)] = scratch[localid];
          if((length<=local) && globalid ==0){
            auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
            aOutPtr[0 + ConvertToActualSyclOffset(CoeffReturnType, out_offset)]=scratch[0];
          }
        }
      }
    }

  };

/// ReductionFunctor
template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
 public:
  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
  ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
  :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
  void operator()(cl::sycl::nd_item<1> itemID) {

    typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
    auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
    /// the device_evaluator is detectable and recognisable on the device.
    typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
    auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
    /// const cast added as a naive solution to solve the qualifier drop error
    auto globalid=static_cast<Index>(itemID.get_global_linear_id());
    if (globalid< range) {
      typename DeviceSelf::CoeffReturnType accum = functor.initialize();
      Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
      functor.finalize(accum);
      output_accessor_ptr[globalid + ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum;
    }
  }
 private:
  write_accessor output_accessor;
  ptrdiff_t out_offset;
  FunctorExpr functors;
  Tuple_of_Acc tuple_of_accessors;
  Dims dims;
  Op functor;
  Index range;
};

template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Index>
class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> {
 public:
  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
  typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
  ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
    Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>,  Index range_, Index num_values_to_reduce_)
  :output_accessor(output_accessor_),  out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
  void operator()(cl::sycl::nd_item<1> itemID) {

    typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
    auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
    /// the device_evaluator is detectable and recognisable on the device.
    typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
    auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
    /// const cast added as a naive solution to solve the qualifier drop error
    auto globalid=static_cast<Index>(itemID.get_global_linear_id());
    if (globalid< range) {
      typename DeviceSelf::CoeffReturnType accum = functor.initialize();
      Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
      functor.finalize(accum);
      output_accessor_ptr[globalid+ ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum/num_values_to_reduce;
    }
  }
 private:
  write_accessor output_accessor;
  ptrdiff_t out_offset;
  FunctorExpr functors;
  Tuple_of_Acc tuple_of_accessors;
  Dims dims;
  Op functor;
  Index range;
  Index num_values_to_reduce;
};

template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
class FullReductionKernelFunctor{
public:
  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
  OutAccessor tmp_global_accessor;
  Index rng , remaining, red_factor;
  Op op;
  Dims dims;
  FunctorExpr functors;
  TupleType tuple_of_accessors;

  FullReductionKernelFunctor(OutAccessor acc,   Index rng_, Index remaining_, Index red_factor_, Op op_, Dims dims_, FunctorExpr functors_, TupleType t_acc)
  :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(op_), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}

  void operator()(cl::sycl::nd_item<1> itemID) {

    typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
    auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
    /// the device_evaluator is detectable and recognisable on the device.
    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
    /// const cast added as a naive solution to solve the qualifier drop error
    auto globalid=itemID.get_global_linear_id();

    tmp_global_accessor.get_pointer()[globalid]=(globalid<rng) ? Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op))
    : static_cast<CoeffReturnType>(op.initialize());

    if(remaining!=0 && globalid==0 ){
      // this will add the rest of input buffer when the input size is not devidable to red_factor.
      auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::
      reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
      auto accum = op.initialize();
      op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
      op.reduce(remaining_reduce, &accum);
      op.finalize(accum);
      tmp_global_accessor.get_pointer()[0]=accum;

    }
  }
};

template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr,  typename Dims, typename Index, typename TupleType>
class FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Eigen::internal::MeanReducer<CoeffReturnType>, Dims, Index, TupleType>{
public:
  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
  typedef Eigen::internal::SumReducer<CoeffReturnType> Op;

  OutAccessor tmp_global_accessor;
  Index rng , remaining, red_factor;
  Op op;
  Dims dims;
  FunctorExpr functors;
  TupleType tuple_of_accessors;

  FullReductionKernelFunctor(OutAccessor acc,   Index rng_, Index remaining_, Index red_factor_, Eigen::internal::MeanReducer<CoeffReturnType>, Dims dims_, FunctorExpr functors_, TupleType t_acc)
  :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(Op()), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}

  void operator()(cl::sycl::nd_item<1> itemID) {

    typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
    auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
    /// the device_evaluator is detectable and recognisable on the device.
    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
    /// const cast added as a naive solution to solve the qualifier drop error
    auto globalid=itemID.get_global_linear_id();
    auto scale = (rng*red_factor) + remaining;

    tmp_global_accessor.get_pointer()[globalid]= (globalid<rng)? ((Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)))/scale)
    :static_cast<CoeffReturnType>(op.initialize())/scale;

    if(remaining!=0 && globalid==0 ){
      // this will add the rest of input buffer when the input size is not devidable to red_factor.
      auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
      auto accum = op.initialize();
      tmp_global_accessor.get_pointer()[0]= tmp_global_accessor.get_pointer()[0]*scale;
      op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
      op.reduce(remaining_reduce, &accum);
      op.finalize(accum);
      tmp_global_accessor.get_pointer()[0]=accum/scale;

    }
  }
};

}
}
}
#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H