unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Mehdi Goli    Codeplay Software Ltd.
// Ralph Potter  Codeplay Software Ltd.
// Luke Iwanski  Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

/*****************************************************************
 * TensorScanSycl.h
 *
 * \brief:
 *  Tensor Scan Sycl implement the extend  version of
 * "Efficient parallel scan algorithms for GPUs." .for Tensor operations.
 * The algorithm requires up to 3 stage (consequently 3 kernels) depending on
 * the size of the tensor. In the first kernel (ScanKernelFunctor), each
 * threads within the work-group individually reduces the allocated elements per
 * thread in order to reduces the total number of blocks. In the next step all
 * thread within the work-group will reduce the associated blocks into the
 * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary
 * buffer is given as an input and all the threads within a work-group scan and
 * reduces the boundaries between the blocks (generated from the previous
 * kernel). and write the data on the temporary buffer. If the second kernel is
 * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will
 * adjust the final result into the output buffer.
 * The original algorithm for the parallel prefix sum can be found here:
 *
 * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel
 * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003
 *1, no. 1 (2008): 1-17.
 *****************************************************************/

#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP

namespace Eigen {
namespace TensorSycl {
namespace internal {

#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
#endif

template <typename index_t>
struct ScanParameters {
  // must be power of 2
  static EIGEN_CONSTEXPR index_t ScanPerThread = 8;
  const index_t total_size;
  const index_t non_scan_size;
  const index_t scan_size;
  const index_t non_scan_stride;
  const index_t scan_stride;
  const index_t panel_threads;
  const index_t group_threads;
  const index_t block_threads;
  const index_t elements_per_group;
  const index_t elements_per_block;
  const index_t loop_range;

  ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
                 index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
                 index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
      : total_size(total_size_),
        non_scan_size(non_scan_size_),
        scan_size(scan_size_),
        non_scan_stride(non_scan_stride_),
        scan_stride(scan_stride_),
        panel_threads(panel_threads_),
        group_threads(group_threads_),
        block_threads(block_threads_),
        elements_per_group(elements_per_group_),
        elements_per_block(elements_per_block_),
        loop_range(loop_range_) {}
};

enum class scan_step { first, second };
template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,
          scan_step stp>
struct ScanKernelFunctor {
  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
      LocalAccessor;
  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;

  LocalAccessor scratch;
  Evaluator dev_eval;
  OutAccessor out_accessor;
  OutAccessor temp_accessor;
  const ScanParameters<Index> scanParameters;
  Op accumulator;
  const bool inclusive;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_,
                                                          OutAccessor out_accessor_, OutAccessor temp_accessor_,
                                                          const ScanParameters<Index> scanParameters_, Op accumulator_,
                                                          const bool inclusive_)
      : scratch(scratch_),
        dev_eval(dev_eval_),
        out_accessor(out_accessor_),
        temp_accessor(temp_accessor_),
        scanParameters(scanParameters_),
        accumulator(accumulator_),
        inclusive(inclusive_) {}

  template <scan_step sst = stp, typename Input>
  typename ::Eigen::internal::enable_if<sst == scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
      EIGEN_STRONG_INLINE
      read(const Input &inpt, Index global_id) {
    return inpt.coeff(global_id);
  }

  template <scan_step sst = stp, typename Input>
  typename ::Eigen::internal::enable_if<sst != scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
      EIGEN_STRONG_INLINE
      read(const Input &inpt, Index global_id) {
    return inpt[global_id];
  }

  template <scan_step sst = stp, typename InclusiveOp>
  typename ::Eigen::internal::enable_if<sst == scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  first_step_inclusive_Operation(InclusiveOp inclusive_op) {
    inclusive_op();
  }

  template <scan_step sst = stp, typename InclusiveOp>
  typename ::Eigen::internal::enable_if<sst != scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  first_step_inclusive_Operation(InclusiveOp) {}

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
    auto out_ptr = out_accessor.get_pointer();
    auto tmp_ptr = temp_accessor.get_pointer();
    auto scratch_ptr = scratch.get_pointer().get();

    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
      Index tmp = data_offset % scanParameters.panel_threads;
      const Index panel_id = data_offset / scanParameters.panel_threads;
      const Index group_id = tmp / scanParameters.group_threads;
      tmp = tmp % scanParameters.group_threads;
      const Index block_id = tmp / scanParameters.block_threads;
      const Index local_id = tmp % scanParameters.block_threads;
      // we put one element per packet in scratch_mem
      const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
      const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
      CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
      CoeffReturnType inclusive_scan;
      // the actual panel size is scan_size * non_scan_size.
      // elements_per_panel is roundup to power of 2 for binary tree
      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
      const Index group_offset = group_id * scanParameters.non_scan_stride;
      // This will be effective when the size is bigger than elements_per_block
      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
      const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);
      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
      Index next_elements = 0;
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
        Index global_id = global_offset + next_elements;
        private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
                             (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) &&
                           (global_id < scanParameters.total_size))
                              ? read(dev_eval, global_id)
                              : accumulator.initialize();
        next_elements += scanParameters.scan_stride;
      }
      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
        if (inclusive) {
          inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
        }
      });
      // This for loop must be 2
      EIGEN_UNROLL_LOOP
      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
        Index private_offset = 1;
        // build sum in place up the tree
        EIGEN_UNROLL_LOOP
        for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
          EIGEN_UNROLL_LOOP
          for (Index l = 0; l < d; l++) {
            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
            CoeffReturnType accum = accumulator.initialize();
            accumulator.reduce(private_scan[ai], &accum);
            accumulator.reduce(private_scan[bi], &accum);
            private_scan[bi] = accumulator.finalize(accum);
          }
          private_offset *= 2;
        }
        scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
            private_scan[PacketSize - 1 + packetIndex];
        private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
        // traverse down tree & build scan
        EIGEN_UNROLL_LOOP
        for (Index d = 1; d < PacketSize; d *= 2) {
          private_offset >>= 1;
          EIGEN_UNROLL_LOOP
          for (Index l = 0; l < d; l++) {
            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
            CoeffReturnType accum = accumulator.initialize();
            accumulator.reduce(private_scan[ai], &accum);
            accumulator.reduce(private_scan[bi], &accum);
            private_scan[ai] = private_scan[bi];
            private_scan[bi] = accumulator.finalize(accum);
          }
        }
      }

      Index offset = 1;
      // build sum in place up the tree
      for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
        // Synchronise
        itemID.barrier(cl::sycl::access::fence_space::local_space);
        if (local_id < d) {
          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
          CoeffReturnType accum = accumulator.initialize();
          accumulator.reduce(scratch_ptr[ai], &accum);
          accumulator.reduce(scratch_ptr[bi], &accum);
          scratch_ptr[bi] = accumulator.finalize(accum);
        }
        offset *= 2;
      }
      // Synchronise
      itemID.barrier(cl::sycl::access::fence_space::local_space);
      // next step optimisation
      if (local_id == 0) {
        if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) {
          const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) *
                                    scanParameters.non_scan_size +
                                group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +
                                block_id;
          tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
        }
        // clear the last element
        scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
      }
      // traverse down tree & build scan
      for (Index d = 1; d < scratch_stride; d *= 2) {
        offset >>= 1;
        // Synchronise
        itemID.barrier(cl::sycl::access::fence_space::local_space);
        if (local_id < d) {
          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
          CoeffReturnType accum = accumulator.initialize();
          accumulator.reduce(scratch_ptr[ai], &accum);
          accumulator.reduce(scratch_ptr[bi], &accum);
          scratch_ptr[ai] = scratch_ptr[bi];
          scratch_ptr[bi] = accumulator.finalize(accum);
        }
      }
      // Synchronise
      itemID.barrier(cl::sycl::access::fence_space::local_space);
      // This for loop must be 2
      EIGEN_UNROLL_LOOP
      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
        EIGEN_UNROLL_LOOP
        for (Index i = 0; i < PacketSize; i++) {
          CoeffReturnType accum = private_scan[packetIndex + i];
          accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
          private_scan[packetIndex + i] = accumulator.finalize(accum);
        }
      }
      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
        if (inclusive) {
          accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
          private_scan[0] = accumulator.finalize(inclusive_scan);
        }
      });
      next_elements = 0;
      // right the first set of private param
      EIGEN_UNROLL_LOOP
      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
        Index global_id = global_offset + next_elements;
        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
             scanParameters.scan_size) &&
            (global_id < scanParameters.total_size)) {
          Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
          out_ptr[global_id] = private_scan[private_id];
        }
        next_elements += scanParameters.scan_stride;
      }
    }  // end for loop
  }
};

template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>
struct ScanAdjustmentKernelFunctor {
  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
      LocalAccessor;
  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
  InAccessor in_accessor;
  OutAccessor out_accessor;
  const ScanParameters<Index> scanParameters;
  Op accumulator;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,
                                                                    OutAccessor out_accessor_,
                                                                    const ScanParameters<Index> scanParameters_,
                                                                    Op accumulator_)
      : in_accessor(in_accessor_),
        out_accessor(out_accessor_),
        scanParameters(scanParameters_),
        accumulator(accumulator_) {}

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
    auto in_ptr = in_accessor.get_pointer();
    auto out_ptr = out_accessor.get_pointer();

    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
      Index tmp = data_offset % scanParameters.panel_threads;
      const Index panel_id = data_offset / scanParameters.panel_threads;
      const Index group_id = tmp / scanParameters.group_threads;
      tmp = tmp % scanParameters.group_threads;
      const Index block_id = tmp / scanParameters.block_threads;
      const Index local_id = tmp % scanParameters.block_threads;

      // the actual panel size is scan_size * non_scan_size.
      // elements_per_panel is roundup to power of 2 for binary tree
      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
      const Index group_offset = group_id * scanParameters.non_scan_stride;
      // This will be effective when the size is bigger than elements_per_block
      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
      const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;

      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
      const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block;
      const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;
      CoeffReturnType adjust_val = in_ptr[in_id];

      Index next_elements = 0;
      EIGEN_UNROLL_LOOP
      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
        Index global_id = global_offset + next_elements;
        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
             scanParameters.scan_size) &&
            (global_id < scanParameters.total_size)) {
          CoeffReturnType accum = adjust_val;
          accumulator.reduce(out_ptr[global_id], &accum);
          out_ptr[global_id] = accumulator.finalize(accum);
        }
        next_elements += scanParameters.scan_stride;
      }
    }
  }
};

template <typename Index>
struct ScanInfo {
  const Index &total_size;
  const Index &scan_size;
  const Index &panel_size;
  const Index &non_scan_size;
  const Index &scan_stride;
  const Index &non_scan_stride;

  Index max_elements_per_block;
  Index block_size;
  Index panel_threads;
  Index group_threads;
  Index block_threads;
  Index elements_per_group;
  Index elements_per_block;
  Index loop_range;
  Index global_range;
  Index local_range;
  const Eigen::SyclDevice &dev;
  EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,
                               const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,
                               const Eigen::SyclDevice &dev_)
      : total_size(total_size_),
        scan_size(scan_size_),
        panel_size(panel_size_),
        non_scan_size(non_scan_size_),
        scan_stride(scan_stride_),
        non_scan_stride(non_scan_stride_),
        dev(dev_) {
    // must be power of 2
    local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),
                           Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));

    max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread;

    elements_per_group =
        dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);
    const Index elements_per_panel = elements_per_group * non_scan_size;
    elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block));
    panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread;
    group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread;
    block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread;
    block_size = elements_per_group / elements_per_block;
#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
    const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE));
#else
    const Index max_threads = panel_threads * panel_size;
#endif
    global_range = roundUp(max_threads, local_range);
    loop_range = Index(
        std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
  }
  inline ScanParameters<Index> get_scan_parameter() {
    return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads,
                                 group_threads, block_threads, elements_per_group, elements_per_block, loop_range);
  }
  inline cl::sycl::nd_range<1> get_thread_range() {
    return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
  }
};

template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>
struct SYCLAdjustBlockOffset {
  EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,
                                                           Reducer &accumulator, const Index total_size,
                                                           const Index scan_size, const Index panel_size,
                                                           const Index non_scan_size, const Index scan_stride,
                                                           const Index non_scan_stride, const Eigen::SyclDevice &dev) {
    auto scan_info =
        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);

    typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index>
        AdjustFuctor;
    dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
                                                                      scan_info.max_elements_per_block,
                                                                      scan_info.get_scan_parameter(), accumulator);
  }
};

template <typename CoeffReturnType, scan_step stp>
struct ScanLauncher_impl {
  template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>
  EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,
                                             const Index total_size, const Index scan_size, const Index panel_size,
                                             const Index non_scan_size, const Index scan_stride,
                                             const Index non_scan_stride, const bool inclusive,
                                             const Eigen::SyclDevice &dev) {
    auto scan_info =
        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
    const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
    const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);
    CoeffReturnType *temp_pointer =
        static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));
    EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);

    typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;
    dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
        in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
        scan_info.get_scan_parameter(), accumulator, inclusive);

    if (scan_info.block_size > 1) {
      ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(
          tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
          non_scan_size, Index(1), scan_info.block_size, false, dev);

      SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset(
          tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
          non_scan_stride, dev);
    }
    dev.deallocate_temp(temp_pointer);
  }
};

}  // namespace internal
}  // namespace TensorSycl
namespace internal {
template <typename Self, typename Reducer, bool vectorize>
struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
  typedef typename Self::Index Index;
  typedef typename Self::CoeffReturnType CoeffReturnType;
  typedef typename Self::Storage Storage;
  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
  void operator()(Self &self, EvaluatorPointerType data) {
    const Index total_size = internal::array_prod(self.dimensions());
    const Index scan_size = self.size();
    const Index scan_stride = self.stride();
    // this is the scan op (can be sum or ...)
    auto accumulator = self.accumulator();
    auto inclusive = !self.exclusive();
    auto consume_dim = self.consume_dim();
    auto dev = self.device();

    auto dims = self.inner().dimensions();

    Index non_scan_size = 1;
    Index panel_size = 1;
    if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {
      for (int i = 0; i < consume_dim; i++) {
        non_scan_size *= dims[i];
      }
      for (int i = consume_dim + 1; i < Self::NumDims; i++) {
        panel_size *= dims[i];
      }
    } else {
      for (int i = Self::NumDims - 1; i > consume_dim; i--) {
        non_scan_size *= dims[i];
      }
      for (int i = consume_dim - 1; i >= 0; i--) {
        panel_size *= dims[i];
      }
    }
    const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
    auto eval_impl = self.inner();
    TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block(
        eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
        inclusive, dev);
  }
};
} // namespace internal
}  // namespace Eigen

#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP