aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl
blob: e9accde30758904f5dcae00264c1e483be413b1b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
/*
 * Copyright 2018 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can
 * be found in the LICENSE file.
 *
 */

//
// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE
// LAYOUT OF THE TTRK KEY.  IF THE TTRK KEY IS ALTERED THEN THIS
// KERNEL WILL NEED TO BE UPDATED
//

#include "tile.h"
#include "raster_builder_cl_12.h" // need meta_in structure
#include "device_cl_12.h"

//
//
//

#define HS_KEYS_PER_SLAB  (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
#define HS_LANE_MASK      (HS_LANES_PER_WARP - 1)

//
// THE BEST TYPE TO ZERO SMEM
//

#define SKC_ZERO_TYPE  ulong
#define SKC_ZERO_WORDS 2

//
// THE ORDER OF COMPONENTS IS:
//
// 0: blocks
// 1: offset
// 2: pk
// 3: rk
//

#if (HS_KEYS_PER_SLAB < 256)

#define SKC_META_TYPE       uint
#define SKC_META_WORDS      1

#define SKC_COMPONENT_TYPE  uchar

#else

#define SKC_META_TYPE       uint2
#define SKC_META_WORDS      2

#define SKC_COMPONENT_TYPE  ushort

#endif

//
//
//

#if ( SKC_TTRK_HI_BITS_COHORT <= 8)
#define SKC_COHORT_TYPE uchar
#else
#define SKC_COHORT_TYPE ushort
#endif

//
//
//

#define SKC_COHORT_ID(row)                      \
  as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT

//
// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED
//

#define SKC_IS_BLOCK(row)                                               \
  ((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)

#define SKC_YX(row,prev)                        \
  (as_uint2(r##row).hi ^ as_uint2(r##prev).hi)

#define SKC_IS_PK(row,prev)                             \
  ((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X)

//
// COHORT   SIZE IS ALWAYS A POWER-OF-TWO
// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO
//
// COHORT SIZE >= SUBGROUP SIZE
//

#define SKC_COHORT_SIZE           (1<<SKC_TTRK_HI_BITS_COHORT)

#define SKC_ZERO_RATIO            (SKC_ZERO_WORDS / SKC_META_WORDS)
#define SKC_META_ZERO_COUNT       (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE))
#define SKC_META_ZERO_REM         (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2))

#define SKC_META_COMPONENTS       4
#define SKC_META_COMPONENT_COUNT  (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE))

//
//
//

__kernel
__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
void
skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
                        __global uint        * SKC_RESTRICT const metas)
{
  __local union
  {
    SKC_META_TYPE volatile m[SKC_COHORT_SIZE];
    SKC_ZERO_TYPE          z[SKC_META_ZERO_COUNT];
    SKC_COMPONENT_TYPE     c[SKC_META_COMPONENT_COUNT];
  } shared;

  uint const global_id = get_global_id(0);
  uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
  uint const gmem_idx  = gmem_base + (global_id & HS_LANE_MASK);
  uint const gmem_off  = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;

  //
  // LOAD ALL THE ROWS
  //
#undef  HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev)                                           \
  HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];

  HS_SLAB_ROWS();

  //
  // LOAD LAST REGISTER FROM COLUMN TO LEFT
  //
  uint  diffs = 0;
  uint2 r0    = 0;

  if (gmem_base > 0) {
    // if this is the first key in any slab but the first then it
    // broadcast loads the last key in previous slab
    r0.hi = as_uint2(vout[gmem_base - 1]).hi;
  } else {
    // otherwise broadcast the first key in the first slab
    r0.hi = sub_group_broadcast(as_uint2(r1).hi,0);
    // and mark it as an implicit diff
    if (get_sub_group_local_id() == 0)
      diffs = 1;
  }

  // now shuffle in the last key from the column to the left
  r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);

  // shift away y/x
  SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT;

  //
  // EXTRACT ALL COHORT IDS EARLY...
  //
#undef  HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev)                           \
  SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row);

  HS_SLAB_ROWS();

  //
  // DEBUG
  //
#if 0
  if (gmem_base == HS_KEYS_PER_SLAB * 7)
    {
      if (get_sub_group_local_id() == 0)
        printf("\n%llX ",as_ulong(r0));
      else
        printf("%llX ",as_ulong(r0));
#undef  HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev)                   \
      if (get_sub_group_local_id() == 0)        \
        printf("\n%llX ",r##row);               \
      else                                      \
        printf("%llX ",r##row);

      HS_SLAB_ROWS();
    }
#endif

  //
  // CAPTURE ALL CONDITIONS WE CARE ABOUT
  //
  // Diffs must be captured before cohorts
  //
  uint            valid  = 0;
  uint            blocks = 0;
  uint            pks    = 0;
  SKC_COHORT_TYPE c_max  = 0;

  //
  // FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN
  // EXPLICIT PREDICATE WILL GENERATE THE SAME CODE
  //
#if 0

#undef  HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev)                   \
  diffs |= ((c##row != c##prev) << prev);

  HS_SLAB_ROWS();

#undef  HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev)                   \
  blocks |= (SKC_IS_BLOCK(row) << prev);

  HS_SLAB_ROWS();

#undef  HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev)                   \
  pks |= SKC_IS_PK(row,prev) << prev);

  HS_SLAB_ROWS();

#undef  HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev)                   \
  valid |= ((r##row != SKC_ULONG_MAX) << prev);

  HS_SLAB_ROWS();

#else

#undef  HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev)                   \
  if (c##row != c##prev)                        \
    diffs |= 1<<prev;

  HS_SLAB_ROWS();

#undef  HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev)                   \
  if (SKC_IS_BLOCK(row))                        \
    blocks |= 1<<prev;

  HS_SLAB_ROWS();

#undef  HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev)                   \
  if (SKC_IS_PK(row,prev))                      \
    pks |= 1<<prev;

  HS_SLAB_ROWS();

#undef  HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev)                   \
  if (r##row != SKC_ULONG_MAX) {                \
    valid |= 1<<prev;                           \
    c_max  = max(c_max,c##row);                 \
  }

  HS_SLAB_ROWS();

#endif

  //
  // TRANSPOSE THE SLAB AND STORE IT
  //
  HS_TRANSPOSE_SLAB();

  // the min cohort is the first key in the slab
  uint const c_min = sub_group_broadcast(c1,0);
  
  // the max cohort is the max across all lanes
  c_max = sub_group_reduce_max(c_max);

#if 0 // REMOVE ME LATER
  if (get_sub_group_local_id() == 0)
    printf("%3u : ( %3u , %3u )\n",
           get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max);
#endif

  //
  // ZERO SMEM
  //
  // zero only the meta info for the cohort ids found in this slab
  //
#if   (SKC_ZERO_WORDS >= SKC_META_WORDS)
  uint       zz     = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id();
  uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO;

  for (; zz<=zz_max; zz+=HS_LANES_PER_WARP)
    shared.z[zz] = 0;
#else
  // ERROR -- it's highly unlikely that the zero type is smaller than
  // the meta type
#error("Unsupported right now...")
#endif

  //
  // ACCUMULATE AND STORE META INFO
  //
  uint const    valid_blocks = valid & blocks;
  uint const    valid_pks    = valid & pks & ~diffs;
  SKC_META_TYPE meta         = ( 0 );

#define SKC_META_LOCAL_ADD(meta)                \
  atomic_add(shared.m+HS_REG_LAST(c),meta);

#define SKC_META_LOCAL_STORE(meta,prev)         \
  shared.m[c##prev] = meta;

  // note this is purposefully off by +1
#define SKC_META_RESET(meta,curr)               \
  meta = ((gmem_off + curr) << 8);

#if 0

  // FIXME -- this can be tweaked to shift directly
#define SKC_META_ADD(meta,prev,blocks,pks,rks)  \
  meta += ((((blocks >> prev) & 1)      ) |     \
           (((pks    >> prev) & 1) << 16) |     \
           (((rks    >> prev) & 1) << 24));

#else

#define SKC_META_ADD(meta,prev,blocks,pks,rks)  \
  if (blocks & (1<<prev))                       \
    meta += 1;                                  \
  if (pks    & (1<<prev))                       \
    meta += 1<<16;                              \
  if (rks    & (1<<prev))                       \
    meta += 1<<24;

#endif

#undef  HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev)                   \
  if (diffs & (1<<prev)) {                      \
    SKC_META_LOCAL_STORE(meta,prev);            \
    SKC_META_RESET(meta,row);                   \
  }                                             \
  SKC_META_ADD(meta,prev,                       \
               valid_blocks,                    \
               valid_pks,                       \
               valid);

  HS_SLAB_ROWS();

  //
  // ATOMICALLY ADD THE CARRIED OUT METAS
  //
#if 0 // BUG
  if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0))
    SKC_META_LOCAL_ADD(meta);
#else
  if (meta != 0)
    SKC_META_LOCAL_ADD(meta);
#endif

  //
  // NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE
  //

  // convert the slab offset to an extent offset
  bool const is_offset = (get_sub_group_local_id() & 3) == 1;
  uint const adjust    = is_offset ? gmem_base - 1 : 0;

  //
  // only process the meta components found in this slab
  //
  uint const cc_min = c_min * SKC_META_COMPONENTS;
  uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1;
  uint       cc     = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id();

  if ((cc >= cc_min) && (cc <= cc_max))
    {
      uint const c = shared.c[cc];

      if (c != 0)
        atomic_add(metas+cc,c+adjust);
    }

  cc += HS_LANES_PER_WARP;

  for (; cc<=cc_max; cc+=HS_LANES_PER_WARP)
    {
      uint const c = shared.c[cc];

      if (c != 0)
        atomic_add(metas+cc,c+adjust);
    }
}

//
//
//