aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/compute/skc/segment_ttck.cl
blob: 6ac068bee6fcbe3ff5e0f8d9691ab05cf1fb1864 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/*
 * Copyright 2018 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can
 * be found in the LICENSE file.
 *
 */

//
// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE
// LAYOUT OF THE TTCK KEY.  IF THE TTCK KEY IS ALTERED THEN THIS
// KERNEL WILL NEED TO BE UPDATED
//

#include <hs/cl/gen9/hs_cl_macros.h>

#include "atomic_cl.h"
#include "tile.h"

//
//
//

#define HS_KEYS_PER_SLAB  (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
#define HS_LANE_MASK      (HS_LANES_PER_WARP - 1)

//
//
//

#define SKC_YX_NEQ(row,prev)                \
  (((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0)

//
//
//

__kernel
__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
void
skc_kernel_segment_ttck(__global HS_KEY_TYPE              * SKC_RESTRICT const vout,
                        __global uint                     * SKC_RESTRICT const indices,
                        __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics)
{
  uint const global_id = get_global_id(0);
  uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
  uint const gmem_idx  = gmem_base + (global_id & HS_LANE_MASK);
  uint const lane_idx  = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;

  //
  // LOAD ALL THE ROWS
  //
#undef  HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev)                                           \
  HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];

  HS_SLAB_ROWS();

  //
  // LOAD LAST REGISTER FROM COLUMN TO LEFT
  //
  uint  diffs = 0;
  uint2 r0    = r1;

  if (gmem_base > 0) {
    // if this is the first key in any slab but the first then it
    // broadcast loads the last key in previous slab
    r0.hi = as_uint2(vout[gmem_base - 1]).hi;
  } else if (get_sub_group_local_id() == 0) {
    // if this is the first lane in the first slab
    diffs = 1;
  }

  // now shuffle in the last key from the column to the left
  r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);

  //
  // FIND ALL DIFFERENCES IN SLAB
  //
  uint valid = 0;

#undef  HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev)                   \
  valid |= ((r##row != SKC_ULONG_MAX) << prev);

  HS_SLAB_ROWS();

#undef  HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev)                   \
  diffs |= (SKC_YX_NEQ(row,prev) << prev);

  HS_SLAB_ROWS();

  //
  // SUM UP THE DIFFERENCES
  //
  uint const valid_diffs = valid & diffs;
  uint const count       = popcount(valid_diffs);
  uint const inclusive   = sub_group_scan_inclusive_add(count);
  uint const exclusive   = inclusive - count;

  //
  // RESERVE SPACE IN THE INDICES ARRAY
  //
  uint next = 0;

  if (get_sub_group_local_id() == HS_LANES_PER_WARP-1)
    next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset

  // distribute base across subgroup
  next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1);

  //
  // STORE THE INDICES
  //
#undef  HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev)                   \
  if (valid_diffs & (1 << prev))                \
    indices[next++] = lane_idx + prev;

  HS_SLAB_ROWS();

  //
  // TRANSPOSE THE SLAB AND STORE IT
  //
  HS_TRANSPOSE_SLAB();
}

//
//
//