src/compute/hs/cl/gen9/hs_cl_macros.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199

//
// Copyright 2016 Google Inc.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//

#ifndef HS_CL_MACROS_ONCE
#define HS_CL_MACROS_ONCE

//
//
//

#include "hs_cl.h"

//
// Inter-lane compare exchange
//

// default
#define HS_CMP_XCHG_V0(a,b)                     \
  {                                             \
    HS_KEY_TYPE const t = min(a,b);             \
    b = max(a,b);                               \
    a = t;                                      \
  }

// super slow
#define HS_CMP_XCHG_V1(a,b)                     \
  {                                             \
    HS_KEY_TYPE const tmp = a;                  \
    a  = (a < b) ? a : b;                       \
    b ^= a ^ tmp;                               \
  }

// best
#define HS_CMP_XCHG_V2(a,b)                     \
  if (a >= b) {                                 \
    HS_KEY_TYPE const t = a;                    \
    a = b;                                      \
    b = t;                                      \
  }

// good
#define HS_CMP_XCHG_V3(a,b)                     \
  {                                             \
    int         const ge = a >= b;              \
    HS_KEY_TYPE const t  = a;                   \
    a = ge ? b : a;                             \
    b = ge ? t : b;                             \
  }

//
//
//

#if   (HS_KEY_WORDS == 1)
#define HS_CMP_XCHG(a,b)  HS_CMP_XCHG_V0(a,b)
#elif (HS_KEY_WORDS == 2)
#define HS_CMP_XCHG(a,b)  HS_CMP_XCHG_V2(a,b)
#endif

//
// Conditional inter-subgroup flip/half compare exchange
//

#define HS_CMP_FLIP(i,a,b)                                              \
  {                                                                     \
    HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,flip_lane_idx);    \
    HS_KEY_TYPE const tb = intel_sub_group_shuffle(b,flip_lane_idx);    \
    a = HS_COND_MIN_MAX(t_lt,a,tb);                                     \
    b = HS_COND_MIN_MAX(t_lt,b,ta);                                     \
  }

#define HS_CMP_HALF(i,a)                                                \
  {                                                                     \
    HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,half_lane_idx);    \
    a = HS_COND_MIN_MAX(t_lt,a,ta);                                     \
  }

//
// The device's comparison operator might return what we actually
// want.  For example, it appears GEN 'cmp' returns {true:-1,false:0}.
//

#define HS_CMP_IS_ZERO_ONE

#ifdef HS_CMP_IS_ZERO_ONE
// OpenCL requires a {true: +1, false: 0} scalar result
// (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF }
#define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b))
#define HS_CMP_TO_MASK(a)   (HS_KEY_TYPE)(-a)
#else
// However, OpenCL requires { -1, 0 } for vectors
// (a < b) -> { 0xFFFFFFFF, 0 }
#define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64
#define HS_CMP_TO_MASK(a)   (a)
#endif

//
// The flip/half comparisons rely on a "conditional min/max":
//
//  - if the flag is false, return min(a,b)
//  - otherwise, return max(a,b)
//
// What's a little surprising is that sequence (1) is faster than (2)
// for 32-bit keys.
//
// I suspect either a code generation problem or that the sequence
// maps well to the GEN instruction set.
//
// We mostly care about 64-bit keys and unsurprisingly sequence (2) is
// fastest for this wider type.
//

// this is what you would normally use
#define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) ^ lt) ? b : a

// this seems to be faster for 32-bit keys
#define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b))

//
//
//

#if   (HS_KEY_WORDS == 1)
#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V1(lt,a,b)
#elif (HS_KEY_WORDS == 2)
#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b)
#endif

//
// This snarl of macros is for transposing a "slab" of sorted elements
// into linear order.
//
// This can occur as the last step in hs_sort() or via a custom kernel
// that inspects the slab and then transposes and stores it to memory.
//
// The slab format can be inspected more efficiently than a linear
// arrangement.
//
// The prime example is detecting when adjacent keys (in sort order)
// have differing high order bits ("key changes").  The index of each
// change is recorded to an auxilary array.
//
// A post-processing step like this needs to be able to navigate the
// slab and eventually transpose and store the slab in linear order.
//

#define HS_TRANSPOSE_REG(prefix,row)   prefix##row
#define HS_TRANSPOSE_DECL(prefix,row)  HS_KEY_TYPE const HS_TRANSPOSE_REG(prefix,row)

#define HS_TRANSPOSE_DELTA(level)     (HS_LANES_PER_WARP + (1 << (level-1)))
#define HS_TRANSPOSE_IF(level)        ((get_sub_group_local_id() >> (level - 1)) & 1)

#define HS_TRANSPOSE_LL(level)        HS_TRANSPOSE_IF(level) ? 0 : HS_TRANSPOSE_DELTA(level)
#define HS_TRANSPOSE_UR(level)        HS_TRANSPOSE_IF(level) ? HS_TRANSPOSE_DELTA(level) : 0

#define HS_TRANSPOSE_DELTA_LL(level)  delta_ll_##level
#define HS_TRANSPOSE_DELTA_UR(level)  delta_ur_##level

#define HS_TRANSPOSE_STAGE(level)                                       \
  uint const HS_TRANSPOSE_DELTA_LL(level) = HS_TRANSPOSE_LL(level);     \
  uint const HS_TRANSPOSE_DELTA_UR(level) = HS_TRANSPOSE_UR(level);

#define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \
  HS_TRANSPOSE_DECL(prefix_curr,row_ll) =                               \
    intel_sub_group_shuffle_down(HS_TRANSPOSE_REG(prefix_prev,row_ll),  \
                                 HS_TRANSPOSE_REG(prefix_prev,row_ur),  \
                                 HS_TRANSPOSE_DELTA_LL(level));         \
  HS_TRANSPOSE_DECL(prefix_curr,row_ur) =                               \
    intel_sub_group_shuffle_up(HS_TRANSPOSE_REG(prefix_prev,row_ll),    \
                               HS_TRANSPOSE_REG(prefix_prev,row_ur),    \
                               HS_TRANSPOSE_DELTA_UR(level));           \

// #define HS_TRANSPOSE_LOAD(row)                                        \
//   HS_TRANSPOSE_DECL(0,row) = (vout + gmem_idx)[(row-1) << HS_LANES_PER_WARP_LOG2];

#define HS_TRANSPOSE_REMAP(prefix,row_from,row_to)                      \
  (vout + gmem_idx)[(row_to-1) << HS_LANES_PER_WARP_LOG2] =             \
    HS_TRANSPOSE_REG(prefix,row_from);

//
// undefine these if you want to override
//

#define HS_TRANSPOSE_PREAMBLE()
#define HS_TRANSPOSE_BODY()

//
//
//

#endif

//
//
//