aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/compute/skc/platforms/cl_12/kernels/devices/gen9/kernel_cl_12.h
blob: 224d5c9d91a4e910b0974e797978b8b2bdb2d3d5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
/*
 * Copyright 2017 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can
 * be found in the LICENSE file.
 *
 */

#ifndef SKC_ONCE_DEVICE_CL_12_H
#define SKC_ONCE_DEVICE_CL_12_H

//
// FIXME -- THERE ARE SOME DUPLICATED TYPEDEFS IN THIS FILE
//
// THESE WILL GO AWAY AS THE TYPING GET POLISHED AND SIMPLIFIED
//

#include "block.h"

//
// HOW TO SELECT A SUBBLOCK AND BLOCK SIZES:
//
// 1) The subblock size should match the natural SIMT/SIMD width of
//    the target device.
//
// 2) Either a square or rectangular (1:2) tile size is chosen.  The
//    tile size is usually determined by the amount of SMEM available
//    to a render kernel subgroup and desired multiprocessor
//    occupancy.
//
// 3) If the tile is rectangular then the block size must be at least
//    twice the size of the subblock size.
//
// 4) A large block size can decrease allocation overhead but there
//    will be diminishing returns as the block size increases.
//

#define SKC_DEVICE_BLOCK_WORDS_LOG2             6  // CHANGE "WORDS" TO "SIZE" ?
#define SKC_DEVICE_SUBBLOCK_WORDS_LOG2          3

#define SKC_TILE_WIDTH_LOG2                     SKC_DEVICE_SUBBLOCK_WORDS_LOG2
#define SKC_TILE_HEIGHT_LOG2                    (SKC_DEVICE_SUBBLOCK_WORDS_LOG2 + 1)

/////////////////////////////////////////////////////////////////
//
// BLOCK POOL INIT
//

#define SKC_BP_INIT_IDS_KERNEL_ATTRIBS
#define SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS      __attribute__((reqd_work_group_size(2,1,1)))

/////////////////////////////////////////////////////////////////
//
// PATHS ALLOC
//

#define SKC_PATHS_ALLOC_KERNEL_ATTRIBS          __attribute__((reqd_work_group_size(1,1,1)))

/////////////////////////////////////////////////////////////////
//
// PATHS COPY
//

#define SKC_PATHS_COPY_SUBGROUP_SIZE_LOG2       SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK?
#define SKC_PATHS_COPY_ELEM_WORDS               1
#define SKC_PATHS_COPY_ELEM_EXPAND()            SKC_EXPAND_1()

#define SKC_PATHS_COPY_KERNEL_ATTRIBS           __attribute__((intel_reqd_sub_group_size(SKC_PATHS_COPY_SUBGROUP_SIZE)))

#define SKC_IS_NOT_PATH_HEAD(sg,I)              ((sg) + get_sub_group_local_id() >= SKC_PATH_HEAD_WORDS)

typedef skc_uint  skc_paths_copy_elem;
typedef skc_uint  skc_pb_idx_v;

/////////////////////////////////////////////////////////////////
//
// FILLS EXPAND
//

#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_LOG2     SKC_DEVICE_SUBBLOCK_WORDS_LOG2
#define SKC_FILLS_EXPAND_ELEM_WORDS             1

#define SKC_FILLS_EXPAND_KERNEL_ATTRIBS         __attribute__((intel_reqd_sub_group_size(SKC_FILLS_EXPAND_SUBGROUP_SIZE)))

/////////////////////////////////////////////////////////////////
//
// RASTER ALLOC
//
// NOTE -- Intel subgroup shuffles aren't supported in SIMD32 which is
// why use of the subgroup broadcast produces a compiler error. So a
// subgroup of size 16 is this widest we can require.
//

#define SKC_RASTERS_ALLOC_GROUP_SIZE            16

#if (SKC_RASTERS_ALLOC_GROUP_SIZE <= 16)

#define SKC_RASTERS_ALLOC_KERNEL_ATTRIBS        __attribute__((intel_reqd_sub_group_size(SKC_RASTERS_ALLOC_GROUP_SIZE)))
#define SKC_RASTERS_ALLOC_LOCAL_ID()            get_sub_group_local_id()
#define SKC_RASTERS_ALLOC_INCLUSIVE_ADD(v)      sub_group_scan_inclusive_add(v)
#define SKC_RASTERS_ALLOC_BROADCAST(v,i)        sub_group_broadcast(v,i)

#else

#define SKC_RASTERS_ALLOC_KERNEL_ATTRIBS        __attribute__((reqd_work_group_size(SKC_RASTERS_ALLOC_GROUP_SIZE,1,1)))
#define SKC_RASTERS_ALLOC_LOCAL_ID()            get_local_id(0)
#define SKC_RASTERS_ALLOC_INCLUSIVE_ADD(v)      work_group_scan_inclusive_add(v)
#define SKC_RASTERS_ALLOC_BROADCAST(v,i)        work_group_broadcast(v,i)

#endif

/////////////////////////////////////////////////////////////////
//
// RASTERIZE
//

#define SKC_RASTERIZE_SUBGROUP_SIZE             SKC_DEVICE_SUBBLOCK_WORDS
#define SKC_RASTERIZE_VECTOR_SIZE_LOG2          0
#define SKC_RASTERIZE_WORKGROUP_SUBGROUPS       1

#define SKC_RASTERIZE_KERNEL_ATTRIBS                                    \
  __attribute__((intel_reqd_sub_group_size(SKC_RASTERIZE_SUBGROUP_SIZE))) \
  __attribute__((reqd_work_group_size(SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_WORKGROUP_SUBGROUPS, 1, 1)))

#define SKC_RASTERIZE_FLOAT                     float
#define SKC_RASTERIZE_UINT                      uint
#define SKC_RASTERIZE_INT                       int
#define SKC_RASTERIZE_PREDICATE                 bool
#define SKC_RASTERIZE_POOL                      uint

#define SKC_RASTERIZE_TILE_HASH_X_BITS          1
#define SKC_RASTERIZE_TILE_HASH_Y_BITS          2

typedef skc_block_id_t skc_block_id_v_t;
typedef skc_uint2      skc_ttsk_v_t;
typedef skc_uint2      skc_ttsk_s_t;

// SKC_STATIC_ASSERT(SKC_RASTERIZE_POOL_SIZE > SKC_RASTERIZE_SUBGROUP_SIZE);

/////////////////////////////////////////////////////////////////
//
// PREFIX
//

#define SKC_PREFIX_SUBGROUP_SIZE               8 // for now this had better be SKC_DEVICE_SUBBLOCK_WORDS
#define SKC_PREFIX_WORKGROUP_SUBGROUPS         1

#define SKC_PREFIX_KERNEL_ATTRIBS                                       \
  __attribute__((intel_reqd_sub_group_size(SKC_PREFIX_SUBGROUP_SIZE)))  \
  __attribute__((reqd_work_group_size(SKC_PREFIX_SUBGROUP_SIZE * SKC_PREFIX_WORKGROUP_SUBGROUPS, 1, 1)))

#define SKC_PREFIX_TTP_V                       skc_uint2
#define SKC_PREFIX_TTS_V_BITFIELD              skc_int

#define SKC_PREFIX_TTS_VECTOR_INT_EXPAND       SKC_EXPAND_1

#define SKC_PREFIX_SMEM_ZERO                   ulong
#define SKC_PREFIX_SMEM_ZERO_WIDTH             (sizeof(SKC_PREFIX_SMEM_ZERO) / sizeof(skc_ttp_t))
#define SKC_PREFIX_SMEM_COUNT_BLOCK_ID         8

#define SKC_PREFIX_BLOCK_ID_V_SIZE             SKC_PREFIX_SUBGROUP_SIZE

#define SKC_PREFIX_TTXK_V_SIZE                 SKC_PREFIX_SUBGROUP_SIZE
#define SKC_PREFIX_TTXK_V_MASK                 (SKC_PREFIX_TTXK_V_SIZE - 1)

typedef skc_uint       skc_bp_elem_t;

typedef skc_uint2      skc_ttrk_e_t;
typedef skc_uint2      skc_ttsk_v_t;
typedef skc_uint2      skc_ttsk_s_t;
typedef skc_uint2      skc_ttpk_s_t;
typedef skc_uint2      skc_ttxk_v_t;

typedef skc_int        skc_tts_v_t;

typedef skc_int        skc_ttp_t;

typedef skc_uint       skc_raster_yx_s;

typedef skc_block_id_t skc_block_id_v_t;
typedef skc_block_id_t skc_block_id_s_t;

/////////////////////////////////////////////////////////////////
//
// PLACE
//

#define SKC_PLACE_SUBGROUP_SIZE                16
#define SKC_PLACE_WORKGROUP_SUBGROUPS          1

#define SKC_PLACE_KERNEL_ATTRIBS                                       \
  __attribute__((intel_reqd_sub_group_size(SKC_PLACE_SUBGROUP_SIZE)))  \
  __attribute__((reqd_work_group_size(SKC_PLACE_SUBGROUP_SIZE * SKC_PLACE_WORKGROUP_SUBGROUPS, 1, 1)))

typedef skc_uint  skc_bp_elem_t;

typedef skc_uint  skc_ttsk_lo_t;
typedef skc_uint  skc_ttsk_hi_t;

typedef skc_uint  skc_ttpk_lo_t;
typedef skc_uint  skc_ttpk_hi_t;

typedef skc_uint  skc_ttxk_lo_t;
typedef skc_uint  skc_ttxk_hi_t;

typedef skc_uint2 skc_ttck_t;

typedef skc_bool  skc_pred_v_t;
typedef skc_int   skc_int_v_t;

/////////////////////////////////////////////////////////////////
//
// RENDER
//

#define SKC_ARCH_GEN9

#if defined(__OPENCL_C_VERSION__)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif

#define SKC_RENDER_SUBGROUP_SIZE               8
#define SKC_RENDER_WORKGROUP_SUBGROUPS         1

#define SKC_RENDER_KERNEL_ATTRIBS                                       \
  __attribute__((intel_reqd_sub_group_size(SKC_RENDER_SUBGROUP_SIZE)))  \
  __attribute__((reqd_work_group_size(SKC_RENDER_SUBGROUP_SIZE * SKC_RENDER_WORKGROUP_SUBGROUPS, 1, 1)))

#define SKC_RENDER_SCANLINE_VECTOR_SIZE        2

#define SKC_RENDER_REGS_COLOR_R                2
#define SKC_RENDER_REGS_COVER_R                3

#define SKC_RENDER_TTSB_EXPAND()               SKC_EXPAND_1()

#define SKC_RENDER_TTS_V                       skc_int
#define SKC_RENDER_TTS_V_BITFIELD              skc_int

#define SKC_RENDER_TTP_V                       skc_int2
#define SKC_RENDER_AREA_V                      skc_int2

#define SKC_RENDER_TILE_COLOR_PAIR             half2
#define SKC_RENDER_TILE_COLOR_PAIR_LOAD(x,v)   vload2(x,v)

#define SKC_RENDER_SURFACE_COLOR               half4
#define SKC_RENDER_SURFACE_WRITE               write_imageh

// #define SKC_RENDER_TTXB_VECTOR_INT             int2
// #define SKC_RENDER_TTXB_VECTOR_UINT            uint2

#define SKC_RENDER_WIDE_AA                     ulong // SLM = 64 bytes/clock

#define SKC_RENDER_TILE_COLOR                  half2
#define SKC_RENDER_TILE_COVER                  half2

#define SKC_RENDER_ACC_COVER_INT               int2
#define SKC_RENDER_ACC_COVER_UINT              uint2

#define SKC_RENDER_GRADIENT_FLOAT              float2
#define SKC_RENDER_GRADIENT_INT                int2
#define SKC_RENDER_GRADIENT_STOP               int2
#define SKC_RENDER_GRADIENT_FRAC               half2
#define SKC_RENDER_GRADIENT_COLOR_STOP         half

#define SKC_RENDER_SURFACE_U8_RGBA             uint2

#define SKC_RENDER_TILE_COLOR_VECTOR           uint16
#define SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT uint
#define SKC_RENDER_TILE_COLOR_VECTOR_COUNT     ((sizeof(SKC_RENDER_TILE_COLOR) * 4 * SKC_TILE_WIDTH) / sizeof(SKC_RENDER_TILE_COLOR_VECTOR))

/////////////////////////////////////////////////////////////////
//
// PATHS & RASTERS RECLAIM
//
// FIXME -- investigate enabling the stride option for a smaller grid
// that iterates over a fixed number of threads.  Since reclamation is
// a low-priority task, it's probably reasonable to trade longer
// reclamation times for lower occupancy of the device because it
// might delay the fastpath of the pipeline.
//

#define SKC_RECLAIM_ARRAY_SIZE                  (7 * 8 / 2) // 8 EUs with 7 hardware threads divided by 2 is half a sub-slice

/////////////////////////////////////////////////////////////////
//
// PATHS RECLAIM
//

#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2    SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK?
#define SKC_PATHS_RECLAIM_LOCAL_ELEMS           1
#define SKC_PATHS_RECLAIM_KERNEL_ATTRIBS        __attribute__((intel_reqd_sub_group_size(SKC_PATHS_RECLAIM_SUBGROUP_SIZE)))

/////////////////////////////////////////////////////////////////
//
// RASTERS RECLAIM
//

#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2  SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK?
#define SKC_RASTERS_RECLAIM_LOCAL_ELEMS         1
#define SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS      __attribute__((intel_reqd_sub_group_size(SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)))

//
// COMMON -- FIXME -- HOIST THESE ELSEWHERE
//

#define SKC_DEVICE_BLOCK_WORDS                 (1u << SKC_DEVICE_BLOCK_WORDS_LOG2)
#define SKC_DEVICE_SUBBLOCK_WORDS              (1u << SKC_DEVICE_SUBBLOCK_WORDS_LOG2)

#define SKC_DEVICE_BLOCK_DWORDS                (SKC_DEVICE_BLOCK_WORDS / 2)

#define SKC_DEVICE_BLOCK_WORDS_MASK            SKC_BITS_TO_MASK(SKC_DEVICE_BLOCK_WORDS_LOG2)
#define SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK    SKC_BITS_TO_MASK(SKC_DEVICE_BLOCK_WORDS_LOG2 - SKC_DEVICE_SUBBLOCK_WORDS_LOG2)

#define SKC_DEVICE_SUBBLOCKS_PER_BLOCK         (SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS)

#define SKC_TILE_RATIO                         (SKC_TILE_HEIGHT / SKC_TILE_WIDTH)

//
//
//

#define SKC_PATHS_COPY_SUBGROUP_SIZE           (1 << SKC_PATHS_COPY_SUBGROUP_SIZE_LOG2)
#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE        (1 << SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2)
#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE      (1 << SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2)
#define SKC_FILLS_EXPAND_SUBGROUP_SIZE         (1 << SKC_FILLS_EXPAND_SUBGROUP_SIZE_LOG2)

//
//
//

#endif

//
//
//