aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl
diff options
context:
space:
mode:
Diffstat (limited to 'src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl')
-rw-r--r--src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl788
1 files changed, 394 insertions, 394 deletions
diff --git a/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl b/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl
index e9accde307..9db82d5f98 100644
--- a/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl
@@ -1,394 +1,394 @@
-/*
- * Copyright 2018 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE
-// LAYOUT OF THE TTRK KEY. IF THE TTRK KEY IS ALTERED THEN THIS
-// KERNEL WILL NEED TO BE UPDATED
-//
-
-#include "tile.h"
-#include "raster_builder_cl_12.h" // need meta_in structure
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
-#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
-
-//
-// THE BEST TYPE TO ZERO SMEM
-//
-
-#define SKC_ZERO_TYPE ulong
-#define SKC_ZERO_WORDS 2
-
-//
-// THE ORDER OF COMPONENTS IS:
-//
-// 0: blocks
-// 1: offset
-// 2: pk
-// 3: rk
-//
-
-#if (HS_KEYS_PER_SLAB < 256)
-
-#define SKC_META_TYPE uint
-#define SKC_META_WORDS 1
-
-#define SKC_COMPONENT_TYPE uchar
-
-#else
-
-#define SKC_META_TYPE uint2
-#define SKC_META_WORDS 2
-
-#define SKC_COMPONENT_TYPE ushort
-
-#endif
-
-//
-//
-//
-
-#if ( SKC_TTRK_HI_BITS_COHORT <= 8)
-#define SKC_COHORT_TYPE uchar
-#else
-#define SKC_COHORT_TYPE ushort
-#endif
-
-//
-//
-//
-
-#define SKC_COHORT_ID(row) \
- as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT
-
-//
-// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED
-//
-
-#define SKC_IS_BLOCK(row) \
- ((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
-
-#define SKC_YX(row,prev) \
- (as_uint2(r##row).hi ^ as_uint2(r##prev).hi)
-
-#define SKC_IS_PK(row,prev) \
- ((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X)
-
-//
-// COHORT SIZE IS ALWAYS A POWER-OF-TWO
-// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO
-//
-// COHORT SIZE >= SUBGROUP SIZE
-//
-
-#define SKC_COHORT_SIZE (1<<SKC_TTRK_HI_BITS_COHORT)
-
-#define SKC_ZERO_RATIO (SKC_ZERO_WORDS / SKC_META_WORDS)
-#define SKC_META_ZERO_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE))
-#define SKC_META_ZERO_REM (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2))
-
-#define SKC_META_COMPONENTS 4
-#define SKC_META_COMPONENT_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE))
-
-//
-//
-//
-
-__kernel
-__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
-void
-skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
- __global uint * SKC_RESTRICT const metas)
-{
- __local union
- {
- SKC_META_TYPE volatile m[SKC_COHORT_SIZE];
- SKC_ZERO_TYPE z[SKC_META_ZERO_COUNT];
- SKC_COMPONENT_TYPE c[SKC_META_COMPONENT_COUNT];
- } shared;
-
- uint const global_id = get_global_id(0);
- uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
- uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
- uint const gmem_off = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
-
- //
- // LOAD ALL THE ROWS
- //
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
-
- HS_SLAB_ROWS();
-
- //
- // LOAD LAST REGISTER FROM COLUMN TO LEFT
- //
- uint diffs = 0;
- uint2 r0 = 0;
-
- if (gmem_base > 0) {
- // if this is the first key in any slab but the first then it
- // broadcast loads the last key in previous slab
- r0.hi = as_uint2(vout[gmem_base - 1]).hi;
- } else {
- // otherwise broadcast the first key in the first slab
- r0.hi = sub_group_broadcast(as_uint2(r1).hi,0);
- // and mark it as an implicit diff
- if (get_sub_group_local_id() == 0)
- diffs = 1;
- }
-
- // now shuffle in the last key from the column to the left
- r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
-
- // shift away y/x
- SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT;
-
- //
- // EXTRACT ALL COHORT IDS EARLY...
- //
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row);
-
- HS_SLAB_ROWS();
-
- //
- // DEBUG
- //
-#if 0
- if (gmem_base == HS_KEYS_PER_SLAB * 7)
- {
- if (get_sub_group_local_id() == 0)
- printf("\n%llX ",as_ulong(r0));
- else
- printf("%llX ",as_ulong(r0));
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- if (get_sub_group_local_id() == 0) \
- printf("\n%llX ",r##row); \
- else \
- printf("%llX ",r##row);
-
- HS_SLAB_ROWS();
- }
-#endif
-
- //
- // CAPTURE ALL CONDITIONS WE CARE ABOUT
- //
- // Diffs must be captured before cohorts
- //
- uint valid = 0;
- uint blocks = 0;
- uint pks = 0;
- SKC_COHORT_TYPE c_max = 0;
-
- //
- // FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN
- // EXPLICIT PREDICATE WILL GENERATE THE SAME CODE
- //
-#if 0
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- diffs |= ((c##row != c##prev) << prev);
-
- HS_SLAB_ROWS();
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- blocks |= (SKC_IS_BLOCK(row) << prev);
-
- HS_SLAB_ROWS();
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- pks |= SKC_IS_PK(row,prev) << prev);
-
- HS_SLAB_ROWS();
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- valid |= ((r##row != SKC_ULONG_MAX) << prev);
-
- HS_SLAB_ROWS();
-
-#else
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- if (c##row != c##prev) \
- diffs |= 1<<prev;
-
- HS_SLAB_ROWS();
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- if (SKC_IS_BLOCK(row)) \
- blocks |= 1<<prev;
-
- HS_SLAB_ROWS();
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- if (SKC_IS_PK(row,prev)) \
- pks |= 1<<prev;
-
- HS_SLAB_ROWS();
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- if (r##row != SKC_ULONG_MAX) { \
- valid |= 1<<prev; \
- c_max = max(c_max,c##row); \
- }
-
- HS_SLAB_ROWS();
-
-#endif
-
- //
- // TRANSPOSE THE SLAB AND STORE IT
- //
- HS_TRANSPOSE_SLAB();
-
- // the min cohort is the first key in the slab
- uint const c_min = sub_group_broadcast(c1,0);
-
- // the max cohort is the max across all lanes
- c_max = sub_group_reduce_max(c_max);
-
-#if 0 // REMOVE ME LATER
- if (get_sub_group_local_id() == 0)
- printf("%3u : ( %3u , %3u )\n",
- get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max);
-#endif
-
- //
- // ZERO SMEM
- //
- // zero only the meta info for the cohort ids found in this slab
- //
-#if (SKC_ZERO_WORDS >= SKC_META_WORDS)
- uint zz = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id();
- uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO;
-
- for (; zz<=zz_max; zz+=HS_LANES_PER_WARP)
- shared.z[zz] = 0;
-#else
- // ERROR -- it's highly unlikely that the zero type is smaller than
- // the meta type
-#error("Unsupported right now...")
-#endif
-
- //
- // ACCUMULATE AND STORE META INFO
- //
- uint const valid_blocks = valid & blocks;
- uint const valid_pks = valid & pks & ~diffs;
- SKC_META_TYPE meta = ( 0 );
-
-#define SKC_META_LOCAL_ADD(meta) \
- atomic_add(shared.m+HS_REG_LAST(c),meta);
-
-#define SKC_META_LOCAL_STORE(meta,prev) \
- shared.m[c##prev] = meta;
-
- // note this is purposefully off by +1
-#define SKC_META_RESET(meta,curr) \
- meta = ((gmem_off + curr) << 8);
-
-#if 0
-
- // FIXME -- this can be tweaked to shift directly
-#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
- meta += ((((blocks >> prev) & 1) ) | \
- (((pks >> prev) & 1) << 16) | \
- (((rks >> prev) & 1) << 24));
-
-#else
-
-#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
- if (blocks & (1<<prev)) \
- meta += 1; \
- if (pks & (1<<prev)) \
- meta += 1<<16; \
- if (rks & (1<<prev)) \
- meta += 1<<24;
-
-#endif
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- if (diffs & (1<<prev)) { \
- SKC_META_LOCAL_STORE(meta,prev); \
- SKC_META_RESET(meta,row); \
- } \
- SKC_META_ADD(meta,prev, \
- valid_blocks, \
- valid_pks, \
- valid);
-
- HS_SLAB_ROWS();
-
- //
- // ATOMICALLY ADD THE CARRIED OUT METAS
- //
-#if 0 // BUG
- if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0))
- SKC_META_LOCAL_ADD(meta);
-#else
- if (meta != 0)
- SKC_META_LOCAL_ADD(meta);
-#endif
-
- //
- // NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE
- //
-
- // convert the slab offset to an extent offset
- bool const is_offset = (get_sub_group_local_id() & 3) == 1;
- uint const adjust = is_offset ? gmem_base - 1 : 0;
-
- //
- // only process the meta components found in this slab
- //
- uint const cc_min = c_min * SKC_META_COMPONENTS;
- uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1;
- uint cc = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id();
-
- if ((cc >= cc_min) && (cc <= cc_max))
- {
- uint const c = shared.c[cc];
-
- if (c != 0)
- atomic_add(metas+cc,c+adjust);
- }
-
- cc += HS_LANES_PER_WARP;
-
- for (; cc<=cc_max; cc+=HS_LANES_PER_WARP)
- {
- uint const c = shared.c[cc];
-
- if (c != 0)
- atomic_add(metas+cc,c+adjust);
- }
-}
-
-//
-//
-//
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE
+// LAYOUT OF THE TTRK KEY. IF THE TTRK KEY IS ALTERED THEN THIS
+// KERNEL WILL NEED TO BE UPDATED
+//
+
+#include "tile.h"
+#include "raster_builder_cl_12.h" // need meta_in structure
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
+#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
+
+//
+// THE BEST TYPE TO ZERO SMEM
+//
+
+#define SKC_ZERO_TYPE ulong
+#define SKC_ZERO_WORDS 2
+
+//
+// THE ORDER OF COMPONENTS IS:
+//
+// 0: blocks
+// 1: offset
+// 2: pk
+// 3: rk
+//
+
+#if (HS_KEYS_PER_SLAB < 256)
+
+#define SKC_META_TYPE uint
+#define SKC_META_WORDS 1
+
+#define SKC_COMPONENT_TYPE uchar
+
+#else
+
+#define SKC_META_TYPE uint2
+#define SKC_META_WORDS 2
+
+#define SKC_COMPONENT_TYPE ushort
+
+#endif
+
+//
+//
+//
+
+#if ( SKC_TTRK_HI_BITS_COHORT <= 8)
+#define SKC_COHORT_TYPE uchar
+#else
+#define SKC_COHORT_TYPE ushort
+#endif
+
+//
+//
+//
+
+#define SKC_COHORT_ID(row) \
+ as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT
+
+//
+// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED
+//
+
+#define SKC_IS_BLOCK(row) \
+ ((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
+
+#define SKC_YX(row,prev) \
+ (as_uint2(r##row).hi ^ as_uint2(r##prev).hi)
+
+#define SKC_IS_PK(row,prev) \
+ ((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X)
+
+//
+// COHORT SIZE IS ALWAYS A POWER-OF-TWO
+// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO
+//
+// COHORT SIZE >= SUBGROUP SIZE
+//
+
+#define SKC_COHORT_SIZE (1<<SKC_TTRK_HI_BITS_COHORT)
+
+#define SKC_ZERO_RATIO (SKC_ZERO_WORDS / SKC_META_WORDS)
+#define SKC_META_ZERO_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE))
+#define SKC_META_ZERO_REM (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2))
+
+#define SKC_META_COMPONENTS 4
+#define SKC_META_COMPONENT_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE))
+
+//
+//
+//
+
+__kernel
+__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
+void
+skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
+ __global uint * SKC_RESTRICT const metas)
+{
+ __local union
+ {
+ SKC_META_TYPE volatile m[SKC_COHORT_SIZE];
+ SKC_ZERO_TYPE z[SKC_META_ZERO_COUNT];
+ SKC_COMPONENT_TYPE c[SKC_META_COMPONENT_COUNT];
+ } shared;
+
+ uint const global_id = get_global_id(0);
+ uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
+ uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
+ uint const gmem_off = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
+
+ //
+ // LOAD ALL THE ROWS
+ //
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
+
+ HS_SLAB_ROWS();
+
+ //
+ // LOAD LAST REGISTER FROM COLUMN TO LEFT
+ //
+ uint diffs = 0;
+ uint2 r0 = 0;
+
+ if (gmem_base > 0) {
+ // if this is the first key in any slab but the first then it
+ // broadcast loads the last key in previous slab
+ r0.hi = as_uint2(vout[gmem_base - 1]).hi;
+ } else {
+ // otherwise broadcast the first key in the first slab
+ r0.hi = sub_group_broadcast(as_uint2(r1).hi,0);
+ // and mark it as an implicit diff
+ if (get_sub_group_local_id() == 0)
+ diffs = 1;
+ }
+
+ // now shuffle in the last key from the column to the left
+ r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
+
+ // shift away y/x
+ SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT;
+
+ //
+ // EXTRACT ALL COHORT IDS EARLY...
+ //
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row);
+
+ HS_SLAB_ROWS();
+
+ //
+ // DEBUG
+ //
+#if 0
+ if (gmem_base == HS_KEYS_PER_SLAB * 7)
+ {
+ if (get_sub_group_local_id() == 0)
+ printf("\n%llX ",as_ulong(r0));
+ else
+ printf("%llX ",as_ulong(r0));
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ if (get_sub_group_local_id() == 0) \
+ printf("\n%llX ",r##row); \
+ else \
+ printf("%llX ",r##row);
+
+ HS_SLAB_ROWS();
+ }
+#endif
+
+ //
+ // CAPTURE ALL CONDITIONS WE CARE ABOUT
+ //
+ // Diffs must be captured before cohorts
+ //
+ uint valid = 0;
+ uint blocks = 0;
+ uint pks = 0;
+ SKC_COHORT_TYPE c_max = 0;
+
+ //
+ // FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN
+ // EXPLICIT PREDICATE WILL GENERATE THE SAME CODE
+ //
+#if 0
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ diffs |= ((c##row != c##prev) << prev);
+
+ HS_SLAB_ROWS();
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ blocks |= (SKC_IS_BLOCK(row) << prev);
+
+ HS_SLAB_ROWS();
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ pks |= SKC_IS_PK(row,prev) << prev);
+
+ HS_SLAB_ROWS();
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ valid |= ((r##row != SKC_ULONG_MAX) << prev);
+
+ HS_SLAB_ROWS();
+
+#else
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ if (c##row != c##prev) \
+ diffs |= 1<<prev;
+
+ HS_SLAB_ROWS();
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ if (SKC_IS_BLOCK(row)) \
+ blocks |= 1<<prev;
+
+ HS_SLAB_ROWS();
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ if (SKC_IS_PK(row,prev)) \
+ pks |= 1<<prev;
+
+ HS_SLAB_ROWS();
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ if (r##row != SKC_ULONG_MAX) { \
+ valid |= 1<<prev; \
+ c_max = max(c_max,c##row); \
+ }
+
+ HS_SLAB_ROWS();
+
+#endif
+
+ //
+ // TRANSPOSE THE SLAB AND STORE IT
+ //
+ HS_TRANSPOSE_SLAB();
+
+ // the min cohort is the first key in the slab
+ uint const c_min = sub_group_broadcast(c1,0);
+
+ // the max cohort is the max across all lanes
+ c_max = sub_group_reduce_max(c_max);
+
+#if 0 // REMOVE ME LATER
+ if (get_sub_group_local_id() == 0)
+ printf("%3u : ( %3u , %3u )\n",
+ get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max);
+#endif
+
+ //
+ // ZERO SMEM
+ //
+ // zero only the meta info for the cohort ids found in this slab
+ //
+#if (SKC_ZERO_WORDS >= SKC_META_WORDS)
+ uint zz = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id();
+ uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO;
+
+ for (; zz<=zz_max; zz+=HS_LANES_PER_WARP)
+ shared.z[zz] = 0;
+#else
+ // ERROR -- it's highly unlikely that the zero type is smaller than
+ // the meta type
+#error("Unsupported right now...")
+#endif
+
+ //
+ // ACCUMULATE AND STORE META INFO
+ //
+ uint const valid_blocks = valid & blocks;
+ uint const valid_pks = valid & pks & ~diffs;
+ SKC_META_TYPE meta = ( 0 );
+
+#define SKC_META_LOCAL_ADD(meta) \
+ atomic_add(shared.m+HS_REG_LAST(c),meta);
+
+#define SKC_META_LOCAL_STORE(meta,prev) \
+ shared.m[c##prev] = meta;
+
+ // note this is purposefully off by +1
+#define SKC_META_RESET(meta,curr) \
+ meta = ((gmem_off + curr) << 8);
+
+#if 0
+
+ // FIXME -- this can be tweaked to shift directly
+#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
+ meta += ((((blocks >> prev) & 1) ) | \
+ (((pks >> prev) & 1) << 16) | \
+ (((rks >> prev) & 1) << 24));
+
+#else
+
+#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
+ if (blocks & (1<<prev)) \
+ meta += 1; \
+ if (pks & (1<<prev)) \
+ meta += 1<<16; \
+ if (rks & (1<<prev)) \
+ meta += 1<<24;
+
+#endif
+
+#undef HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev) \
+ if (diffs & (1<<prev)) { \
+ SKC_META_LOCAL_STORE(meta,prev); \
+ SKC_META_RESET(meta,row); \
+ } \
+ SKC_META_ADD(meta,prev, \
+ valid_blocks, \
+ valid_pks, \
+ valid);
+
+ HS_SLAB_ROWS();
+
+ //
+ // ATOMICALLY ADD THE CARRIED OUT METAS
+ //
+#if 0 // BUG
+ if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0))
+ SKC_META_LOCAL_ADD(meta);
+#else
+ if (meta != 0)
+ SKC_META_LOCAL_ADD(meta);
+#endif
+
+ //
+ // NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE
+ //
+
+ // convert the slab offset to an extent offset
+ bool const is_offset = (get_sub_group_local_id() & 3) == 1;
+ uint const adjust = is_offset ? gmem_base - 1 : 0;
+
+ //
+ // only process the meta components found in this slab
+ //
+ uint const cc_min = c_min * SKC_META_COMPONENTS;
+ uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1;
+ uint cc = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id();
+
+ if ((cc >= cc_min) && (cc <= cc_max))
+ {
+ uint const c = shared.c[cc];
+
+ if (c != 0)
+ atomic_add(metas+cc,c+adjust);
+ }
+
+ cc += HS_LANES_PER_WARP;
+
+ for (; cc<=cc_max; cc+=HS_LANES_PER_WARP)
+ {
+ uint const c = shared.c[cc];
+
+ if (c != 0)
+ atomic_add(metas+cc,c+adjust);
+ }
+}
+
+//
+//
+//