diff options
Diffstat (limited to 'src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl')
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl | 260 |
1 files changed, 130 insertions, 130 deletions
diff --git a/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl b/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl index 378d51d8d7..7f48978782 100644 --- a/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl +++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl @@ -1,130 +1,130 @@ -/*
- * Copyright 2018 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE
-// LAYOUT OF THE TTCK KEY. IF THE TTCK KEY IS ALTERED THEN THIS
-// KERNEL WILL NEED TO BE UPDATED
-//
-
-#include "tile.h"
-#include "atomic_cl.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
-#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
-
-//
-//
-//
-
-#define SKC_YX_NEQ(row,prev) \
- (((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0)
-
-//
-//
-//
-
-__kernel
-__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
-void
-skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
- __global uint * SKC_RESTRICT const indices,
- __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics)
-{
- uint const global_id = get_global_id(0);
- uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
- uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
- uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
-
- //
- // LOAD ALL THE ROWS
- //
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
-
- HS_SLAB_ROWS();
-
- //
- // LOAD LAST REGISTER FROM COLUMN TO LEFT
- //
- uint diffs = 0;
- uint2 r0 = r1;
-
- if (gmem_base > 0) {
- // if this is the first key in any slab but the first then it
- // broadcast loads the last key in previous slab
- r0.hi = as_uint2(vout[gmem_base - 1]).hi;
- } else if (get_sub_group_local_id() == 0) {
- // if this is the first lane in the first slab
- diffs = 1;
- }
-
- // now shuffle in the last key from the column to the left
- r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
-
- //
- // FIND ALL DIFFERENCES IN SLAB
- //
- uint valid = 0;
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- valid |= ((r##row != SKC_ULONG_MAX) << prev);
-
- HS_SLAB_ROWS();
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- diffs |= (SKC_YX_NEQ(row,prev) << prev);
-
- HS_SLAB_ROWS();
-
- //
- // SUM UP THE DIFFERENCES
- //
- uint const valid_diffs = valid & diffs;
- uint const count = popcount(valid_diffs);
- uint const inclusive = sub_group_scan_inclusive_add(count);
- uint const exclusive = inclusive - count;
-
- //
- // RESERVE SPACE IN THE INDICES ARRAY
- //
- uint next = 0;
-
- if (get_sub_group_local_id() == HS_LANES_PER_WARP-1)
- next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset
-
- // distribute base across subgroup
- next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1);
-
- //
- // STORE THE INDICES
- //
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- if (valid_diffs & (1 << prev)) \
- indices[next++] = lane_idx + prev;
-
- HS_SLAB_ROWS();
-
- //
- // TRANSPOSE THE SLAB AND STORE IT
- //
- HS_TRANSPOSE_SLAB();
-}
-
-//
-//
-//
+/* + * Copyright 2018 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE +// LAYOUT OF THE TTCK KEY. IF THE TTCK KEY IS ALTERED THEN THIS +// KERNEL WILL NEED TO BE UPDATED +// + +#include "tile.h" +#include "atomic_cl.h" +#include "kernel_cl_12.h" + +// +// +// + +#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP) +#define HS_LANE_MASK (HS_LANES_PER_WARP - 1) + +// +// +// + +#define SKC_YX_NEQ(row,prev) \ + (((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0) + +// +// +// + +__kernel +__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP))) +void +skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const vout, + __global uint * SKC_RESTRICT const indices, + __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics) +{ + uint const global_id = get_global_id(0); + uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB; + uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK); + uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE; + + // + // LOAD ALL THE ROWS + // +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP]; + + HS_SLAB_ROWS(); + + // + // LOAD LAST REGISTER FROM COLUMN TO LEFT + // + uint diffs = 0; + uint2 r0 = r1; + + if (gmem_base > 0) { + // if this is the first key in any slab but the first then it + // broadcast loads the last key in previous slab + r0.hi = as_uint2(vout[gmem_base - 1]).hi; + } else if (get_sub_group_local_id() == 0) { + // if this is the first lane in the first slab + diffs = 1; + } + + // now shuffle in the last key from the column to the left + r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1); + + // + // FIND ALL DIFFERENCES IN SLAB + // + uint valid = 0; + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + valid |= ((r##row != SKC_ULONG_MAX) << prev); + + HS_SLAB_ROWS(); + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + diffs |= (SKC_YX_NEQ(row,prev) << prev); + + HS_SLAB_ROWS(); + + // + // SUM UP THE DIFFERENCES + // + uint const valid_diffs = valid & diffs; + uint const count = popcount(valid_diffs); + uint const inclusive = sub_group_scan_inclusive_add(count); + uint const exclusive = inclusive - count; + + // + // RESERVE SPACE IN THE INDICES ARRAY + // + uint next = 0; + + if (get_sub_group_local_id() == HS_LANES_PER_WARP-1) + next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset + + // distribute base across subgroup + next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1); + + // + // STORE THE INDICES + // +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + if (valid_diffs & (1 << prev)) \ + indices[next++] = lane_idx + prev; + + HS_SLAB_ROWS(); + + // + // TRANSPOSE THE SLAB AND STORE IT + // + HS_TRANSPOSE_SLAB(); +} + +// +// +// |