diff options
author | Allan MacKinnon <allanmac@google.com> | 2018-06-21 09:09:56 -0700 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2018-06-21 16:52:47 +0000 |
commit | c110e7941e4e051ad9004412de7b419da8bcf270 (patch) | |
tree | f3f0bfab677b0581d237db540b19bb2e97d40338 /src/compute/hs | |
parent | 867ce8fc8eef76e26b1e56be66badffc3d5ec3ae (diff) |
OpenGL interop is simplified when the cl_context is not created by SKC.
Added GEN9 HotSort kernels so the hs_cl_gen9 lib and hs_bench_cl app can be built.
Bug: skia:
Change-Id: I5b21d33499a6ec3524f39a51443981802b722c8b
Reviewed-on: https://skia-review.googlesource.com/136608
Commit-Queue: Allan MacKinnon <allanmac@google.com>
Reviewed-by: Mike Reed <reed@google.com>
Reviewed-by: Mike Klein <mtklein@google.com>
Diffstat (limited to 'src/compute/hs')
-rw-r--r-- | src/compute/hs/cl/gen9/hs_cl.cl | 10082 | ||||
-rw-r--r-- | src/compute/hs/cl/gen9/hs_cl.h | 122 | ||||
-rw-r--r-- | src/compute/hs/cl/gen9/hs_cl_macros.h | 199 |
3 files changed, 10403 insertions, 0 deletions
diff --git a/src/compute/hs/cl/gen9/hs_cl.cl b/src/compute/hs/cl/gen9/hs_cl.cl new file mode 100644 index 0000000000..63627ad068 --- /dev/null +++ b/src/compute/hs/cl/gen9/hs_cl.cl @@ -0,0 +1,10082 @@ +// +// Copyright 2016 Google Inc. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// + +#include <hs_cl_macros.h> + +// +// +// + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_transpose(__global HS_KEY_TYPE* const restrict vout) +{ + uint const global_id = get_global_id(0); + uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); + + HS_KEY_TYPE r1 = (vout + gmem_idx)[0 * 8]; + HS_KEY_TYPE r2 = (vout + gmem_idx)[1 * 8]; + HS_KEY_TYPE r3 = (vout + gmem_idx)[2 * 8]; + HS_KEY_TYPE r4 = (vout + gmem_idx)[3 * 8]; + HS_KEY_TYPE r5 = (vout + gmem_idx)[4 * 8]; + HS_KEY_TYPE r6 = (vout + gmem_idx)[5 * 8]; + HS_KEY_TYPE r7 = (vout + gmem_idx)[6 * 8]; + HS_KEY_TYPE r8 = (vout + gmem_idx)[7 * 8]; + HS_KEY_TYPE r9 = (vout + gmem_idx)[8 * 8]; + HS_KEY_TYPE r10 = (vout + gmem_idx)[9 * 8]; + HS_KEY_TYPE r11 = (vout + gmem_idx)[10 * 8]; + HS_KEY_TYPE r12 = (vout + gmem_idx)[11 * 8]; + HS_KEY_TYPE r13 = (vout + gmem_idx)[12 * 8]; + HS_KEY_TYPE r14 = (vout + gmem_idx)[13 * 8]; + HS_KEY_TYPE r15 = (vout + gmem_idx)[14 * 8]; + HS_KEY_TYPE r16 = (vout + gmem_idx)[15 * 8]; + HS_TRANSPOSE_SLAB() +} + +__kernel __attribute__((reqd_work_group_size(128, 1, 1))) +__attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_bs_4(__global HS_KEY_TYPE const* const restrict vin, + __global HS_KEY_TYPE* const restrict vout) +{ + __local union + { + HS_KEY_TYPE m[16 * 128]; + } shared; + + uint const global_id = get_global_id(0); + uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); + + HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8]; + HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8]; + HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8]; + HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8]; + HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8]; + HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8]; + HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8]; + HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8]; + HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8]; + HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8]; + HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8]; + HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8]; + HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8]; + HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8]; + HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8]; + HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8]; + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r6, r11) + HS_CMP_XCHG(r7, r10) + HS_CMP_XCHG(r4, r13) + HS_CMP_XCHG(r14, r15) + HS_CMP_XCHG(r8, r12) + HS_CMP_XCHG(r2, r3) + HS_CMP_XCHG(r5, r9) + HS_CMP_XCHG(r2, r5) + HS_CMP_XCHG(r8, r14) + HS_CMP_XCHG(r3, r9) + HS_CMP_XCHG(r12, r15) + HS_CMP_XCHG(r3, r5) + HS_CMP_XCHG(r6, r7) + HS_CMP_XCHG(r10, r11) + HS_CMP_XCHG(r12, r14) + HS_CMP_XCHG(r4, r9) + HS_CMP_XCHG(r8, r13) + HS_CMP_XCHG(r7, r9) + HS_CMP_XCHG(r11, r13) + HS_CMP_XCHG(r4, r6) + HS_CMP_XCHG(r8, r10) + HS_CMP_XCHG(r4, r5) + HS_CMP_XCHG(r6, r7) + HS_CMP_XCHG(r8, r9) + HS_CMP_XCHG(r10, r11) + HS_CMP_XCHG(r12, r13) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + { + uint const flip_lane_mask = 1; + uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; + int const t_lt = get_sub_group_local_id() < flip_lane_idx; + HS_CMP_FLIP(0, r1, r16) + HS_CMP_FLIP(1, r2, r15) + HS_CMP_FLIP(2, r3, r14) + HS_CMP_FLIP(3, r4, r13) + HS_CMP_FLIP(4, r5, r12) + HS_CMP_FLIP(5, r6, r11) + HS_CMP_FLIP(6, r7, r10) + HS_CMP_FLIP(7, r8, r9) + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + { + uint const flip_lane_mask = 3; + uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; + int const t_lt = get_sub_group_local_id() < flip_lane_idx; + HS_CMP_FLIP(0, r1, r16) + HS_CMP_FLIP(1, r2, r15) + HS_CMP_FLIP(2, r3, r14) + HS_CMP_FLIP(3, r4, r13) + HS_CMP_FLIP(4, r5, r12) + HS_CMP_FLIP(5, r6, r11) + HS_CMP_FLIP(6, r7, r10) + HS_CMP_FLIP(7, r8, r9) + } + { + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + { + uint const flip_lane_mask = 7; + uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; + int const t_lt = get_sub_group_local_id() < flip_lane_idx; + HS_CMP_FLIP(0, r1, r16) + HS_CMP_FLIP(1, r2, r15) + HS_CMP_FLIP(2, r3, r14) + HS_CMP_FLIP(3, r4, r13) + HS_CMP_FLIP(4, r5, r12) + HS_CMP_FLIP(5, r6, r11) + HS_CMP_FLIP(6, r7, r10) + HS_CMP_FLIP(7, r8, r9) + } + { + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) + } + { + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + uint const smem_l_idx = get_sub_group_id() * 128 + get_sub_group_local_id(); + uint const smem_r_idx = + (get_sub_group_id() ^ 1) * 128 + (get_sub_group_local_id() ^ 7); + (shared.m + get_local_id(0))[16 * 8 * 0] = r1; + (shared.m + get_local_id(0))[16 * 8 * 1] = r16; + (shared.m + get_local_id(0))[16 * 8 * 2] = r2; + (shared.m + get_local_id(0))[16 * 8 * 3] = r15; + (shared.m + get_local_id(0))[16 * 8 * 4] = r3; + (shared.m + get_local_id(0))[16 * 8 * 5] = r14; + (shared.m + get_local_id(0))[16 * 8 * 6] = r4; + (shared.m + get_local_id(0))[16 * 8 * 7] = r13; + (shared.m + get_local_id(0))[16 * 8 * 8] = r5; + (shared.m + get_local_id(0))[16 * 8 * 9] = r12; + (shared.m + get_local_id(0))[16 * 8 * 10] = r6; + (shared.m + get_local_id(0))[16 * 8 * 11] = r11; + (shared.m + get_local_id(0))[16 * 8 * 12] = r7; + (shared.m + get_local_id(0))[16 * 8 * 13] = r10; + (shared.m + get_local_id(0))[16 * 8 * 14] = r8; + (shared.m + get_local_id(0))[16 * 8 * 15] = r9; + barrier(CLK_LOCAL_MEM_FENCE); + { + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; + HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[8]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[0] = r0_1; + (shared.m + smem_r_idx)[8] = r0_2; + } + { + HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[16]; + HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[24]; + HS_CMP_XCHG(r1_1, r1_2) + (shared.m + smem_l_idx)[16] = r1_1; + (shared.m + smem_r_idx)[24] = r1_2; + } + { + HS_KEY_TYPE r2_1 = (shared.m + smem_l_idx)[32]; + HS_KEY_TYPE r2_2 = (shared.m + smem_r_idx)[40]; + HS_CMP_XCHG(r2_1, r2_2) + (shared.m + smem_l_idx)[32] = r2_1; + (shared.m + smem_r_idx)[40] = r2_2; + } + { + HS_KEY_TYPE r3_1 = (shared.m + smem_l_idx)[48]; + HS_KEY_TYPE r3_2 = (shared.m + smem_r_idx)[56]; + HS_CMP_XCHG(r3_1, r3_2) + (shared.m + smem_l_idx)[48] = r3_1; + (shared.m + smem_r_idx)[56] = r3_2; + } + { + HS_KEY_TYPE r4_1 = (shared.m + smem_l_idx)[64]; + HS_KEY_TYPE r4_2 = (shared.m + smem_r_idx)[72]; + HS_CMP_XCHG(r4_1, r4_2) + (shared.m + smem_l_idx)[64] = r4_1; + (shared.m + smem_r_idx)[72] = r4_2; + } + { + HS_KEY_TYPE r5_1 = (shared.m + smem_l_idx)[80]; + HS_KEY_TYPE r5_2 = (shared.m + smem_r_idx)[88]; + HS_CMP_XCHG(r5_1, r5_2) + (shared.m + smem_l_idx)[80] = r5_1; + (shared.m + smem_r_idx)[88] = r5_2; + } + { + HS_KEY_TYPE r6_1 = (shared.m + smem_l_idx)[96]; + HS_KEY_TYPE r6_2 = (shared.m + smem_r_idx)[104]; + HS_CMP_XCHG(r6_1, r6_2) + (shared.m + smem_l_idx)[96] = r6_1; + (shared.m + smem_r_idx)[104] = r6_2; + } + { + HS_KEY_TYPE r7_1 = (shared.m + smem_l_idx)[112]; + HS_KEY_TYPE r7_2 = (shared.m + smem_r_idx)[120]; + HS_CMP_XCHG(r7_1, r7_2) + (shared.m + smem_l_idx)[112] = r7_1; + (shared.m + smem_r_idx)[120] = r7_2; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + r1 = (shared.m + get_local_id(0))[16 * 8 * 0]; + r16 = (shared.m + get_local_id(0))[16 * 8 * 1]; + r2 = (shared.m + get_local_id(0))[16 * 8 * 2]; + r15 = (shared.m + get_local_id(0))[16 * 8 * 3]; + r3 = (shared.m + get_local_id(0))[16 * 8 * 4]; + r14 = (shared.m + get_local_id(0))[16 * 8 * 5]; + r4 = (shared.m + get_local_id(0))[16 * 8 * 6]; + r13 = (shared.m + get_local_id(0))[16 * 8 * 7]; + r5 = (shared.m + get_local_id(0))[16 * 8 * 8]; + r12 = (shared.m + get_local_id(0))[16 * 8 * 9]; + r6 = (shared.m + get_local_id(0))[16 * 8 * 10]; + r11 = (shared.m + get_local_id(0))[16 * 8 * 11]; + r7 = (shared.m + get_local_id(0))[16 * 8 * 12]; + r10 = (shared.m + get_local_id(0))[16 * 8 * 13]; + r8 = (shared.m + get_local_id(0))[16 * 8 * 14]; + r9 = (shared.m + get_local_id(0))[16 * 8 * 15]; + { { uint const half_lane_mask = 4; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +HS_CMP_XCHG(r1, r9) +HS_CMP_XCHG(r5, r13) +HS_CMP_XCHG(r1, r5) +HS_CMP_XCHG(r9, r13) +HS_CMP_XCHG(r3, r11) +HS_CMP_XCHG(r7, r15) +HS_CMP_XCHG(r3, r7) +HS_CMP_XCHG(r11, r15) +HS_CMP_XCHG(r1, r3) +HS_CMP_XCHG(r5, r7) +HS_CMP_XCHG(r9, r11) +HS_CMP_XCHG(r13, r15) +HS_CMP_XCHG(r2, r10) +HS_CMP_XCHG(r6, r14) +HS_CMP_XCHG(r2, r6) +HS_CMP_XCHG(r10, r14) +HS_CMP_XCHG(r4, r12) +HS_CMP_XCHG(r8, r16) +HS_CMP_XCHG(r4, r8) +HS_CMP_XCHG(r12, r16) +HS_CMP_XCHG(r2, r4) +HS_CMP_XCHG(r6, r8) +HS_CMP_XCHG(r10, r12) +HS_CMP_XCHG(r14, r16) +HS_CMP_XCHG(r1, r2) +HS_CMP_XCHG(r3, r4) +HS_CMP_XCHG(r5, r6) +HS_CMP_XCHG(r7, r8) +HS_CMP_XCHG(r9, r10) +HS_CMP_XCHG(r11, r12) +HS_CMP_XCHG(r13, r14) +HS_CMP_XCHG(r15, r16) +} +(shared.m + get_local_id(0))[16 * 8 * 0] = r1; +(shared.m + get_local_id(0))[16 * 8 * 1] = r16; +(shared.m + get_local_id(0))[16 * 8 * 2] = r2; +(shared.m + get_local_id(0))[16 * 8 * 3] = r15; +(shared.m + get_local_id(0))[16 * 8 * 4] = r3; +(shared.m + get_local_id(0))[16 * 8 * 5] = r14; +(shared.m + get_local_id(0))[16 * 8 * 6] = r4; +(shared.m + get_local_id(0))[16 * 8 * 7] = r13; +(shared.m + get_local_id(0))[16 * 8 * 8] = r5; +(shared.m + get_local_id(0))[16 * 8 * 9] = r12; +(shared.m + get_local_id(0))[16 * 8 * 10] = r6; +(shared.m + get_local_id(0))[16 * 8 * 11] = r11; +(shared.m + get_local_id(0))[16 * 8 * 12] = r7; +(shared.m + get_local_id(0))[16 * 8 * 13] = r10; +(shared.m + get_local_id(0))[16 * 8 * 14] = r8; +(shared.m + get_local_id(0))[16 * 8 * 15] = r9; +barrier(CLK_LOCAL_MEM_FENCE); +{ + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; + HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8]; + HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[16]; + HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[24]; + HS_CMP_XCHG(r0_2, r0_3) + HS_CMP_XCHG(r0_1, r0_4) + HS_CMP_XCHG(r0_3, r0_4) + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[0] = r0_1; + (shared.m + smem_l_idx)[8] = r0_2; + (shared.m + smem_r_idx)[16] = r0_3; + (shared.m + smem_r_idx)[24] = r0_4; + } + { + HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[32]; + HS_KEY_TYPE r1_2 = (shared.m + smem_l_idx)[40]; + HS_KEY_TYPE r1_3 = (shared.m + smem_r_idx)[48]; + HS_KEY_TYPE r1_4 = (shared.m + smem_r_idx)[56]; + HS_CMP_XCHG(r1_2, r1_3) + HS_CMP_XCHG(r1_1, r1_4) + HS_CMP_XCHG(r1_3, r1_4) + HS_CMP_XCHG(r1_1, r1_2) + (shared.m + smem_l_idx)[32] = r1_1; + (shared.m + smem_l_idx)[40] = r1_2; + (shared.m + smem_r_idx)[48] = r1_3; + (shared.m + smem_r_idx)[56] = r1_4; + } + { + HS_KEY_TYPE r2_1 = (shared.m + smem_l_idx)[64]; + HS_KEY_TYPE r2_2 = (shared.m + smem_l_idx)[72]; + HS_KEY_TYPE r2_3 = (shared.m + smem_r_idx)[80]; + HS_KEY_TYPE r2_4 = (shared.m + smem_r_idx)[88]; + HS_CMP_XCHG(r2_2, r2_3) + HS_CMP_XCHG(r2_1, r2_4) + HS_CMP_XCHG(r2_3, r2_4) + HS_CMP_XCHG(r2_1, r2_2) + (shared.m + smem_l_idx)[64] = r2_1; + (shared.m + smem_l_idx)[72] = r2_2; + (shared.m + smem_r_idx)[80] = r2_3; + (shared.m + smem_r_idx)[88] = r2_4; + } + { + HS_KEY_TYPE r3_1 = (shared.m + smem_l_idx)[96]; + HS_KEY_TYPE r3_2 = (shared.m + smem_l_idx)[104]; + HS_KEY_TYPE r3_3 = (shared.m + smem_r_idx)[112]; + HS_KEY_TYPE r3_4 = (shared.m + smem_r_idx)[120]; + HS_CMP_XCHG(r3_2, r3_3) + HS_CMP_XCHG(r3_1, r3_4) + HS_CMP_XCHG(r3_3, r3_4) + HS_CMP_XCHG(r3_1, r3_2) + (shared.m + smem_l_idx)[96] = r3_1; + (shared.m + smem_l_idx)[104] = r3_2; + (shared.m + smem_r_idx)[112] = r3_3; + (shared.m + smem_r_idx)[120] = r3_4; + } +} +barrier(CLK_LOCAL_MEM_FENCE); +r1 = (shared.m + get_local_id(0))[16 * 8 * 0]; +r16 = (shared.m + get_local_id(0))[16 * 8 * 1]; +r2 = (shared.m + get_local_id(0))[16 * 8 * 2]; +r15 = (shared.m + get_local_id(0))[16 * 8 * 3]; +r3 = (shared.m + get_local_id(0))[16 * 8 * 4]; +r14 = (shared.m + get_local_id(0))[16 * 8 * 5]; +r4 = (shared.m + get_local_id(0))[16 * 8 * 6]; +r13 = (shared.m + get_local_id(0))[16 * 8 * 7]; +r5 = (shared.m + get_local_id(0))[16 * 8 * 8]; +r12 = (shared.m + get_local_id(0))[16 * 8 * 9]; +r6 = (shared.m + get_local_id(0))[16 * 8 * 10]; +r11 = (shared.m + get_local_id(0))[16 * 8 * 11]; +r7 = (shared.m + get_local_id(0))[16 * 8 * 12]; +r10 = (shared.m + get_local_id(0))[16 * 8 * 13]; +r8 = (shared.m + get_local_id(0))[16 * 8 * 14]; +r9 = (shared.m + get_local_id(0))[16 * 8 * 15]; +{ { uint const half_lane_mask = 4; +uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; +int const t_lt = get_sub_group_local_id() < half_lane_idx; +HS_CMP_HALF(0, r1) +HS_CMP_HALF(1, r2) +HS_CMP_HALF(2, r3) +HS_CMP_HALF(3, r4) +HS_CMP_HALF(4, r5) +HS_CMP_HALF(5, r6) +HS_CMP_HALF(6, r7) +HS_CMP_HALF(7, r8) +HS_CMP_HALF(8, r9) +HS_CMP_HALF(9, r10) +HS_CMP_HALF(10, r11) +HS_CMP_HALF(11, r12) +HS_CMP_HALF(12, r13) +HS_CMP_HALF(13, r14) +HS_CMP_HALF(14, r15) +HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +HS_CMP_XCHG(r1, r9) +HS_CMP_XCHG(r5, r13) +HS_CMP_XCHG(r1, r5) +HS_CMP_XCHG(r9, r13) +HS_CMP_XCHG(r3, r11) +HS_CMP_XCHG(r7, r15) +HS_CMP_XCHG(r3, r7) +HS_CMP_XCHG(r11, r15) +HS_CMP_XCHG(r1, r3) +HS_CMP_XCHG(r5, r7) +HS_CMP_XCHG(r9, r11) +HS_CMP_XCHG(r13, r15) +HS_CMP_XCHG(r2, r10) +HS_CMP_XCHG(r6, r14) +HS_CMP_XCHG(r2, r6) +HS_CMP_XCHG(r10, r14) +HS_CMP_XCHG(r4, r12) +HS_CMP_XCHG(r8, r16) +HS_CMP_XCHG(r4, r8) +HS_CMP_XCHG(r12, r16) +HS_CMP_XCHG(r2, r4) +HS_CMP_XCHG(r6, r8) +HS_CMP_XCHG(r10, r12) +HS_CMP_XCHG(r14, r16) +HS_CMP_XCHG(r1, r2) +HS_CMP_XCHG(r3, r4) +HS_CMP_XCHG(r5, r6) +HS_CMP_XCHG(r7, r8) +HS_CMP_XCHG(r9, r10) +HS_CMP_XCHG(r11, r12) +HS_CMP_XCHG(r13, r14) +HS_CMP_XCHG(r15, r16) +} +(shared.m + get_local_id(0))[16 * 8 * 0] = r1; +(shared.m + get_local_id(0))[16 * 8 * 1] = r16; +(shared.m + get_local_id(0))[16 * 8 * 2] = r2; +(shared.m + get_local_id(0))[16 * 8 * 3] = r15; +(shared.m + get_local_id(0))[16 * 8 * 4] = r3; +(shared.m + get_local_id(0))[16 * 8 * 5] = r14; +(shared.m + get_local_id(0))[16 * 8 * 6] = r4; +(shared.m + get_local_id(0))[16 * 8 * 7] = r13; +(shared.m + get_local_id(0))[16 * 8 * 8] = r5; +(shared.m + get_local_id(0))[16 * 8 * 9] = r12; +(shared.m + get_local_id(0))[16 * 8 * 10] = r6; +(shared.m + get_local_id(0))[16 * 8 * 11] = r11; +(shared.m + get_local_id(0))[16 * 8 * 12] = r7; +(shared.m + get_local_id(0))[16 * 8 * 13] = r10; +(shared.m + get_local_id(0))[16 * 8 * 14] = r8; +(shared.m + get_local_id(0))[16 * 8 * 15] = r9; +barrier(CLK_LOCAL_MEM_FENCE); +{ + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; + HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8]; + HS_KEY_TYPE r0_3 = (shared.m + smem_l_idx)[16]; + HS_KEY_TYPE r0_4 = (shared.m + smem_l_idx)[24]; + HS_KEY_TYPE r0_5 = (shared.m + smem_r_idx)[32]; + HS_KEY_TYPE r0_6 = (shared.m + smem_r_idx)[40]; + HS_KEY_TYPE r0_7 = (shared.m + smem_r_idx)[48]; + HS_KEY_TYPE r0_8 = (shared.m + smem_r_idx)[56]; + HS_CMP_XCHG(r0_4, r0_5) + HS_CMP_XCHG(r0_3, r0_6) + HS_CMP_XCHG(r0_2, r0_7) + HS_CMP_XCHG(r0_1, r0_8) + HS_CMP_XCHG(r0_5, r0_7) + HS_CMP_XCHG(r0_6, r0_8) + HS_CMP_XCHG(r0_5, r0_6) + HS_CMP_XCHG(r0_7, r0_8) + HS_CMP_XCHG(r0_1, r0_3) + HS_CMP_XCHG(r0_2, r0_4) + HS_CMP_XCHG(r0_1, r0_2) + HS_CMP_XCHG(r0_3, r0_4) + (shared.m + smem_l_idx)[0] = r0_1; + (shared.m + smem_l_idx)[8] = r0_2; + (shared.m + smem_l_idx)[16] = r0_3; + (shared.m + smem_l_idx)[24] = r0_4; + (shared.m + smem_r_idx)[32] = r0_5; + (shared.m + smem_r_idx)[40] = r0_6; + (shared.m + smem_r_idx)[48] = r0_7; + (shared.m + smem_r_idx)[56] = r0_8; + } + { + HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[64]; + HS_KEY_TYPE r1_2 = (shared.m + smem_l_idx)[72]; + HS_KEY_TYPE r1_3 = (shared.m + smem_l_idx)[80]; + HS_KEY_TYPE r1_4 = (shared.m + smem_l_idx)[88]; + HS_KEY_TYPE r1_5 = (shared.m + smem_r_idx)[96]; + HS_KEY_TYPE r1_6 = (shared.m + smem_r_idx)[104]; + HS_KEY_TYPE r1_7 = (shared.m + smem_r_idx)[112]; + HS_KEY_TYPE r1_8 = (shared.m + smem_r_idx)[120]; + HS_CMP_XCHG(r1_4, r1_5) + HS_CMP_XCHG(r1_3, r1_6) + HS_CMP_XCHG(r1_2, r1_7) + HS_CMP_XCHG(r1_1, r1_8) + HS_CMP_XCHG(r1_5, r1_7) + HS_CMP_XCHG(r1_6, r1_8) + HS_CMP_XCHG(r1_5, r1_6) + HS_CMP_XCHG(r1_7, r1_8) + HS_CMP_XCHG(r1_1, r1_3) + HS_CMP_XCHG(r1_2, r1_4) + HS_CMP_XCHG(r1_1, r1_2) + HS_CMP_XCHG(r1_3, r1_4) + (shared.m + smem_l_idx)[64] = r1_1; + (shared.m + smem_l_idx)[72] = r1_2; + (shared.m + smem_l_idx)[80] = r1_3; + (shared.m + smem_l_idx)[88] = r1_4; + (shared.m + smem_r_idx)[96] = r1_5; + (shared.m + smem_r_idx)[104] = r1_6; + (shared.m + smem_r_idx)[112] = r1_7; + (shared.m + smem_r_idx)[120] = r1_8; + } +} +barrier(CLK_LOCAL_MEM_FENCE); +r1 = (shared.m + get_local_id(0))[16 * 8 * 0]; +r16 = (shared.m + get_local_id(0))[16 * 8 * 1]; +r2 = (shared.m + get_local_id(0))[16 * 8 * 2]; +r15 = (shared.m + get_local_id(0))[16 * 8 * 3]; +r3 = (shared.m + get_local_id(0))[16 * 8 * 4]; +r14 = (shared.m + get_local_id(0))[16 * 8 * 5]; +r4 = (shared.m + get_local_id(0))[16 * 8 * 6]; +r13 = (shared.m + get_local_id(0))[16 * 8 * 7]; +r5 = (shared.m + get_local_id(0))[16 * 8 * 8]; +r12 = (shared.m + get_local_id(0))[16 * 8 * 9]; +r6 = (shared.m + get_local_id(0))[16 * 8 * 10]; +r11 = (shared.m + get_local_id(0))[16 * 8 * 11]; +r7 = (shared.m + get_local_id(0))[16 * 8 * 12]; +r10 = (shared.m + get_local_id(0))[16 * 8 * 13]; +r8 = (shared.m + get_local_id(0))[16 * 8 * 14]; +r9 = (shared.m + get_local_id(0))[16 * 8 * 15]; +{ { uint const half_lane_mask = 4; +uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; +int const t_lt = get_sub_group_local_id() < half_lane_idx; +HS_CMP_HALF(0, r1) +HS_CMP_HALF(1, r2) +HS_CMP_HALF(2, r3) +HS_CMP_HALF(3, r4) +HS_CMP_HALF(4, r5) +HS_CMP_HALF(5, r6) +HS_CMP_HALF(6, r7) +HS_CMP_HALF(7, r8) +HS_CMP_HALF(8, r9) +HS_CMP_HALF(9, r10) +HS_CMP_HALF(10, r11) +HS_CMP_HALF(11, r12) +HS_CMP_HALF(12, r13) +HS_CMP_HALF(13, r14) +HS_CMP_HALF(14, r15) +HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +HS_CMP_XCHG(r1, r9) +HS_CMP_XCHG(r5, r13) +HS_CMP_XCHG(r1, r5) +HS_CMP_XCHG(r9, r13) +HS_CMP_XCHG(r3, r11) +HS_CMP_XCHG(r7, r15) +HS_CMP_XCHG(r3, r7) +HS_CMP_XCHG(r11, r15) +HS_CMP_XCHG(r1, r3) +HS_CMP_XCHG(r5, r7) +HS_CMP_XCHG(r9, r11) +HS_CMP_XCHG(r13, r15) +HS_CMP_XCHG(r2, r10) +HS_CMP_XCHG(r6, r14) +HS_CMP_XCHG(r2, r6) +HS_CMP_XCHG(r10, r14) +HS_CMP_XCHG(r4, r12) +HS_CMP_XCHG(r8, r16) +HS_CMP_XCHG(r4, r8) +HS_CMP_XCHG(r12, r16) +HS_CMP_XCHG(r2, r4) +HS_CMP_XCHG(r6, r8) +HS_CMP_XCHG(r10, r12) +HS_CMP_XCHG(r14, r16) +HS_CMP_XCHG(r1, r2) +HS_CMP_XCHG(r3, r4) +HS_CMP_XCHG(r5, r6) +HS_CMP_XCHG(r7, r8) +HS_CMP_XCHG(r9, r10) +HS_CMP_XCHG(r11, r12) +HS_CMP_XCHG(r13, r14) +HS_CMP_XCHG(r15, r16) +} +(shared.m + get_local_id(0))[16 * 8 * 0] = r1; +(shared.m + get_local_id(0))[16 * 8 * 1] = r16; +(shared.m + get_local_id(0))[16 * 8 * 2] = r2; +(shared.m + get_local_id(0))[16 * 8 * 3] = r15; +(shared.m + get_local_id(0))[16 * 8 * 4] = r3; +(shared.m + get_local_id(0))[16 * 8 * 5] = r14; +(shared.m + get_local_id(0))[16 * 8 * 6] = r4; +(shared.m + get_local_id(0))[16 * 8 * 7] = r13; +(shared.m + get_local_id(0))[16 * 8 * 8] = r5; +(shared.m + get_local_id(0))[16 * 8 * 9] = r12; +(shared.m + get_local_id(0))[16 * 8 * 10] = r6; +(shared.m + get_local_id(0))[16 * 8 * 11] = r11; +(shared.m + get_local_id(0))[16 * 8 * 12] = r7; +(shared.m + get_local_id(0))[16 * 8 * 13] = r10; +(shared.m + get_local_id(0))[16 * 8 * 14] = r8; +(shared.m + get_local_id(0))[16 * 8 * 15] = r9; +barrier(CLK_LOCAL_MEM_FENCE); +{ + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; + HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8]; + HS_KEY_TYPE r0_3 = (shared.m + smem_l_idx)[16]; + HS_KEY_TYPE r0_4 = (shared.m + smem_l_idx)[24]; + HS_KEY_TYPE r0_5 = (shared.m + smem_l_idx)[32]; + HS_KEY_TYPE r0_6 = (shared.m + smem_l_idx)[40]; + HS_KEY_TYPE r0_7 = (shared.m + smem_l_idx)[48]; + HS_KEY_TYPE r0_8 = (shared.m + smem_l_idx)[56]; + HS_KEY_TYPE r0_9 = (shared.m + smem_r_idx)[64]; + HS_KEY_TYPE r0_10 = (shared.m + smem_r_idx)[72]; + HS_KEY_TYPE r0_11 = (shared.m + smem_r_idx)[80]; + HS_KEY_TYPE r0_12 = (shared.m + smem_r_idx)[88]; + HS_KEY_TYPE r0_13 = (shared.m + smem_r_idx)[96]; + HS_KEY_TYPE r0_14 = (shared.m + smem_r_idx)[104]; + HS_KEY_TYPE r0_15 = (shared.m + smem_r_idx)[112]; + HS_KEY_TYPE r0_16 = (shared.m + smem_r_idx)[120]; + HS_CMP_XCHG(r0_8, r0_9) + HS_CMP_XCHG(r0_7, r0_10) + HS_CMP_XCHG(r0_6, r0_11) + HS_CMP_XCHG(r0_5, r0_12) + HS_CMP_XCHG(r0_4, r0_13) + HS_CMP_XCHG(r0_3, r0_14) + HS_CMP_XCHG(r0_2, r0_15) + HS_CMP_XCHG(r0_1, r0_16) + HS_CMP_XCHG(r0_9, r0_13) + HS_CMP_XCHG(r0_11, r0_15) + HS_CMP_XCHG(r0_9, r0_11) + HS_CMP_XCHG(r0_13, r0_15) + HS_CMP_XCHG(r0_10, r0_14) + HS_CMP_XCHG(r0_12, r0_16) + HS_CMP_XCHG(r0_10, r0_12) + HS_CMP_XCHG(r0_14, r0_16) + HS_CMP_XCHG(r0_9, r0_10) + HS_CMP_XCHG(r0_11, r0_12) + HS_CMP_XCHG(r0_13, r0_14) + HS_CMP_XCHG(r0_15, r0_16) + HS_CMP_XCHG(r0_1, r0_5) + HS_CMP_XCHG(r0_3, r0_7) + HS_CMP_XCHG(r0_1, r0_3) + HS_CMP_XCHG(r0_5, r0_7) + HS_CMP_XCHG(r0_2, r0_6) + HS_CMP_XCHG(r0_4, r0_8) + HS_CMP_XCHG(r0_2, r0_4) + HS_CMP_XCHG(r0_6, r0_8) + HS_CMP_XCHG(r0_1, r0_2) + HS_CMP_XCHG(r0_3, r0_4) + HS_CMP_XCHG(r0_5, r0_6) + HS_CMP_XCHG(r0_7, r0_8) + (shared.m + smem_l_idx)[0] = r0_1; + (shared.m + smem_l_idx)[8] = r0_2; + (shared.m + smem_l_idx)[16] = r0_3; + (shared.m + smem_l_idx)[24] = r0_4; + (shared.m + smem_l_idx)[32] = r0_5; + (shared.m + smem_l_idx)[40] = r0_6; + (shared.m + smem_l_idx)[48] = r0_7; + (shared.m + smem_l_idx)[56] = r0_8; + (shared.m + smem_r_idx)[64] = r0_9; + (shared.m + smem_r_idx)[72] = r0_10; + (shared.m + smem_r_idx)[80] = r0_11; + (shared.m + smem_r_idx)[88] = r0_12; + (shared.m + smem_r_idx)[96] = r0_13; + (shared.m + smem_r_idx)[104] = r0_14; + (shared.m + smem_r_idx)[112] = r0_15; + (shared.m + smem_r_idx)[120] = r0_16; + } +} +barrier(CLK_LOCAL_MEM_FENCE); +r1 = (shared.m + get_local_id(0))[16 * 8 * 0]; +r16 = (shared.m + get_local_id(0))[16 * 8 * 1]; +r2 = (shared.m + get_local_id(0))[16 * 8 * 2]; +r15 = (shared.m + get_local_id(0))[16 * 8 * 3]; +r3 = (shared.m + get_local_id(0))[16 * 8 * 4]; +r14 = (shared.m + get_local_id(0))[16 * 8 * 5]; +r4 = (shared.m + get_local_id(0))[16 * 8 * 6]; +r13 = (shared.m + get_local_id(0))[16 * 8 * 7]; +r5 = (shared.m + get_local_id(0))[16 * 8 * 8]; +r12 = (shared.m + get_local_id(0))[16 * 8 * 9]; +r6 = (shared.m + get_local_id(0))[16 * 8 * 10]; +r11 = (shared.m + get_local_id(0))[16 * 8 * 11]; +r7 = (shared.m + get_local_id(0))[16 * 8 * 12]; +r10 = (shared.m + get_local_id(0))[16 * 8 * 13]; +r8 = (shared.m + get_local_id(0))[16 * 8 * 14]; +r9 = (shared.m + get_local_id(0))[16 * 8 * 15]; +{ { uint const half_lane_mask = 4; +uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; +int const t_lt = get_sub_group_local_id() < half_lane_idx; +HS_CMP_HALF(0, r1) +HS_CMP_HALF(1, r2) +HS_CMP_HALF(2, r3) +HS_CMP_HALF(3, r4) +HS_CMP_HALF(4, r5) +HS_CMP_HALF(5, r6) +HS_CMP_HALF(6, r7) +HS_CMP_HALF(7, r8) +HS_CMP_HALF(8, r9) +HS_CMP_HALF(9, r10) +HS_CMP_HALF(10, r11) +HS_CMP_HALF(11, r12) +HS_CMP_HALF(12, r13) +HS_CMP_HALF(13, r14) +HS_CMP_HALF(14, r15) +HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +HS_CMP_XCHG(r1, r9) +HS_CMP_XCHG(r5, r13) +HS_CMP_XCHG(r1, r5) +HS_CMP_XCHG(r9, r13) +HS_CMP_XCHG(r3, r11) +HS_CMP_XCHG(r7, r15) +HS_CMP_XCHG(r3, r7) +HS_CMP_XCHG(r11, r15) +HS_CMP_XCHG(r1, r3) +HS_CMP_XCHG(r5, r7) +HS_CMP_XCHG(r9, r11) +HS_CMP_XCHG(r13, r15) +HS_CMP_XCHG(r2, r10) +HS_CMP_XCHG(r6, r14) +HS_CMP_XCHG(r2, r6) +HS_CMP_XCHG(r10, r14) +HS_CMP_XCHG(r4, r12) +HS_CMP_XCHG(r8, r16) +HS_CMP_XCHG(r4, r8) +HS_CMP_XCHG(r12, r16) +HS_CMP_XCHG(r2, r4) +HS_CMP_XCHG(r6, r8) +HS_CMP_XCHG(r10, r12) +HS_CMP_XCHG(r14, r16) +HS_CMP_XCHG(r1, r2) +HS_CMP_XCHG(r3, r4) +HS_CMP_XCHG(r5, r6) +HS_CMP_XCHG(r7, r8) +HS_CMP_XCHG(r9, r10) +HS_CMP_XCHG(r11, r12) +HS_CMP_XCHG(r13, r14) +HS_CMP_XCHG(r15, r16) +} +(vout + gmem_idx)[0 * 8] = r1; +(vout + gmem_idx)[1 * 8] = r2; +(vout + gmem_idx)[2 * 8] = r3; +(vout + gmem_idx)[3 * 8] = r4; +(vout + gmem_idx)[4 * 8] = r5; +(vout + gmem_idx)[5 * 8] = r6; +(vout + gmem_idx)[6 * 8] = r7; +(vout + gmem_idx)[7 * 8] = r8; +(vout + gmem_idx)[8 * 8] = r9; +(vout + gmem_idx)[9 * 8] = r10; +(vout + gmem_idx)[10 * 8] = r11; +(vout + gmem_idx)[11 * 8] = r12; +(vout + gmem_idx)[12 * 8] = r13; +(vout + gmem_idx)[13 * 8] = r14; +(vout + gmem_idx)[14 * 8] = r15; +(vout + gmem_idx)[15 * 8] = r16; +} + +__kernel __attribute__((reqd_work_group_size(64, 1, 1))) +__attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_bs_3(__global HS_KEY_TYPE const* const restrict vin, + __global HS_KEY_TYPE* const restrict vout) +{ + __local union + { + HS_KEY_TYPE m[16 * 64]; + } shared; + + uint const global_id = get_global_id(0); + uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); + + HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8]; + HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8]; + HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8]; + HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8]; + HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8]; + HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8]; + HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8]; + HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8]; + HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8]; + HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8]; + HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8]; + HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8]; + HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8]; + HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8]; + HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8]; + HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8]; + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r6, r11) + HS_CMP_XCHG(r7, r10) + HS_CMP_XCHG(r4, r13) + HS_CMP_XCHG(r14, r15) + HS_CMP_XCHG(r8, r12) + HS_CMP_XCHG(r2, r3) + HS_CMP_XCHG(r5, r9) + HS_CMP_XCHG(r2, r5) + HS_CMP_XCHG(r8, r14) + HS_CMP_XCHG(r3, r9) + HS_CMP_XCHG(r12, r15) + HS_CMP_XCHG(r3, r5) + HS_CMP_XCHG(r6, r7) + HS_CMP_XCHG(r10, r11) + HS_CMP_XCHG(r12, r14) + HS_CMP_XCHG(r4, r9) + HS_CMP_XCHG(r8, r13) + HS_CMP_XCHG(r7, r9) + HS_CMP_XCHG(r11, r13) + HS_CMP_XCHG(r4, r6) + HS_CMP_XCHG(r8, r10) + HS_CMP_XCHG(r4, r5) + HS_CMP_XCHG(r6, r7) + HS_CMP_XCHG(r8, r9) + HS_CMP_XCHG(r10, r11) + HS_CMP_XCHG(r12, r13) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + { + uint const flip_lane_mask = 1; + uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; + int const t_lt = get_sub_group_local_id() < flip_lane_idx; + HS_CMP_FLIP(0, r1, r16) + HS_CMP_FLIP(1, r2, r15) + HS_CMP_FLIP(2, r3, r14) + HS_CMP_FLIP(3, r4, r13) + HS_CMP_FLIP(4, r5, r12) + HS_CMP_FLIP(5, r6, r11) + HS_CMP_FLIP(6, r7, r10) + HS_CMP_FLIP(7, r8, r9) + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + { + uint const flip_lane_mask = 3; + uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; + int const t_lt = get_sub_group_local_id() < flip_lane_idx; + HS_CMP_FLIP(0, r1, r16) + HS_CMP_FLIP(1, r2, r15) + HS_CMP_FLIP(2, r3, r14) + HS_CMP_FLIP(3, r4, r13) + HS_CMP_FLIP(4, r5, r12) + HS_CMP_FLIP(5, r6, r11) + HS_CMP_FLIP(6, r7, r10) + HS_CMP_FLIP(7, r8, r9) + } + { + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + { + uint const flip_lane_mask = 7; + uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; + int const t_lt = get_sub_group_local_id() < flip_lane_idx; + HS_CMP_FLIP(0, r1, r16) + HS_CMP_FLIP(1, r2, r15) + HS_CMP_FLIP(2, r3, r14) + HS_CMP_FLIP(3, r4, r13) + HS_CMP_FLIP(4, r5, r12) + HS_CMP_FLIP(5, r6, r11) + HS_CMP_FLIP(6, r7, r10) + HS_CMP_FLIP(7, r8, r9) + } + { + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) + } + { + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + uint const smem_l_idx = get_sub_group_id() * 64 + get_sub_group_local_id(); + uint const smem_r_idx = + (get_sub_group_id() ^ 1) * 64 + (get_sub_group_local_id() ^ 7); + (shared.m + get_local_id(0))[8 * 8 * 0] = r1; + (shared.m + get_local_id(0))[8 * 8 * 1] = r16; + (shared.m + get_local_id(0))[8 * 8 * 2] = r2; + (shared.m + get_local_id(0))[8 * 8 * 3] = r15; + (shared.m + get_local_id(0))[8 * 8 * 4] = r3; + (shared.m + get_local_id(0))[8 * 8 * 5] = r14; + (shared.m + get_local_id(0))[8 * 8 * 6] = r4; + (shared.m + get_local_id(0))[8 * 8 * 7] = r13; + (shared.m + get_local_id(0))[8 * 8 * 8] = r5; + (shared.m + get_local_id(0))[8 * 8 * 9] = r12; + (shared.m + get_local_id(0))[8 * 8 * 10] = r6; + (shared.m + get_local_id(0))[8 * 8 * 11] = r11; + (shared.m + get_local_id(0))[8 * 8 * 12] = r7; + (shared.m + get_local_id(0))[8 * 8 * 13] = r10; + (shared.m + get_local_id(0))[8 * 8 * 14] = r8; + (shared.m + get_local_id(0))[8 * 8 * 15] = r9; + barrier(CLK_LOCAL_MEM_FENCE); + { + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; + HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[8]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[0] = r0_1; + (shared.m + smem_r_idx)[8] = r0_2; + } + { + HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[16]; + HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[24]; + HS_CMP_XCHG(r1_1, r1_2) + (shared.m + smem_l_idx)[16] = r1_1; + (shared.m + smem_r_idx)[24] = r1_2; + } + { + HS_KEY_TYPE r2_1 = (shared.m + smem_l_idx)[32]; + HS_KEY_TYPE r2_2 = (shared.m + smem_r_idx)[40]; + HS_CMP_XCHG(r2_1, r2_2) + (shared.m + smem_l_idx)[32] = r2_1; + (shared.m + smem_r_idx)[40] = r2_2; + } + { + HS_KEY_TYPE r3_1 = (shared.m + smem_l_idx)[48]; + HS_KEY_TYPE r3_2 = (shared.m + smem_r_idx)[56]; + HS_CMP_XCHG(r3_1, r3_2) + (shared.m + smem_l_idx)[48] = r3_1; + (shared.m + smem_r_idx)[56] = r3_2; + } + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[512]; + HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[520]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[512] = r0_1; + (shared.m + smem_r_idx)[520] = r0_2; + } + { + HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[528]; + HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[536]; + HS_CMP_XCHG(r1_1, r1_2) + (shared.m + smem_l_idx)[528] = r1_1; + (shared.m + smem_r_idx)[536] = r1_2; + } + { + HS_KEY_TYPE r2_1 = (shared.m + smem_l_idx)[544]; + HS_KEY_TYPE r2_2 = (shared.m + smem_r_idx)[552]; + HS_CMP_XCHG(r2_1, r2_2) + (shared.m + smem_l_idx)[544] = r2_1; + (shared.m + smem_r_idx)[552] = r2_2; + } + { + HS_KEY_TYPE r3_1 = (shared.m + smem_l_idx)[560]; + HS_KEY_TYPE r3_2 = (shared.m + smem_r_idx)[568]; + HS_CMP_XCHG(r3_1, r3_2) + (shared.m + smem_l_idx)[560] = r3_1; + (shared.m + smem_r_idx)[568] = r3_2; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + r1 = (shared.m + get_local_id(0))[8 * 8 * 0]; + r16 = (shared.m + get_local_id(0))[8 * 8 * 1]; + r2 = (shared.m + get_local_id(0))[8 * 8 * 2]; + r15 = (shared.m + get_local_id(0))[8 * 8 * 3]; + r3 = (shared.m + get_local_id(0))[8 * 8 * 4]; + r14 = (shared.m + get_local_id(0))[8 * 8 * 5]; + r4 = (shared.m + get_local_id(0))[8 * 8 * 6]; + r13 = (shared.m + get_local_id(0))[8 * 8 * 7]; + r5 = (shared.m + get_local_id(0))[8 * 8 * 8]; + r12 = (shared.m + get_local_id(0))[8 * 8 * 9]; + r6 = (shared.m + get_local_id(0))[8 * 8 * 10]; + r11 = (shared.m + get_local_id(0))[8 * 8 * 11]; + r7 = (shared.m + get_local_id(0))[8 * 8 * 12]; + r10 = (shared.m + get_local_id(0))[8 * 8 * 13]; + r8 = (shared.m + get_local_id(0))[8 * 8 * 14]; + r9 = (shared.m + get_local_id(0))[8 * 8 * 15]; + { { uint const half_lane_mask = 4; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +HS_CMP_XCHG(r1, r9) +HS_CMP_XCHG(r5, r13) +HS_CMP_XCHG(r1, r5) +HS_CMP_XCHG(r9, r13) +HS_CMP_XCHG(r3, r11) +HS_CMP_XCHG(r7, r15) +HS_CMP_XCHG(r3, r7) +HS_CMP_XCHG(r11, r15) +HS_CMP_XCHG(r1, r3) +HS_CMP_XCHG(r5, r7) +HS_CMP_XCHG(r9, r11) +HS_CMP_XCHG(r13, r15) +HS_CMP_XCHG(r2, r10) +HS_CMP_XCHG(r6, r14) +HS_CMP_XCHG(r2, r6) +HS_CMP_XCHG(r10, r14) +HS_CMP_XCHG(r4, r12) +HS_CMP_XCHG(r8, r16) +HS_CMP_XCHG(r4, r8) +HS_CMP_XCHG(r12, r16) +HS_CMP_XCHG(r2, r4) +HS_CMP_XCHG(r6, r8) +HS_CMP_XCHG(r10, r12) +HS_CMP_XCHG(r14, r16) +HS_CMP_XCHG(r1, r2) +HS_CMP_XCHG(r3, r4) +HS_CMP_XCHG(r5, r6) +HS_CMP_XCHG(r7, r8) +HS_CMP_XCHG(r9, r10) +HS_CMP_XCHG(r11, r12) +HS_CMP_XCHG(r13, r14) +HS_CMP_XCHG(r15, r16) +} +(shared.m + get_local_id(0))[8 * 8 * 0] = r1; +(shared.m + get_local_id(0))[8 * 8 * 1] = r16; +(shared.m + get_local_id(0))[8 * 8 * 2] = r2; +(shared.m + get_local_id(0))[8 * 8 * 3] = r15; +(shared.m + get_local_id(0))[8 * 8 * 4] = r3; +(shared.m + get_local_id(0))[8 * 8 * 5] = r14; +(shared.m + get_local_id(0))[8 * 8 * 6] = r4; +(shared.m + get_local_id(0))[8 * 8 * 7] = r13; +(shared.m + get_local_id(0))[8 * 8 * 8] = r5; +(shared.m + get_local_id(0))[8 * 8 * 9] = r12; +(shared.m + get_local_id(0))[8 * 8 * 10] = r6; +(shared.m + get_local_id(0))[8 * 8 * 11] = r11; +(shared.m + get_local_id(0))[8 * 8 * 12] = r7; +(shared.m + get_local_id(0))[8 * 8 * 13] = r10; +(shared.m + get_local_id(0))[8 * 8 * 14] = r8; +(shared.m + get_local_id(0))[8 * 8 * 15] = r9; +barrier(CLK_LOCAL_MEM_FENCE); +{ + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; + HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8]; + HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[16]; + HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[24]; + HS_CMP_XCHG(r0_2, r0_3) + HS_CMP_XCHG(r0_1, r0_4) + HS_CMP_XCHG(r0_3, r0_4) + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[0] = r0_1; + (shared.m + smem_l_idx)[8] = r0_2; + (shared.m + smem_r_idx)[16] = r0_3; + (shared.m + smem_r_idx)[24] = r0_4; + } + { + HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[32]; + HS_KEY_TYPE r1_2 = (shared.m + smem_l_idx)[40]; + HS_KEY_TYPE r1_3 = (shared.m + smem_r_idx)[48]; + HS_KEY_TYPE r1_4 = (shared.m + smem_r_idx)[56]; + HS_CMP_XCHG(r1_2, r1_3) + HS_CMP_XCHG(r1_1, r1_4) + HS_CMP_XCHG(r1_3, r1_4) + HS_CMP_XCHG(r1_1, r1_2) + (shared.m + smem_l_idx)[32] = r1_1; + (shared.m + smem_l_idx)[40] = r1_2; + (shared.m + smem_r_idx)[48] = r1_3; + (shared.m + smem_r_idx)[56] = r1_4; + } + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[512]; + HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[520]; + HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[528]; + HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[536]; + HS_CMP_XCHG(r0_2, r0_3) + HS_CMP_XCHG(r0_1, r0_4) + HS_CMP_XCHG(r0_3, r0_4) + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[512] = r0_1; + (shared.m + smem_l_idx)[520] = r0_2; + (shared.m + smem_r_idx)[528] = r0_3; + (shared.m + smem_r_idx)[536] = r0_4; + } + { + HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[544]; + HS_KEY_TYPE r1_2 = (shared.m + smem_l_idx)[552]; + HS_KEY_TYPE r1_3 = (shared.m + smem_r_idx)[560]; + HS_KEY_TYPE r1_4 = (shared.m + smem_r_idx)[568]; + HS_CMP_XCHG(r1_2, r1_3) + HS_CMP_XCHG(r1_1, r1_4) + HS_CMP_XCHG(r1_3, r1_4) + HS_CMP_XCHG(r1_1, r1_2) + (shared.m + smem_l_idx)[544] = r1_1; + (shared.m + smem_l_idx)[552] = r1_2; + (shared.m + smem_r_idx)[560] = r1_3; + (shared.m + smem_r_idx)[568] = r1_4; + } +} +barrier(CLK_LOCAL_MEM_FENCE); +r1 = (shared.m + get_local_id(0))[8 * 8 * 0]; +r16 = (shared.m + get_local_id(0))[8 * 8 * 1]; +r2 = (shared.m + get_local_id(0))[8 * 8 * 2]; +r15 = (shared.m + get_local_id(0))[8 * 8 * 3]; +r3 = (shared.m + get_local_id(0))[8 * 8 * 4]; +r14 = (shared.m + get_local_id(0))[8 * 8 * 5]; +r4 = (shared.m + get_local_id(0))[8 * 8 * 6]; +r13 = (shared.m + get_local_id(0))[8 * 8 * 7]; +r5 = (shared.m + get_local_id(0))[8 * 8 * 8]; +r12 = (shared.m + get_local_id(0))[8 * 8 * 9]; +r6 = (shared.m + get_local_id(0))[8 * 8 * 10]; +r11 = (shared.m + get_local_id(0))[8 * 8 * 11]; +r7 = (shared.m + get_local_id(0))[8 * 8 * 12]; +r10 = (shared.m + get_local_id(0))[8 * 8 * 13]; +r8 = (shared.m + get_local_id(0))[8 * 8 * 14]; +r9 = (shared.m + get_local_id(0))[8 * 8 * 15]; +{ { uint const half_lane_mask = 4; +uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; +int const t_lt = get_sub_group_local_id() < half_lane_idx; +HS_CMP_HALF(0, r1) +HS_CMP_HALF(1, r2) +HS_CMP_HALF(2, r3) +HS_CMP_HALF(3, r4) +HS_CMP_HALF(4, r5) +HS_CMP_HALF(5, r6) +HS_CMP_HALF(6, r7) +HS_CMP_HALF(7, r8) +HS_CMP_HALF(8, r9) +HS_CMP_HALF(9, r10) +HS_CMP_HALF(10, r11) +HS_CMP_HALF(11, r12) +HS_CMP_HALF(12, r13) +HS_CMP_HALF(13, r14) +HS_CMP_HALF(14, r15) +HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +HS_CMP_XCHG(r1, r9) +HS_CMP_XCHG(r5, r13) +HS_CMP_XCHG(r1, r5) +HS_CMP_XCHG(r9, r13) +HS_CMP_XCHG(r3, r11) +HS_CMP_XCHG(r7, r15) +HS_CMP_XCHG(r3, r7) +HS_CMP_XCHG(r11, r15) +HS_CMP_XCHG(r1, r3) +HS_CMP_XCHG(r5, r7) +HS_CMP_XCHG(r9, r11) +HS_CMP_XCHG(r13, r15) +HS_CMP_XCHG(r2, r10) +HS_CMP_XCHG(r6, r14) +HS_CMP_XCHG(r2, r6) +HS_CMP_XCHG(r10, r14) +HS_CMP_XCHG(r4, r12) +HS_CMP_XCHG(r8, r16) +HS_CMP_XCHG(r4, r8) +HS_CMP_XCHG(r12, r16) +HS_CMP_XCHG(r2, r4) +HS_CMP_XCHG(r6, r8) +HS_CMP_XCHG(r10, r12) +HS_CMP_XCHG(r14, r16) +HS_CMP_XCHG(r1, r2) +HS_CMP_XCHG(r3, r4) +HS_CMP_XCHG(r5, r6) +HS_CMP_XCHG(r7, r8) +HS_CMP_XCHG(r9, r10) +HS_CMP_XCHG(r11, r12) +HS_CMP_XCHG(r13, r14) +HS_CMP_XCHG(r15, r16) +} +(shared.m + get_local_id(0))[8 * 8 * 0] = r1; +(shared.m + get_local_id(0))[8 * 8 * 1] = r16; +(shared.m + get_local_id(0))[8 * 8 * 2] = r2; +(shared.m + get_local_id(0))[8 * 8 * 3] = r15; +(shared.m + get_local_id(0))[8 * 8 * 4] = r3; +(shared.m + get_local_id(0))[8 * 8 * 5] = r14; +(shared.m + get_local_id(0))[8 * 8 * 6] = r4; +(shared.m + get_local_id(0))[8 * 8 * 7] = r13; +(shared.m + get_local_id(0))[8 * 8 * 8] = r5; +(shared.m + get_local_id(0))[8 * 8 * 9] = r12; +(shared.m + get_local_id(0))[8 * 8 * 10] = r6; +(shared.m + get_local_id(0))[8 * 8 * 11] = r11; +(shared.m + get_local_id(0))[8 * 8 * 12] = r7; +(shared.m + get_local_id(0))[8 * 8 * 13] = r10; +(shared.m + get_local_id(0))[8 * 8 * 14] = r8; +(shared.m + get_local_id(0))[8 * 8 * 15] = r9; +barrier(CLK_LOCAL_MEM_FENCE); +{ + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; + HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8]; + HS_KEY_TYPE r0_3 = (shared.m + smem_l_idx)[16]; + HS_KEY_TYPE r0_4 = (shared.m + smem_l_idx)[24]; + HS_KEY_TYPE r0_5 = (shared.m + smem_r_idx)[32]; + HS_KEY_TYPE r0_6 = (shared.m + smem_r_idx)[40]; + HS_KEY_TYPE r0_7 = (shared.m + smem_r_idx)[48]; + HS_KEY_TYPE r0_8 = (shared.m + smem_r_idx)[56]; + HS_CMP_XCHG(r0_4, r0_5) + HS_CMP_XCHG(r0_3, r0_6) + HS_CMP_XCHG(r0_2, r0_7) + HS_CMP_XCHG(r0_1, r0_8) + HS_CMP_XCHG(r0_5, r0_7) + HS_CMP_XCHG(r0_6, r0_8) + HS_CMP_XCHG(r0_5, r0_6) + HS_CMP_XCHG(r0_7, r0_8) + HS_CMP_XCHG(r0_1, r0_3) + HS_CMP_XCHG(r0_2, r0_4) + HS_CMP_XCHG(r0_1, r0_2) + HS_CMP_XCHG(r0_3, r0_4) + (shared.m + smem_l_idx)[0] = r0_1; + (shared.m + smem_l_idx)[8] = r0_2; + (shared.m + smem_l_idx)[16] = r0_3; + (shared.m + smem_l_idx)[24] = r0_4; + (shared.m + smem_r_idx)[32] = r0_5; + (shared.m + smem_r_idx)[40] = r0_6; + (shared.m + smem_r_idx)[48] = r0_7; + (shared.m + smem_r_idx)[56] = r0_8; + } + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[512]; + HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[520]; + HS_KEY_TYPE r0_3 = (shared.m + smem_l_idx)[528]; + HS_KEY_TYPE r0_4 = (shared.m + smem_l_idx)[536]; + HS_KEY_TYPE r0_5 = (shared.m + smem_r_idx)[544]; + HS_KEY_TYPE r0_6 = (shared.m + smem_r_idx)[552]; + HS_KEY_TYPE r0_7 = (shared.m + smem_r_idx)[560]; + HS_KEY_TYPE r0_8 = (shared.m + smem_r_idx)[568]; + HS_CMP_XCHG(r0_4, r0_5) + HS_CMP_XCHG(r0_3, r0_6) + HS_CMP_XCHG(r0_2, r0_7) + HS_CMP_XCHG(r0_1, r0_8) + HS_CMP_XCHG(r0_5, r0_7) + HS_CMP_XCHG(r0_6, r0_8) + HS_CMP_XCHG(r0_5, r0_6) + HS_CMP_XCHG(r0_7, r0_8) + HS_CMP_XCHG(r0_1, r0_3) + HS_CMP_XCHG(r0_2, r0_4) + HS_CMP_XCHG(r0_1, r0_2) + HS_CMP_XCHG(r0_3, r0_4) + (shared.m + smem_l_idx)[512] = r0_1; + (shared.m + smem_l_idx)[520] = r0_2; + (shared.m + smem_l_idx)[528] = r0_3; + (shared.m + smem_l_idx)[536] = r0_4; + (shared.m + smem_r_idx)[544] = r0_5; + (shared.m + smem_r_idx)[552] = r0_6; + (shared.m + smem_r_idx)[560] = r0_7; + (shared.m + smem_r_idx)[568] = r0_8; + } +} +barrier(CLK_LOCAL_MEM_FENCE); +r1 = (shared.m + get_local_id(0))[8 * 8 * 0]; +r16 = (shared.m + get_local_id(0))[8 * 8 * 1]; +r2 = (shared.m + get_local_id(0))[8 * 8 * 2]; +r15 = (shared.m + get_local_id(0))[8 * 8 * 3]; +r3 = (shared.m + get_local_id(0))[8 * 8 * 4]; +r14 = (shared.m + get_local_id(0))[8 * 8 * 5]; +r4 = (shared.m + get_local_id(0))[8 * 8 * 6]; +r13 = (shared.m + get_local_id(0))[8 * 8 * 7]; +r5 = (shared.m + get_local_id(0))[8 * 8 * 8]; +r12 = (shared.m + get_local_id(0))[8 * 8 * 9]; +r6 = (shared.m + get_local_id(0))[8 * 8 * 10]; +r11 = (shared.m + get_local_id(0))[8 * 8 * 11]; +r7 = (shared.m + get_local_id(0))[8 * 8 * 12]; +r10 = (shared.m + get_local_id(0))[8 * 8 * 13]; +r8 = (shared.m + get_local_id(0))[8 * 8 * 14]; +r9 = (shared.m + get_local_id(0))[8 * 8 * 15]; +{ { uint const half_lane_mask = 4; +uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; +int const t_lt = get_sub_group_local_id() < half_lane_idx; +HS_CMP_HALF(0, r1) +HS_CMP_HALF(1, r2) +HS_CMP_HALF(2, r3) +HS_CMP_HALF(3, r4) +HS_CMP_HALF(4, r5) +HS_CMP_HALF(5, r6) +HS_CMP_HALF(6, r7) +HS_CMP_HALF(7, r8) +HS_CMP_HALF(8, r9) +HS_CMP_HALF(9, r10) +HS_CMP_HALF(10, r11) +HS_CMP_HALF(11, r12) +HS_CMP_HALF(12, r13) +HS_CMP_HALF(13, r14) +HS_CMP_HALF(14, r15) +HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +HS_CMP_XCHG(r1, r9) +HS_CMP_XCHG(r5, r13) +HS_CMP_XCHG(r1, r5) +HS_CMP_XCHG(r9, r13) +HS_CMP_XCHG(r3, r11) +HS_CMP_XCHG(r7, r15) +HS_CMP_XCHG(r3, r7) +HS_CMP_XCHG(r11, r15) +HS_CMP_XCHG(r1, r3) +HS_CMP_XCHG(r5, r7) +HS_CMP_XCHG(r9, r11) +HS_CMP_XCHG(r13, r15) +HS_CMP_XCHG(r2, r10) +HS_CMP_XCHG(r6, r14) +HS_CMP_XCHG(r2, r6) +HS_CMP_XCHG(r10, r14) +HS_CMP_XCHG(r4, r12) +HS_CMP_XCHG(r8, r16) +HS_CMP_XCHG(r4, r8) +HS_CMP_XCHG(r12, r16) +HS_CMP_XCHG(r2, r4) +HS_CMP_XCHG(r6, r8) +HS_CMP_XCHG(r10, r12) +HS_CMP_XCHG(r14, r16) +HS_CMP_XCHG(r1, r2) +HS_CMP_XCHG(r3, r4) +HS_CMP_XCHG(r5, r6) +HS_CMP_XCHG(r7, r8) +HS_CMP_XCHG(r9, r10) +HS_CMP_XCHG(r11, r12) +HS_CMP_XCHG(r13, r14) +HS_CMP_XCHG(r15, r16) +} +(vout + gmem_idx)[0 * 8] = r1; +(vout + gmem_idx)[1 * 8] = r2; +(vout + gmem_idx)[2 * 8] = r3; +(vout + gmem_idx)[3 * 8] = r4; +(vout + gmem_idx)[4 * 8] = r5; +(vout + gmem_idx)[5 * 8] = r6; +(vout + gmem_idx)[6 * 8] = r7; +(vout + gmem_idx)[7 * 8] = r8; +(vout + gmem_idx)[8 * 8] = r9; +(vout + gmem_idx)[9 * 8] = r10; +(vout + gmem_idx)[10 * 8] = r11; +(vout + gmem_idx)[11 * 8] = r12; +(vout + gmem_idx)[12 * 8] = r13; +(vout + gmem_idx)[13 * 8] = r14; +(vout + gmem_idx)[14 * 8] = r15; +(vout + gmem_idx)[15 * 8] = r16; +} + +__kernel __attribute__((reqd_work_group_size(32, 1, 1))) +__attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_bs_2(__global HS_KEY_TYPE const* const restrict vin, + __global HS_KEY_TYPE* const restrict vout) +{ + __local union + { + HS_KEY_TYPE m[16 * 32]; + } shared; + + uint const global_id = get_global_id(0); + uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); + + HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8]; + HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8]; + HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8]; + HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8]; + HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8]; + HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8]; + HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8]; + HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8]; + HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8]; + HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8]; + HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8]; + HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8]; + HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8]; + HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8]; + HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8]; + HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8]; + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r6, r11) + HS_CMP_XCHG(r7, r10) + HS_CMP_XCHG(r4, r13) + HS_CMP_XCHG(r14, r15) + HS_CMP_XCHG(r8, r12) + HS_CMP_XCHG(r2, r3) + HS_CMP_XCHG(r5, r9) + HS_CMP_XCHG(r2, r5) + HS_CMP_XCHG(r8, r14) + HS_CMP_XCHG(r3, r9) + HS_CMP_XCHG(r12, r15) + HS_CMP_XCHG(r3, r5) + HS_CMP_XCHG(r6, r7) + HS_CMP_XCHG(r10, r11) + HS_CMP_XCHG(r12, r14) + HS_CMP_XCHG(r4, r9) + HS_CMP_XCHG(r8, r13) + HS_CMP_XCHG(r7, r9) + HS_CMP_XCHG(r11, r13) + HS_CMP_XCHG(r4, r6) + HS_CMP_XCHG(r8, r10) + HS_CMP_XCHG(r4, r5) + HS_CMP_XCHG(r6, r7) + HS_CMP_XCHG(r8, r9) + HS_CMP_XCHG(r10, r11) + HS_CMP_XCHG(r12, r13) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + { + uint const flip_lane_mask = 1; + uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; + int const t_lt = get_sub_group_local_id() < flip_lane_idx; + HS_CMP_FLIP(0, r1, r16) + HS_CMP_FLIP(1, r2, r15) + HS_CMP_FLIP(2, r3, r14) + HS_CMP_FLIP(3, r4, r13) + HS_CMP_FLIP(4, r5, r12) + HS_CMP_FLIP(5, r6, r11) + HS_CMP_FLIP(6, r7, r10) + HS_CMP_FLIP(7, r8, r9) + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + { + uint const flip_lane_mask = 3; + uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; + int const t_lt = get_sub_group_local_id() < flip_lane_idx; + HS_CMP_FLIP(0, r1, r16) + HS_CMP_FLIP(1, r2, r15) + HS_CMP_FLIP(2, r3, r14) + HS_CMP_FLIP(3, r4, r13) + HS_CMP_FLIP(4, r5, r12) + HS_CMP_FLIP(5, r6, r11) + HS_CMP_FLIP(6, r7, r10) + HS_CMP_FLIP(7, r8, r9) + } + { + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + { + uint const flip_lane_mask = 7; + uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; + int const t_lt = get_sub_group_local_id() < flip_lane_idx; + HS_CMP_FLIP(0, r1, r16) + HS_CMP_FLIP(1, r2, r15) + HS_CMP_FLIP(2, r3, r14) + HS_CMP_FLIP(3, r4, r13) + HS_CMP_FLIP(4, r5, r12) + HS_CMP_FLIP(5, r6, r11) + HS_CMP_FLIP(6, r7, r10) + HS_CMP_FLIP(7, r8, r9) + } + { + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) + } + { + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + uint const smem_l_idx = get_sub_group_id() * 32 + get_sub_group_local_id(); + uint const smem_r_idx = + (get_sub_group_id() ^ 1) * 32 + (get_sub_group_local_id() ^ 7); + (shared.m + get_local_id(0))[4 * 8 * 0] = r1; + (shared.m + get_local_id(0))[4 * 8 * 1] = r16; + (shared.m + get_local_id(0))[4 * 8 * 2] = r2; + (shared.m + get_local_id(0))[4 * 8 * 3] = r15; + (shared.m + get_local_id(0))[4 * 8 * 4] = r3; + (shared.m + get_local_id(0))[4 * 8 * 5] = r14; + (shared.m + get_local_id(0))[4 * 8 * 6] = r4; + (shared.m + get_local_id(0))[4 * 8 * 7] = r13; + (shared.m + get_local_id(0))[4 * 8 * 8] = r5; + (shared.m + get_local_id(0))[4 * 8 * 9] = r12; + (shared.m + get_local_id(0))[4 * 8 * 10] = r6; + (shared.m + get_local_id(0))[4 * 8 * 11] = r11; + (shared.m + get_local_id(0))[4 * 8 * 12] = r7; + (shared.m + get_local_id(0))[4 * 8 * 13] = r10; + (shared.m + get_local_id(0))[4 * 8 * 14] = r8; + (shared.m + get_local_id(0))[4 * 8 * 15] = r9; + barrier(CLK_LOCAL_MEM_FENCE); + { + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; + HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[8]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[0] = r0_1; + (shared.m + smem_r_idx)[8] = r0_2; + } + { + HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[16]; + HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[24]; + HS_CMP_XCHG(r1_1, r1_2) + (shared.m + smem_l_idx)[16] = r1_1; + (shared.m + smem_r_idx)[24] = r1_2; + } + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[128]; + HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[136]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[128] = r0_1; + (shared.m + smem_r_idx)[136] = r0_2; + } + { + HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[144]; + HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[152]; + HS_CMP_XCHG(r1_1, r1_2) + (shared.m + smem_l_idx)[144] = r1_1; + (shared.m + smem_r_idx)[152] = r1_2; + } + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[256]; + HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[264]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[256] = r0_1; + (shared.m + smem_r_idx)[264] = r0_2; + } + { + HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[272]; + HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[280]; + HS_CMP_XCHG(r1_1, r1_2) + (shared.m + smem_l_idx)[272] = r1_1; + (shared.m + smem_r_idx)[280] = r1_2; + } + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[384]; + HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[392]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[384] = r0_1; + (shared.m + smem_r_idx)[392] = r0_2; + } + { + HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[400]; + HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[408]; + HS_CMP_XCHG(r1_1, r1_2) + (shared.m + smem_l_idx)[400] = r1_1; + (shared.m + smem_r_idx)[408] = r1_2; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + r1 = (shared.m + get_local_id(0))[4 * 8 * 0]; + r16 = (shared.m + get_local_id(0))[4 * 8 * 1]; + r2 = (shared.m + get_local_id(0))[4 * 8 * 2]; + r15 = (shared.m + get_local_id(0))[4 * 8 * 3]; + r3 = (shared.m + get_local_id(0))[4 * 8 * 4]; + r14 = (shared.m + get_local_id(0))[4 * 8 * 5]; + r4 = (shared.m + get_local_id(0))[4 * 8 * 6]; + r13 = (shared.m + get_local_id(0))[4 * 8 * 7]; + r5 = (shared.m + get_local_id(0))[4 * 8 * 8]; + r12 = (shared.m + get_local_id(0))[4 * 8 * 9]; + r6 = (shared.m + get_local_id(0))[4 * 8 * 10]; + r11 = (shared.m + get_local_id(0))[4 * 8 * 11]; + r7 = (shared.m + get_local_id(0))[4 * 8 * 12]; + r10 = (shared.m + get_local_id(0))[4 * 8 * 13]; + r8 = (shared.m + get_local_id(0))[4 * 8 * 14]; + r9 = (shared.m + get_local_id(0))[4 * 8 * 15]; + { { uint const half_lane_mask = 4; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +HS_CMP_XCHG(r1, r9) +HS_CMP_XCHG(r5, r13) +HS_CMP_XCHG(r1, r5) +HS_CMP_XCHG(r9, r13) +HS_CMP_XCHG(r3, r11) +HS_CMP_XCHG(r7, r15) +HS_CMP_XCHG(r3, r7) +HS_CMP_XCHG(r11, r15) +HS_CMP_XCHG(r1, r3) +HS_CMP_XCHG(r5, r7) +HS_CMP_XCHG(r9, r11) +HS_CMP_XCHG(r13, r15) +HS_CMP_XCHG(r2, r10) +HS_CMP_XCHG(r6, r14) +HS_CMP_XCHG(r2, r6) +HS_CMP_XCHG(r10, r14) +HS_CMP_XCHG(r4, r12) +HS_CMP_XCHG(r8, r16) +HS_CMP_XCHG(r4, r8) +HS_CMP_XCHG(r12, r16) +HS_CMP_XCHG(r2, r4) +HS_CMP_XCHG(r6, r8) +HS_CMP_XCHG(r10, r12) +HS_CMP_XCHG(r14, r16) +HS_CMP_XCHG(r1, r2) +HS_CMP_XCHG(r3, r4) +HS_CMP_XCHG(r5, r6) +HS_CMP_XCHG(r7, r8) +HS_CMP_XCHG(r9, r10) +HS_CMP_XCHG(r11, r12) +HS_CMP_XCHG(r13, r14) +HS_CMP_XCHG(r15, r16) +} +(shared.m + get_local_id(0))[4 * 8 * 0] = r1; +(shared.m + get_local_id(0))[4 * 8 * 1] = r16; +(shared.m + get_local_id(0))[4 * 8 * 2] = r2; +(shared.m + get_local_id(0))[4 * 8 * 3] = r15; +(shared.m + get_local_id(0))[4 * 8 * 4] = r3; +(shared.m + get_local_id(0))[4 * 8 * 5] = r14; +(shared.m + get_local_id(0))[4 * 8 * 6] = r4; +(shared.m + get_local_id(0))[4 * 8 * 7] = r13; +(shared.m + get_local_id(0))[4 * 8 * 8] = r5; +(shared.m + get_local_id(0))[4 * 8 * 9] = r12; +(shared.m + get_local_id(0))[4 * 8 * 10] = r6; +(shared.m + get_local_id(0))[4 * 8 * 11] = r11; +(shared.m + get_local_id(0))[4 * 8 * 12] = r7; +(shared.m + get_local_id(0))[4 * 8 * 13] = r10; +(shared.m + get_local_id(0))[4 * 8 * 14] = r8; +(shared.m + get_local_id(0))[4 * 8 * 15] = r9; +barrier(CLK_LOCAL_MEM_FENCE); +{ + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; + HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8]; + HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[16]; + HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[24]; + HS_CMP_XCHG(r0_2, r0_3) + HS_CMP_XCHG(r0_1, r0_4) + HS_CMP_XCHG(r0_3, r0_4) + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[0] = r0_1; + (shared.m + smem_l_idx)[8] = r0_2; + (shared.m + smem_r_idx)[16] = r0_3; + (shared.m + smem_r_idx)[24] = r0_4; + } + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[128]; + HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[136]; + HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[144]; + HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[152]; + HS_CMP_XCHG(r0_2, r0_3) + HS_CMP_XCHG(r0_1, r0_4) + HS_CMP_XCHG(r0_3, r0_4) + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[128] = r0_1; + (shared.m + smem_l_idx)[136] = r0_2; + (shared.m + smem_r_idx)[144] = r0_3; + (shared.m + smem_r_idx)[152] = r0_4; + } + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[256]; + HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[264]; + HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[272]; + HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[280]; + HS_CMP_XCHG(r0_2, r0_3) + HS_CMP_XCHG(r0_1, r0_4) + HS_CMP_XCHG(r0_3, r0_4) + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[256] = r0_1; + (shared.m + smem_l_idx)[264] = r0_2; + (shared.m + smem_r_idx)[272] = r0_3; + (shared.m + smem_r_idx)[280] = r0_4; + } + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[384]; + HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[392]; + HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[400]; + HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[408]; + HS_CMP_XCHG(r0_2, r0_3) + HS_CMP_XCHG(r0_1, r0_4) + HS_CMP_XCHG(r0_3, r0_4) + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[384] = r0_1; + (shared.m + smem_l_idx)[392] = r0_2; + (shared.m + smem_r_idx)[400] = r0_3; + (shared.m + smem_r_idx)[408] = r0_4; + } +} +barrier(CLK_LOCAL_MEM_FENCE); +r1 = (shared.m + get_local_id(0))[4 * 8 * 0]; +r16 = (shared.m + get_local_id(0))[4 * 8 * 1]; +r2 = (shared.m + get_local_id(0))[4 * 8 * 2]; +r15 = (shared.m + get_local_id(0))[4 * 8 * 3]; +r3 = (shared.m + get_local_id(0))[4 * 8 * 4]; +r14 = (shared.m + get_local_id(0))[4 * 8 * 5]; +r4 = (shared.m + get_local_id(0))[4 * 8 * 6]; +r13 = (shared.m + get_local_id(0))[4 * 8 * 7]; +r5 = (shared.m + get_local_id(0))[4 * 8 * 8]; +r12 = (shared.m + get_local_id(0))[4 * 8 * 9]; +r6 = (shared.m + get_local_id(0))[4 * 8 * 10]; +r11 = (shared.m + get_local_id(0))[4 * 8 * 11]; +r7 = (shared.m + get_local_id(0))[4 * 8 * 12]; +r10 = (shared.m + get_local_id(0))[4 * 8 * 13]; +r8 = (shared.m + get_local_id(0))[4 * 8 * 14]; +r9 = (shared.m + get_local_id(0))[4 * 8 * 15]; +{ { uint const half_lane_mask = 4; +uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; +int const t_lt = get_sub_group_local_id() < half_lane_idx; +HS_CMP_HALF(0, r1) +HS_CMP_HALF(1, r2) +HS_CMP_HALF(2, r3) +HS_CMP_HALF(3, r4) +HS_CMP_HALF(4, r5) +HS_CMP_HALF(5, r6) +HS_CMP_HALF(6, r7) +HS_CMP_HALF(7, r8) +HS_CMP_HALF(8, r9) +HS_CMP_HALF(9, r10) +HS_CMP_HALF(10, r11) +HS_CMP_HALF(11, r12) +HS_CMP_HALF(12, r13) +HS_CMP_HALF(13, r14) +HS_CMP_HALF(14, r15) +HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +HS_CMP_XCHG(r1, r9) +HS_CMP_XCHG(r5, r13) +HS_CMP_XCHG(r1, r5) +HS_CMP_XCHG(r9, r13) +HS_CMP_XCHG(r3, r11) +HS_CMP_XCHG(r7, r15) +HS_CMP_XCHG(r3, r7) +HS_CMP_XCHG(r11, r15) +HS_CMP_XCHG(r1, r3) +HS_CMP_XCHG(r5, r7) +HS_CMP_XCHG(r9, r11) +HS_CMP_XCHG(r13, r15) +HS_CMP_XCHG(r2, r10) +HS_CMP_XCHG(r6, r14) +HS_CMP_XCHG(r2, r6) +HS_CMP_XCHG(r10, r14) +HS_CMP_XCHG(r4, r12) +HS_CMP_XCHG(r8, r16) +HS_CMP_XCHG(r4, r8) +HS_CMP_XCHG(r12, r16) +HS_CMP_XCHG(r2, r4) +HS_CMP_XCHG(r6, r8) +HS_CMP_XCHG(r10, r12) +HS_CMP_XCHG(r14, r16) +HS_CMP_XCHG(r1, r2) +HS_CMP_XCHG(r3, r4) +HS_CMP_XCHG(r5, r6) +HS_CMP_XCHG(r7, r8) +HS_CMP_XCHG(r9, r10) +HS_CMP_XCHG(r11, r12) +HS_CMP_XCHG(r13, r14) +HS_CMP_XCHG(r15, r16) +} +(vout + gmem_idx)[0 * 8] = r1; +(vout + gmem_idx)[1 * 8] = r2; +(vout + gmem_idx)[2 * 8] = r3; +(vout + gmem_idx)[3 * 8] = r4; +(vout + gmem_idx)[4 * 8] = r5; +(vout + gmem_idx)[5 * 8] = r6; +(vout + gmem_idx)[6 * 8] = r7; +(vout + gmem_idx)[7 * 8] = r8; +(vout + gmem_idx)[8 * 8] = r9; +(vout + gmem_idx)[9 * 8] = r10; +(vout + gmem_idx)[10 * 8] = r11; +(vout + gmem_idx)[11 * 8] = r12; +(vout + gmem_idx)[12 * 8] = r13; +(vout + gmem_idx)[13 * 8] = r14; +(vout + gmem_idx)[14 * 8] = r15; +(vout + gmem_idx)[15 * 8] = r16; +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) +__attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_bs_1(__global HS_KEY_TYPE const* const restrict vin, + __global HS_KEY_TYPE* const restrict vout) +{ + __local union + { + HS_KEY_TYPE m[16 * 16]; + } shared; + + uint const global_id = get_global_id(0); + uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); + + HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8]; + HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8]; + HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8]; + HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8]; + HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8]; + HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8]; + HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8]; + HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8]; + HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8]; + HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8]; + HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8]; + HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8]; + HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8]; + HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8]; + HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8]; + HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8]; + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r6, r11) + HS_CMP_XCHG(r7, r10) + HS_CMP_XCHG(r4, r13) + HS_CMP_XCHG(r14, r15) + HS_CMP_XCHG(r8, r12) + HS_CMP_XCHG(r2, r3) + HS_CMP_XCHG(r5, r9) + HS_CMP_XCHG(r2, r5) + HS_CMP_XCHG(r8, r14) + HS_CMP_XCHG(r3, r9) + HS_CMP_XCHG(r12, r15) + HS_CMP_XCHG(r3, r5) + HS_CMP_XCHG(r6, r7) + HS_CMP_XCHG(r10, r11) + HS_CMP_XCHG(r12, r14) + HS_CMP_XCHG(r4, r9) + HS_CMP_XCHG(r8, r13) + HS_CMP_XCHG(r7, r9) + HS_CMP_XCHG(r11, r13) + HS_CMP_XCHG(r4, r6) + HS_CMP_XCHG(r8, r10) + HS_CMP_XCHG(r4, r5) + HS_CMP_XCHG(r6, r7) + HS_CMP_XCHG(r8, r9) + HS_CMP_XCHG(r10, r11) + HS_CMP_XCHG(r12, r13) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + { + uint const flip_lane_mask = 1; + uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; + int const t_lt = get_sub_group_local_id() < flip_lane_idx; + HS_CMP_FLIP(0, r1, r16) + HS_CMP_FLIP(1, r2, r15) + HS_CMP_FLIP(2, r3, r14) + HS_CMP_FLIP(3, r4, r13) + HS_CMP_FLIP(4, r5, r12) + HS_CMP_FLIP(5, r6, r11) + HS_CMP_FLIP(6, r7, r10) + HS_CMP_FLIP(7, r8, r9) + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + { + uint const flip_lane_mask = 3; + uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; + int const t_lt = get_sub_group_local_id() < flip_lane_idx; + HS_CMP_FLIP(0, r1, r16) + HS_CMP_FLIP(1, r2, r15) + HS_CMP_FLIP(2, r3, r14) + HS_CMP_FLIP(3, r4, r13) + HS_CMP_FLIP(4, r5, r12) + HS_CMP_FLIP(5, r6, r11) + HS_CMP_FLIP(6, r7, r10) + HS_CMP_FLIP(7, r8, r9) + } + { + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + { + uint const flip_lane_mask = 7; + uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; + int const t_lt = get_sub_group_local_id() < flip_lane_idx; + HS_CMP_FLIP(0, r1, r16) + HS_CMP_FLIP(1, r2, r15) + HS_CMP_FLIP(2, r3, r14) + HS_CMP_FLIP(3, r4, r13) + HS_CMP_FLIP(4, r5, r12) + HS_CMP_FLIP(5, r6, r11) + HS_CMP_FLIP(6, r7, r10) + HS_CMP_FLIP(7, r8, r9) + } + { + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) + } + { + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + uint const smem_l_idx = get_sub_group_id() * 16 + get_sub_group_local_id(); + uint const smem_r_idx = + (get_sub_group_id() ^ 1) * 16 + (get_sub_group_local_id() ^ 7); + (shared.m + get_local_id(0))[2 * 8 * 0] = r1; + (shared.m + get_local_id(0))[2 * 8 * 1] = r16; + (shared.m + get_local_id(0))[2 * 8 * 2] = r2; + (shared.m + get_local_id(0))[2 * 8 * 3] = r15; + (shared.m + get_local_id(0))[2 * 8 * 4] = r3; + (shared.m + get_local_id(0))[2 * 8 * 5] = r14; + (shared.m + get_local_id(0))[2 * 8 * 6] = r4; + (shared.m + get_local_id(0))[2 * 8 * 7] = r13; + (shared.m + get_local_id(0))[2 * 8 * 8] = r5; + (shared.m + get_local_id(0))[2 * 8 * 9] = r12; + (shared.m + get_local_id(0))[2 * 8 * 10] = r6; + (shared.m + get_local_id(0))[2 * 8 * 11] = r11; + (shared.m + get_local_id(0))[2 * 8 * 12] = r7; + (shared.m + get_local_id(0))[2 * 8 * 13] = r10; + (shared.m + get_local_id(0))[2 * 8 * 14] = r8; + (shared.m + get_local_id(0))[2 * 8 * 15] = r9; + barrier(CLK_LOCAL_MEM_FENCE); + { + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; + HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[8]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[0] = r0_1; + (shared.m + smem_r_idx)[8] = r0_2; + } + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[32]; + HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[40]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[32] = r0_1; + (shared.m + smem_r_idx)[40] = r0_2; + } + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[64]; + HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[72]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[64] = r0_1; + (shared.m + smem_r_idx)[72] = r0_2; + } + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[96]; + HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[104]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[96] = r0_1; + (shared.m + smem_r_idx)[104] = r0_2; + } + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[128]; + HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[136]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[128] = r0_1; + (shared.m + smem_r_idx)[136] = r0_2; + } + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[160]; + HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[168]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[160] = r0_1; + (shared.m + smem_r_idx)[168] = r0_2; + } + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[192]; + HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[200]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[192] = r0_1; + (shared.m + smem_r_idx)[200] = r0_2; + } + { + HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[224]; + HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[232]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[224] = r0_1; + (shared.m + smem_r_idx)[232] = r0_2; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + r1 = (shared.m + get_local_id(0))[2 * 8 * 0]; + r16 = (shared.m + get_local_id(0))[2 * 8 * 1]; + r2 = (shared.m + get_local_id(0))[2 * 8 * 2]; + r15 = (shared.m + get_local_id(0))[2 * 8 * 3]; + r3 = (shared.m + get_local_id(0))[2 * 8 * 4]; + r14 = (shared.m + get_local_id(0))[2 * 8 * 5]; + r4 = (shared.m + get_local_id(0))[2 * 8 * 6]; + r13 = (shared.m + get_local_id(0))[2 * 8 * 7]; + r5 = (shared.m + get_local_id(0))[2 * 8 * 8]; + r12 = (shared.m + get_local_id(0))[2 * 8 * 9]; + r6 = (shared.m + get_local_id(0))[2 * 8 * 10]; + r11 = (shared.m + get_local_id(0))[2 * 8 * 11]; + r7 = (shared.m + get_local_id(0))[2 * 8 * 12]; + r10 = (shared.m + get_local_id(0))[2 * 8 * 13]; + r8 = (shared.m + get_local_id(0))[2 * 8 * 14]; + r9 = (shared.m + get_local_id(0))[2 * 8 * 15]; + { { uint const half_lane_mask = 4; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +HS_CMP_XCHG(r1, r9) +HS_CMP_XCHG(r5, r13) +HS_CMP_XCHG(r1, r5) +HS_CMP_XCHG(r9, r13) +HS_CMP_XCHG(r3, r11) +HS_CMP_XCHG(r7, r15) +HS_CMP_XCHG(r3, r7) +HS_CMP_XCHG(r11, r15) +HS_CMP_XCHG(r1, r3) +HS_CMP_XCHG(r5, r7) +HS_CMP_XCHG(r9, r11) +HS_CMP_XCHG(r13, r15) +HS_CMP_XCHG(r2, r10) +HS_CMP_XCHG(r6, r14) +HS_CMP_XCHG(r2, r6) +HS_CMP_XCHG(r10, r14) +HS_CMP_XCHG(r4, r12) +HS_CMP_XCHG(r8, r16) +HS_CMP_XCHG(r4, r8) +HS_CMP_XCHG(r12, r16) +HS_CMP_XCHG(r2, r4) +HS_CMP_XCHG(r6, r8) +HS_CMP_XCHG(r10, r12) +HS_CMP_XCHG(r14, r16) +HS_CMP_XCHG(r1, r2) +HS_CMP_XCHG(r3, r4) +HS_CMP_XCHG(r5, r6) +HS_CMP_XCHG(r7, r8) +HS_CMP_XCHG(r9, r10) +HS_CMP_XCHG(r11, r12) +HS_CMP_XCHG(r13, r14) +HS_CMP_XCHG(r15, r16) +} +(vout + gmem_idx)[0 * 8] = r1; +(vout + gmem_idx)[1 * 8] = r2; +(vout + gmem_idx)[2 * 8] = r3; +(vout + gmem_idx)[3 * 8] = r4; +(vout + gmem_idx)[4 * 8] = r5; +(vout + gmem_idx)[5 * 8] = r6; +(vout + gmem_idx)[6 * 8] = r7; +(vout + gmem_idx)[7 * 8] = r8; +(vout + gmem_idx)[8 * 8] = r9; +(vout + gmem_idx)[9 * 8] = r10; +(vout + gmem_idx)[10 * 8] = r11; +(vout + gmem_idx)[11 * 8] = r12; +(vout + gmem_idx)[12 * 8] = r13; +(vout + gmem_idx)[13 * 8] = r14; +(vout + gmem_idx)[14 * 8] = r15; +(vout + gmem_idx)[15 * 8] = r16; +} + +__kernel __attribute__((reqd_work_group_size(8, 1, 1))) +__attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_bs_0(__global HS_KEY_TYPE const* const restrict vin, + __global HS_KEY_TYPE* const restrict vout) +{ + __local union + { + } shared; + + uint const global_id = get_global_id(0); + uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); + + HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8]; + HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8]; + HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8]; + HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8]; + HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8]; + HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8]; + HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8]; + HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8]; + HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8]; + HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8]; + HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8]; + HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8]; + HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8]; + HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8]; + HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8]; + HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8]; + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r6, r11) + HS_CMP_XCHG(r7, r10) + HS_CMP_XCHG(r4, r13) + HS_CMP_XCHG(r14, r15) + HS_CMP_XCHG(r8, r12) + HS_CMP_XCHG(r2, r3) + HS_CMP_XCHG(r5, r9) + HS_CMP_XCHG(r2, r5) + HS_CMP_XCHG(r8, r14) + HS_CMP_XCHG(r3, r9) + HS_CMP_XCHG(r12, r15) + HS_CMP_XCHG(r3, r5) + HS_CMP_XCHG(r6, r7) + HS_CMP_XCHG(r10, r11) + HS_CMP_XCHG(r12, r14) + HS_CMP_XCHG(r4, r9) + HS_CMP_XCHG(r8, r13) + HS_CMP_XCHG(r7, r9) + HS_CMP_XCHG(r11, r13) + HS_CMP_XCHG(r4, r6) + HS_CMP_XCHG(r8, r10) + HS_CMP_XCHG(r4, r5) + HS_CMP_XCHG(r6, r7) + HS_CMP_XCHG(r8, r9) + HS_CMP_XCHG(r10, r11) + HS_CMP_XCHG(r12, r13) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + { + uint const flip_lane_mask = 1; + uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; + int const t_lt = get_sub_group_local_id() < flip_lane_idx; + HS_CMP_FLIP(0, r1, r16) + HS_CMP_FLIP(1, r2, r15) + HS_CMP_FLIP(2, r3, r14) + HS_CMP_FLIP(3, r4, r13) + HS_CMP_FLIP(4, r5, r12) + HS_CMP_FLIP(5, r6, r11) + HS_CMP_FLIP(6, r7, r10) + HS_CMP_FLIP(7, r8, r9) + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + { + uint const flip_lane_mask = 3; + uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; + int const t_lt = get_sub_group_local_id() < flip_lane_idx; + HS_CMP_FLIP(0, r1, r16) + HS_CMP_FLIP(1, r2, r15) + HS_CMP_FLIP(2, r3, r14) + HS_CMP_FLIP(3, r4, r13) + HS_CMP_FLIP(4, r5, r12) + HS_CMP_FLIP(5, r6, r11) + HS_CMP_FLIP(6, r7, r10) + HS_CMP_FLIP(7, r8, r9) + } + { + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + { + uint const flip_lane_mask = 7; + uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; + int const t_lt = get_sub_group_local_id() < flip_lane_idx; + HS_CMP_FLIP(0, r1, r16) + HS_CMP_FLIP(1, r2, r15) + HS_CMP_FLIP(2, r3, r14) + HS_CMP_FLIP(3, r4, r13) + HS_CMP_FLIP(4, r5, r12) + HS_CMP_FLIP(5, r6, r11) + HS_CMP_FLIP(6, r7, r10) + HS_CMP_FLIP(7, r8, r9) + } + { + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) + } + { + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + (vout + gmem_idx)[0 * 8] = r1; + (vout + gmem_idx)[1 * 8] = r2; + (vout + gmem_idx)[2 * 8] = r3; + (vout + gmem_idx)[3 * 8] = r4; + (vout + gmem_idx)[4 * 8] = r5; + (vout + gmem_idx)[5 * 8] = r6; + (vout + gmem_idx)[6 * 8] = r7; + (vout + gmem_idx)[7 * 8] = r8; + (vout + gmem_idx)[8 * 8] = r9; + (vout + gmem_idx)[9 * 8] = r10; + (vout + gmem_idx)[10 * 8] = r11; + (vout + gmem_idx)[11 * 8] = r12; + (vout + gmem_idx)[12 * 8] = r13; + (vout + gmem_idx)[13 * 8] = r14; + (vout + gmem_idx)[14 * 8] = r15; + (vout + gmem_idx)[15 * 8] = r16; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_bc_4(__global HS_KEY_TYPE* const restrict vout) +{ + __local union + { + HS_KEY_TYPE m[16 * 128]; + } shared; + + uint const global_id = get_global_id(0); + uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); + + uint const gmem_l_idx = (global_id / 128) * 2048 + (global_id & 127); + uint const smem_l_idx = get_sub_group_id() * 128 + get_sub_group_local_id(); + { + { + HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[0]; + HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[128]; + HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[256]; + HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[384]; + HS_KEY_TYPE r0_5 = (vout + gmem_l_idx)[512]; + HS_KEY_TYPE r0_6 = (vout + gmem_l_idx)[640]; + HS_KEY_TYPE r0_7 = (vout + gmem_l_idx)[768]; + HS_KEY_TYPE r0_8 = (vout + gmem_l_idx)[896]; + HS_KEY_TYPE r0_9 = (vout + gmem_l_idx)[1024]; + HS_KEY_TYPE r0_10 = (vout + gmem_l_idx)[1152]; + HS_KEY_TYPE r0_11 = (vout + gmem_l_idx)[1280]; + HS_KEY_TYPE r0_12 = (vout + gmem_l_idx)[1408]; + HS_KEY_TYPE r0_13 = (vout + gmem_l_idx)[1536]; + HS_KEY_TYPE r0_14 = (vout + gmem_l_idx)[1664]; + HS_KEY_TYPE r0_15 = (vout + gmem_l_idx)[1792]; + HS_KEY_TYPE r0_16 = (vout + gmem_l_idx)[1920]; + HS_CMP_XCHG(r0_1, r0_9) + HS_CMP_XCHG(r0_5, r0_13) + HS_CMP_XCHG(r0_1, r0_5) + HS_CMP_XCHG(r0_9, r0_13) + HS_CMP_XCHG(r0_3, r0_11) + HS_CMP_XCHG(r0_7, r0_15) + HS_CMP_XCHG(r0_3, r0_7) + HS_CMP_XCHG(r0_11, r0_15) + HS_CMP_XCHG(r0_1, r0_3) + HS_CMP_XCHG(r0_5, r0_7) + HS_CMP_XCHG(r0_9, r0_11) + HS_CMP_XCHG(r0_13, r0_15) + HS_CMP_XCHG(r0_2, r0_10) + HS_CMP_XCHG(r0_6, r0_14) + HS_CMP_XCHG(r0_2, r0_6) + HS_CMP_XCHG(r0_10, r0_14) + HS_CMP_XCHG(r0_4, r0_12) + HS_CMP_XCHG(r0_8, r0_16) + HS_CMP_XCHG(r0_4, r0_8) + HS_CMP_XCHG(r0_12, r0_16) + HS_CMP_XCHG(r0_2, r0_4) + HS_CMP_XCHG(r0_6, r0_8) + HS_CMP_XCHG(r0_10, r0_12) + HS_CMP_XCHG(r0_14, r0_16) + HS_CMP_XCHG(r0_1, r0_2) + HS_CMP_XCHG(r0_3, r0_4) + HS_CMP_XCHG(r0_5, r0_6) + HS_CMP_XCHG(r0_7, r0_8) + HS_CMP_XCHG(r0_9, r0_10) + HS_CMP_XCHG(r0_11, r0_12) + HS_CMP_XCHG(r0_13, r0_14) + HS_CMP_XCHG(r0_15, r0_16) + (shared.m + smem_l_idx)[0] = r0_1; + (shared.m + smem_l_idx)[8] = r0_2; + (shared.m + smem_l_idx)[16] = r0_3; + (shared.m + smem_l_idx)[24] = r0_4; + (shared.m + smem_l_idx)[32] = r0_5; + (shared.m + smem_l_idx)[40] = r0_6; + (shared.m + smem_l_idx)[48] = r0_7; + (shared.m + smem_l_idx)[56] = r0_8; + (shared.m + smem_l_idx)[64] = r0_9; + (shared.m + smem_l_idx)[72] = r0_10; + (shared.m + smem_l_idx)[80] = r0_11; + (shared.m + smem_l_idx)[88] = r0_12; + (shared.m + smem_l_idx)[96] = r0_13; + (shared.m + smem_l_idx)[104] = r0_14; + (shared.m + smem_l_idx)[112] = r0_15; + (shared.m + smem_l_idx)[120] = r0_16; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + HS_KEY_TYPE r1 = (shared.m + get_local_id(0))[16 * 8 * 0]; + HS_KEY_TYPE r2 = (shared.m + get_local_id(0))[16 * 8 * 1]; + HS_KEY_TYPE r3 = (shared.m + get_local_id(0))[16 * 8 * 2]; + HS_KEY_TYPE r4 = (shared.m + get_local_id(0))[16 * 8 * 3]; + HS_KEY_TYPE r5 = (shared.m + get_local_id(0))[16 * 8 * 4]; + HS_KEY_TYPE r6 = (shared.m + get_local_id(0))[16 * 8 * 5]; + HS_KEY_TYPE r7 = (shared.m + get_local_id(0))[16 * 8 * 6]; + HS_KEY_TYPE r8 = (shared.m + get_local_id(0))[16 * 8 * 7]; + HS_KEY_TYPE r9 = (shared.m + get_local_id(0))[16 * 8 * 8]; + HS_KEY_TYPE r10 = (shared.m + get_local_id(0))[16 * 8 * 9]; + HS_KEY_TYPE r11 = (shared.m + get_local_id(0))[16 * 8 * 10]; + HS_KEY_TYPE r12 = (shared.m + get_local_id(0))[16 * 8 * 11]; + HS_KEY_TYPE r13 = (shared.m + get_local_id(0))[16 * 8 * 12]; + HS_KEY_TYPE r14 = (shared.m + get_local_id(0))[16 * 8 * 13]; + HS_KEY_TYPE r15 = (shared.m + get_local_id(0))[16 * 8 * 14]; + HS_KEY_TYPE r16 = (shared.m + get_local_id(0))[16 * 8 * 15]; + { { uint const half_lane_mask = 4; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +HS_CMP_XCHG(r1, r9) +HS_CMP_XCHG(r5, r13) +HS_CMP_XCHG(r1, r5) +HS_CMP_XCHG(r9, r13) +HS_CMP_XCHG(r3, r11) +HS_CMP_XCHG(r7, r15) +HS_CMP_XCHG(r3, r7) +HS_CMP_XCHG(r11, r15) +HS_CMP_XCHG(r1, r3) +HS_CMP_XCHG(r5, r7) +HS_CMP_XCHG(r9, r11) +HS_CMP_XCHG(r13, r15) +HS_CMP_XCHG(r2, r10) +HS_CMP_XCHG(r6, r14) +HS_CMP_XCHG(r2, r6) +HS_CMP_XCHG(r10, r14) +HS_CMP_XCHG(r4, r12) +HS_CMP_XCHG(r8, r16) +HS_CMP_XCHG(r4, r8) +HS_CMP_XCHG(r12, r16) +HS_CMP_XCHG(r2, r4) +HS_CMP_XCHG(r6, r8) +HS_CMP_XCHG(r10, r12) +HS_CMP_XCHG(r14, r16) +HS_CMP_XCHG(r1, r2) +HS_CMP_XCHG(r3, r4) +HS_CMP_XCHG(r5, r6) +HS_CMP_XCHG(r7, r8) +HS_CMP_XCHG(r9, r10) +HS_CMP_XCHG(r11, r12) +HS_CMP_XCHG(r13, r14) +HS_CMP_XCHG(r15, r16) +} +(vout + gmem_idx)[0 * 8] = r1; +(vout + gmem_idx)[1 * 8] = r2; +(vout + gmem_idx)[2 * 8] = r3; +(vout + gmem_idx)[3 * 8] = r4; +(vout + gmem_idx)[4 * 8] = r5; +(vout + gmem_idx)[5 * 8] = r6; +(vout + gmem_idx)[6 * 8] = r7; +(vout + gmem_idx)[7 * 8] = r8; +(vout + gmem_idx)[8 * 8] = r9; +(vout + gmem_idx)[9 * 8] = r10; +(vout + gmem_idx)[10 * 8] = r11; +(vout + gmem_idx)[11 * 8] = r12; +(vout + gmem_idx)[12 * 8] = r13; +(vout + gmem_idx)[13 * 8] = r14; +(vout + gmem_idx)[14 * 8] = r15; +(vout + gmem_idx)[15 * 8] = r16; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_bc_3(__global HS_KEY_TYPE* const restrict vout) +{ + __local union + { + HS_KEY_TYPE m[16 * 64]; + } shared; + + uint const global_id = get_global_id(0); + uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); + + uint const gmem_l_idx = (global_id / 64) * 1024 + (global_id & 63); + uint const smem_l_idx = get_sub_group_id() * 64 + get_sub_group_local_id(); + { + { + HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[0]; + HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[128]; + HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[256]; + HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[384]; + HS_KEY_TYPE r0_5 = (vout + gmem_l_idx)[512]; + HS_KEY_TYPE r0_6 = (vout + gmem_l_idx)[640]; + HS_KEY_TYPE r0_7 = (vout + gmem_l_idx)[768]; + HS_KEY_TYPE r0_8 = (vout + gmem_l_idx)[896]; + HS_CMP_XCHG(r0_1, r0_5) + HS_CMP_XCHG(r0_3, r0_7) + HS_CMP_XCHG(r0_1, r0_3) + HS_CMP_XCHG(r0_5, r0_7) + HS_CMP_XCHG(r0_2, r0_6) + HS_CMP_XCHG(r0_4, r0_8) + HS_CMP_XCHG(r0_2, r0_4) + HS_CMP_XCHG(r0_6, r0_8) + HS_CMP_XCHG(r0_1, r0_2) + HS_CMP_XCHG(r0_3, r0_4) + HS_CMP_XCHG(r0_5, r0_6) + HS_CMP_XCHG(r0_7, r0_8) + (shared.m + smem_l_idx)[0] = r0_1; + (shared.m + smem_l_idx)[8] = r0_2; + (shared.m + smem_l_idx)[16] = r0_3; + (shared.m + smem_l_idx)[24] = r0_4; + (shared.m + smem_l_idx)[32] = r0_5; + (shared.m + smem_l_idx)[40] = r0_6; + (shared.m + smem_l_idx)[48] = r0_7; + (shared.m + smem_l_idx)[56] = r0_8; + } + { + HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[64]; + HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[192]; + HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[320]; + HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[448]; + HS_KEY_TYPE r0_5 = (vout + gmem_l_idx)[576]; + HS_KEY_TYPE r0_6 = (vout + gmem_l_idx)[704]; + HS_KEY_TYPE r0_7 = (vout + gmem_l_idx)[832]; + HS_KEY_TYPE r0_8 = (vout + gmem_l_idx)[960]; + HS_CMP_XCHG(r0_1, r0_5) + HS_CMP_XCHG(r0_3, r0_7) + HS_CMP_XCHG(r0_1, r0_3) + HS_CMP_XCHG(r0_5, r0_7) + HS_CMP_XCHG(r0_2, r0_6) + HS_CMP_XCHG(r0_4, r0_8) + HS_CMP_XCHG(r0_2, r0_4) + HS_CMP_XCHG(r0_6, r0_8) + HS_CMP_XCHG(r0_1, r0_2) + HS_CMP_XCHG(r0_3, r0_4) + HS_CMP_XCHG(r0_5, r0_6) + HS_CMP_XCHG(r0_7, r0_8) + (shared.m + smem_l_idx)[512] = r0_1; + (shared.m + smem_l_idx)[520] = r0_2; + (shared.m + smem_l_idx)[528] = r0_3; + (shared.m + smem_l_idx)[536] = r0_4; + (shared.m + smem_l_idx)[544] = r0_5; + (shared.m + smem_l_idx)[552] = r0_6; + (shared.m + smem_l_idx)[560] = r0_7; + (shared.m + smem_l_idx)[568] = r0_8; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + HS_KEY_TYPE r1 = (shared.m + get_local_id(0))[8 * 8 * 0]; + HS_KEY_TYPE r2 = (shared.m + get_local_id(0))[8 * 8 * 1]; + HS_KEY_TYPE r3 = (shared.m + get_local_id(0))[8 * 8 * 2]; + HS_KEY_TYPE r4 = (shared.m + get_local_id(0))[8 * 8 * 3]; + HS_KEY_TYPE r5 = (shared.m + get_local_id(0))[8 * 8 * 4]; + HS_KEY_TYPE r6 = (shared.m + get_local_id(0))[8 * 8 * 5]; + HS_KEY_TYPE r7 = (shared.m + get_local_id(0))[8 * 8 * 6]; + HS_KEY_TYPE r8 = (shared.m + get_local_id(0))[8 * 8 * 7]; + HS_KEY_TYPE r9 = (shared.m + get_local_id(0))[8 * 8 * 8]; + HS_KEY_TYPE r10 = (shared.m + get_local_id(0))[8 * 8 * 9]; + HS_KEY_TYPE r11 = (shared.m + get_local_id(0))[8 * 8 * 10]; + HS_KEY_TYPE r12 = (shared.m + get_local_id(0))[8 * 8 * 11]; + HS_KEY_TYPE r13 = (shared.m + get_local_id(0))[8 * 8 * 12]; + HS_KEY_TYPE r14 = (shared.m + get_local_id(0))[8 * 8 * 13]; + HS_KEY_TYPE r15 = (shared.m + get_local_id(0))[8 * 8 * 14]; + HS_KEY_TYPE r16 = (shared.m + get_local_id(0))[8 * 8 * 15]; + { { uint const half_lane_mask = 4; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +HS_CMP_XCHG(r1, r9) +HS_CMP_XCHG(r5, r13) +HS_CMP_XCHG(r1, r5) +HS_CMP_XCHG(r9, r13) +HS_CMP_XCHG(r3, r11) +HS_CMP_XCHG(r7, r15) +HS_CMP_XCHG(r3, r7) +HS_CMP_XCHG(r11, r15) +HS_CMP_XCHG(r1, r3) +HS_CMP_XCHG(r5, r7) +HS_CMP_XCHG(r9, r11) +HS_CMP_XCHG(r13, r15) +HS_CMP_XCHG(r2, r10) +HS_CMP_XCHG(r6, r14) +HS_CMP_XCHG(r2, r6) +HS_CMP_XCHG(r10, r14) +HS_CMP_XCHG(r4, r12) +HS_CMP_XCHG(r8, r16) +HS_CMP_XCHG(r4, r8) +HS_CMP_XCHG(r12, r16) +HS_CMP_XCHG(r2, r4) +HS_CMP_XCHG(r6, r8) +HS_CMP_XCHG(r10, r12) +HS_CMP_XCHG(r14, r16) +HS_CMP_XCHG(r1, r2) +HS_CMP_XCHG(r3, r4) +HS_CMP_XCHG(r5, r6) +HS_CMP_XCHG(r7, r8) +HS_CMP_XCHG(r9, r10) +HS_CMP_XCHG(r11, r12) +HS_CMP_XCHG(r13, r14) +HS_CMP_XCHG(r15, r16) +} +(vout + gmem_idx)[0 * 8] = r1; +(vout + gmem_idx)[1 * 8] = r2; +(vout + gmem_idx)[2 * 8] = r3; +(vout + gmem_idx)[3 * 8] = r4; +(vout + gmem_idx)[4 * 8] = r5; +(vout + gmem_idx)[5 * 8] = r6; +(vout + gmem_idx)[6 * 8] = r7; +(vout + gmem_idx)[7 * 8] = r8; +(vout + gmem_idx)[8 * 8] = r9; +(vout + gmem_idx)[9 * 8] = r10; +(vout + gmem_idx)[10 * 8] = r11; +(vout + gmem_idx)[11 * 8] = r12; +(vout + gmem_idx)[12 * 8] = r13; +(vout + gmem_idx)[13 * 8] = r14; +(vout + gmem_idx)[14 * 8] = r15; +(vout + gmem_idx)[15 * 8] = r16; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_bc_2(__global HS_KEY_TYPE* const restrict vout) +{ + __local union + { + HS_KEY_TYPE m[16 * 32]; + } shared; + + uint const global_id = get_global_id(0); + uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); + + uint const gmem_l_idx = (global_id / 32) * 512 + (global_id & 31); + uint const smem_l_idx = get_sub_group_id() * 32 + get_sub_group_local_id(); + { + { + HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[0]; + HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[128]; + HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[256]; + HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[384]; + HS_CMP_XCHG(r0_1, r0_3) + HS_CMP_XCHG(r0_2, r0_4) + HS_CMP_XCHG(r0_1, r0_2) + HS_CMP_XCHG(r0_3, r0_4) + (shared.m + smem_l_idx)[0] = r0_1; + (shared.m + smem_l_idx)[8] = r0_2; + (shared.m + smem_l_idx)[16] = r0_3; + (shared.m + smem_l_idx)[24] = r0_4; + } + { + HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[32]; + HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[160]; + HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[288]; + HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[416]; + HS_CMP_XCHG(r0_1, r0_3) + HS_CMP_XCHG(r0_2, r0_4) + HS_CMP_XCHG(r0_1, r0_2) + HS_CMP_XCHG(r0_3, r0_4) + (shared.m + smem_l_idx)[128] = r0_1; + (shared.m + smem_l_idx)[136] = r0_2; + (shared.m + smem_l_idx)[144] = r0_3; + (shared.m + smem_l_idx)[152] = r0_4; + } + { + HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[64]; + HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[192]; + HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[320]; + HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[448]; + HS_CMP_XCHG(r0_1, r0_3) + HS_CMP_XCHG(r0_2, r0_4) + HS_CMP_XCHG(r0_1, r0_2) + HS_CMP_XCHG(r0_3, r0_4) + (shared.m + smem_l_idx)[256] = r0_1; + (shared.m + smem_l_idx)[264] = r0_2; + (shared.m + smem_l_idx)[272] = r0_3; + (shared.m + smem_l_idx)[280] = r0_4; + } + { + HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[96]; + HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[224]; + HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[352]; + HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[480]; + HS_CMP_XCHG(r0_1, r0_3) + HS_CMP_XCHG(r0_2, r0_4) + HS_CMP_XCHG(r0_1, r0_2) + HS_CMP_XCHG(r0_3, r0_4) + (shared.m + smem_l_idx)[384] = r0_1; + (shared.m + smem_l_idx)[392] = r0_2; + (shared.m + smem_l_idx)[400] = r0_3; + (shared.m + smem_l_idx)[408] = r0_4; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + HS_KEY_TYPE r1 = (shared.m + get_local_id(0))[4 * 8 * 0]; + HS_KEY_TYPE r2 = (shared.m + get_local_id(0))[4 * 8 * 1]; + HS_KEY_TYPE r3 = (shared.m + get_local_id(0))[4 * 8 * 2]; + HS_KEY_TYPE r4 = (shared.m + get_local_id(0))[4 * 8 * 3]; + HS_KEY_TYPE r5 = (shared.m + get_local_id(0))[4 * 8 * 4]; + HS_KEY_TYPE r6 = (shared.m + get_local_id(0))[4 * 8 * 5]; + HS_KEY_TYPE r7 = (shared.m + get_local_id(0))[4 * 8 * 6]; + HS_KEY_TYPE r8 = (shared.m + get_local_id(0))[4 * 8 * 7]; + HS_KEY_TYPE r9 = (shared.m + get_local_id(0))[4 * 8 * 8]; + HS_KEY_TYPE r10 = (shared.m + get_local_id(0))[4 * 8 * 9]; + HS_KEY_TYPE r11 = (shared.m + get_local_id(0))[4 * 8 * 10]; + HS_KEY_TYPE r12 = (shared.m + get_local_id(0))[4 * 8 * 11]; + HS_KEY_TYPE r13 = (shared.m + get_local_id(0))[4 * 8 * 12]; + HS_KEY_TYPE r14 = (shared.m + get_local_id(0))[4 * 8 * 13]; + HS_KEY_TYPE r15 = (shared.m + get_local_id(0))[4 * 8 * 14]; + HS_KEY_TYPE r16 = (shared.m + get_local_id(0))[4 * 8 * 15]; + { { uint const half_lane_mask = 4; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +HS_CMP_XCHG(r1, r9) +HS_CMP_XCHG(r5, r13) +HS_CMP_XCHG(r1, r5) +HS_CMP_XCHG(r9, r13) +HS_CMP_XCHG(r3, r11) +HS_CMP_XCHG(r7, r15) +HS_CMP_XCHG(r3, r7) +HS_CMP_XCHG(r11, r15) +HS_CMP_XCHG(r1, r3) +HS_CMP_XCHG(r5, r7) +HS_CMP_XCHG(r9, r11) +HS_CMP_XCHG(r13, r15) +HS_CMP_XCHG(r2, r10) +HS_CMP_XCHG(r6, r14) +HS_CMP_XCHG(r2, r6) +HS_CMP_XCHG(r10, r14) +HS_CMP_XCHG(r4, r12) +HS_CMP_XCHG(r8, r16) +HS_CMP_XCHG(r4, r8) +HS_CMP_XCHG(r12, r16) +HS_CMP_XCHG(r2, r4) +HS_CMP_XCHG(r6, r8) +HS_CMP_XCHG(r10, r12) +HS_CMP_XCHG(r14, r16) +HS_CMP_XCHG(r1, r2) +HS_CMP_XCHG(r3, r4) +HS_CMP_XCHG(r5, r6) +HS_CMP_XCHG(r7, r8) +HS_CMP_XCHG(r9, r10) +HS_CMP_XCHG(r11, r12) +HS_CMP_XCHG(r13, r14) +HS_CMP_XCHG(r15, r16) +} +(vout + gmem_idx)[0 * 8] = r1; +(vout + gmem_idx)[1 * 8] = r2; +(vout + gmem_idx)[2 * 8] = r3; +(vout + gmem_idx)[3 * 8] = r4; +(vout + gmem_idx)[4 * 8] = r5; +(vout + gmem_idx)[5 * 8] = r6; +(vout + gmem_idx)[6 * 8] = r7; +(vout + gmem_idx)[7 * 8] = r8; +(vout + gmem_idx)[8 * 8] = r9; +(vout + gmem_idx)[9 * 8] = r10; +(vout + gmem_idx)[10 * 8] = r11; +(vout + gmem_idx)[11 * 8] = r12; +(vout + gmem_idx)[12 * 8] = r13; +(vout + gmem_idx)[13 * 8] = r14; +(vout + gmem_idx)[14 * 8] = r15; +(vout + gmem_idx)[15 * 8] = r16; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_bc_1(__global HS_KEY_TYPE* const restrict vout) +{ + __local union + { + HS_KEY_TYPE m[16 * 16]; + } shared; + + uint const global_id = get_global_id(0); + uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); + + uint const gmem_l_idx = (global_id / 16) * 256 + (global_id & 15); + uint const smem_l_idx = get_sub_group_id() * 16 + get_sub_group_local_id(); + { + { + HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[0]; + HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[128]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[0] = r0_1; + (shared.m + smem_l_idx)[8] = r0_2; + } + { + HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[16]; + HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[144]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[32] = r0_1; + (shared.m + smem_l_idx)[40] = r0_2; + } + { + HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[32]; + HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[160]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[64] = r0_1; + (shared.m + smem_l_idx)[72] = r0_2; + } + { + HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[48]; + HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[176]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[96] = r0_1; + (shared.m + smem_l_idx)[104] = r0_2; + } + { + HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[64]; + HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[192]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[128] = r0_1; + (shared.m + smem_l_idx)[136] = r0_2; + } + { + HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[80]; + HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[208]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[160] = r0_1; + (shared.m + smem_l_idx)[168] = r0_2; + } + { + HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[96]; + HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[224]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[192] = r0_1; + (shared.m + smem_l_idx)[200] = r0_2; + } + { + HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[112]; + HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[240]; + HS_CMP_XCHG(r0_1, r0_2) + (shared.m + smem_l_idx)[224] = r0_1; + (shared.m + smem_l_idx)[232] = r0_2; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + HS_KEY_TYPE r1 = (shared.m + get_local_id(0))[2 * 8 * 0]; + HS_KEY_TYPE r2 = (shared.m + get_local_id(0))[2 * 8 * 1]; + HS_KEY_TYPE r3 = (shared.m + get_local_id(0))[2 * 8 * 2]; + HS_KEY_TYPE r4 = (shared.m + get_local_id(0))[2 * 8 * 3]; + HS_KEY_TYPE r5 = (shared.m + get_local_id(0))[2 * 8 * 4]; + HS_KEY_TYPE r6 = (shared.m + get_local_id(0))[2 * 8 * 5]; + HS_KEY_TYPE r7 = (shared.m + get_local_id(0))[2 * 8 * 6]; + HS_KEY_TYPE r8 = (shared.m + get_local_id(0))[2 * 8 * 7]; + HS_KEY_TYPE r9 = (shared.m + get_local_id(0))[2 * 8 * 8]; + HS_KEY_TYPE r10 = (shared.m + get_local_id(0))[2 * 8 * 9]; + HS_KEY_TYPE r11 = (shared.m + get_local_id(0))[2 * 8 * 10]; + HS_KEY_TYPE r12 = (shared.m + get_local_id(0))[2 * 8 * 11]; + HS_KEY_TYPE r13 = (shared.m + get_local_id(0))[2 * 8 * 12]; + HS_KEY_TYPE r14 = (shared.m + get_local_id(0))[2 * 8 * 13]; + HS_KEY_TYPE r15 = (shared.m + get_local_id(0))[2 * 8 * 14]; + HS_KEY_TYPE r16 = (shared.m + get_local_id(0))[2 * 8 * 15]; + { { uint const half_lane_mask = 4; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +HS_CMP_XCHG(r1, r9) +HS_CMP_XCHG(r5, r13) +HS_CMP_XCHG(r1, r5) +HS_CMP_XCHG(r9, r13) +HS_CMP_XCHG(r3, r11) +HS_CMP_XCHG(r7, r15) +HS_CMP_XCHG(r3, r7) +HS_CMP_XCHG(r11, r15) +HS_CMP_XCHG(r1, r3) +HS_CMP_XCHG(r5, r7) +HS_CMP_XCHG(r9, r11) +HS_CMP_XCHG(r13, r15) +HS_CMP_XCHG(r2, r10) +HS_CMP_XCHG(r6, r14) +HS_CMP_XCHG(r2, r6) +HS_CMP_XCHG(r10, r14) +HS_CMP_XCHG(r4, r12) +HS_CMP_XCHG(r8, r16) +HS_CMP_XCHG(r4, r8) +HS_CMP_XCHG(r12, r16) +HS_CMP_XCHG(r2, r4) +HS_CMP_XCHG(r6, r8) +HS_CMP_XCHG(r10, r12) +HS_CMP_XCHG(r14, r16) +HS_CMP_XCHG(r1, r2) +HS_CMP_XCHG(r3, r4) +HS_CMP_XCHG(r5, r6) +HS_CMP_XCHG(r7, r8) +HS_CMP_XCHG(r9, r10) +HS_CMP_XCHG(r11, r12) +HS_CMP_XCHG(r13, r14) +HS_CMP_XCHG(r15, r16) +} +(vout + gmem_idx)[0 * 8] = r1; +(vout + gmem_idx)[1 * 8] = r2; +(vout + gmem_idx)[2 * 8] = r3; +(vout + gmem_idx)[3 * 8] = r4; +(vout + gmem_idx)[4 * 8] = r5; +(vout + gmem_idx)[5 * 8] = r6; +(vout + gmem_idx)[6 * 8] = r7; +(vout + gmem_idx)[7 * 8] = r8; +(vout + gmem_idx)[8 * 8] = r9; +(vout + gmem_idx)[9 * 8] = r10; +(vout + gmem_idx)[10 * 8] = r11; +(vout + gmem_idx)[11 * 8] = r12; +(vout + gmem_idx)[12 * 8] = r13; +(vout + gmem_idx)[13 * 8] = r14; +(vout + gmem_idx)[14 * 8] = r15; +(vout + gmem_idx)[15 * 8] = r16; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_bc_0(__global HS_KEY_TYPE* const restrict vout) +{ + __local union + { + } shared; + + uint const global_id = get_global_id(0); + uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); + + HS_KEY_TYPE r1 = (vout + gmem_idx)[0 * 8]; + HS_KEY_TYPE r2 = (vout + gmem_idx)[1 * 8]; + HS_KEY_TYPE r3 = (vout + gmem_idx)[2 * 8]; + HS_KEY_TYPE r4 = (vout + gmem_idx)[3 * 8]; + HS_KEY_TYPE r5 = (vout + gmem_idx)[4 * 8]; + HS_KEY_TYPE r6 = (vout + gmem_idx)[5 * 8]; + HS_KEY_TYPE r7 = (vout + gmem_idx)[6 * 8]; + HS_KEY_TYPE r8 = (vout + gmem_idx)[7 * 8]; + HS_KEY_TYPE r9 = (vout + gmem_idx)[8 * 8]; + HS_KEY_TYPE r10 = (vout + gmem_idx)[9 * 8]; + HS_KEY_TYPE r11 = (vout + gmem_idx)[10 * 8]; + HS_KEY_TYPE r12 = (vout + gmem_idx)[11 * 8]; + HS_KEY_TYPE r13 = (vout + gmem_idx)[12 * 8]; + HS_KEY_TYPE r14 = (vout + gmem_idx)[13 * 8]; + HS_KEY_TYPE r15 = (vout + gmem_idx)[14 * 8]; + HS_KEY_TYPE r16 = (vout + gmem_idx)[15 * 8]; + { { uint const half_lane_mask = 4; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 2; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +{ + uint const half_lane_mask = 1; + uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; + int const t_lt = get_sub_group_local_id() < half_lane_idx; + HS_CMP_HALF(0, r1) + HS_CMP_HALF(1, r2) + HS_CMP_HALF(2, r3) + HS_CMP_HALF(3, r4) + HS_CMP_HALF(4, r5) + HS_CMP_HALF(5, r6) + HS_CMP_HALF(6, r7) + HS_CMP_HALF(7, r8) + HS_CMP_HALF(8, r9) + HS_CMP_HALF(9, r10) + HS_CMP_HALF(10, r11) + HS_CMP_HALF(11, r12) + HS_CMP_HALF(12, r13) + HS_CMP_HALF(13, r14) + HS_CMP_HALF(14, r15) + HS_CMP_HALF(15, r16) +} +HS_CMP_XCHG(r1, r9) +HS_CMP_XCHG(r5, r13) +HS_CMP_XCHG(r1, r5) +HS_CMP_XCHG(r9, r13) +HS_CMP_XCHG(r3, r11) +HS_CMP_XCHG(r7, r15) +HS_CMP_XCHG(r3, r7) +HS_CMP_XCHG(r11, r15) +HS_CMP_XCHG(r1, r3) +HS_CMP_XCHG(r5, r7) +HS_CMP_XCHG(r9, r11) +HS_CMP_XCHG(r13, r15) +HS_CMP_XCHG(r2, r10) +HS_CMP_XCHG(r6, r14) +HS_CMP_XCHG(r2, r6) +HS_CMP_XCHG(r10, r14) +HS_CMP_XCHG(r4, r12) +HS_CMP_XCHG(r8, r16) +HS_CMP_XCHG(r4, r8) +HS_CMP_XCHG(r12, r16) +HS_CMP_XCHG(r2, r4) +HS_CMP_XCHG(r6, r8) +HS_CMP_XCHG(r10, r12) +HS_CMP_XCHG(r14, r16) +HS_CMP_XCHG(r1, r2) +HS_CMP_XCHG(r3, r4) +HS_CMP_XCHG(r5, r6) +HS_CMP_XCHG(r7, r8) +HS_CMP_XCHG(r9, r10) +HS_CMP_XCHG(r11, r12) +HS_CMP_XCHG(r13, r14) +HS_CMP_XCHG(r15, r16) +} +(vout + gmem_idx)[0 * 8] = r1; +(vout + gmem_idx)[1 * 8] = r2; +(vout + gmem_idx)[2 * 8] = r3; +(vout + gmem_idx)[3 * 8] = r4; +(vout + gmem_idx)[4 * 8] = r5; +(vout + gmem_idx)[5 * 8] = r6; +(vout + gmem_idx)[6 * 8] = r7; +(vout + gmem_idx)[7 * 8] = r8; +(vout + gmem_idx)[8 * 8] = r9; +(vout + gmem_idx)[9 * 8] = r10; +(vout + gmem_idx)[10 * 8] = r11; +(vout + gmem_idx)[11 * 8] = r12; +(vout + gmem_idx)[12 * 8] = r13; +(vout + gmem_idx)[13 * 8] = r14; +(vout + gmem_idx)[14 * 8] = r15; +(vout + gmem_idx)[15 * 8] = r16; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_fm_1(__global HS_KEY_TYPE* const restrict vout, + uint const fm_full, + uint const fm_frac) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = warp_idx / 16 >> 0; + + uint const merge_stride = 16 * 8 << 0; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + + uint const merge_l_off = + (warp_idx - merge_idx * (16 << 0)) * 8 + warp_lane_idx; + uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; + + int const merge_r_off = merge_keys - merge_l_end - 1; + + __global HS_KEY_TYPE* const restrict merge_l = + vout + (merge_base + merge_l_off); + __global HS_KEY_TYPE* const restrict merge_r = + vout + (merge_base + merge_r_off); + + HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; + if (merge_idx < fm_full) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; + HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; + HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; + HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; + HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; + HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; + HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; + HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r8, r25) + HS_CMP_XCHG(r7, r26) + HS_CMP_XCHG(r6, r27) + HS_CMP_XCHG(r5, r28) + HS_CMP_XCHG(r4, r29) + HS_CMP_XCHG(r3, r30) + HS_CMP_XCHG(r2, r31) + HS_CMP_XCHG(r1, r32) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_r[15 * merge_stride] = r32; + merge_r[14 * merge_stride] = r31; + merge_r[13 * merge_stride] = r30; + merge_r[12 * merge_stride] = r29; + merge_r[11 * merge_stride] = r28; + merge_r[10 * merge_stride] = r27; + merge_r[9 * merge_stride] = r26; + merge_r[8 * merge_stride] = r25; + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 8) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 4) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 2) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r17, r18) + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_CMP_XCHG(r16, r17) + merge_r[0 * merge_stride] = r17; + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + merge_l[15 * merge_stride] = r16; + merge_l[14 * merge_stride] = r15; + merge_l[13 * merge_stride] = r14; + merge_l[12 * merge_stride] = r13; + merge_l[11 * merge_stride] = r12; + merge_l[10 * merge_stride] = r11; + merge_l[9 * merge_stride] = r10; + merge_l[8 * merge_stride] = r9; + merge_l[7 * merge_stride] = r8; + merge_l[6 * merge_stride] = r7; + merge_l[5 * merge_stride] = r6; + merge_l[4 * merge_stride] = r5; + merge_l[3 * merge_stride] = r4; + merge_l[2 * merge_stride] = r3; + merge_l[1 * merge_stride] = r2; + merge_l[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_fm_2(__global HS_KEY_TYPE* const restrict vout, + uint const fm_full, + uint const fm_frac) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = warp_idx / 16 >> 1; + + uint const merge_stride = 16 * 8 << 1; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + + uint const merge_l_off = + (warp_idx - merge_idx * (16 << 1)) * 8 + warp_lane_idx; + uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; + + int const merge_r_off = merge_keys - merge_l_end - 1; + + __global HS_KEY_TYPE* const restrict merge_l = + vout + (merge_base + merge_l_off); + __global HS_KEY_TYPE* const restrict merge_r = + vout + (merge_base + merge_r_off); + + HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; + if (merge_idx < fm_full) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; + HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; + HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; + HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; + HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; + HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; + HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; + HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r8, r25) + HS_CMP_XCHG(r7, r26) + HS_CMP_XCHG(r6, r27) + HS_CMP_XCHG(r5, r28) + HS_CMP_XCHG(r4, r29) + HS_CMP_XCHG(r3, r30) + HS_CMP_XCHG(r2, r31) + HS_CMP_XCHG(r1, r32) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_r[15 * merge_stride] = r32; + merge_r[14 * merge_stride] = r31; + merge_r[13 * merge_stride] = r30; + merge_r[12 * merge_stride] = r29; + merge_r[11 * merge_stride] = r28; + merge_r[10 * merge_stride] = r27; + merge_r[9 * merge_stride] = r26; + merge_r[8 * merge_stride] = r25; + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 8) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 4) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 2) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r17, r18) + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_CMP_XCHG(r16, r17) + merge_r[0 * merge_stride] = r17; + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + merge_l[15 * merge_stride] = r16; + merge_l[14 * merge_stride] = r15; + merge_l[13 * merge_stride] = r14; + merge_l[12 * merge_stride] = r13; + merge_l[11 * merge_stride] = r12; + merge_l[10 * merge_stride] = r11; + merge_l[9 * merge_stride] = r10; + merge_l[8 * merge_stride] = r9; + merge_l[7 * merge_stride] = r8; + merge_l[6 * merge_stride] = r7; + merge_l[5 * merge_stride] = r6; + merge_l[4 * merge_stride] = r5; + merge_l[3 * merge_stride] = r4; + merge_l[2 * merge_stride] = r3; + merge_l[1 * merge_stride] = r2; + merge_l[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_fm_3(__global HS_KEY_TYPE* const restrict vout, + uint const fm_full, + uint const fm_frac) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = warp_idx / 16 >> 2; + + uint const merge_stride = 16 * 8 << 2; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + + uint const merge_l_off = + (warp_idx - merge_idx * (16 << 2)) * 8 + warp_lane_idx; + uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; + + int const merge_r_off = merge_keys - merge_l_end - 1; + + __global HS_KEY_TYPE* const restrict merge_l = + vout + (merge_base + merge_l_off); + __global HS_KEY_TYPE* const restrict merge_r = + vout + (merge_base + merge_r_off); + + HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; + if (merge_idx < fm_full) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; + HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; + HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; + HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; + HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; + HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; + HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; + HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r8, r25) + HS_CMP_XCHG(r7, r26) + HS_CMP_XCHG(r6, r27) + HS_CMP_XCHG(r5, r28) + HS_CMP_XCHG(r4, r29) + HS_CMP_XCHG(r3, r30) + HS_CMP_XCHG(r2, r31) + HS_CMP_XCHG(r1, r32) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_r[15 * merge_stride] = r32; + merge_r[14 * merge_stride] = r31; + merge_r[13 * merge_stride] = r30; + merge_r[12 * merge_stride] = r29; + merge_r[11 * merge_stride] = r28; + merge_r[10 * merge_stride] = r27; + merge_r[9 * merge_stride] = r26; + merge_r[8 * merge_stride] = r25; + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 8) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 4) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 2) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r17, r18) + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_CMP_XCHG(r16, r17) + merge_r[0 * merge_stride] = r17; + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + merge_l[15 * merge_stride] = r16; + merge_l[14 * merge_stride] = r15; + merge_l[13 * merge_stride] = r14; + merge_l[12 * merge_stride] = r13; + merge_l[11 * merge_stride] = r12; + merge_l[10 * merge_stride] = r11; + merge_l[9 * merge_stride] = r10; + merge_l[8 * merge_stride] = r9; + merge_l[7 * merge_stride] = r8; + merge_l[6 * merge_stride] = r7; + merge_l[5 * merge_stride] = r6; + merge_l[4 * merge_stride] = r5; + merge_l[3 * merge_stride] = r4; + merge_l[2 * merge_stride] = r3; + merge_l[1 * merge_stride] = r2; + merge_l[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_fm_4(__global HS_KEY_TYPE* const restrict vout, + uint const fm_full, + uint const fm_frac) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = warp_idx / 16 >> 3; + + uint const merge_stride = 16 * 8 << 3; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + + uint const merge_l_off = + (warp_idx - merge_idx * (16 << 3)) * 8 + warp_lane_idx; + uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; + + int const merge_r_off = merge_keys - merge_l_end - 1; + + __global HS_KEY_TYPE* const restrict merge_l = + vout + (merge_base + merge_l_off); + __global HS_KEY_TYPE* const restrict merge_r = + vout + (merge_base + merge_r_off); + + HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; + if (merge_idx < fm_full) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; + HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; + HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; + HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; + HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; + HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; + HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; + HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r8, r25) + HS_CMP_XCHG(r7, r26) + HS_CMP_XCHG(r6, r27) + HS_CMP_XCHG(r5, r28) + HS_CMP_XCHG(r4, r29) + HS_CMP_XCHG(r3, r30) + HS_CMP_XCHG(r2, r31) + HS_CMP_XCHG(r1, r32) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_r[15 * merge_stride] = r32; + merge_r[14 * merge_stride] = r31; + merge_r[13 * merge_stride] = r30; + merge_r[12 * merge_stride] = r29; + merge_r[11 * merge_stride] = r28; + merge_r[10 * merge_stride] = r27; + merge_r[9 * merge_stride] = r26; + merge_r[8 * merge_stride] = r25; + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 8) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 4) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 2) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r17, r18) + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_CMP_XCHG(r16, r17) + merge_r[0 * merge_stride] = r17; + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + merge_l[15 * merge_stride] = r16; + merge_l[14 * merge_stride] = r15; + merge_l[13 * merge_stride] = r14; + merge_l[12 * merge_stride] = r13; + merge_l[11 * merge_stride] = r12; + merge_l[10 * merge_stride] = r11; + merge_l[9 * merge_stride] = r10; + merge_l[8 * merge_stride] = r9; + merge_l[7 * merge_stride] = r8; + merge_l[6 * merge_stride] = r7; + merge_l[5 * merge_stride] = r6; + merge_l[4 * merge_stride] = r5; + merge_l[3 * merge_stride] = r4; + merge_l[2 * merge_stride] = r3; + merge_l[1 * merge_stride] = r2; + merge_l[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_fm_5(__global HS_KEY_TYPE* const restrict vout, + uint const fm_full, + uint const fm_frac) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = warp_idx / 16 >> 4; + + uint const merge_stride = 16 * 8 << 4; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + + uint const merge_l_off = + (warp_idx - merge_idx * (16 << 4)) * 8 + warp_lane_idx; + uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; + + int const merge_r_off = merge_keys - merge_l_end - 1; + + __global HS_KEY_TYPE* const restrict merge_l = + vout + (merge_base + merge_l_off); + __global HS_KEY_TYPE* const restrict merge_r = + vout + (merge_base + merge_r_off); + + HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; + if (merge_idx < fm_full) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; + HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; + HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; + HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; + HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; + HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; + HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; + HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r8, r25) + HS_CMP_XCHG(r7, r26) + HS_CMP_XCHG(r6, r27) + HS_CMP_XCHG(r5, r28) + HS_CMP_XCHG(r4, r29) + HS_CMP_XCHG(r3, r30) + HS_CMP_XCHG(r2, r31) + HS_CMP_XCHG(r1, r32) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_r[15 * merge_stride] = r32; + merge_r[14 * merge_stride] = r31; + merge_r[13 * merge_stride] = r30; + merge_r[12 * merge_stride] = r29; + merge_r[11 * merge_stride] = r28; + merge_r[10 * merge_stride] = r27; + merge_r[9 * merge_stride] = r26; + merge_r[8 * merge_stride] = r25; + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 8) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 4) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 2) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r17, r18) + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_CMP_XCHG(r16, r17) + merge_r[0 * merge_stride] = r17; + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + merge_l[15 * merge_stride] = r16; + merge_l[14 * merge_stride] = r15; + merge_l[13 * merge_stride] = r14; + merge_l[12 * merge_stride] = r13; + merge_l[11 * merge_stride] = r12; + merge_l[10 * merge_stride] = r11; + merge_l[9 * merge_stride] = r10; + merge_l[8 * merge_stride] = r9; + merge_l[7 * merge_stride] = r8; + merge_l[6 * merge_stride] = r7; + merge_l[5 * merge_stride] = r6; + merge_l[4 * merge_stride] = r5; + merge_l[3 * merge_stride] = r4; + merge_l[2 * merge_stride] = r3; + merge_l[1 * merge_stride] = r2; + merge_l[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_fm_6(__global HS_KEY_TYPE* const restrict vout, + uint const fm_full, + uint const fm_frac) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = warp_idx / 16 >> 5; + + uint const merge_stride = 16 * 8 << 5; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + + uint const merge_l_off = + (warp_idx - merge_idx * (16 << 5)) * 8 + warp_lane_idx; + uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; + + int const merge_r_off = merge_keys - merge_l_end - 1; + + __global HS_KEY_TYPE* const restrict merge_l = + vout + (merge_base + merge_l_off); + __global HS_KEY_TYPE* const restrict merge_r = + vout + (merge_base + merge_r_off); + + HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; + if (merge_idx < fm_full) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; + HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; + HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; + HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; + HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; + HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; + HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; + HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r8, r25) + HS_CMP_XCHG(r7, r26) + HS_CMP_XCHG(r6, r27) + HS_CMP_XCHG(r5, r28) + HS_CMP_XCHG(r4, r29) + HS_CMP_XCHG(r3, r30) + HS_CMP_XCHG(r2, r31) + HS_CMP_XCHG(r1, r32) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_r[15 * merge_stride] = r32; + merge_r[14 * merge_stride] = r31; + merge_r[13 * merge_stride] = r30; + merge_r[12 * merge_stride] = r29; + merge_r[11 * merge_stride] = r28; + merge_r[10 * merge_stride] = r27; + merge_r[9 * merge_stride] = r26; + merge_r[8 * merge_stride] = r25; + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 8) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 4) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 2) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r17, r18) + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_CMP_XCHG(r16, r17) + merge_r[0 * merge_stride] = r17; + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + merge_l[15 * merge_stride] = r16; + merge_l[14 * merge_stride] = r15; + merge_l[13 * merge_stride] = r14; + merge_l[12 * merge_stride] = r13; + merge_l[11 * merge_stride] = r12; + merge_l[10 * merge_stride] = r11; + merge_l[9 * merge_stride] = r10; + merge_l[8 * merge_stride] = r9; + merge_l[7 * merge_stride] = r8; + merge_l[6 * merge_stride] = r7; + merge_l[5 * merge_stride] = r6; + merge_l[4 * merge_stride] = r5; + merge_l[3 * merge_stride] = r4; + merge_l[2 * merge_stride] = r3; + merge_l[1 * merge_stride] = r2; + merge_l[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_hm_5(__global HS_KEY_TYPE* const restrict vout) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = (warp_idx / 16) >> 0; + + uint const merge_stride = 16 * 8 << 0; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + uint const merge_off = (warp_idx - merge_idx * (16 << 0)) * 8; + + __global HS_KEY_TYPE* const restrict merge_ptr = + vout + (merge_base + merge_off + warp_lane_idx); + + HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; + HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; + HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; + HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; + HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; + HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; + HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; + HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; + HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; + HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; + HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; + HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; + HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; + HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; + HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; + HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; + HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; + HS_CMP_XCHG(r1, r17) + HS_CMP_XCHG(r9, r25) + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r5, r21) + HS_CMP_XCHG(r13, r29) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r3, r19) + HS_CMP_XCHG(r11, r27) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r7, r23) + HS_CMP_XCHG(r15, r31) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r2, r18) + HS_CMP_XCHG(r10, r26) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r6, r22) + HS_CMP_XCHG(r14, r30) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r4, r20) + HS_CMP_XCHG(r12, r28) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r8, r24) + HS_CMP_XCHG(r16, r32) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_ptr[31 * merge_stride] = r32; + merge_ptr[30 * merge_stride] = r31; + merge_ptr[29 * merge_stride] = r30; + merge_ptr[28 * merge_stride] = r29; + merge_ptr[27 * merge_stride] = r28; + merge_ptr[26 * merge_stride] = r27; + merge_ptr[25 * merge_stride] = r26; + merge_ptr[24 * merge_stride] = r25; + merge_ptr[23 * merge_stride] = r24; + merge_ptr[22 * merge_stride] = r23; + merge_ptr[21 * merge_stride] = r22; + merge_ptr[20 * merge_stride] = r21; + merge_ptr[19 * merge_stride] = r20; + merge_ptr[18 * merge_stride] = r19; + merge_ptr[17 * merge_stride] = r18; + merge_ptr[16 * merge_stride] = r17; + merge_ptr[15 * merge_stride] = r16; + merge_ptr[14 * merge_stride] = r15; + merge_ptr[13 * merge_stride] = r14; + merge_ptr[12 * merge_stride] = r13; + merge_ptr[11 * merge_stride] = r12; + merge_ptr[10 * merge_stride] = r11; + merge_ptr[9 * merge_stride] = r10; + merge_ptr[8 * merge_stride] = r9; + merge_ptr[7 * merge_stride] = r8; + merge_ptr[6 * merge_stride] = r7; + merge_ptr[5 * merge_stride] = r6; + merge_ptr[4 * merge_stride] = r5; + merge_ptr[3 * merge_stride] = r4; + merge_ptr[2 * merge_stride] = r3; + merge_ptr[1 * merge_stride] = r2; + merge_ptr[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_fm_7(__global HS_KEY_TYPE* const restrict vout, + uint const fm_full, + uint const fm_frac) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = warp_idx / 16 >> 6; + + uint const merge_stride = 16 * 8 << 6; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + + uint const merge_l_off = + (warp_idx - merge_idx * (16 << 6)) * 8 + warp_lane_idx; + uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; + + int const merge_r_off = merge_keys - merge_l_end - 1; + + __global HS_KEY_TYPE* const restrict merge_l = + vout + (merge_base + merge_l_off); + __global HS_KEY_TYPE* const restrict merge_r = + vout + (merge_base + merge_r_off); + + HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; + if (merge_idx < fm_full) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; + HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; + HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; + HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; + HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; + HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; + HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; + HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r8, r25) + HS_CMP_XCHG(r7, r26) + HS_CMP_XCHG(r6, r27) + HS_CMP_XCHG(r5, r28) + HS_CMP_XCHG(r4, r29) + HS_CMP_XCHG(r3, r30) + HS_CMP_XCHG(r2, r31) + HS_CMP_XCHG(r1, r32) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_r[15 * merge_stride] = r32; + merge_r[14 * merge_stride] = r31; + merge_r[13 * merge_stride] = r30; + merge_r[12 * merge_stride] = r29; + merge_r[11 * merge_stride] = r28; + merge_r[10 * merge_stride] = r27; + merge_r[9 * merge_stride] = r26; + merge_r[8 * merge_stride] = r25; + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 8) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 4) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 2) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r17, r18) + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_CMP_XCHG(r16, r17) + merge_r[0 * merge_stride] = r17; + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + merge_l[15 * merge_stride] = r16; + merge_l[14 * merge_stride] = r15; + merge_l[13 * merge_stride] = r14; + merge_l[12 * merge_stride] = r13; + merge_l[11 * merge_stride] = r12; + merge_l[10 * merge_stride] = r11; + merge_l[9 * merge_stride] = r10; + merge_l[8 * merge_stride] = r9; + merge_l[7 * merge_stride] = r8; + merge_l[6 * merge_stride] = r7; + merge_l[5 * merge_stride] = r6; + merge_l[4 * merge_stride] = r5; + merge_l[3 * merge_stride] = r4; + merge_l[2 * merge_stride] = r3; + merge_l[1 * merge_stride] = r2; + merge_l[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_hm_6(__global HS_KEY_TYPE* const restrict vout) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = (warp_idx / 16) >> 1; + + uint const merge_stride = 16 * 8 << 1; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + uint const merge_off = (warp_idx - merge_idx * (16 << 1)) * 8; + + __global HS_KEY_TYPE* const restrict merge_ptr = + vout + (merge_base + merge_off + warp_lane_idx); + + HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; + HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; + HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; + HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; + HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; + HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; + HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; + HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; + HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; + HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; + HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; + HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; + HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; + HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; + HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; + HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; + HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; + HS_CMP_XCHG(r1, r17) + HS_CMP_XCHG(r9, r25) + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r5, r21) + HS_CMP_XCHG(r13, r29) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r3, r19) + HS_CMP_XCHG(r11, r27) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r7, r23) + HS_CMP_XCHG(r15, r31) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r2, r18) + HS_CMP_XCHG(r10, r26) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r6, r22) + HS_CMP_XCHG(r14, r30) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r4, r20) + HS_CMP_XCHG(r12, r28) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r8, r24) + HS_CMP_XCHG(r16, r32) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_ptr[31 * merge_stride] = r32; + merge_ptr[30 * merge_stride] = r31; + merge_ptr[29 * merge_stride] = r30; + merge_ptr[28 * merge_stride] = r29; + merge_ptr[27 * merge_stride] = r28; + merge_ptr[26 * merge_stride] = r27; + merge_ptr[25 * merge_stride] = r26; + merge_ptr[24 * merge_stride] = r25; + merge_ptr[23 * merge_stride] = r24; + merge_ptr[22 * merge_stride] = r23; + merge_ptr[21 * merge_stride] = r22; + merge_ptr[20 * merge_stride] = r21; + merge_ptr[19 * merge_stride] = r20; + merge_ptr[18 * merge_stride] = r19; + merge_ptr[17 * merge_stride] = r18; + merge_ptr[16 * merge_stride] = r17; + merge_ptr[15 * merge_stride] = r16; + merge_ptr[14 * merge_stride] = r15; + merge_ptr[13 * merge_stride] = r14; + merge_ptr[12 * merge_stride] = r13; + merge_ptr[11 * merge_stride] = r12; + merge_ptr[10 * merge_stride] = r11; + merge_ptr[9 * merge_stride] = r10; + merge_ptr[8 * merge_stride] = r9; + merge_ptr[7 * merge_stride] = r8; + merge_ptr[6 * merge_stride] = r7; + merge_ptr[5 * merge_stride] = r6; + merge_ptr[4 * merge_stride] = r5; + merge_ptr[3 * merge_stride] = r4; + merge_ptr[2 * merge_stride] = r3; + merge_ptr[1 * merge_stride] = r2; + merge_ptr[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_fm_8(__global HS_KEY_TYPE* const restrict vout, + uint const fm_full, + uint const fm_frac) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = warp_idx / 16 >> 7; + + uint const merge_stride = 16 * 8 << 7; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + + uint const merge_l_off = + (warp_idx - merge_idx * (16 << 7)) * 8 + warp_lane_idx; + uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; + + int const merge_r_off = merge_keys - merge_l_end - 1; + + __global HS_KEY_TYPE* const restrict merge_l = + vout + (merge_base + merge_l_off); + __global HS_KEY_TYPE* const restrict merge_r = + vout + (merge_base + merge_r_off); + + HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; + if (merge_idx < fm_full) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; + HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; + HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; + HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; + HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; + HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; + HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; + HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r8, r25) + HS_CMP_XCHG(r7, r26) + HS_CMP_XCHG(r6, r27) + HS_CMP_XCHG(r5, r28) + HS_CMP_XCHG(r4, r29) + HS_CMP_XCHG(r3, r30) + HS_CMP_XCHG(r2, r31) + HS_CMP_XCHG(r1, r32) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_r[15 * merge_stride] = r32; + merge_r[14 * merge_stride] = r31; + merge_r[13 * merge_stride] = r30; + merge_r[12 * merge_stride] = r29; + merge_r[11 * merge_stride] = r28; + merge_r[10 * merge_stride] = r27; + merge_r[9 * merge_stride] = r26; + merge_r[8 * merge_stride] = r25; + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 8) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 4) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 2) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r17, r18) + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_CMP_XCHG(r16, r17) + merge_r[0 * merge_stride] = r17; + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + merge_l[15 * merge_stride] = r16; + merge_l[14 * merge_stride] = r15; + merge_l[13 * merge_stride] = r14; + merge_l[12 * merge_stride] = r13; + merge_l[11 * merge_stride] = r12; + merge_l[10 * merge_stride] = r11; + merge_l[9 * merge_stride] = r10; + merge_l[8 * merge_stride] = r9; + merge_l[7 * merge_stride] = r8; + merge_l[6 * merge_stride] = r7; + merge_l[5 * merge_stride] = r6; + merge_l[4 * merge_stride] = r5; + merge_l[3 * merge_stride] = r4; + merge_l[2 * merge_stride] = r3; + merge_l[1 * merge_stride] = r2; + merge_l[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_hm_7(__global HS_KEY_TYPE* const restrict vout) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = (warp_idx / 16) >> 2; + + uint const merge_stride = 16 * 8 << 2; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + uint const merge_off = (warp_idx - merge_idx * (16 << 2)) * 8; + + __global HS_KEY_TYPE* const restrict merge_ptr = + vout + (merge_base + merge_off + warp_lane_idx); + + HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; + HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; + HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; + HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; + HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; + HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; + HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; + HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; + HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; + HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; + HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; + HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; + HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; + HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; + HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; + HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; + HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; + HS_CMP_XCHG(r1, r17) + HS_CMP_XCHG(r9, r25) + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r5, r21) + HS_CMP_XCHG(r13, r29) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r3, r19) + HS_CMP_XCHG(r11, r27) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r7, r23) + HS_CMP_XCHG(r15, r31) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r2, r18) + HS_CMP_XCHG(r10, r26) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r6, r22) + HS_CMP_XCHG(r14, r30) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r4, r20) + HS_CMP_XCHG(r12, r28) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r8, r24) + HS_CMP_XCHG(r16, r32) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_ptr[31 * merge_stride] = r32; + merge_ptr[30 * merge_stride] = r31; + merge_ptr[29 * merge_stride] = r30; + merge_ptr[28 * merge_stride] = r29; + merge_ptr[27 * merge_stride] = r28; + merge_ptr[26 * merge_stride] = r27; + merge_ptr[25 * merge_stride] = r26; + merge_ptr[24 * merge_stride] = r25; + merge_ptr[23 * merge_stride] = r24; + merge_ptr[22 * merge_stride] = r23; + merge_ptr[21 * merge_stride] = r22; + merge_ptr[20 * merge_stride] = r21; + merge_ptr[19 * merge_stride] = r20; + merge_ptr[18 * merge_stride] = r19; + merge_ptr[17 * merge_stride] = r18; + merge_ptr[16 * merge_stride] = r17; + merge_ptr[15 * merge_stride] = r16; + merge_ptr[14 * merge_stride] = r15; + merge_ptr[13 * merge_stride] = r14; + merge_ptr[12 * merge_stride] = r13; + merge_ptr[11 * merge_stride] = r12; + merge_ptr[10 * merge_stride] = r11; + merge_ptr[9 * merge_stride] = r10; + merge_ptr[8 * merge_stride] = r9; + merge_ptr[7 * merge_stride] = r8; + merge_ptr[6 * merge_stride] = r7; + merge_ptr[5 * merge_stride] = r6; + merge_ptr[4 * merge_stride] = r5; + merge_ptr[3 * merge_stride] = r4; + merge_ptr[2 * merge_stride] = r3; + merge_ptr[1 * merge_stride] = r2; + merge_ptr[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_fm_9(__global HS_KEY_TYPE* const restrict vout, + uint const fm_full, + uint const fm_frac) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = warp_idx / 16 >> 8; + + uint const merge_stride = 16 * 8 << 8; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + + uint const merge_l_off = + (warp_idx - merge_idx * (16 << 8)) * 8 + warp_lane_idx; + uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; + + int const merge_r_off = merge_keys - merge_l_end - 1; + + __global HS_KEY_TYPE* const restrict merge_l = + vout + (merge_base + merge_l_off); + __global HS_KEY_TYPE* const restrict merge_r = + vout + (merge_base + merge_r_off); + + HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; + if (merge_idx < fm_full) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; + HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; + HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; + HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; + HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; + HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; + HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; + HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r8, r25) + HS_CMP_XCHG(r7, r26) + HS_CMP_XCHG(r6, r27) + HS_CMP_XCHG(r5, r28) + HS_CMP_XCHG(r4, r29) + HS_CMP_XCHG(r3, r30) + HS_CMP_XCHG(r2, r31) + HS_CMP_XCHG(r1, r32) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_r[15 * merge_stride] = r32; + merge_r[14 * merge_stride] = r31; + merge_r[13 * merge_stride] = r30; + merge_r[12 * merge_stride] = r29; + merge_r[11 * merge_stride] = r28; + merge_r[10 * merge_stride] = r27; + merge_r[9 * merge_stride] = r26; + merge_r[8 * merge_stride] = r25; + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 8) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 4) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 2) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r17, r18) + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_CMP_XCHG(r16, r17) + merge_r[0 * merge_stride] = r17; + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + merge_l[15 * merge_stride] = r16; + merge_l[14 * merge_stride] = r15; + merge_l[13 * merge_stride] = r14; + merge_l[12 * merge_stride] = r13; + merge_l[11 * merge_stride] = r12; + merge_l[10 * merge_stride] = r11; + merge_l[9 * merge_stride] = r10; + merge_l[8 * merge_stride] = r9; + merge_l[7 * merge_stride] = r8; + merge_l[6 * merge_stride] = r7; + merge_l[5 * merge_stride] = r6; + merge_l[4 * merge_stride] = r5; + merge_l[3 * merge_stride] = r4; + merge_l[2 * merge_stride] = r3; + merge_l[1 * merge_stride] = r2; + merge_l[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_hm_8(__global HS_KEY_TYPE* const restrict vout) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = (warp_idx / 16) >> 3; + + uint const merge_stride = 16 * 8 << 3; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + uint const merge_off = (warp_idx - merge_idx * (16 << 3)) * 8; + + __global HS_KEY_TYPE* const restrict merge_ptr = + vout + (merge_base + merge_off + warp_lane_idx); + + HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; + HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; + HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; + HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; + HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; + HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; + HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; + HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; + HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; + HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; + HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; + HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; + HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; + HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; + HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; + HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; + HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; + HS_CMP_XCHG(r1, r17) + HS_CMP_XCHG(r9, r25) + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r5, r21) + HS_CMP_XCHG(r13, r29) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r3, r19) + HS_CMP_XCHG(r11, r27) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r7, r23) + HS_CMP_XCHG(r15, r31) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r2, r18) + HS_CMP_XCHG(r10, r26) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r6, r22) + HS_CMP_XCHG(r14, r30) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r4, r20) + HS_CMP_XCHG(r12, r28) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r8, r24) + HS_CMP_XCHG(r16, r32) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_ptr[31 * merge_stride] = r32; + merge_ptr[30 * merge_stride] = r31; + merge_ptr[29 * merge_stride] = r30; + merge_ptr[28 * merge_stride] = r29; + merge_ptr[27 * merge_stride] = r28; + merge_ptr[26 * merge_stride] = r27; + merge_ptr[25 * merge_stride] = r26; + merge_ptr[24 * merge_stride] = r25; + merge_ptr[23 * merge_stride] = r24; + merge_ptr[22 * merge_stride] = r23; + merge_ptr[21 * merge_stride] = r22; + merge_ptr[20 * merge_stride] = r21; + merge_ptr[19 * merge_stride] = r20; + merge_ptr[18 * merge_stride] = r19; + merge_ptr[17 * merge_stride] = r18; + merge_ptr[16 * merge_stride] = r17; + merge_ptr[15 * merge_stride] = r16; + merge_ptr[14 * merge_stride] = r15; + merge_ptr[13 * merge_stride] = r14; + merge_ptr[12 * merge_stride] = r13; + merge_ptr[11 * merge_stride] = r12; + merge_ptr[10 * merge_stride] = r11; + merge_ptr[9 * merge_stride] = r10; + merge_ptr[8 * merge_stride] = r9; + merge_ptr[7 * merge_stride] = r8; + merge_ptr[6 * merge_stride] = r7; + merge_ptr[5 * merge_stride] = r6; + merge_ptr[4 * merge_stride] = r5; + merge_ptr[3 * merge_stride] = r4; + merge_ptr[2 * merge_stride] = r3; + merge_ptr[1 * merge_stride] = r2; + merge_ptr[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_fm_10(__global HS_KEY_TYPE* const restrict vout, + uint const fm_full, + uint const fm_frac) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = warp_idx / 16 >> 9; + + uint const merge_stride = 16 * 8 << 9; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + + uint const merge_l_off = + (warp_idx - merge_idx * (16 << 9)) * 8 + warp_lane_idx; + uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; + + int const merge_r_off = merge_keys - merge_l_end - 1; + + __global HS_KEY_TYPE* const restrict merge_l = + vout + (merge_base + merge_l_off); + __global HS_KEY_TYPE* const restrict merge_r = + vout + (merge_base + merge_r_off); + + HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; + if (merge_idx < fm_full) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; + HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; + HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; + HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; + HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; + HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; + HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; + HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r8, r25) + HS_CMP_XCHG(r7, r26) + HS_CMP_XCHG(r6, r27) + HS_CMP_XCHG(r5, r28) + HS_CMP_XCHG(r4, r29) + HS_CMP_XCHG(r3, r30) + HS_CMP_XCHG(r2, r31) + HS_CMP_XCHG(r1, r32) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_r[15 * merge_stride] = r32; + merge_r[14 * merge_stride] = r31; + merge_r[13 * merge_stride] = r30; + merge_r[12 * merge_stride] = r29; + merge_r[11 * merge_stride] = r28; + merge_r[10 * merge_stride] = r27; + merge_r[9 * merge_stride] = r26; + merge_r[8 * merge_stride] = r25; + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 8) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 4) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 2) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r17, r18) + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_CMP_XCHG(r16, r17) + merge_r[0 * merge_stride] = r17; + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + merge_l[15 * merge_stride] = r16; + merge_l[14 * merge_stride] = r15; + merge_l[13 * merge_stride] = r14; + merge_l[12 * merge_stride] = r13; + merge_l[11 * merge_stride] = r12; + merge_l[10 * merge_stride] = r11; + merge_l[9 * merge_stride] = r10; + merge_l[8 * merge_stride] = r9; + merge_l[7 * merge_stride] = r8; + merge_l[6 * merge_stride] = r7; + merge_l[5 * merge_stride] = r6; + merge_l[4 * merge_stride] = r5; + merge_l[3 * merge_stride] = r4; + merge_l[2 * merge_stride] = r3; + merge_l[1 * merge_stride] = r2; + merge_l[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_hm_9(__global HS_KEY_TYPE* const restrict vout) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = (warp_idx / 16) >> 4; + + uint const merge_stride = 16 * 8 << 4; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + uint const merge_off = (warp_idx - merge_idx * (16 << 4)) * 8; + + __global HS_KEY_TYPE* const restrict merge_ptr = + vout + (merge_base + merge_off + warp_lane_idx); + + HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; + HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; + HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; + HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; + HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; + HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; + HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; + HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; + HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; + HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; + HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; + HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; + HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; + HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; + HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; + HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; + HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; + HS_CMP_XCHG(r1, r17) + HS_CMP_XCHG(r9, r25) + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r5, r21) + HS_CMP_XCHG(r13, r29) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r3, r19) + HS_CMP_XCHG(r11, r27) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r7, r23) + HS_CMP_XCHG(r15, r31) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r2, r18) + HS_CMP_XCHG(r10, r26) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r6, r22) + HS_CMP_XCHG(r14, r30) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r4, r20) + HS_CMP_XCHG(r12, r28) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r8, r24) + HS_CMP_XCHG(r16, r32) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_ptr[31 * merge_stride] = r32; + merge_ptr[30 * merge_stride] = r31; + merge_ptr[29 * merge_stride] = r30; + merge_ptr[28 * merge_stride] = r29; + merge_ptr[27 * merge_stride] = r28; + merge_ptr[26 * merge_stride] = r27; + merge_ptr[25 * merge_stride] = r26; + merge_ptr[24 * merge_stride] = r25; + merge_ptr[23 * merge_stride] = r24; + merge_ptr[22 * merge_stride] = r23; + merge_ptr[21 * merge_stride] = r22; + merge_ptr[20 * merge_stride] = r21; + merge_ptr[19 * merge_stride] = r20; + merge_ptr[18 * merge_stride] = r19; + merge_ptr[17 * merge_stride] = r18; + merge_ptr[16 * merge_stride] = r17; + merge_ptr[15 * merge_stride] = r16; + merge_ptr[14 * merge_stride] = r15; + merge_ptr[13 * merge_stride] = r14; + merge_ptr[12 * merge_stride] = r13; + merge_ptr[11 * merge_stride] = r12; + merge_ptr[10 * merge_stride] = r11; + merge_ptr[9 * merge_stride] = r10; + merge_ptr[8 * merge_stride] = r9; + merge_ptr[7 * merge_stride] = r8; + merge_ptr[6 * merge_stride] = r7; + merge_ptr[5 * merge_stride] = r6; + merge_ptr[4 * merge_stride] = r5; + merge_ptr[3 * merge_stride] = r4; + merge_ptr[2 * merge_stride] = r3; + merge_ptr[1 * merge_stride] = r2; + merge_ptr[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_fm_11(__global HS_KEY_TYPE* const restrict vout, + uint const fm_full, + uint const fm_frac) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = warp_idx / 16 >> 10; + + uint const merge_stride = 16 * 8 << 10; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + + uint const merge_l_off = + (warp_idx - merge_idx * (16 << 10)) * 8 + warp_lane_idx; + uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; + + int const merge_r_off = merge_keys - merge_l_end - 1; + + __global HS_KEY_TYPE* const restrict merge_l = + vout + (merge_base + merge_l_off); + __global HS_KEY_TYPE* const restrict merge_r = + vout + (merge_base + merge_r_off); + + HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; + if (merge_idx < fm_full) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; + HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; + HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; + HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; + HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; + HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; + HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; + HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r8, r25) + HS_CMP_XCHG(r7, r26) + HS_CMP_XCHG(r6, r27) + HS_CMP_XCHG(r5, r28) + HS_CMP_XCHG(r4, r29) + HS_CMP_XCHG(r3, r30) + HS_CMP_XCHG(r2, r31) + HS_CMP_XCHG(r1, r32) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_r[15 * merge_stride] = r32; + merge_r[14 * merge_stride] = r31; + merge_r[13 * merge_stride] = r30; + merge_r[12 * merge_stride] = r29; + merge_r[11 * merge_stride] = r28; + merge_r[10 * merge_stride] = r27; + merge_r[9 * merge_stride] = r26; + merge_r[8 * merge_stride] = r25; + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 8) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 4) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 2) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r17, r18) + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_CMP_XCHG(r16, r17) + merge_r[0 * merge_stride] = r17; + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + merge_l[15 * merge_stride] = r16; + merge_l[14 * merge_stride] = r15; + merge_l[13 * merge_stride] = r14; + merge_l[12 * merge_stride] = r13; + merge_l[11 * merge_stride] = r12; + merge_l[10 * merge_stride] = r11; + merge_l[9 * merge_stride] = r10; + merge_l[8 * merge_stride] = r9; + merge_l[7 * merge_stride] = r8; + merge_l[6 * merge_stride] = r7; + merge_l[5 * merge_stride] = r6; + merge_l[4 * merge_stride] = r5; + merge_l[3 * merge_stride] = r4; + merge_l[2 * merge_stride] = r3; + merge_l[1 * merge_stride] = r2; + merge_l[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_hm_10(__global HS_KEY_TYPE* const restrict vout) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = (warp_idx / 16) >> 5; + + uint const merge_stride = 16 * 8 << 5; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + uint const merge_off = (warp_idx - merge_idx * (16 << 5)) * 8; + + __global HS_KEY_TYPE* const restrict merge_ptr = + vout + (merge_base + merge_off + warp_lane_idx); + + HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; + HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; + HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; + HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; + HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; + HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; + HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; + HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; + HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; + HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; + HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; + HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; + HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; + HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; + HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; + HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; + HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; + HS_CMP_XCHG(r1, r17) + HS_CMP_XCHG(r9, r25) + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r5, r21) + HS_CMP_XCHG(r13, r29) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r3, r19) + HS_CMP_XCHG(r11, r27) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r7, r23) + HS_CMP_XCHG(r15, r31) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r2, r18) + HS_CMP_XCHG(r10, r26) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r6, r22) + HS_CMP_XCHG(r14, r30) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r4, r20) + HS_CMP_XCHG(r12, r28) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r8, r24) + HS_CMP_XCHG(r16, r32) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_ptr[31 * merge_stride] = r32; + merge_ptr[30 * merge_stride] = r31; + merge_ptr[29 * merge_stride] = r30; + merge_ptr[28 * merge_stride] = r29; + merge_ptr[27 * merge_stride] = r28; + merge_ptr[26 * merge_stride] = r27; + merge_ptr[25 * merge_stride] = r26; + merge_ptr[24 * merge_stride] = r25; + merge_ptr[23 * merge_stride] = r24; + merge_ptr[22 * merge_stride] = r23; + merge_ptr[21 * merge_stride] = r22; + merge_ptr[20 * merge_stride] = r21; + merge_ptr[19 * merge_stride] = r20; + merge_ptr[18 * merge_stride] = r19; + merge_ptr[17 * merge_stride] = r18; + merge_ptr[16 * merge_stride] = r17; + merge_ptr[15 * merge_stride] = r16; + merge_ptr[14 * merge_stride] = r15; + merge_ptr[13 * merge_stride] = r14; + merge_ptr[12 * merge_stride] = r13; + merge_ptr[11 * merge_stride] = r12; + merge_ptr[10 * merge_stride] = r11; + merge_ptr[9 * merge_stride] = r10; + merge_ptr[8 * merge_stride] = r9; + merge_ptr[7 * merge_stride] = r8; + merge_ptr[6 * merge_stride] = r7; + merge_ptr[5 * merge_stride] = r6; + merge_ptr[4 * merge_stride] = r5; + merge_ptr[3 * merge_stride] = r4; + merge_ptr[2 * merge_stride] = r3; + merge_ptr[1 * merge_stride] = r2; + merge_ptr[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_fm_12(__global HS_KEY_TYPE* const restrict vout, + uint const fm_full, + uint const fm_frac) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = warp_idx / 16 >> 11; + + uint const merge_stride = 16 * 8 << 11; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + + uint const merge_l_off = + (warp_idx - merge_idx * (16 << 11)) * 8 + warp_lane_idx; + uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; + + int const merge_r_off = merge_keys - merge_l_end - 1; + + __global HS_KEY_TYPE* const restrict merge_l = + vout + (merge_base + merge_l_off); + __global HS_KEY_TYPE* const restrict merge_r = + vout + (merge_base + merge_r_off); + + HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; + if (merge_idx < fm_full) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; + HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; + HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; + HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; + HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; + HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; + HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; + HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r8, r25) + HS_CMP_XCHG(r7, r26) + HS_CMP_XCHG(r6, r27) + HS_CMP_XCHG(r5, r28) + HS_CMP_XCHG(r4, r29) + HS_CMP_XCHG(r3, r30) + HS_CMP_XCHG(r2, r31) + HS_CMP_XCHG(r1, r32) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_r[15 * merge_stride] = r32; + merge_r[14 * merge_stride] = r31; + merge_r[13 * merge_stride] = r30; + merge_r[12 * merge_stride] = r29; + merge_r[11 * merge_stride] = r28; + merge_r[10 * merge_stride] = r27; + merge_r[9 * merge_stride] = r26; + merge_r[8 * merge_stride] = r25; + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 8) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 4) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 2) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r17, r18) + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_CMP_XCHG(r16, r17) + merge_r[0 * merge_stride] = r17; + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + merge_l[15 * merge_stride] = r16; + merge_l[14 * merge_stride] = r15; + merge_l[13 * merge_stride] = r14; + merge_l[12 * merge_stride] = r13; + merge_l[11 * merge_stride] = r12; + merge_l[10 * merge_stride] = r11; + merge_l[9 * merge_stride] = r10; + merge_l[8 * merge_stride] = r9; + merge_l[7 * merge_stride] = r8; + merge_l[6 * merge_stride] = r7; + merge_l[5 * merge_stride] = r6; + merge_l[4 * merge_stride] = r5; + merge_l[3 * merge_stride] = r4; + merge_l[2 * merge_stride] = r3; + merge_l[1 * merge_stride] = r2; + merge_l[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_hm_11(__global HS_KEY_TYPE* const restrict vout) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = (warp_idx / 16) >> 6; + + uint const merge_stride = 16 * 8 << 6; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + uint const merge_off = (warp_idx - merge_idx * (16 << 6)) * 8; + + __global HS_KEY_TYPE* const restrict merge_ptr = + vout + (merge_base + merge_off + warp_lane_idx); + + HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; + HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; + HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; + HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; + HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; + HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; + HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; + HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; + HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; + HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; + HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; + HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; + HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; + HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; + HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; + HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; + HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; + HS_CMP_XCHG(r1, r17) + HS_CMP_XCHG(r9, r25) + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r5, r21) + HS_CMP_XCHG(r13, r29) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r3, r19) + HS_CMP_XCHG(r11, r27) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r7, r23) + HS_CMP_XCHG(r15, r31) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r2, r18) + HS_CMP_XCHG(r10, r26) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r6, r22) + HS_CMP_XCHG(r14, r30) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r4, r20) + HS_CMP_XCHG(r12, r28) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r8, r24) + HS_CMP_XCHG(r16, r32) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_ptr[31 * merge_stride] = r32; + merge_ptr[30 * merge_stride] = r31; + merge_ptr[29 * merge_stride] = r30; + merge_ptr[28 * merge_stride] = r29; + merge_ptr[27 * merge_stride] = r28; + merge_ptr[26 * merge_stride] = r27; + merge_ptr[25 * merge_stride] = r26; + merge_ptr[24 * merge_stride] = r25; + merge_ptr[23 * merge_stride] = r24; + merge_ptr[22 * merge_stride] = r23; + merge_ptr[21 * merge_stride] = r22; + merge_ptr[20 * merge_stride] = r21; + merge_ptr[19 * merge_stride] = r20; + merge_ptr[18 * merge_stride] = r19; + merge_ptr[17 * merge_stride] = r18; + merge_ptr[16 * merge_stride] = r17; + merge_ptr[15 * merge_stride] = r16; + merge_ptr[14 * merge_stride] = r15; + merge_ptr[13 * merge_stride] = r14; + merge_ptr[12 * merge_stride] = r13; + merge_ptr[11 * merge_stride] = r12; + merge_ptr[10 * merge_stride] = r11; + merge_ptr[9 * merge_stride] = r10; + merge_ptr[8 * merge_stride] = r9; + merge_ptr[7 * merge_stride] = r8; + merge_ptr[6 * merge_stride] = r7; + merge_ptr[5 * merge_stride] = r6; + merge_ptr[4 * merge_stride] = r5; + merge_ptr[3 * merge_stride] = r4; + merge_ptr[2 * merge_stride] = r3; + merge_ptr[1 * merge_stride] = r2; + merge_ptr[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_fm_13(__global HS_KEY_TYPE* const restrict vout, + uint const fm_full, + uint const fm_frac) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = warp_idx / 16 >> 12; + + uint const merge_stride = 16 * 8 << 12; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + + uint const merge_l_off = + (warp_idx - merge_idx * (16 << 12)) * 8 + warp_lane_idx; + uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; + + int const merge_r_off = merge_keys - merge_l_end - 1; + + __global HS_KEY_TYPE* const restrict merge_l = + vout + (merge_base + merge_l_off); + __global HS_KEY_TYPE* const restrict merge_r = + vout + (merge_base + merge_r_off); + + HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; + if (merge_idx < fm_full) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; + HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; + HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; + HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; + HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; + HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; + HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; + HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r8, r25) + HS_CMP_XCHG(r7, r26) + HS_CMP_XCHG(r6, r27) + HS_CMP_XCHG(r5, r28) + HS_CMP_XCHG(r4, r29) + HS_CMP_XCHG(r3, r30) + HS_CMP_XCHG(r2, r31) + HS_CMP_XCHG(r1, r32) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_r[15 * merge_stride] = r32; + merge_r[14 * merge_stride] = r31; + merge_r[13 * merge_stride] = r30; + merge_r[12 * merge_stride] = r29; + merge_r[11 * merge_stride] = r28; + merge_r[10 * merge_stride] = r27; + merge_r[9 * merge_stride] = r26; + merge_r[8 * merge_stride] = r25; + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 8) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 4) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 2) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r17, r18) + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_CMP_XCHG(r16, r17) + merge_r[0 * merge_stride] = r17; + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + merge_l[15 * merge_stride] = r16; + merge_l[14 * merge_stride] = r15; + merge_l[13 * merge_stride] = r14; + merge_l[12 * merge_stride] = r13; + merge_l[11 * merge_stride] = r12; + merge_l[10 * merge_stride] = r11; + merge_l[9 * merge_stride] = r10; + merge_l[8 * merge_stride] = r9; + merge_l[7 * merge_stride] = r8; + merge_l[6 * merge_stride] = r7; + merge_l[5 * merge_stride] = r6; + merge_l[4 * merge_stride] = r5; + merge_l[3 * merge_stride] = r4; + merge_l[2 * merge_stride] = r3; + merge_l[1 * merge_stride] = r2; + merge_l[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_hm_12(__global HS_KEY_TYPE* const restrict vout) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = (warp_idx / 16) >> 7; + + uint const merge_stride = 16 * 8 << 7; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + uint const merge_off = (warp_idx - merge_idx * (16 << 7)) * 8; + + __global HS_KEY_TYPE* const restrict merge_ptr = + vout + (merge_base + merge_off + warp_lane_idx); + + HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; + HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; + HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; + HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; + HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; + HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; + HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; + HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; + HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; + HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; + HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; + HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; + HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; + HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; + HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; + HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; + HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; + HS_CMP_XCHG(r1, r17) + HS_CMP_XCHG(r9, r25) + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r5, r21) + HS_CMP_XCHG(r13, r29) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r3, r19) + HS_CMP_XCHG(r11, r27) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r7, r23) + HS_CMP_XCHG(r15, r31) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r2, r18) + HS_CMP_XCHG(r10, r26) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r6, r22) + HS_CMP_XCHG(r14, r30) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r4, r20) + HS_CMP_XCHG(r12, r28) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r8, r24) + HS_CMP_XCHG(r16, r32) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_ptr[31 * merge_stride] = r32; + merge_ptr[30 * merge_stride] = r31; + merge_ptr[29 * merge_stride] = r30; + merge_ptr[28 * merge_stride] = r29; + merge_ptr[27 * merge_stride] = r28; + merge_ptr[26 * merge_stride] = r27; + merge_ptr[25 * merge_stride] = r26; + merge_ptr[24 * merge_stride] = r25; + merge_ptr[23 * merge_stride] = r24; + merge_ptr[22 * merge_stride] = r23; + merge_ptr[21 * merge_stride] = r22; + merge_ptr[20 * merge_stride] = r21; + merge_ptr[19 * merge_stride] = r20; + merge_ptr[18 * merge_stride] = r19; + merge_ptr[17 * merge_stride] = r18; + merge_ptr[16 * merge_stride] = r17; + merge_ptr[15 * merge_stride] = r16; + merge_ptr[14 * merge_stride] = r15; + merge_ptr[13 * merge_stride] = r14; + merge_ptr[12 * merge_stride] = r13; + merge_ptr[11 * merge_stride] = r12; + merge_ptr[10 * merge_stride] = r11; + merge_ptr[9 * merge_stride] = r10; + merge_ptr[8 * merge_stride] = r9; + merge_ptr[7 * merge_stride] = r8; + merge_ptr[6 * merge_stride] = r7; + merge_ptr[5 * merge_stride] = r6; + merge_ptr[4 * merge_stride] = r5; + merge_ptr[3 * merge_stride] = r4; + merge_ptr[2 * merge_stride] = r3; + merge_ptr[1 * merge_stride] = r2; + merge_ptr[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_fm_14(__global HS_KEY_TYPE* const restrict vout, + uint const fm_full, + uint const fm_frac) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = warp_idx / 16 >> 13; + + uint const merge_stride = 16 * 8 << 13; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + + uint const merge_l_off = + (warp_idx - merge_idx * (16 << 13)) * 8 + warp_lane_idx; + uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; + + int const merge_r_off = merge_keys - merge_l_end - 1; + + __global HS_KEY_TYPE* const restrict merge_l = + vout + (merge_base + merge_l_off); + __global HS_KEY_TYPE* const restrict merge_r = + vout + (merge_base + merge_r_off); + + HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; + if (merge_idx < fm_full) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; + HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; + HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; + HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; + HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; + HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; + HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; + HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r8, r25) + HS_CMP_XCHG(r7, r26) + HS_CMP_XCHG(r6, r27) + HS_CMP_XCHG(r5, r28) + HS_CMP_XCHG(r4, r29) + HS_CMP_XCHG(r3, r30) + HS_CMP_XCHG(r2, r31) + HS_CMP_XCHG(r1, r32) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_r[15 * merge_stride] = r32; + merge_r[14 * merge_stride] = r31; + merge_r[13 * merge_stride] = r30; + merge_r[12 * merge_stride] = r29; + merge_r[11 * merge_stride] = r28; + merge_r[10 * merge_stride] = r27; + merge_r[9 * merge_stride] = r26; + merge_r[8 * merge_stride] = r25; + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 8) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 4) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 2) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r17, r18) + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_CMP_XCHG(r16, r17) + merge_r[0 * merge_stride] = r17; + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + merge_l[15 * merge_stride] = r16; + merge_l[14 * merge_stride] = r15; + merge_l[13 * merge_stride] = r14; + merge_l[12 * merge_stride] = r13; + merge_l[11 * merge_stride] = r12; + merge_l[10 * merge_stride] = r11; + merge_l[9 * merge_stride] = r10; + merge_l[8 * merge_stride] = r9; + merge_l[7 * merge_stride] = r8; + merge_l[6 * merge_stride] = r7; + merge_l[5 * merge_stride] = r6; + merge_l[4 * merge_stride] = r5; + merge_l[3 * merge_stride] = r4; + merge_l[2 * merge_stride] = r3; + merge_l[1 * merge_stride] = r2; + merge_l[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_hm_13(__global HS_KEY_TYPE* const restrict vout) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = (warp_idx / 16) >> 8; + + uint const merge_stride = 16 * 8 << 8; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + uint const merge_off = (warp_idx - merge_idx * (16 << 8)) * 8; + + __global HS_KEY_TYPE* const restrict merge_ptr = + vout + (merge_base + merge_off + warp_lane_idx); + + HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; + HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; + HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; + HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; + HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; + HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; + HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; + HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; + HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; + HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; + HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; + HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; + HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; + HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; + HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; + HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; + HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; + HS_CMP_XCHG(r1, r17) + HS_CMP_XCHG(r9, r25) + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r5, r21) + HS_CMP_XCHG(r13, r29) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r3, r19) + HS_CMP_XCHG(r11, r27) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r7, r23) + HS_CMP_XCHG(r15, r31) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r2, r18) + HS_CMP_XCHG(r10, r26) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r6, r22) + HS_CMP_XCHG(r14, r30) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r4, r20) + HS_CMP_XCHG(r12, r28) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r8, r24) + HS_CMP_XCHG(r16, r32) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_ptr[31 * merge_stride] = r32; + merge_ptr[30 * merge_stride] = r31; + merge_ptr[29 * merge_stride] = r30; + merge_ptr[28 * merge_stride] = r29; + merge_ptr[27 * merge_stride] = r28; + merge_ptr[26 * merge_stride] = r27; + merge_ptr[25 * merge_stride] = r26; + merge_ptr[24 * merge_stride] = r25; + merge_ptr[23 * merge_stride] = r24; + merge_ptr[22 * merge_stride] = r23; + merge_ptr[21 * merge_stride] = r22; + merge_ptr[20 * merge_stride] = r21; + merge_ptr[19 * merge_stride] = r20; + merge_ptr[18 * merge_stride] = r19; + merge_ptr[17 * merge_stride] = r18; + merge_ptr[16 * merge_stride] = r17; + merge_ptr[15 * merge_stride] = r16; + merge_ptr[14 * merge_stride] = r15; + merge_ptr[13 * merge_stride] = r14; + merge_ptr[12 * merge_stride] = r13; + merge_ptr[11 * merge_stride] = r12; + merge_ptr[10 * merge_stride] = r11; + merge_ptr[9 * merge_stride] = r10; + merge_ptr[8 * merge_stride] = r9; + merge_ptr[7 * merge_stride] = r8; + merge_ptr[6 * merge_stride] = r7; + merge_ptr[5 * merge_stride] = r6; + merge_ptr[4 * merge_stride] = r5; + merge_ptr[3 * merge_stride] = r4; + merge_ptr[2 * merge_stride] = r3; + merge_ptr[1 * merge_stride] = r2; + merge_ptr[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_fm_15(__global HS_KEY_TYPE* const restrict vout, + uint const fm_full, + uint const fm_frac) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = warp_idx / 16 >> 14; + + uint const merge_stride = 16 * 8 << 14; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + + uint const merge_l_off = + (warp_idx - merge_idx * (16 << 14)) * 8 + warp_lane_idx; + uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; + + int const merge_r_off = merge_keys - merge_l_end - 1; + + __global HS_KEY_TYPE* const restrict merge_l = + vout + (merge_base + merge_l_off); + __global HS_KEY_TYPE* const restrict merge_r = + vout + (merge_base + merge_r_off); + + HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; + if (merge_idx < fm_full) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; + HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; + HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; + HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; + HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; + HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; + HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; + HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r8, r25) + HS_CMP_XCHG(r7, r26) + HS_CMP_XCHG(r6, r27) + HS_CMP_XCHG(r5, r28) + HS_CMP_XCHG(r4, r29) + HS_CMP_XCHG(r3, r30) + HS_CMP_XCHG(r2, r31) + HS_CMP_XCHG(r1, r32) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_r[15 * merge_stride] = r32; + merge_r[14 * merge_stride] = r31; + merge_r[13 * merge_stride] = r30; + merge_r[12 * merge_stride] = r29; + merge_r[11 * merge_stride] = r28; + merge_r[10 * merge_stride] = r27; + merge_r[9 * merge_stride] = r26; + merge_r[8 * merge_stride] = r25; + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 8) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 4) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 2) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r17, r18) + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_CMP_XCHG(r16, r17) + merge_r[0 * merge_stride] = r17; + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + merge_l[15 * merge_stride] = r16; + merge_l[14 * merge_stride] = r15; + merge_l[13 * merge_stride] = r14; + merge_l[12 * merge_stride] = r13; + merge_l[11 * merge_stride] = r12; + merge_l[10 * merge_stride] = r11; + merge_l[9 * merge_stride] = r10; + merge_l[8 * merge_stride] = r9; + merge_l[7 * merge_stride] = r8; + merge_l[6 * merge_stride] = r7; + merge_l[5 * merge_stride] = r6; + merge_l[4 * merge_stride] = r5; + merge_l[3 * merge_stride] = r4; + merge_l[2 * merge_stride] = r3; + merge_l[1 * merge_stride] = r2; + merge_l[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_hm_14(__global HS_KEY_TYPE* const restrict vout) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = (warp_idx / 16) >> 9; + + uint const merge_stride = 16 * 8 << 9; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + uint const merge_off = (warp_idx - merge_idx * (16 << 9)) * 8; + + __global HS_KEY_TYPE* const restrict merge_ptr = + vout + (merge_base + merge_off + warp_lane_idx); + + HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; + HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; + HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; + HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; + HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; + HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; + HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; + HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; + HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; + HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; + HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; + HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; + HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; + HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; + HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; + HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; + HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; + HS_CMP_XCHG(r1, r17) + HS_CMP_XCHG(r9, r25) + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r5, r21) + HS_CMP_XCHG(r13, r29) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r3, r19) + HS_CMP_XCHG(r11, r27) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r7, r23) + HS_CMP_XCHG(r15, r31) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r2, r18) + HS_CMP_XCHG(r10, r26) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r6, r22) + HS_CMP_XCHG(r14, r30) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r4, r20) + HS_CMP_XCHG(r12, r28) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r8, r24) + HS_CMP_XCHG(r16, r32) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_ptr[31 * merge_stride] = r32; + merge_ptr[30 * merge_stride] = r31; + merge_ptr[29 * merge_stride] = r30; + merge_ptr[28 * merge_stride] = r29; + merge_ptr[27 * merge_stride] = r28; + merge_ptr[26 * merge_stride] = r27; + merge_ptr[25 * merge_stride] = r26; + merge_ptr[24 * merge_stride] = r25; + merge_ptr[23 * merge_stride] = r24; + merge_ptr[22 * merge_stride] = r23; + merge_ptr[21 * merge_stride] = r22; + merge_ptr[20 * merge_stride] = r21; + merge_ptr[19 * merge_stride] = r20; + merge_ptr[18 * merge_stride] = r19; + merge_ptr[17 * merge_stride] = r18; + merge_ptr[16 * merge_stride] = r17; + merge_ptr[15 * merge_stride] = r16; + merge_ptr[14 * merge_stride] = r15; + merge_ptr[13 * merge_stride] = r14; + merge_ptr[12 * merge_stride] = r13; + merge_ptr[11 * merge_stride] = r12; + merge_ptr[10 * merge_stride] = r11; + merge_ptr[9 * merge_stride] = r10; + merge_ptr[8 * merge_stride] = r9; + merge_ptr[7 * merge_stride] = r8; + merge_ptr[6 * merge_stride] = r7; + merge_ptr[5 * merge_stride] = r6; + merge_ptr[4 * merge_stride] = r5; + merge_ptr[3 * merge_stride] = r4; + merge_ptr[2 * merge_stride] = r3; + merge_ptr[1 * merge_stride] = r2; + merge_ptr[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_fm_16(__global HS_KEY_TYPE* const restrict vout, + uint const fm_full, + uint const fm_frac) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = warp_idx / 16 >> 15; + + uint const merge_stride = 16 * 8 << 15; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + + uint const merge_l_off = + (warp_idx - merge_idx * (16 << 15)) * 8 + warp_lane_idx; + uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; + + int const merge_r_off = merge_keys - merge_l_end - 1; + + __global HS_KEY_TYPE* const restrict merge_l = + vout + (merge_base + merge_l_off); + __global HS_KEY_TYPE* const restrict merge_r = + vout + (merge_base + merge_r_off); + + HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; + if (merge_idx < fm_full) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; + HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; + HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; + HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; + HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; + HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; + HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; + HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r8, r25) + HS_CMP_XCHG(r7, r26) + HS_CMP_XCHG(r6, r27) + HS_CMP_XCHG(r5, r28) + HS_CMP_XCHG(r4, r29) + HS_CMP_XCHG(r3, r30) + HS_CMP_XCHG(r2, r31) + HS_CMP_XCHG(r1, r32) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_r[15 * merge_stride] = r32; + merge_r[14 * merge_stride] = r31; + merge_r[13 * merge_stride] = r30; + merge_r[12 * merge_stride] = r29; + merge_r[11 * merge_stride] = r28; + merge_r[10 * merge_stride] = r27; + merge_r[9 * merge_stride] = r26; + merge_r[8 * merge_stride] = r25; + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 8) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; + HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; + HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; + HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r12, r21) + HS_CMP_XCHG(r11, r22) + HS_CMP_XCHG(r10, r23) + HS_CMP_XCHG(r9, r24) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + merge_r[7 * merge_stride] = r24; + merge_r[6 * merge_stride] = r23; + merge_r[5 * merge_stride] = r22; + merge_r[4 * merge_stride] = r21; + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 4) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; + HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r14, r19) + HS_CMP_XCHG(r13, r20) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + merge_r[3 * merge_stride] = r20; + merge_r[2 * merge_stride] = r19; + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else if (fm_frac == 2) { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; + HS_CMP_XCHG(r16, r17) + HS_CMP_XCHG(r15, r18) + HS_CMP_XCHG(r17, r18) + merge_r[1 * merge_stride] = r18; + merge_r[0 * merge_stride] = r17; + } else { + HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; + HS_CMP_XCHG(r16, r17) + merge_r[0 * merge_stride] = r17; + } + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + merge_l[15 * merge_stride] = r16; + merge_l[14 * merge_stride] = r15; + merge_l[13 * merge_stride] = r14; + merge_l[12 * merge_stride] = r13; + merge_l[11 * merge_stride] = r12; + merge_l[10 * merge_stride] = r11; + merge_l[9 * merge_stride] = r10; + merge_l[8 * merge_stride] = r9; + merge_l[7 * merge_stride] = r8; + merge_l[6 * merge_stride] = r7; + merge_l[5 * merge_stride] = r6; + merge_l[4 * merge_stride] = r5; + merge_l[3 * merge_stride] = r4; + merge_l[2 * merge_stride] = r3; + merge_l[1 * merge_stride] = r2; + merge_l[0 * merge_stride] = r1; +} + +__kernel __attribute__((intel_reqd_sub_group_size(8))) void +hs_kernel_hm_15(__global HS_KEY_TYPE* const restrict vout) +{ + uint const global_id = (uint)get_global_id(0); + uint const warp_idx = global_id / 8; + uint const warp_lane_idx = global_id & 7; + + uint const merge_idx = (warp_idx / 16) >> 10; + + uint const merge_stride = 16 * 8 << 10; + uint const merge_keys = merge_stride * 32; + + uint const merge_base = merge_idx * merge_keys; + uint const merge_off = (warp_idx - merge_idx * (16 << 10)) * 8; + + __global HS_KEY_TYPE* const restrict merge_ptr = + vout + (merge_base + merge_off + warp_lane_idx); + + HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; + HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; + HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; + HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; + HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; + HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; + HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; + HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; + HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; + HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; + HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; + HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; + HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; + HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; + HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; + HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; + HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; + HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; + HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; + HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; + HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; + HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; + HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; + HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; + HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; + HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; + HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; + HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; + HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; + HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; + HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; + HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; + HS_CMP_XCHG(r1, r17) + HS_CMP_XCHG(r9, r25) + HS_CMP_XCHG(r1, r9) + HS_CMP_XCHG(r17, r25) + HS_CMP_XCHG(r5, r21) + HS_CMP_XCHG(r13, r29) + HS_CMP_XCHG(r5, r13) + HS_CMP_XCHG(r21, r29) + HS_CMP_XCHG(r1, r5) + HS_CMP_XCHG(r9, r13) + HS_CMP_XCHG(r17, r21) + HS_CMP_XCHG(r25, r29) + HS_CMP_XCHG(r3, r19) + HS_CMP_XCHG(r11, r27) + HS_CMP_XCHG(r3, r11) + HS_CMP_XCHG(r19, r27) + HS_CMP_XCHG(r7, r23) + HS_CMP_XCHG(r15, r31) + HS_CMP_XCHG(r7, r15) + HS_CMP_XCHG(r23, r31) + HS_CMP_XCHG(r3, r7) + HS_CMP_XCHG(r11, r15) + HS_CMP_XCHG(r19, r23) + HS_CMP_XCHG(r27, r31) + HS_CMP_XCHG(r1, r3) + HS_CMP_XCHG(r5, r7) + HS_CMP_XCHG(r9, r11) + HS_CMP_XCHG(r13, r15) + HS_CMP_XCHG(r17, r19) + HS_CMP_XCHG(r21, r23) + HS_CMP_XCHG(r25, r27) + HS_CMP_XCHG(r29, r31) + HS_CMP_XCHG(r2, r18) + HS_CMP_XCHG(r10, r26) + HS_CMP_XCHG(r2, r10) + HS_CMP_XCHG(r18, r26) + HS_CMP_XCHG(r6, r22) + HS_CMP_XCHG(r14, r30) + HS_CMP_XCHG(r6, r14) + HS_CMP_XCHG(r22, r30) + HS_CMP_XCHG(r2, r6) + HS_CMP_XCHG(r10, r14) + HS_CMP_XCHG(r18, r22) + HS_CMP_XCHG(r26, r30) + HS_CMP_XCHG(r4, r20) + HS_CMP_XCHG(r12, r28) + HS_CMP_XCHG(r4, r12) + HS_CMP_XCHG(r20, r28) + HS_CMP_XCHG(r8, r24) + HS_CMP_XCHG(r16, r32) + HS_CMP_XCHG(r8, r16) + HS_CMP_XCHG(r24, r32) + HS_CMP_XCHG(r4, r8) + HS_CMP_XCHG(r12, r16) + HS_CMP_XCHG(r20, r24) + HS_CMP_XCHG(r28, r32) + HS_CMP_XCHG(r2, r4) + HS_CMP_XCHG(r6, r8) + HS_CMP_XCHG(r10, r12) + HS_CMP_XCHG(r14, r16) + HS_CMP_XCHG(r18, r20) + HS_CMP_XCHG(r22, r24) + HS_CMP_XCHG(r26, r28) + HS_CMP_XCHG(r30, r32) + HS_CMP_XCHG(r1, r2) + HS_CMP_XCHG(r3, r4) + HS_CMP_XCHG(r5, r6) + HS_CMP_XCHG(r7, r8) + HS_CMP_XCHG(r9, r10) + HS_CMP_XCHG(r11, r12) + HS_CMP_XCHG(r13, r14) + HS_CMP_XCHG(r15, r16) + HS_CMP_XCHG(r17, r18) + HS_CMP_XCHG(r19, r20) + HS_CMP_XCHG(r21, r22) + HS_CMP_XCHG(r23, r24) + HS_CMP_XCHG(r25, r26) + HS_CMP_XCHG(r27, r28) + HS_CMP_XCHG(r29, r30) + HS_CMP_XCHG(r31, r32) + merge_ptr[31 * merge_stride] = r32; + merge_ptr[30 * merge_stride] = r31; + merge_ptr[29 * merge_stride] = r30; + merge_ptr[28 * merge_stride] = r29; + merge_ptr[27 * merge_stride] = r28; + merge_ptr[26 * merge_stride] = r27; + merge_ptr[25 * merge_stride] = r26; + merge_ptr[24 * merge_stride] = r25; + merge_ptr[23 * merge_stride] = r24; + merge_ptr[22 * merge_stride] = r23; + merge_ptr[21 * merge_stride] = r22; + merge_ptr[20 * merge_stride] = r21; + merge_ptr[19 * merge_stride] = r20; + merge_ptr[18 * merge_stride] = r19; + merge_ptr[17 * merge_stride] = r18; + merge_ptr[16 * merge_stride] = r17; + merge_ptr[15 * merge_stride] = r16; + merge_ptr[14 * merge_stride] = r15; + merge_ptr[13 * merge_stride] = r14; + merge_ptr[12 * merge_stride] = r13; + merge_ptr[11 * merge_stride] = r12; + merge_ptr[10 * merge_stride] = r11; + merge_ptr[9 * merge_stride] = r10; + merge_ptr[8 * merge_stride] = r9; + merge_ptr[7 * merge_stride] = r8; + merge_ptr[6 * merge_stride] = r7; + merge_ptr[5 * merge_stride] = r6; + merge_ptr[4 * merge_stride] = r5; + merge_ptr[3 * merge_stride] = r4; + merge_ptr[2 * merge_stride] = r3; + merge_ptr[1 * merge_stride] = r2; + merge_ptr[0 * merge_stride] = r1; +} + +// +// +// diff --git a/src/compute/hs/cl/gen9/hs_cl.h b/src/compute/hs/cl/gen9/hs_cl.h new file mode 100644 index 0000000000..a33b2b7b93 --- /dev/null +++ b/src/compute/hs/cl/gen9/hs_cl.h @@ -0,0 +1,122 @@ +// +// Copyright 2016 Google Inc. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// + +#ifndef HS_CL_ONCE +#define HS_CL_ONCE + +#define HS_LANES_PER_WARP_LOG2 3 +#define HS_LANES_PER_WARP (1 << HS_LANES_PER_WARP_LOG2) +#define HS_BS_WARPS 16 +#define HS_BS_WARPS_LOG2_RU 4 +#define HS_BC_WARPS_LOG2_MAX 4 +#define HS_FM_BLOCKS_LOG2_MIN 1 +#define HS_HM_BLOCKS_LOG2_MIN 1 +#define HS_KEYS_PER_LANE 16 +#define HS_REG_LAST(c) c##16 +#define HS_KEY_WORDS 2 +#define HS_KEY_TYPE ulong +#define HS_EMPTY + +#define HS_SLAB_ROWS() \ + HS_SLAB_ROW( 1, 0 ) \ + HS_SLAB_ROW( 2, 1 ) \ + HS_SLAB_ROW( 3, 2 ) \ + HS_SLAB_ROW( 4, 3 ) \ + HS_SLAB_ROW( 5, 4 ) \ + HS_SLAB_ROW( 6, 5 ) \ + HS_SLAB_ROW( 7, 6 ) \ + HS_SLAB_ROW( 8, 7 ) \ + HS_SLAB_ROW( 9, 8 ) \ + HS_SLAB_ROW( 10, 9 ) \ + HS_SLAB_ROW( 11, 10 ) \ + HS_SLAB_ROW( 12, 11 ) \ + HS_SLAB_ROW( 13, 12 ) \ + HS_SLAB_ROW( 14, 13 ) \ + HS_SLAB_ROW( 15, 14 ) \ + HS_SLAB_ROW( 16, 15 ) \ + HS_EMPTY + +#define HS_TRANSPOSE_SLAB() \ + HS_TRANSPOSE_STAGE( 1 ) \ + HS_TRANSPOSE_STAGE( 2 ) \ + HS_TRANSPOSE_STAGE( 3 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 2, 1 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 4, 3 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 6, 5 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 8, 7 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 10, 9 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 12, 11 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 14, 13 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 16, 15 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 3, 1 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 4, 2 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 7, 5 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 8, 6 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 11, 9 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 12, 10 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 15, 13 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 16, 14 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 5, 1 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 6, 2 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 7, 3 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 8, 4 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 13, 9 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 14, 10 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 15, 11 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 16, 12 ) \ + HS_TRANSPOSE_REMAP( u, 1, 1 ) \ + HS_TRANSPOSE_REMAP( u, 2, 3 ) \ + HS_TRANSPOSE_REMAP( u, 3, 5 ) \ + HS_TRANSPOSE_REMAP( u, 4, 7 ) \ + HS_TRANSPOSE_REMAP( u, 5, 9 ) \ + HS_TRANSPOSE_REMAP( u, 6, 11 ) \ + HS_TRANSPOSE_REMAP( u, 7, 13 ) \ + HS_TRANSPOSE_REMAP( u, 8, 15 ) \ + HS_TRANSPOSE_REMAP( u, 9, 2 ) \ + HS_TRANSPOSE_REMAP( u, 10, 4 ) \ + HS_TRANSPOSE_REMAP( u, 11, 6 ) \ + HS_TRANSPOSE_REMAP( u, 12, 8 ) \ + HS_TRANSPOSE_REMAP( u, 13, 10 ) \ + HS_TRANSPOSE_REMAP( u, 14, 12 ) \ + HS_TRANSPOSE_REMAP( u, 15, 14 ) \ + HS_TRANSPOSE_REMAP( u, 16, 16 ) \ + HS_EMPTY + +#define HS_FM_BLOCKS_LOG2_1 0 +#define HS_FM_BLOCKS_LOG2_2 1 +#define HS_FM_BLOCKS_LOG2_3 2 +#define HS_FM_BLOCKS_LOG2_4 3 +#define HS_FM_BLOCKS_LOG2_5 4 +#define HS_FM_BLOCKS_LOG2_6 5 +#define HS_HM_BLOCKS_LOG2_5 0 +#define HS_FM_BLOCKS_LOG2_7 6 +#define HS_HM_BLOCKS_LOG2_6 1 +#define HS_FM_BLOCKS_LOG2_8 7 +#define HS_HM_BLOCKS_LOG2_7 2 +#define HS_FM_BLOCKS_LOG2_9 8 +#define HS_HM_BLOCKS_LOG2_8 3 +#define HS_FM_BLOCKS_LOG2_10 9 +#define HS_HM_BLOCKS_LOG2_9 4 +#define HS_FM_BLOCKS_LOG2_11 10 +#define HS_HM_BLOCKS_LOG2_10 5 +#define HS_FM_BLOCKS_LOG2_12 11 +#define HS_HM_BLOCKS_LOG2_11 6 +#define HS_FM_BLOCKS_LOG2_13 12 +#define HS_HM_BLOCKS_LOG2_12 7 +#define HS_FM_BLOCKS_LOG2_14 13 +#define HS_HM_BLOCKS_LOG2_13 8 +#define HS_FM_BLOCKS_LOG2_15 14 +#define HS_HM_BLOCKS_LOG2_14 9 +#define HS_FM_BLOCKS_LOG2_16 15 +#define HS_HM_BLOCKS_LOG2_15 10 + +#endif + +// +// +// + diff --git a/src/compute/hs/cl/gen9/hs_cl_macros.h b/src/compute/hs/cl/gen9/hs_cl_macros.h new file mode 100644 index 0000000000..d314fe88ae --- /dev/null +++ b/src/compute/hs/cl/gen9/hs_cl_macros.h @@ -0,0 +1,199 @@ +// +// Copyright 2016 Google Inc. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// + +#ifndef HS_CL_MACROS_ONCE +#define HS_CL_MACROS_ONCE + +// +// +// + +#include "hs_cl.h" + +// +// Inter-lane compare exchange +// + +// default +#define HS_CMP_XCHG_V0(a,b) \ + { \ + HS_KEY_TYPE const t = min(a,b); \ + b = max(a,b); \ + a = t; \ + } + +// super slow +#define HS_CMP_XCHG_V1(a,b) \ + { \ + HS_KEY_TYPE const tmp = a; \ + a = (a < b) ? a : b; \ + b ^= a ^ tmp; \ + } + +// best +#define HS_CMP_XCHG_V2(a,b) \ + if (a >= b) { \ + HS_KEY_TYPE const t = a; \ + a = b; \ + b = t; \ + } + +// good +#define HS_CMP_XCHG_V3(a,b) \ + { \ + int const ge = a >= b; \ + HS_KEY_TYPE const t = a; \ + a = ge ? b : a; \ + b = ge ? t : b; \ + } + +// +// +// + +#if (HS_KEY_WORDS == 1) +#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V0(a,b) +#elif (HS_KEY_WORDS == 2) +#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V2(a,b) +#endif + +// +// Conditional inter-subgroup flip/half compare exchange +// + +#define HS_CMP_FLIP(i,a,b) \ + { \ + HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,flip_lane_idx); \ + HS_KEY_TYPE const tb = intel_sub_group_shuffle(b,flip_lane_idx); \ + a = HS_COND_MIN_MAX(t_lt,a,tb); \ + b = HS_COND_MIN_MAX(t_lt,b,ta); \ + } + +#define HS_CMP_HALF(i,a) \ + { \ + HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,half_lane_idx); \ + a = HS_COND_MIN_MAX(t_lt,a,ta); \ + } + +// +// The device's comparison operator might return what we actually +// want. For example, it appears GEN 'cmp' returns {true:-1,false:0}. +// + +#define HS_CMP_IS_ZERO_ONE + +#ifdef HS_CMP_IS_ZERO_ONE +// OpenCL requires a {true: +1, false: 0} scalar result +// (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF } +#define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b)) +#define HS_CMP_TO_MASK(a) (HS_KEY_TYPE)(-a) +#else +// However, OpenCL requires { -1, 0 } for vectors +// (a < b) -> { 0xFFFFFFFF, 0 } +#define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64 +#define HS_CMP_TO_MASK(a) (a) +#endif + +// +// The flip/half comparisons rely on a "conditional min/max": +// +// - if the flag is false, return min(a,b) +// - otherwise, return max(a,b) +// +// What's a little surprising is that sequence (1) is faster than (2) +// for 32-bit keys. +// +// I suspect either a code generation problem or that the sequence +// maps well to the GEN instruction set. +// +// We mostly care about 64-bit keys and unsurprisingly sequence (2) is +// fastest for this wider type. +// + +// this is what you would normally use +#define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) ^ lt) ? b : a + +// this seems to be faster for 32-bit keys +#define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b)) + +// +// +// + +#if (HS_KEY_WORDS == 1) +#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V1(lt,a,b) +#elif (HS_KEY_WORDS == 2) +#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b) +#endif + +// +// This snarl of macros is for transposing a "slab" of sorted elements +// into linear order. +// +// This can occur as the last step in hs_sort() or via a custom kernel +// that inspects the slab and then transposes and stores it to memory. +// +// The slab format can be inspected more efficiently than a linear +// arrangement. +// +// The prime example is detecting when adjacent keys (in sort order) +// have differing high order bits ("key changes"). The index of each +// change is recorded to an auxilary array. +// +// A post-processing step like this needs to be able to navigate the +// slab and eventually transpose and store the slab in linear order. +// + +#define HS_TRANSPOSE_REG(prefix,row) prefix##row +#define HS_TRANSPOSE_DECL(prefix,row) HS_KEY_TYPE const HS_TRANSPOSE_REG(prefix,row) + +#define HS_TRANSPOSE_DELTA(level) (HS_LANES_PER_WARP + (1 << (level-1))) +#define HS_TRANSPOSE_IF(level) ((get_sub_group_local_id() >> (level - 1)) & 1) + +#define HS_TRANSPOSE_LL(level) HS_TRANSPOSE_IF(level) ? 0 : HS_TRANSPOSE_DELTA(level) +#define HS_TRANSPOSE_UR(level) HS_TRANSPOSE_IF(level) ? HS_TRANSPOSE_DELTA(level) : 0 + +#define HS_TRANSPOSE_DELTA_LL(level) delta_ll_##level +#define HS_TRANSPOSE_DELTA_UR(level) delta_ur_##level + +#define HS_TRANSPOSE_STAGE(level) \ + uint const HS_TRANSPOSE_DELTA_LL(level) = HS_TRANSPOSE_LL(level); \ + uint const HS_TRANSPOSE_DELTA_UR(level) = HS_TRANSPOSE_UR(level); + +#define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \ + HS_TRANSPOSE_DECL(prefix_curr,row_ll) = \ + intel_sub_group_shuffle_down(HS_TRANSPOSE_REG(prefix_prev,row_ll), \ + HS_TRANSPOSE_REG(prefix_prev,row_ur), \ + HS_TRANSPOSE_DELTA_LL(level)); \ + HS_TRANSPOSE_DECL(prefix_curr,row_ur) = \ + intel_sub_group_shuffle_up(HS_TRANSPOSE_REG(prefix_prev,row_ll), \ + HS_TRANSPOSE_REG(prefix_prev,row_ur), \ + HS_TRANSPOSE_DELTA_UR(level)); \ + +// #define HS_TRANSPOSE_LOAD(row) \ +// HS_TRANSPOSE_DECL(0,row) = (vout + gmem_idx)[(row-1) << HS_LANES_PER_WARP_LOG2]; + +#define HS_TRANSPOSE_REMAP(prefix,row_from,row_to) \ + (vout + gmem_idx)[(row_to-1) << HS_LANES_PER_WARP_LOG2] = \ + HS_TRANSPOSE_REG(prefix,row_from); + +// +// undefine these if you want to override +// + +#define HS_TRANSPOSE_PREAMBLE() +#define HS_TRANSPOSE_BODY() + +// +// +// + +#endif + +// +// +// |