aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/compute
diff options
context:
space:
mode:
authorGravatar Allan MacKinnon <allanmac@google.com>2018-07-16 15:57:05 -0700
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2018-07-17 17:01:41 +0000
commit9e0d7e4072e43495a3907bb2bac7824e8e60c368 (patch)
treebaaff58dd81c1dc5e26668a8d517cbdf568bdb94 /src/compute
parent53c876900247ad700ce28f7b33031047a6cff402 (diff)
Bug fixes and improvements to SKC and HotSort. Vulkan is WIP.
Bug: skia: Change-Id: Iffc75a5b4dfcbfa4a6c23d972bb9798c2f550335 Reviewed-on: https://skia-review.googlesource.com/141582 Reviewed-by: Mike Reed <reed@google.com> Reviewed-by: Allan MacKinnon <allanmac@google.com> Commit-Queue: Allan MacKinnon <allanmac@google.com>
Diffstat (limited to 'src/compute')
-rw-r--r--src/compute/common/cl/assert_cl.c2
-rw-r--r--src/compute/common/cl/assert_cl.h1
-rw-r--r--src/compute/common/cl/find_cl.c11
-rw-r--r--src/compute/common/cl/find_cl.h1
-rw-r--r--src/compute/common/macros.h41
-rw-r--r--src/compute/common/util.c4
-rw-r--r--src/compute/common/util.h2
-rw-r--r--src/compute/hs/cl/bench/main.c262
-rw-r--r--src/compute/hs/cl/gen9/hs_cl.cl10082
-rw-r--r--src/compute/hs/cl/gen9/hs_cl_macros.h199
-rw-r--r--src/compute/hs/cl/gen9/make_all.bat16
-rw-r--r--src/compute/hs/cl/hs_cl_launcher.c1524
-rw-r--r--src/compute/hs/cl/hs_cl_launcher.h62
-rw-r--r--src/compute/hs/cl/hs_cl_target.h63
-rw-r--r--src/compute/hs/cl/intel/gen8/u32/make_all.bat16
-rw-r--r--src/compute/hs/cl/intel/gen8/u32/make_inl_cl.bat (renamed from src/compute/hs/cl/gen9/make_inl_cl.bat)1
-rw-r--r--src/compute/hs/cl/intel/gen8/u32b32/make_all.bat16
-rw-r--r--src/compute/hs/cl/intel/gen8/u32b32/make_inl_cl.bat77
-rw-r--r--src/compute/hs/cl/intel/gen8/u64/hs_cl.cl4851
-rw-r--r--src/compute/hs/cl/intel/gen8/u64/hs_cl.h (renamed from src/compute/hs/cl/gen9/hs_cl.h)52
-rw-r--r--src/compute/hs/cl/intel/gen8/u64/hs_cl_macros.h361
-rw-r--r--src/compute/hs/cl/intel/gen8/u64/hs_target.h115
-rw-r--r--src/compute/hs/cl/intel/gen8/u64/make_all.bat26
-rw-r--r--src/compute/hs/cl/intel/gen8/u64/make_inl_cl.bat113
-rw-r--r--src/compute/hs/cl/intel/gen9lp/u32/make_inl_cl.bat77
-rw-r--r--src/compute/hs/cl/intel/gen9lp/u32b32/make_inl_cl.bat77
-rw-r--r--src/compute/hs/cl/intel/gen9lp/u64/make_inl_cl.bat77
-rw-r--r--src/compute/hs/gen/gen.h112
-rw-r--r--src/compute/hs/gen/main.c532
-rw-r--r--src/compute/hs/gen/networks_merging.c4
-rw-r--r--src/compute/hs/gen/networks_sorting.c4
-rw-r--r--src/compute/hs/gen/target_cuda.c600
-rw-r--r--src/compute/hs/gen/target_cuda_sm3x.c776
-rw-r--r--src/compute/hs/gen/target_debug.c73
-rw-r--r--src/compute/hs/gen/target_glsl.c674
-rw-r--r--src/compute/hs/gen/target_igp_genx.c672
-rw-r--r--src/compute/hs/gen/target_opencl.c600
-rw-r--r--src/compute/hs/gen/transpose.c61
-rw-r--r--src/compute/hs/gen/transpose.h6
-rw-r--r--src/compute/hs/vk/hs_spirv_target.h77
-rw-r--r--src/compute/hs/vk/hs_vk_launcher.c248
-rw-r--r--src/compute/hs/vk/hs_vk_launcher.h88
-rw-r--r--src/compute/hs/vk/intel/gen8/u32/make_all.bat48
-rw-r--r--src/compute/hs/vk/intel/gen8/u32b32/make_all.bat48
-rw-r--r--src/compute/hs/vk/intel/gen8/u64/hs_glsl.h100
-rw-r--r--src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h417
-rw-r--r--src/compute/hs/vk/intel/gen8/u64/hs_kernels.h75
-rw-r--r--src/compute/hs/vk/intel/gen8/u64/hs_target.h113
-rw-r--r--src/compute/hs/vk/intel/gen8/u64/make_all.bat79
-rw-r--r--src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat48
-rw-r--r--src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat48
-rw-r--r--src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat48
-rw-r--r--src/compute/skc/extent_ring.c1
-rw-r--r--src/compute/skc/main.c6
-rw-r--r--src/compute/skc/path_builder.h1
-rw-r--r--src/compute/skc/platforms/cl_12/allocator_device_cl.c1
-rw-r--r--src/compute/skc/platforms/cl_12/allocator_device_cl.h1
-rw-r--r--src/compute/skc/platforms/cl_12/composition_cl_12.c23
-rw-r--r--src/compute/skc/platforms/cl_12/export_cl_12.h1
-rw-r--r--src/compute/skc/platforms/cl_12/extent_cl_12.c2
-rw-r--r--src/compute/skc/platforms/cl_12/interop/interop_glfw.c4
-rw-r--r--src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c22
-rw-r--r--src/compute/skc/platforms/cl_12/kernels/devices/gen9/kernel_cl_12.h6
-rw-r--r--src/compute/skc/platforms/cl_12/kernels/rasterize.cl4
-rw-r--r--src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl16
-rw-r--r--src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl32
-rw-r--r--src/compute/skc/platforms/cl_12/raster_builder_cl_12.c20
-rw-r--r--src/compute/skc/platforms/cl_12/runtime_cl_12.c1
-rw-r--r--src/compute/skc/platforms/cl_12/runtime_cl_12.h3
-rw-r--r--src/compute/skc/platforms/cl_12/surface_cl_12.c5
-rw-r--r--src/compute/skc/skc.h1
-rw-r--r--src/compute/skc/styling.h1
-rw-r--r--src/compute/skc/styling_types.h2
-rw-r--r--src/compute/skc/surface.c6
-rw-r--r--src/compute/skc/surface.h1
-rw-r--r--src/compute/skc/weakref.h2
76 files changed, 10634 insertions, 13208 deletions
diff --git a/src/compute/common/cl/assert_cl.c b/src/compute/common/cl/assert_cl.c
index 5d420586b3..944256daec 100644
--- a/src/compute/common/cl/assert_cl.c
+++ b/src/compute/common/cl/assert_cl.c
@@ -129,7 +129,7 @@ assert_cl(cl_int const code, char const * const file, int const line, bool const
char const * const cl_err_str = cl_get_error_string(code);
fprintf(stderr,
- "\"%s\", line %d: cl_assert (%d) = \"%s\"",
+ "\"%s\", line %d: assert_cl( %d ) = \"%s\"",
file,line,code,cl_err_str);
if (abort)
diff --git a/src/compute/common/cl/assert_cl.h b/src/compute/common/cl/assert_cl.h
index 517ada8d37..efe698f29e 100644
--- a/src/compute/common/cl/assert_cl.h
+++ b/src/compute/common/cl/assert_cl.h
@@ -53,4 +53,3 @@ cl_get_event_command_type_string(cl_command_type const type);
//
//
//
-
diff --git a/src/compute/common/cl/find_cl.c b/src/compute/common/cl/find_cl.c
index a04d9ebd69..6c500c0865 100644
--- a/src/compute/common/cl/find_cl.c
+++ b/src/compute/common/cl/find_cl.c
@@ -45,7 +45,7 @@ clFindIdsByName(char const * const target_platform_substring,
cl(GetPlatformIDs(0,NULL,&platform_count));
- cl_platform_id * const platform_ids = ALLOCA(sizeof(*platform_ids) * platform_count);
+ cl_platform_id * const platform_ids = ALLOCA_MACRO(sizeof(*platform_ids) * platform_count);
cl(GetPlatformIDs(platform_count,platform_ids,NULL));
@@ -62,7 +62,7 @@ clFindIdsByName(char const * const target_platform_substring,
NULL,
&platform_name_size));
- char * const platform_name = ALLOCA(platform_name_size);
+ char * const platform_name = ALLOCA_MACRO(platform_name_size);
cl(GetPlatformInfo(platform_ids[ii],
CL_PLATFORM_NAME,
@@ -93,7 +93,7 @@ clFindIdsByName(char const * const target_platform_substring,
NULL,
&device_count);
- cl_device_id * const device_ids = ALLOCA(sizeof(*device_ids) * device_count);
+ cl_device_id * const device_ids = ALLOCA_MACRO(sizeof(*device_ids) * device_count);
cl_err = clGetDeviceIDs(platform_ids[ii],
CL_DEVICE_TYPE_ALL,
@@ -121,8 +121,8 @@ clFindIdsByName(char const * const target_platform_substring,
NULL,
&driver_version_size));
- char * const device_name = ALLOCA(device_name_size);
- char * const driver_version = ALLOCA(driver_version_size);
+ char * const device_name = ALLOCA_MACRO(device_name_size);
+ char * const driver_version = ALLOCA_MACRO(driver_version_size);
cl(GetDeviceInfo(device_ids[jj],
CL_DEVICE_NAME,
@@ -207,4 +207,3 @@ clFindIdsByName(char const * const target_platform_substring,
//
//
//
-
diff --git a/src/compute/common/cl/find_cl.h b/src/compute/common/cl/find_cl.h
index 5143e39f85..6dbfe10838 100644
--- a/src/compute/common/cl/find_cl.h
+++ b/src/compute/common/cl/find_cl.h
@@ -32,4 +32,3 @@ clFindIdsByName(char const * const target_platform_substring,
//
//
//
-
diff --git a/src/compute/common/macros.h b/src/compute/common/macros.h
index 52dc8689fc..266b58f108 100644
--- a/src/compute/common/macros.h
+++ b/src/compute/common/macros.h
@@ -12,16 +12,35 @@
//
//
-#define ARRAY_LENGTH(x) (sizeof(x)/sizeof(x[0]))
+#include <stdint.h>
//
//
//
-#define MAX_MACRO(a,b) (((a) > (b)) ? (a) : (b))
-#define MIN_MACRO(a,b) (((a) < (b)) ? (a) : (b))
-#define GTE_MACRO(a,b) ((a) >= (b))
-#define LT_MACRO(a,b) ((a) < (b))
+#define ARRAY_LENGTH_MACRO(x) (sizeof(x)/sizeof(x[0]))
+#define OFFSET_OF_MACRO(t,m) ((size_t)&(((t*)0)->m))
+#define MEMBER_SIZE_MACRO(t,m) sizeof(((t*)0)->m)
+
+
+//
+//
+//
+
+#define MAX_MACRO(a,b) (((a) > (b)) ? (a) : (b))
+#define MIN_MACRO(a,b) (((a) < (b)) ? (a) : (b))
+#define GTE_MACRO(a,b) ((a) >= (b))
+#define LT_MACRO(a,b) ((a) < (b))
+
+//
+//
+//
+
+#if defined(_MSC_VER)
+#define ALLOCA_MACRO(n) _alloca(n)
+#else
+#define ALLOCA_MACRO(n) alloca(n)
+#endif
//
//
@@ -34,14 +53,14 @@
#define BITS_TO_MASK_AT_64(n,b) (BITS_TO_MASK_64(n)<<(b))
//
-//
+// Convert 4 byte pointer to network order dword to a host order.
//
-#if defined(_MSC_VER)
-#define ALLOCA(n) _alloca(n)
-#else
-#define ALLOCA(n) alloca(n)
-#endif
+#define NPBTOHL_MACRO(pb4) ((((pb4)[0])<<24) | (((pb4)[1])<<16) | \
+ (((pb4)[2])<< 8) | (pb4)[3])
+
+#define NTOHL_MACRO(nl) ntohl(nl)
+
//
//
//
diff --git a/src/compute/common/util.c b/src/compute/common/util.c
index eb05d91a9f..51a8e0128e 100644
--- a/src/compute/common/util.c
+++ b/src/compute/common/util.c
@@ -59,12 +59,11 @@ pow2_rd_u32(uint32_t n)
uint32_t
msb_idx_u32(uint32_t n)
{
-
#ifdef _MSC_VER
uint32_t index;
- _BitScanReverse(&index,n);
+ _BitScanReverse((unsigned long *)&index,n);
return index;
@@ -78,7 +77,6 @@ msb_idx_u32(uint32_t n)
#error "No msb_index()"
#endif
-
}
//
diff --git a/src/compute/common/util.h b/src/compute/common/util.h
index 7d5a7b4600..113e26d789 100644
--- a/src/compute/common/util.h
+++ b/src/compute/common/util.h
@@ -27,5 +27,3 @@ uint32_t msb_idx_u32(uint32_t n); // 0-based bit position
//
//
//
-
-
diff --git a/src/compute/hs/cl/bench/main.c b/src/compute/hs/cl/bench/main.c
index 3b9ef6e1c7..bfa7c1da38 100644
--- a/src/compute/hs/cl/bench/main.c
+++ b/src/compute/hs/cl/bench/main.c
@@ -32,9 +32,10 @@
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#endif
-#include "macros.h"
-#include "assert_cl.h"
-#include "find_cl.h"
+#include "common/macros.h"
+#include "common/cl/assert_cl.h"
+#include "common/cl/find_cl.h"
+
#include "hs_cl_launcher.h"
//
@@ -90,10 +91,10 @@ char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count);
static
char const *
-hs_cpu_sort(void * sorted_h,
- uint32_t const count,
- struct hs_info const * const info,
- double * const cpu_ns)
+hs_cpu_sort(uint32_t const hs_words,
+ void * sorted_h,
+ uint32_t const count,
+ double * const cpu_ns)
{
char const * algo;
@@ -101,7 +102,7 @@ hs_cpu_sort(void * sorted_h,
QueryPerformanceCounter(&t0);
- if (info->words == 1)
+ if (hs_words == 1)
algo = hs_cpu_sort_u32(sorted_h,count);
else
algo = hs_cpu_sort_u64(sorted_h,count);
@@ -117,27 +118,34 @@ hs_cpu_sort(void * sorted_h,
static
bool
-hs_verify_linear(void * sorted_h, void * vout_h, const uint32_t count, struct hs_info const * const info)
+hs_verify_linear(uint32_t const hs_words,
+ void * sorted_h,
+ void * vout_h,
+ uint32_t const count)
{
- return memcmp(sorted_h, vout_h, sizeof(uint32_t) * info->words * count) == 0;
+ return memcmp(sorted_h, vout_h, sizeof(uint32_t) * hs_words * count) == 0;
}
static
void
-hs_transpose_slabs_u32(uint32_t * vout_h, const uint32_t count, struct hs_info const * const info)
+hs_transpose_slabs_u32(uint32_t const hs_words,
+ uint32_t const hs_width,
+ uint32_t const hs_height,
+ uint32_t * vout_h,
+ uint32_t const count)
{
- uint32_t const slab_keys = info->keys * info->lanes;
- size_t const slab_size = sizeof(uint32_t) * info->words * slab_keys;
- uint32_t * const slab = _alloca(slab_size);
+ uint32_t const slab_keys = hs_width * hs_height;
+ size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys;
+ uint32_t * const slab = ALLOCA_MACRO(slab_size);
uint32_t slab_count = count / slab_keys;
while (slab_count-- > 0)
{
memcpy(slab,vout_h,slab_size);
- for (uint32_t row=0; row<info->keys; row++)
- for (uint32_t col=0; col<info->lanes; col++)
- vout_h[col * info->keys + row] = slab[row * info->lanes + col];
+ for (uint32_t row=0; row<hs_height; row++)
+ for (uint32_t col=0; col<hs_width; col++)
+ vout_h[col * hs_height + row] = slab[row * hs_width + col];
vout_h += slab_keys;
}
@@ -145,20 +153,24 @@ hs_transpose_slabs_u32(uint32_t * vout_h, const uint32_t count, struct hs_info c
static
void
-hs_transpose_slabs_u64(uint64_t * vout_h, const uint32_t count, struct hs_info const * const info)
+hs_transpose_slabs_u64(uint32_t const hs_words,
+ uint32_t const hs_width,
+ uint32_t const hs_height,
+ uint64_t * vout_h,
+ uint32_t const count)
{
- uint32_t const slab_keys = info->keys * info->lanes;
- size_t const slab_size = sizeof(uint32_t) * info->words * slab_keys;
- uint64_t * const slab = _alloca(slab_size);
+ uint32_t const slab_keys = hs_width * hs_height;
+ size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys;
+ uint64_t * const slab = ALLOCA_MACRO(slab_size);
uint32_t slab_count = count / slab_keys;
while (slab_count-- > 0)
{
memcpy(slab,vout_h,slab_size);
- for (uint32_t row=0; row<info->keys; row++)
- for (uint32_t col=0; col<info->lanes; col++)
- vout_h[col * info->keys + row] = slab[row * info->lanes + col];
+ for (uint32_t row=0; row<hs_height; row++)
+ for (uint32_t col=0; col<hs_width; col++)
+ vout_h[col * hs_height + row] = slab[row * hs_width + col];
vout_h += slab_keys;
}
@@ -166,12 +178,16 @@ hs_transpose_slabs_u64(uint64_t * vout_h, const uint32_t count, struct hs_info c
static
void
-hs_transpose_slabs(void * vout_h, const uint32_t count, struct hs_info const * const info)
+hs_transpose_slabs(uint32_t const hs_words,
+ uint32_t const hs_width,
+ uint32_t const hs_height,
+ void * vout_h,
+ uint32_t const count)
{
- if (info->words == 1)
- hs_transpose_slabs_u32(vout_h,count,info);
+ if (hs_words == 1)
+ hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count);
else
- hs_transpose_slabs_u64(vout_h,count,info);
+ hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count);
}
//
@@ -180,18 +196,18 @@ hs_transpose_slabs(void * vout_h, const uint32_t count, struct hs_info const * c
static
void
-hs_debug_u32(
- uint32_t const * vout_h,
- uint32_t const count,
- struct hs_info const * const info)
+hs_debug_u32(uint32_t const hs_width,
+ uint32_t const hs_height,
+ uint32_t const * vout_h,
+ uint32_t const count)
{
- uint32_t const slab = info->keys * info->lanes;
- uint32_t const slabs = (count + slab - 1) / slab;
+ uint32_t const slab_keys = hs_width * hs_height;
+ uint32_t const slabs = (count + slab_keys - 1) / slab_keys;
for (uint32_t ss=0; ss<slabs; ss++) {
fprintf(stderr,"%u\n",ss);
- for (uint32_t cc=0; cc<info->keys; cc++) {
- for (uint32_t rr=0; rr<info->lanes; rr++)
+ for (uint32_t cc=0; cc<hs_height; cc++) {
+ for (uint32_t rr=0; rr<hs_width; rr++)
fprintf(stderr,"%8X ",*vout_h++);
fprintf(stderr,"\n");
}
@@ -200,17 +216,18 @@ hs_debug_u32(
static
void
-hs_debug_u64(uint64_t const * vout_h,
- uint32_t const count,
- struct hs_info const * const info)
+hs_debug_u64(uint32_t const hs_width,
+ uint32_t const hs_height,
+ uint64_t const * vout_h,
+ uint32_t const count)
{
- uint32_t const slab = info->keys * info->lanes;
- uint32_t const slabs = (count + slab - 1) / slab;
+ uint32_t const slab_keys = hs_width * hs_height;
+ uint32_t const slabs = (count + slab_keys - 1) / slab_keys;
for (uint32_t ss=0; ss<slabs; ss++) {
fprintf(stderr,"%u\n",ss);
- for (uint32_t cc=0; cc<info->keys; cc++) {
- for (uint32_t rr=0; rr<info->lanes; rr++)
+ for (uint32_t cc=0; cc<hs_height; cc++) {
+ for (uint32_t rr=0; rr<hs_width; rr++)
fprintf(stderr,"%16llX ",*vout_h++);
fprintf(stderr,"\n");
}
@@ -275,7 +292,10 @@ hs_dummy_kernel_release()
static
void
-hs_dummy_kernel_enqueue(cl_command_queue cq, cl_event * const event)
+hs_dummy_kernel_enqueue(cl_command_queue cq,
+ uint32_t wait_list_size,
+ cl_event const * wait_list,
+ cl_event * event)
{
size_t const global_work_size = 1;
@@ -285,8 +305,8 @@ hs_dummy_kernel_enqueue(cl_command_queue cq, cl_event * const event)
NULL,
&global_work_size,
NULL,
- 0,
- NULL,
+ wait_list_size,
+ wait_list,
event));
}
@@ -298,8 +318,12 @@ static
void
hs_bench(cl_context context,
cl_command_queue cq,
+ cl_command_queue cq_profile,
char const * const device_name,
- struct hs_info const * const info,
+ uint32_t const hs_words,
+ uint32_t const hs_width,
+ uint32_t const hs_height,
+ struct hs_cl const * const hs,
uint32_t const count_lo,
uint32_t const count_hi,
uint32_t const count_step,
@@ -318,14 +342,13 @@ hs_bench(cl_context context,
//
uint32_t count_hi_padded_in, count_hi_padded_out;
- hs_pad(count_hi,&count_hi_padded_in,&count_hi_padded_out);
+ hs_cl_pad(hs,count_hi,&count_hi_padded_in,&count_hi_padded_out);
//
// SIZE
//
- size_t const key_size = sizeof(uint32_t) * info->words;
+ size_t const key_size = sizeof(uint32_t) * hs_words;
- size_t const size_hi = count_hi * key_size;
size_t const size_hi_in = count_hi_padded_in * key_size;
size_t const size_hi_out = count_hi_padded_out * key_size;
@@ -363,7 +386,7 @@ hs_bench(cl_context context,
&cl_err); cl_ok(cl_err);
// fill with random numbers
- hs_fill_rand(random_h,count_hi,info->words);
+ hs_fill_rand(random_h,count_hi,hs_words);
//
// UNMAP
@@ -379,16 +402,14 @@ hs_bench(cl_context context,
// compute padding before sorting
uint32_t count_padded_in, count_padded_out;
- hs_pad(count,&count_padded_in,&count_padded_out);
+ hs_cl_pad(hs,count,&count_padded_in,&count_padded_out);
cl_ulong elapsed_ns_min = ULONG_MAX;
cl_ulong elapsed_ns_max = 0;
cl_ulong elapsed_ns_sum = 0;
-#if 1
- cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL));
- cl(Finish(cq));
-#endif
+ cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL));
+ cl(Finish(cq));
for (uint32_t ii=0; ii<warmup+loops; ii++)
{
@@ -410,23 +431,23 @@ hs_bench(cl_context context,
//
// sort vin
//
- cl_event start, end;
+ cl_event start, complete, end;
- hs_dummy_kernel_enqueue(cq,&start);
-
- cl(EnqueueBarrierWithWaitList(cq,0,NULL,NULL));
+ hs_dummy_kernel_enqueue(cq_profile,0,NULL,&start);
// note hs_sort enqueues a final barrier
- hs_sort(cq,
- vin,vout,
- count,
- count_padded_in,
- count_padded_out,
- linearize);
+ hs_cl_sort(hs,
+ cq,
+ 1,&start,&complete,
+ vin,vout,
+ count,
+ count_padded_in,
+ count_padded_out,
+ linearize);
- hs_dummy_kernel_enqueue(cq,&end);
+ hs_dummy_kernel_enqueue(cq_profile,1,&complete,&end);
- cl(Finish(cq));
+ cl(Finish(cq_profile));
//
// measure duration
@@ -439,7 +460,6 @@ hs_bench(cl_context context,
sizeof(cl_ulong),
&t_start,
NULL));
- cl(ReleaseEvent(start));
// end
cl(GetEventProfilingInfo(end,
@@ -447,13 +467,16 @@ hs_bench(cl_context context,
sizeof(cl_ulong),
&t_end,
NULL));
- cl(ReleaseEvent(end));
cl_ulong const t = t_end - t_start;
elapsed_ns_min = MIN_MACRO(elapsed_ns_min,t);
elapsed_ns_max = MAX_MACRO(elapsed_ns_max,t);
elapsed_ns_sum += t;
+
+ cl(ReleaseEvent(start));
+ cl(ReleaseEvent(complete));
+ cl(ReleaseEvent(end));
}
//
@@ -485,27 +508,27 @@ hs_bench(cl_context context,
double cpu_ns;
- char const * const algo = hs_cpu_sort(sorted_h,count_padded_in,info,&cpu_ns);
+ char const * const algo = hs_cpu_sort(hs_words,sorted_h,count_padded_in,&cpu_ns);
//
// EXPLICITLY TRANSPOSE THE CPU SORTED SLABS IF NOT LINEARIZING
//
if (!linearize) {
- hs_transpose_slabs(vout_h,count_padded_in,info);
+ hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in);
}
//
// VERIFY
//
- bool const verified = hs_verify_linear(sorted_h,vout_h,count_padded_in,info);
+ bool const verified = hs_verify_linear(hs_words,sorted_h,vout_h,count_padded_in);
#ifndef NDEBUG
if (!verified)
{
- if (info->words == 1)
- hs_debug_u32(vout_h,count,info);
+ if (hs_words == 1)
+ hs_debug_u32(hs_width,hs_height,vout_h,count);
else // ulong
- hs_debug_u64(vout_h,count,info);
+ hs_debug_u64(hs_width,hs_height,vout_h,count);
}
#endif
@@ -519,7 +542,7 @@ hs_bench(cl_context context,
//
fprintf(stdout,"%s, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n",
device_name,
- (info->words == 1) ? "uint" : "ulong",
+ (hs_words == 1) ? "uint" : "ulong",
linearize ? "linear" : "slab",
verified ? " OK " : "*FAIL*",
count,
@@ -555,8 +578,15 @@ hs_bench(cl_context context,
//
//
+#define HS_TARGET_NAME hs_target
+#include "intel/gen8/u64/hs_target.h"
+
+//
+//
+//
+
int
-main(int argc, char** argv)
+main(int argc, char const * argv[])
{
char const * const target_platform_substring = "Intel";
char const * const target_device_substring = "Graphics";
@@ -601,43 +631,64 @@ main(int argc, char** argv)
//
// create command queue
//
- cl_command_queue_properties const props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE;
-
#if 0 // OPENCL 2.0
- cl_queue_properties queue_properties[] =
- {
- CL_QUEUE_PROPERTIES, (cl_queue_properties)props,
- 0
- };
+
+ cl_queue_properties props[] = {
+ CL_QUEUE_PROPERTIES,
+ (cl_queue_properties)CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
+#ifndef NDEBUG
+ (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE,
+#endif
+ 0
+ };
+
+ cl_queue_properties props_profile[] = {
+ CL_QUEUE_PROPERTIES,
+ (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE,
+ 0
+ };
cl_command_queue cq = clCreateCommandQueueWithProperties(context,
device_id,
- queue_properties,
+ props,
&cl_err); cl_ok(cl_err);
+
+ cl_command_queue cq_profile = clCreateCommandQueueWithProperties(context,
+ device_id,
+ props_profile,
+ &cl_err); cl_ok(cl_err);
#else // OPENCL 1.2
+
cl_command_queue cq = clCreateCommandQueue(context,
device_id,
- props,
+#ifndef NDEBUG
+ CL_QUEUE_PROFILING_ENABLE |
+#endif
+ CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
&cl_err); cl_ok(cl_err);
+
+ cl_command_queue cq_profile = clCreateCommandQueue(context,
+ device_id,
+ CL_QUEUE_PROFILING_ENABLE,
+ &cl_err); cl_ok(cl_err);
#endif
//
+ // Intel GEN workaround -- create dummy kernel for semi-accurate
+ // profiling on an out-of-order queue.
+ //
+ hs_dummy_kernel_create(context,device_id);
+
+ //
// create kernels
//
fprintf(stdout,"Creating... ");
- struct hs_info info;
-
- hs_create(context,device_id,&info);
+ struct hs_cl * const hs = hs_cl_create(&hs_target,context,device_id);
fprintf(stdout,"done.\n");
//
- // create dummy kernel for profiling
- //
- hs_dummy_kernel_create(context,device_id);
-
- //
//
//
#ifdef NDEBUG
@@ -651,7 +702,7 @@ main(int argc, char** argv)
//
// sort sizes and loops
//
- uint32_t const kpb = info.keys * info.lanes;
+ uint32_t const kpb = hs_target.config.slab.height << hs_target.config.slab.width_log2;
uint32_t const count_lo = (argc <= 1) ? kpb : strtoul(argv[1],NULL,0);
uint32_t const count_hi = (argc <= 2) ? count_lo : strtoul(argv[2],NULL,0);
@@ -663,15 +714,30 @@ main(int argc, char** argv)
//
// benchmark
//
- hs_bench(context,cq,device_name,&info,count_lo,count_hi,count_step,loops,warmup,linearize);
+ hs_bench(context,
+ cq,cq_profile,
+ device_name,
+ hs_target.config.words.key + hs_target.config.words.val,
+ 1 << hs_target.config.slab.width_log2,
+ hs_target.config.slab.height,
+ hs,
+ count_lo,
+ count_hi,
+ count_step,
+ loops,
+ warmup,
+ linearize);
//
// release everything
//
+ hs_cl_release(hs);
+
hs_dummy_kernel_release();
- hs_release();
cl(ReleaseCommandQueue(cq));
+ cl(ReleaseCommandQueue(cq_profile));
+
cl(ReleaseContext(context));
return 0;
diff --git a/src/compute/hs/cl/gen9/hs_cl.cl b/src/compute/hs/cl/gen9/hs_cl.cl
deleted file mode 100644
index 63627ad068..0000000000
--- a/src/compute/hs/cl/gen9/hs_cl.cl
+++ /dev/null
@@ -1,10082 +0,0 @@
-//
-// Copyright 2016 Google Inc.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-//
-
-#include <hs_cl_macros.h>
-
-//
-//
-//
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_transpose(__global HS_KEY_TYPE* const restrict vout)
-{
- uint const global_id = get_global_id(0);
- uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
-
- HS_KEY_TYPE r1 = (vout + gmem_idx)[0 * 8];
- HS_KEY_TYPE r2 = (vout + gmem_idx)[1 * 8];
- HS_KEY_TYPE r3 = (vout + gmem_idx)[2 * 8];
- HS_KEY_TYPE r4 = (vout + gmem_idx)[3 * 8];
- HS_KEY_TYPE r5 = (vout + gmem_idx)[4 * 8];
- HS_KEY_TYPE r6 = (vout + gmem_idx)[5 * 8];
- HS_KEY_TYPE r7 = (vout + gmem_idx)[6 * 8];
- HS_KEY_TYPE r8 = (vout + gmem_idx)[7 * 8];
- HS_KEY_TYPE r9 = (vout + gmem_idx)[8 * 8];
- HS_KEY_TYPE r10 = (vout + gmem_idx)[9 * 8];
- HS_KEY_TYPE r11 = (vout + gmem_idx)[10 * 8];
- HS_KEY_TYPE r12 = (vout + gmem_idx)[11 * 8];
- HS_KEY_TYPE r13 = (vout + gmem_idx)[12 * 8];
- HS_KEY_TYPE r14 = (vout + gmem_idx)[13 * 8];
- HS_KEY_TYPE r15 = (vout + gmem_idx)[14 * 8];
- HS_KEY_TYPE r16 = (vout + gmem_idx)[15 * 8];
- HS_TRANSPOSE_SLAB()
-}
-
-__kernel __attribute__((reqd_work_group_size(128, 1, 1)))
-__attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_bs_4(__global HS_KEY_TYPE const* const restrict vin,
- __global HS_KEY_TYPE* const restrict vout)
-{
- __local union
- {
- HS_KEY_TYPE m[16 * 128];
- } shared;
-
- uint const global_id = get_global_id(0);
- uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
-
- HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8];
- HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8];
- HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8];
- HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8];
- HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8];
- HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8];
- HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8];
- HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8];
- HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8];
- HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8];
- HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8];
- HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8];
- HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8];
- HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8];
- HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8];
- HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8];
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r6, r11)
- HS_CMP_XCHG(r7, r10)
- HS_CMP_XCHG(r4, r13)
- HS_CMP_XCHG(r14, r15)
- HS_CMP_XCHG(r8, r12)
- HS_CMP_XCHG(r2, r3)
- HS_CMP_XCHG(r5, r9)
- HS_CMP_XCHG(r2, r5)
- HS_CMP_XCHG(r8, r14)
- HS_CMP_XCHG(r3, r9)
- HS_CMP_XCHG(r12, r15)
- HS_CMP_XCHG(r3, r5)
- HS_CMP_XCHG(r6, r7)
- HS_CMP_XCHG(r10, r11)
- HS_CMP_XCHG(r12, r14)
- HS_CMP_XCHG(r4, r9)
- HS_CMP_XCHG(r8, r13)
- HS_CMP_XCHG(r7, r9)
- HS_CMP_XCHG(r11, r13)
- HS_CMP_XCHG(r4, r6)
- HS_CMP_XCHG(r8, r10)
- HS_CMP_XCHG(r4, r5)
- HS_CMP_XCHG(r6, r7)
- HS_CMP_XCHG(r8, r9)
- HS_CMP_XCHG(r10, r11)
- HS_CMP_XCHG(r12, r13)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- {
- uint const flip_lane_mask = 1;
- uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
- int const t_lt = get_sub_group_local_id() < flip_lane_idx;
- HS_CMP_FLIP(0, r1, r16)
- HS_CMP_FLIP(1, r2, r15)
- HS_CMP_FLIP(2, r3, r14)
- HS_CMP_FLIP(3, r4, r13)
- HS_CMP_FLIP(4, r5, r12)
- HS_CMP_FLIP(5, r6, r11)
- HS_CMP_FLIP(6, r7, r10)
- HS_CMP_FLIP(7, r8, r9)
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- {
- uint const flip_lane_mask = 3;
- uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
- int const t_lt = get_sub_group_local_id() < flip_lane_idx;
- HS_CMP_FLIP(0, r1, r16)
- HS_CMP_FLIP(1, r2, r15)
- HS_CMP_FLIP(2, r3, r14)
- HS_CMP_FLIP(3, r4, r13)
- HS_CMP_FLIP(4, r5, r12)
- HS_CMP_FLIP(5, r6, r11)
- HS_CMP_FLIP(6, r7, r10)
- HS_CMP_FLIP(7, r8, r9)
- }
- {
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- {
- uint const flip_lane_mask = 7;
- uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
- int const t_lt = get_sub_group_local_id() < flip_lane_idx;
- HS_CMP_FLIP(0, r1, r16)
- HS_CMP_FLIP(1, r2, r15)
- HS_CMP_FLIP(2, r3, r14)
- HS_CMP_FLIP(3, r4, r13)
- HS_CMP_FLIP(4, r5, r12)
- HS_CMP_FLIP(5, r6, r11)
- HS_CMP_FLIP(6, r7, r10)
- HS_CMP_FLIP(7, r8, r9)
- }
- {
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
- }
- {
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- uint const smem_l_idx = get_sub_group_id() * 128 + get_sub_group_local_id();
- uint const smem_r_idx =
- (get_sub_group_id() ^ 1) * 128 + (get_sub_group_local_id() ^ 7);
- (shared.m + get_local_id(0))[16 * 8 * 0] = r1;
- (shared.m + get_local_id(0))[16 * 8 * 1] = r16;
- (shared.m + get_local_id(0))[16 * 8 * 2] = r2;
- (shared.m + get_local_id(0))[16 * 8 * 3] = r15;
- (shared.m + get_local_id(0))[16 * 8 * 4] = r3;
- (shared.m + get_local_id(0))[16 * 8 * 5] = r14;
- (shared.m + get_local_id(0))[16 * 8 * 6] = r4;
- (shared.m + get_local_id(0))[16 * 8 * 7] = r13;
- (shared.m + get_local_id(0))[16 * 8 * 8] = r5;
- (shared.m + get_local_id(0))[16 * 8 * 9] = r12;
- (shared.m + get_local_id(0))[16 * 8 * 10] = r6;
- (shared.m + get_local_id(0))[16 * 8 * 11] = r11;
- (shared.m + get_local_id(0))[16 * 8 * 12] = r7;
- (shared.m + get_local_id(0))[16 * 8 * 13] = r10;
- (shared.m + get_local_id(0))[16 * 8 * 14] = r8;
- (shared.m + get_local_id(0))[16 * 8 * 15] = r9;
- barrier(CLK_LOCAL_MEM_FENCE);
- {
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
- HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[8];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[0] = r0_1;
- (shared.m + smem_r_idx)[8] = r0_2;
- }
- {
- HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[16];
- HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[24];
- HS_CMP_XCHG(r1_1, r1_2)
- (shared.m + smem_l_idx)[16] = r1_1;
- (shared.m + smem_r_idx)[24] = r1_2;
- }
- {
- HS_KEY_TYPE r2_1 = (shared.m + smem_l_idx)[32];
- HS_KEY_TYPE r2_2 = (shared.m + smem_r_idx)[40];
- HS_CMP_XCHG(r2_1, r2_2)
- (shared.m + smem_l_idx)[32] = r2_1;
- (shared.m + smem_r_idx)[40] = r2_2;
- }
- {
- HS_KEY_TYPE r3_1 = (shared.m + smem_l_idx)[48];
- HS_KEY_TYPE r3_2 = (shared.m + smem_r_idx)[56];
- HS_CMP_XCHG(r3_1, r3_2)
- (shared.m + smem_l_idx)[48] = r3_1;
- (shared.m + smem_r_idx)[56] = r3_2;
- }
- {
- HS_KEY_TYPE r4_1 = (shared.m + smem_l_idx)[64];
- HS_KEY_TYPE r4_2 = (shared.m + smem_r_idx)[72];
- HS_CMP_XCHG(r4_1, r4_2)
- (shared.m + smem_l_idx)[64] = r4_1;
- (shared.m + smem_r_idx)[72] = r4_2;
- }
- {
- HS_KEY_TYPE r5_1 = (shared.m + smem_l_idx)[80];
- HS_KEY_TYPE r5_2 = (shared.m + smem_r_idx)[88];
- HS_CMP_XCHG(r5_1, r5_2)
- (shared.m + smem_l_idx)[80] = r5_1;
- (shared.m + smem_r_idx)[88] = r5_2;
- }
- {
- HS_KEY_TYPE r6_1 = (shared.m + smem_l_idx)[96];
- HS_KEY_TYPE r6_2 = (shared.m + smem_r_idx)[104];
- HS_CMP_XCHG(r6_1, r6_2)
- (shared.m + smem_l_idx)[96] = r6_1;
- (shared.m + smem_r_idx)[104] = r6_2;
- }
- {
- HS_KEY_TYPE r7_1 = (shared.m + smem_l_idx)[112];
- HS_KEY_TYPE r7_2 = (shared.m + smem_r_idx)[120];
- HS_CMP_XCHG(r7_1, r7_2)
- (shared.m + smem_l_idx)[112] = r7_1;
- (shared.m + smem_r_idx)[120] = r7_2;
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- r1 = (shared.m + get_local_id(0))[16 * 8 * 0];
- r16 = (shared.m + get_local_id(0))[16 * 8 * 1];
- r2 = (shared.m + get_local_id(0))[16 * 8 * 2];
- r15 = (shared.m + get_local_id(0))[16 * 8 * 3];
- r3 = (shared.m + get_local_id(0))[16 * 8 * 4];
- r14 = (shared.m + get_local_id(0))[16 * 8 * 5];
- r4 = (shared.m + get_local_id(0))[16 * 8 * 6];
- r13 = (shared.m + get_local_id(0))[16 * 8 * 7];
- r5 = (shared.m + get_local_id(0))[16 * 8 * 8];
- r12 = (shared.m + get_local_id(0))[16 * 8 * 9];
- r6 = (shared.m + get_local_id(0))[16 * 8 * 10];
- r11 = (shared.m + get_local_id(0))[16 * 8 * 11];
- r7 = (shared.m + get_local_id(0))[16 * 8 * 12];
- r10 = (shared.m + get_local_id(0))[16 * 8 * 13];
- r8 = (shared.m + get_local_id(0))[16 * 8 * 14];
- r9 = (shared.m + get_local_id(0))[16 * 8 * 15];
- { { uint const half_lane_mask = 4;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-HS_CMP_XCHG(r1, r9)
-HS_CMP_XCHG(r5, r13)
-HS_CMP_XCHG(r1, r5)
-HS_CMP_XCHG(r9, r13)
-HS_CMP_XCHG(r3, r11)
-HS_CMP_XCHG(r7, r15)
-HS_CMP_XCHG(r3, r7)
-HS_CMP_XCHG(r11, r15)
-HS_CMP_XCHG(r1, r3)
-HS_CMP_XCHG(r5, r7)
-HS_CMP_XCHG(r9, r11)
-HS_CMP_XCHG(r13, r15)
-HS_CMP_XCHG(r2, r10)
-HS_CMP_XCHG(r6, r14)
-HS_CMP_XCHG(r2, r6)
-HS_CMP_XCHG(r10, r14)
-HS_CMP_XCHG(r4, r12)
-HS_CMP_XCHG(r8, r16)
-HS_CMP_XCHG(r4, r8)
-HS_CMP_XCHG(r12, r16)
-HS_CMP_XCHG(r2, r4)
-HS_CMP_XCHG(r6, r8)
-HS_CMP_XCHG(r10, r12)
-HS_CMP_XCHG(r14, r16)
-HS_CMP_XCHG(r1, r2)
-HS_CMP_XCHG(r3, r4)
-HS_CMP_XCHG(r5, r6)
-HS_CMP_XCHG(r7, r8)
-HS_CMP_XCHG(r9, r10)
-HS_CMP_XCHG(r11, r12)
-HS_CMP_XCHG(r13, r14)
-HS_CMP_XCHG(r15, r16)
-}
-(shared.m + get_local_id(0))[16 * 8 * 0] = r1;
-(shared.m + get_local_id(0))[16 * 8 * 1] = r16;
-(shared.m + get_local_id(0))[16 * 8 * 2] = r2;
-(shared.m + get_local_id(0))[16 * 8 * 3] = r15;
-(shared.m + get_local_id(0))[16 * 8 * 4] = r3;
-(shared.m + get_local_id(0))[16 * 8 * 5] = r14;
-(shared.m + get_local_id(0))[16 * 8 * 6] = r4;
-(shared.m + get_local_id(0))[16 * 8 * 7] = r13;
-(shared.m + get_local_id(0))[16 * 8 * 8] = r5;
-(shared.m + get_local_id(0))[16 * 8 * 9] = r12;
-(shared.m + get_local_id(0))[16 * 8 * 10] = r6;
-(shared.m + get_local_id(0))[16 * 8 * 11] = r11;
-(shared.m + get_local_id(0))[16 * 8 * 12] = r7;
-(shared.m + get_local_id(0))[16 * 8 * 13] = r10;
-(shared.m + get_local_id(0))[16 * 8 * 14] = r8;
-(shared.m + get_local_id(0))[16 * 8 * 15] = r9;
-barrier(CLK_LOCAL_MEM_FENCE);
-{
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
- HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8];
- HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[16];
- HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[24];
- HS_CMP_XCHG(r0_2, r0_3)
- HS_CMP_XCHG(r0_1, r0_4)
- HS_CMP_XCHG(r0_3, r0_4)
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[0] = r0_1;
- (shared.m + smem_l_idx)[8] = r0_2;
- (shared.m + smem_r_idx)[16] = r0_3;
- (shared.m + smem_r_idx)[24] = r0_4;
- }
- {
- HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[32];
- HS_KEY_TYPE r1_2 = (shared.m + smem_l_idx)[40];
- HS_KEY_TYPE r1_3 = (shared.m + smem_r_idx)[48];
- HS_KEY_TYPE r1_4 = (shared.m + smem_r_idx)[56];
- HS_CMP_XCHG(r1_2, r1_3)
- HS_CMP_XCHG(r1_1, r1_4)
- HS_CMP_XCHG(r1_3, r1_4)
- HS_CMP_XCHG(r1_1, r1_2)
- (shared.m + smem_l_idx)[32] = r1_1;
- (shared.m + smem_l_idx)[40] = r1_2;
- (shared.m + smem_r_idx)[48] = r1_3;
- (shared.m + smem_r_idx)[56] = r1_4;
- }
- {
- HS_KEY_TYPE r2_1 = (shared.m + smem_l_idx)[64];
- HS_KEY_TYPE r2_2 = (shared.m + smem_l_idx)[72];
- HS_KEY_TYPE r2_3 = (shared.m + smem_r_idx)[80];
- HS_KEY_TYPE r2_4 = (shared.m + smem_r_idx)[88];
- HS_CMP_XCHG(r2_2, r2_3)
- HS_CMP_XCHG(r2_1, r2_4)
- HS_CMP_XCHG(r2_3, r2_4)
- HS_CMP_XCHG(r2_1, r2_2)
- (shared.m + smem_l_idx)[64] = r2_1;
- (shared.m + smem_l_idx)[72] = r2_2;
- (shared.m + smem_r_idx)[80] = r2_3;
- (shared.m + smem_r_idx)[88] = r2_4;
- }
- {
- HS_KEY_TYPE r3_1 = (shared.m + smem_l_idx)[96];
- HS_KEY_TYPE r3_2 = (shared.m + smem_l_idx)[104];
- HS_KEY_TYPE r3_3 = (shared.m + smem_r_idx)[112];
- HS_KEY_TYPE r3_4 = (shared.m + smem_r_idx)[120];
- HS_CMP_XCHG(r3_2, r3_3)
- HS_CMP_XCHG(r3_1, r3_4)
- HS_CMP_XCHG(r3_3, r3_4)
- HS_CMP_XCHG(r3_1, r3_2)
- (shared.m + smem_l_idx)[96] = r3_1;
- (shared.m + smem_l_idx)[104] = r3_2;
- (shared.m + smem_r_idx)[112] = r3_3;
- (shared.m + smem_r_idx)[120] = r3_4;
- }
-}
-barrier(CLK_LOCAL_MEM_FENCE);
-r1 = (shared.m + get_local_id(0))[16 * 8 * 0];
-r16 = (shared.m + get_local_id(0))[16 * 8 * 1];
-r2 = (shared.m + get_local_id(0))[16 * 8 * 2];
-r15 = (shared.m + get_local_id(0))[16 * 8 * 3];
-r3 = (shared.m + get_local_id(0))[16 * 8 * 4];
-r14 = (shared.m + get_local_id(0))[16 * 8 * 5];
-r4 = (shared.m + get_local_id(0))[16 * 8 * 6];
-r13 = (shared.m + get_local_id(0))[16 * 8 * 7];
-r5 = (shared.m + get_local_id(0))[16 * 8 * 8];
-r12 = (shared.m + get_local_id(0))[16 * 8 * 9];
-r6 = (shared.m + get_local_id(0))[16 * 8 * 10];
-r11 = (shared.m + get_local_id(0))[16 * 8 * 11];
-r7 = (shared.m + get_local_id(0))[16 * 8 * 12];
-r10 = (shared.m + get_local_id(0))[16 * 8 * 13];
-r8 = (shared.m + get_local_id(0))[16 * 8 * 14];
-r9 = (shared.m + get_local_id(0))[16 * 8 * 15];
-{ { uint const half_lane_mask = 4;
-uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
-int const t_lt = get_sub_group_local_id() < half_lane_idx;
-HS_CMP_HALF(0, r1)
-HS_CMP_HALF(1, r2)
-HS_CMP_HALF(2, r3)
-HS_CMP_HALF(3, r4)
-HS_CMP_HALF(4, r5)
-HS_CMP_HALF(5, r6)
-HS_CMP_HALF(6, r7)
-HS_CMP_HALF(7, r8)
-HS_CMP_HALF(8, r9)
-HS_CMP_HALF(9, r10)
-HS_CMP_HALF(10, r11)
-HS_CMP_HALF(11, r12)
-HS_CMP_HALF(12, r13)
-HS_CMP_HALF(13, r14)
-HS_CMP_HALF(14, r15)
-HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-HS_CMP_XCHG(r1, r9)
-HS_CMP_XCHG(r5, r13)
-HS_CMP_XCHG(r1, r5)
-HS_CMP_XCHG(r9, r13)
-HS_CMP_XCHG(r3, r11)
-HS_CMP_XCHG(r7, r15)
-HS_CMP_XCHG(r3, r7)
-HS_CMP_XCHG(r11, r15)
-HS_CMP_XCHG(r1, r3)
-HS_CMP_XCHG(r5, r7)
-HS_CMP_XCHG(r9, r11)
-HS_CMP_XCHG(r13, r15)
-HS_CMP_XCHG(r2, r10)
-HS_CMP_XCHG(r6, r14)
-HS_CMP_XCHG(r2, r6)
-HS_CMP_XCHG(r10, r14)
-HS_CMP_XCHG(r4, r12)
-HS_CMP_XCHG(r8, r16)
-HS_CMP_XCHG(r4, r8)
-HS_CMP_XCHG(r12, r16)
-HS_CMP_XCHG(r2, r4)
-HS_CMP_XCHG(r6, r8)
-HS_CMP_XCHG(r10, r12)
-HS_CMP_XCHG(r14, r16)
-HS_CMP_XCHG(r1, r2)
-HS_CMP_XCHG(r3, r4)
-HS_CMP_XCHG(r5, r6)
-HS_CMP_XCHG(r7, r8)
-HS_CMP_XCHG(r9, r10)
-HS_CMP_XCHG(r11, r12)
-HS_CMP_XCHG(r13, r14)
-HS_CMP_XCHG(r15, r16)
-}
-(shared.m + get_local_id(0))[16 * 8 * 0] = r1;
-(shared.m + get_local_id(0))[16 * 8 * 1] = r16;
-(shared.m + get_local_id(0))[16 * 8 * 2] = r2;
-(shared.m + get_local_id(0))[16 * 8 * 3] = r15;
-(shared.m + get_local_id(0))[16 * 8 * 4] = r3;
-(shared.m + get_local_id(0))[16 * 8 * 5] = r14;
-(shared.m + get_local_id(0))[16 * 8 * 6] = r4;
-(shared.m + get_local_id(0))[16 * 8 * 7] = r13;
-(shared.m + get_local_id(0))[16 * 8 * 8] = r5;
-(shared.m + get_local_id(0))[16 * 8 * 9] = r12;
-(shared.m + get_local_id(0))[16 * 8 * 10] = r6;
-(shared.m + get_local_id(0))[16 * 8 * 11] = r11;
-(shared.m + get_local_id(0))[16 * 8 * 12] = r7;
-(shared.m + get_local_id(0))[16 * 8 * 13] = r10;
-(shared.m + get_local_id(0))[16 * 8 * 14] = r8;
-(shared.m + get_local_id(0))[16 * 8 * 15] = r9;
-barrier(CLK_LOCAL_MEM_FENCE);
-{
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
- HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8];
- HS_KEY_TYPE r0_3 = (shared.m + smem_l_idx)[16];
- HS_KEY_TYPE r0_4 = (shared.m + smem_l_idx)[24];
- HS_KEY_TYPE r0_5 = (shared.m + smem_r_idx)[32];
- HS_KEY_TYPE r0_6 = (shared.m + smem_r_idx)[40];
- HS_KEY_TYPE r0_7 = (shared.m + smem_r_idx)[48];
- HS_KEY_TYPE r0_8 = (shared.m + smem_r_idx)[56];
- HS_CMP_XCHG(r0_4, r0_5)
- HS_CMP_XCHG(r0_3, r0_6)
- HS_CMP_XCHG(r0_2, r0_7)
- HS_CMP_XCHG(r0_1, r0_8)
- HS_CMP_XCHG(r0_5, r0_7)
- HS_CMP_XCHG(r0_6, r0_8)
- HS_CMP_XCHG(r0_5, r0_6)
- HS_CMP_XCHG(r0_7, r0_8)
- HS_CMP_XCHG(r0_1, r0_3)
- HS_CMP_XCHG(r0_2, r0_4)
- HS_CMP_XCHG(r0_1, r0_2)
- HS_CMP_XCHG(r0_3, r0_4)
- (shared.m + smem_l_idx)[0] = r0_1;
- (shared.m + smem_l_idx)[8] = r0_2;
- (shared.m + smem_l_idx)[16] = r0_3;
- (shared.m + smem_l_idx)[24] = r0_4;
- (shared.m + smem_r_idx)[32] = r0_5;
- (shared.m + smem_r_idx)[40] = r0_6;
- (shared.m + smem_r_idx)[48] = r0_7;
- (shared.m + smem_r_idx)[56] = r0_8;
- }
- {
- HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[64];
- HS_KEY_TYPE r1_2 = (shared.m + smem_l_idx)[72];
- HS_KEY_TYPE r1_3 = (shared.m + smem_l_idx)[80];
- HS_KEY_TYPE r1_4 = (shared.m + smem_l_idx)[88];
- HS_KEY_TYPE r1_5 = (shared.m + smem_r_idx)[96];
- HS_KEY_TYPE r1_6 = (shared.m + smem_r_idx)[104];
- HS_KEY_TYPE r1_7 = (shared.m + smem_r_idx)[112];
- HS_KEY_TYPE r1_8 = (shared.m + smem_r_idx)[120];
- HS_CMP_XCHG(r1_4, r1_5)
- HS_CMP_XCHG(r1_3, r1_6)
- HS_CMP_XCHG(r1_2, r1_7)
- HS_CMP_XCHG(r1_1, r1_8)
- HS_CMP_XCHG(r1_5, r1_7)
- HS_CMP_XCHG(r1_6, r1_8)
- HS_CMP_XCHG(r1_5, r1_6)
- HS_CMP_XCHG(r1_7, r1_8)
- HS_CMP_XCHG(r1_1, r1_3)
- HS_CMP_XCHG(r1_2, r1_4)
- HS_CMP_XCHG(r1_1, r1_2)
- HS_CMP_XCHG(r1_3, r1_4)
- (shared.m + smem_l_idx)[64] = r1_1;
- (shared.m + smem_l_idx)[72] = r1_2;
- (shared.m + smem_l_idx)[80] = r1_3;
- (shared.m + smem_l_idx)[88] = r1_4;
- (shared.m + smem_r_idx)[96] = r1_5;
- (shared.m + smem_r_idx)[104] = r1_6;
- (shared.m + smem_r_idx)[112] = r1_7;
- (shared.m + smem_r_idx)[120] = r1_8;
- }
-}
-barrier(CLK_LOCAL_MEM_FENCE);
-r1 = (shared.m + get_local_id(0))[16 * 8 * 0];
-r16 = (shared.m + get_local_id(0))[16 * 8 * 1];
-r2 = (shared.m + get_local_id(0))[16 * 8 * 2];
-r15 = (shared.m + get_local_id(0))[16 * 8 * 3];
-r3 = (shared.m + get_local_id(0))[16 * 8 * 4];
-r14 = (shared.m + get_local_id(0))[16 * 8 * 5];
-r4 = (shared.m + get_local_id(0))[16 * 8 * 6];
-r13 = (shared.m + get_local_id(0))[16 * 8 * 7];
-r5 = (shared.m + get_local_id(0))[16 * 8 * 8];
-r12 = (shared.m + get_local_id(0))[16 * 8 * 9];
-r6 = (shared.m + get_local_id(0))[16 * 8 * 10];
-r11 = (shared.m + get_local_id(0))[16 * 8 * 11];
-r7 = (shared.m + get_local_id(0))[16 * 8 * 12];
-r10 = (shared.m + get_local_id(0))[16 * 8 * 13];
-r8 = (shared.m + get_local_id(0))[16 * 8 * 14];
-r9 = (shared.m + get_local_id(0))[16 * 8 * 15];
-{ { uint const half_lane_mask = 4;
-uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
-int const t_lt = get_sub_group_local_id() < half_lane_idx;
-HS_CMP_HALF(0, r1)
-HS_CMP_HALF(1, r2)
-HS_CMP_HALF(2, r3)
-HS_CMP_HALF(3, r4)
-HS_CMP_HALF(4, r5)
-HS_CMP_HALF(5, r6)
-HS_CMP_HALF(6, r7)
-HS_CMP_HALF(7, r8)
-HS_CMP_HALF(8, r9)
-HS_CMP_HALF(9, r10)
-HS_CMP_HALF(10, r11)
-HS_CMP_HALF(11, r12)
-HS_CMP_HALF(12, r13)
-HS_CMP_HALF(13, r14)
-HS_CMP_HALF(14, r15)
-HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-HS_CMP_XCHG(r1, r9)
-HS_CMP_XCHG(r5, r13)
-HS_CMP_XCHG(r1, r5)
-HS_CMP_XCHG(r9, r13)
-HS_CMP_XCHG(r3, r11)
-HS_CMP_XCHG(r7, r15)
-HS_CMP_XCHG(r3, r7)
-HS_CMP_XCHG(r11, r15)
-HS_CMP_XCHG(r1, r3)
-HS_CMP_XCHG(r5, r7)
-HS_CMP_XCHG(r9, r11)
-HS_CMP_XCHG(r13, r15)
-HS_CMP_XCHG(r2, r10)
-HS_CMP_XCHG(r6, r14)
-HS_CMP_XCHG(r2, r6)
-HS_CMP_XCHG(r10, r14)
-HS_CMP_XCHG(r4, r12)
-HS_CMP_XCHG(r8, r16)
-HS_CMP_XCHG(r4, r8)
-HS_CMP_XCHG(r12, r16)
-HS_CMP_XCHG(r2, r4)
-HS_CMP_XCHG(r6, r8)
-HS_CMP_XCHG(r10, r12)
-HS_CMP_XCHG(r14, r16)
-HS_CMP_XCHG(r1, r2)
-HS_CMP_XCHG(r3, r4)
-HS_CMP_XCHG(r5, r6)
-HS_CMP_XCHG(r7, r8)
-HS_CMP_XCHG(r9, r10)
-HS_CMP_XCHG(r11, r12)
-HS_CMP_XCHG(r13, r14)
-HS_CMP_XCHG(r15, r16)
-}
-(shared.m + get_local_id(0))[16 * 8 * 0] = r1;
-(shared.m + get_local_id(0))[16 * 8 * 1] = r16;
-(shared.m + get_local_id(0))[16 * 8 * 2] = r2;
-(shared.m + get_local_id(0))[16 * 8 * 3] = r15;
-(shared.m + get_local_id(0))[16 * 8 * 4] = r3;
-(shared.m + get_local_id(0))[16 * 8 * 5] = r14;
-(shared.m + get_local_id(0))[16 * 8 * 6] = r4;
-(shared.m + get_local_id(0))[16 * 8 * 7] = r13;
-(shared.m + get_local_id(0))[16 * 8 * 8] = r5;
-(shared.m + get_local_id(0))[16 * 8 * 9] = r12;
-(shared.m + get_local_id(0))[16 * 8 * 10] = r6;
-(shared.m + get_local_id(0))[16 * 8 * 11] = r11;
-(shared.m + get_local_id(0))[16 * 8 * 12] = r7;
-(shared.m + get_local_id(0))[16 * 8 * 13] = r10;
-(shared.m + get_local_id(0))[16 * 8 * 14] = r8;
-(shared.m + get_local_id(0))[16 * 8 * 15] = r9;
-barrier(CLK_LOCAL_MEM_FENCE);
-{
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
- HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8];
- HS_KEY_TYPE r0_3 = (shared.m + smem_l_idx)[16];
- HS_KEY_TYPE r0_4 = (shared.m + smem_l_idx)[24];
- HS_KEY_TYPE r0_5 = (shared.m + smem_l_idx)[32];
- HS_KEY_TYPE r0_6 = (shared.m + smem_l_idx)[40];
- HS_KEY_TYPE r0_7 = (shared.m + smem_l_idx)[48];
- HS_KEY_TYPE r0_8 = (shared.m + smem_l_idx)[56];
- HS_KEY_TYPE r0_9 = (shared.m + smem_r_idx)[64];
- HS_KEY_TYPE r0_10 = (shared.m + smem_r_idx)[72];
- HS_KEY_TYPE r0_11 = (shared.m + smem_r_idx)[80];
- HS_KEY_TYPE r0_12 = (shared.m + smem_r_idx)[88];
- HS_KEY_TYPE r0_13 = (shared.m + smem_r_idx)[96];
- HS_KEY_TYPE r0_14 = (shared.m + smem_r_idx)[104];
- HS_KEY_TYPE r0_15 = (shared.m + smem_r_idx)[112];
- HS_KEY_TYPE r0_16 = (shared.m + smem_r_idx)[120];
- HS_CMP_XCHG(r0_8, r0_9)
- HS_CMP_XCHG(r0_7, r0_10)
- HS_CMP_XCHG(r0_6, r0_11)
- HS_CMP_XCHG(r0_5, r0_12)
- HS_CMP_XCHG(r0_4, r0_13)
- HS_CMP_XCHG(r0_3, r0_14)
- HS_CMP_XCHG(r0_2, r0_15)
- HS_CMP_XCHG(r0_1, r0_16)
- HS_CMP_XCHG(r0_9, r0_13)
- HS_CMP_XCHG(r0_11, r0_15)
- HS_CMP_XCHG(r0_9, r0_11)
- HS_CMP_XCHG(r0_13, r0_15)
- HS_CMP_XCHG(r0_10, r0_14)
- HS_CMP_XCHG(r0_12, r0_16)
- HS_CMP_XCHG(r0_10, r0_12)
- HS_CMP_XCHG(r0_14, r0_16)
- HS_CMP_XCHG(r0_9, r0_10)
- HS_CMP_XCHG(r0_11, r0_12)
- HS_CMP_XCHG(r0_13, r0_14)
- HS_CMP_XCHG(r0_15, r0_16)
- HS_CMP_XCHG(r0_1, r0_5)
- HS_CMP_XCHG(r0_3, r0_7)
- HS_CMP_XCHG(r0_1, r0_3)
- HS_CMP_XCHG(r0_5, r0_7)
- HS_CMP_XCHG(r0_2, r0_6)
- HS_CMP_XCHG(r0_4, r0_8)
- HS_CMP_XCHG(r0_2, r0_4)
- HS_CMP_XCHG(r0_6, r0_8)
- HS_CMP_XCHG(r0_1, r0_2)
- HS_CMP_XCHG(r0_3, r0_4)
- HS_CMP_XCHG(r0_5, r0_6)
- HS_CMP_XCHG(r0_7, r0_8)
- (shared.m + smem_l_idx)[0] = r0_1;
- (shared.m + smem_l_idx)[8] = r0_2;
- (shared.m + smem_l_idx)[16] = r0_3;
- (shared.m + smem_l_idx)[24] = r0_4;
- (shared.m + smem_l_idx)[32] = r0_5;
- (shared.m + smem_l_idx)[40] = r0_6;
- (shared.m + smem_l_idx)[48] = r0_7;
- (shared.m + smem_l_idx)[56] = r0_8;
- (shared.m + smem_r_idx)[64] = r0_9;
- (shared.m + smem_r_idx)[72] = r0_10;
- (shared.m + smem_r_idx)[80] = r0_11;
- (shared.m + smem_r_idx)[88] = r0_12;
- (shared.m + smem_r_idx)[96] = r0_13;
- (shared.m + smem_r_idx)[104] = r0_14;
- (shared.m + smem_r_idx)[112] = r0_15;
- (shared.m + smem_r_idx)[120] = r0_16;
- }
-}
-barrier(CLK_LOCAL_MEM_FENCE);
-r1 = (shared.m + get_local_id(0))[16 * 8 * 0];
-r16 = (shared.m + get_local_id(0))[16 * 8 * 1];
-r2 = (shared.m + get_local_id(0))[16 * 8 * 2];
-r15 = (shared.m + get_local_id(0))[16 * 8 * 3];
-r3 = (shared.m + get_local_id(0))[16 * 8 * 4];
-r14 = (shared.m + get_local_id(0))[16 * 8 * 5];
-r4 = (shared.m + get_local_id(0))[16 * 8 * 6];
-r13 = (shared.m + get_local_id(0))[16 * 8 * 7];
-r5 = (shared.m + get_local_id(0))[16 * 8 * 8];
-r12 = (shared.m + get_local_id(0))[16 * 8 * 9];
-r6 = (shared.m + get_local_id(0))[16 * 8 * 10];
-r11 = (shared.m + get_local_id(0))[16 * 8 * 11];
-r7 = (shared.m + get_local_id(0))[16 * 8 * 12];
-r10 = (shared.m + get_local_id(0))[16 * 8 * 13];
-r8 = (shared.m + get_local_id(0))[16 * 8 * 14];
-r9 = (shared.m + get_local_id(0))[16 * 8 * 15];
-{ { uint const half_lane_mask = 4;
-uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
-int const t_lt = get_sub_group_local_id() < half_lane_idx;
-HS_CMP_HALF(0, r1)
-HS_CMP_HALF(1, r2)
-HS_CMP_HALF(2, r3)
-HS_CMP_HALF(3, r4)
-HS_CMP_HALF(4, r5)
-HS_CMP_HALF(5, r6)
-HS_CMP_HALF(6, r7)
-HS_CMP_HALF(7, r8)
-HS_CMP_HALF(8, r9)
-HS_CMP_HALF(9, r10)
-HS_CMP_HALF(10, r11)
-HS_CMP_HALF(11, r12)
-HS_CMP_HALF(12, r13)
-HS_CMP_HALF(13, r14)
-HS_CMP_HALF(14, r15)
-HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-HS_CMP_XCHG(r1, r9)
-HS_CMP_XCHG(r5, r13)
-HS_CMP_XCHG(r1, r5)
-HS_CMP_XCHG(r9, r13)
-HS_CMP_XCHG(r3, r11)
-HS_CMP_XCHG(r7, r15)
-HS_CMP_XCHG(r3, r7)
-HS_CMP_XCHG(r11, r15)
-HS_CMP_XCHG(r1, r3)
-HS_CMP_XCHG(r5, r7)
-HS_CMP_XCHG(r9, r11)
-HS_CMP_XCHG(r13, r15)
-HS_CMP_XCHG(r2, r10)
-HS_CMP_XCHG(r6, r14)
-HS_CMP_XCHG(r2, r6)
-HS_CMP_XCHG(r10, r14)
-HS_CMP_XCHG(r4, r12)
-HS_CMP_XCHG(r8, r16)
-HS_CMP_XCHG(r4, r8)
-HS_CMP_XCHG(r12, r16)
-HS_CMP_XCHG(r2, r4)
-HS_CMP_XCHG(r6, r8)
-HS_CMP_XCHG(r10, r12)
-HS_CMP_XCHG(r14, r16)
-HS_CMP_XCHG(r1, r2)
-HS_CMP_XCHG(r3, r4)
-HS_CMP_XCHG(r5, r6)
-HS_CMP_XCHG(r7, r8)
-HS_CMP_XCHG(r9, r10)
-HS_CMP_XCHG(r11, r12)
-HS_CMP_XCHG(r13, r14)
-HS_CMP_XCHG(r15, r16)
-}
-(vout + gmem_idx)[0 * 8] = r1;
-(vout + gmem_idx)[1 * 8] = r2;
-(vout + gmem_idx)[2 * 8] = r3;
-(vout + gmem_idx)[3 * 8] = r4;
-(vout + gmem_idx)[4 * 8] = r5;
-(vout + gmem_idx)[5 * 8] = r6;
-(vout + gmem_idx)[6 * 8] = r7;
-(vout + gmem_idx)[7 * 8] = r8;
-(vout + gmem_idx)[8 * 8] = r9;
-(vout + gmem_idx)[9 * 8] = r10;
-(vout + gmem_idx)[10 * 8] = r11;
-(vout + gmem_idx)[11 * 8] = r12;
-(vout + gmem_idx)[12 * 8] = r13;
-(vout + gmem_idx)[13 * 8] = r14;
-(vout + gmem_idx)[14 * 8] = r15;
-(vout + gmem_idx)[15 * 8] = r16;
-}
-
-__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
-__attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_bs_3(__global HS_KEY_TYPE const* const restrict vin,
- __global HS_KEY_TYPE* const restrict vout)
-{
- __local union
- {
- HS_KEY_TYPE m[16 * 64];
- } shared;
-
- uint const global_id = get_global_id(0);
- uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
-
- HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8];
- HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8];
- HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8];
- HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8];
- HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8];
- HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8];
- HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8];
- HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8];
- HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8];
- HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8];
- HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8];
- HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8];
- HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8];
- HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8];
- HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8];
- HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8];
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r6, r11)
- HS_CMP_XCHG(r7, r10)
- HS_CMP_XCHG(r4, r13)
- HS_CMP_XCHG(r14, r15)
- HS_CMP_XCHG(r8, r12)
- HS_CMP_XCHG(r2, r3)
- HS_CMP_XCHG(r5, r9)
- HS_CMP_XCHG(r2, r5)
- HS_CMP_XCHG(r8, r14)
- HS_CMP_XCHG(r3, r9)
- HS_CMP_XCHG(r12, r15)
- HS_CMP_XCHG(r3, r5)
- HS_CMP_XCHG(r6, r7)
- HS_CMP_XCHG(r10, r11)
- HS_CMP_XCHG(r12, r14)
- HS_CMP_XCHG(r4, r9)
- HS_CMP_XCHG(r8, r13)
- HS_CMP_XCHG(r7, r9)
- HS_CMP_XCHG(r11, r13)
- HS_CMP_XCHG(r4, r6)
- HS_CMP_XCHG(r8, r10)
- HS_CMP_XCHG(r4, r5)
- HS_CMP_XCHG(r6, r7)
- HS_CMP_XCHG(r8, r9)
- HS_CMP_XCHG(r10, r11)
- HS_CMP_XCHG(r12, r13)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- {
- uint const flip_lane_mask = 1;
- uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
- int const t_lt = get_sub_group_local_id() < flip_lane_idx;
- HS_CMP_FLIP(0, r1, r16)
- HS_CMP_FLIP(1, r2, r15)
- HS_CMP_FLIP(2, r3, r14)
- HS_CMP_FLIP(3, r4, r13)
- HS_CMP_FLIP(4, r5, r12)
- HS_CMP_FLIP(5, r6, r11)
- HS_CMP_FLIP(6, r7, r10)
- HS_CMP_FLIP(7, r8, r9)
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- {
- uint const flip_lane_mask = 3;
- uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
- int const t_lt = get_sub_group_local_id() < flip_lane_idx;
- HS_CMP_FLIP(0, r1, r16)
- HS_CMP_FLIP(1, r2, r15)
- HS_CMP_FLIP(2, r3, r14)
- HS_CMP_FLIP(3, r4, r13)
- HS_CMP_FLIP(4, r5, r12)
- HS_CMP_FLIP(5, r6, r11)
- HS_CMP_FLIP(6, r7, r10)
- HS_CMP_FLIP(7, r8, r9)
- }
- {
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- {
- uint const flip_lane_mask = 7;
- uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
- int const t_lt = get_sub_group_local_id() < flip_lane_idx;
- HS_CMP_FLIP(0, r1, r16)
- HS_CMP_FLIP(1, r2, r15)
- HS_CMP_FLIP(2, r3, r14)
- HS_CMP_FLIP(3, r4, r13)
- HS_CMP_FLIP(4, r5, r12)
- HS_CMP_FLIP(5, r6, r11)
- HS_CMP_FLIP(6, r7, r10)
- HS_CMP_FLIP(7, r8, r9)
- }
- {
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
- }
- {
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- uint const smem_l_idx = get_sub_group_id() * 64 + get_sub_group_local_id();
- uint const smem_r_idx =
- (get_sub_group_id() ^ 1) * 64 + (get_sub_group_local_id() ^ 7);
- (shared.m + get_local_id(0))[8 * 8 * 0] = r1;
- (shared.m + get_local_id(0))[8 * 8 * 1] = r16;
- (shared.m + get_local_id(0))[8 * 8 * 2] = r2;
- (shared.m + get_local_id(0))[8 * 8 * 3] = r15;
- (shared.m + get_local_id(0))[8 * 8 * 4] = r3;
- (shared.m + get_local_id(0))[8 * 8 * 5] = r14;
- (shared.m + get_local_id(0))[8 * 8 * 6] = r4;
- (shared.m + get_local_id(0))[8 * 8 * 7] = r13;
- (shared.m + get_local_id(0))[8 * 8 * 8] = r5;
- (shared.m + get_local_id(0))[8 * 8 * 9] = r12;
- (shared.m + get_local_id(0))[8 * 8 * 10] = r6;
- (shared.m + get_local_id(0))[8 * 8 * 11] = r11;
- (shared.m + get_local_id(0))[8 * 8 * 12] = r7;
- (shared.m + get_local_id(0))[8 * 8 * 13] = r10;
- (shared.m + get_local_id(0))[8 * 8 * 14] = r8;
- (shared.m + get_local_id(0))[8 * 8 * 15] = r9;
- barrier(CLK_LOCAL_MEM_FENCE);
- {
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
- HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[8];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[0] = r0_1;
- (shared.m + smem_r_idx)[8] = r0_2;
- }
- {
- HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[16];
- HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[24];
- HS_CMP_XCHG(r1_1, r1_2)
- (shared.m + smem_l_idx)[16] = r1_1;
- (shared.m + smem_r_idx)[24] = r1_2;
- }
- {
- HS_KEY_TYPE r2_1 = (shared.m + smem_l_idx)[32];
- HS_KEY_TYPE r2_2 = (shared.m + smem_r_idx)[40];
- HS_CMP_XCHG(r2_1, r2_2)
- (shared.m + smem_l_idx)[32] = r2_1;
- (shared.m + smem_r_idx)[40] = r2_2;
- }
- {
- HS_KEY_TYPE r3_1 = (shared.m + smem_l_idx)[48];
- HS_KEY_TYPE r3_2 = (shared.m + smem_r_idx)[56];
- HS_CMP_XCHG(r3_1, r3_2)
- (shared.m + smem_l_idx)[48] = r3_1;
- (shared.m + smem_r_idx)[56] = r3_2;
- }
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[512];
- HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[520];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[512] = r0_1;
- (shared.m + smem_r_idx)[520] = r0_2;
- }
- {
- HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[528];
- HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[536];
- HS_CMP_XCHG(r1_1, r1_2)
- (shared.m + smem_l_idx)[528] = r1_1;
- (shared.m + smem_r_idx)[536] = r1_2;
- }
- {
- HS_KEY_TYPE r2_1 = (shared.m + smem_l_idx)[544];
- HS_KEY_TYPE r2_2 = (shared.m + smem_r_idx)[552];
- HS_CMP_XCHG(r2_1, r2_2)
- (shared.m + smem_l_idx)[544] = r2_1;
- (shared.m + smem_r_idx)[552] = r2_2;
- }
- {
- HS_KEY_TYPE r3_1 = (shared.m + smem_l_idx)[560];
- HS_KEY_TYPE r3_2 = (shared.m + smem_r_idx)[568];
- HS_CMP_XCHG(r3_1, r3_2)
- (shared.m + smem_l_idx)[560] = r3_1;
- (shared.m + smem_r_idx)[568] = r3_2;
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- r1 = (shared.m + get_local_id(0))[8 * 8 * 0];
- r16 = (shared.m + get_local_id(0))[8 * 8 * 1];
- r2 = (shared.m + get_local_id(0))[8 * 8 * 2];
- r15 = (shared.m + get_local_id(0))[8 * 8 * 3];
- r3 = (shared.m + get_local_id(0))[8 * 8 * 4];
- r14 = (shared.m + get_local_id(0))[8 * 8 * 5];
- r4 = (shared.m + get_local_id(0))[8 * 8 * 6];
- r13 = (shared.m + get_local_id(0))[8 * 8 * 7];
- r5 = (shared.m + get_local_id(0))[8 * 8 * 8];
- r12 = (shared.m + get_local_id(0))[8 * 8 * 9];
- r6 = (shared.m + get_local_id(0))[8 * 8 * 10];
- r11 = (shared.m + get_local_id(0))[8 * 8 * 11];
- r7 = (shared.m + get_local_id(0))[8 * 8 * 12];
- r10 = (shared.m + get_local_id(0))[8 * 8 * 13];
- r8 = (shared.m + get_local_id(0))[8 * 8 * 14];
- r9 = (shared.m + get_local_id(0))[8 * 8 * 15];
- { { uint const half_lane_mask = 4;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-HS_CMP_XCHG(r1, r9)
-HS_CMP_XCHG(r5, r13)
-HS_CMP_XCHG(r1, r5)
-HS_CMP_XCHG(r9, r13)
-HS_CMP_XCHG(r3, r11)
-HS_CMP_XCHG(r7, r15)
-HS_CMP_XCHG(r3, r7)
-HS_CMP_XCHG(r11, r15)
-HS_CMP_XCHG(r1, r3)
-HS_CMP_XCHG(r5, r7)
-HS_CMP_XCHG(r9, r11)
-HS_CMP_XCHG(r13, r15)
-HS_CMP_XCHG(r2, r10)
-HS_CMP_XCHG(r6, r14)
-HS_CMP_XCHG(r2, r6)
-HS_CMP_XCHG(r10, r14)
-HS_CMP_XCHG(r4, r12)
-HS_CMP_XCHG(r8, r16)
-HS_CMP_XCHG(r4, r8)
-HS_CMP_XCHG(r12, r16)
-HS_CMP_XCHG(r2, r4)
-HS_CMP_XCHG(r6, r8)
-HS_CMP_XCHG(r10, r12)
-HS_CMP_XCHG(r14, r16)
-HS_CMP_XCHG(r1, r2)
-HS_CMP_XCHG(r3, r4)
-HS_CMP_XCHG(r5, r6)
-HS_CMP_XCHG(r7, r8)
-HS_CMP_XCHG(r9, r10)
-HS_CMP_XCHG(r11, r12)
-HS_CMP_XCHG(r13, r14)
-HS_CMP_XCHG(r15, r16)
-}
-(shared.m + get_local_id(0))[8 * 8 * 0] = r1;
-(shared.m + get_local_id(0))[8 * 8 * 1] = r16;
-(shared.m + get_local_id(0))[8 * 8 * 2] = r2;
-(shared.m + get_local_id(0))[8 * 8 * 3] = r15;
-(shared.m + get_local_id(0))[8 * 8 * 4] = r3;
-(shared.m + get_local_id(0))[8 * 8 * 5] = r14;
-(shared.m + get_local_id(0))[8 * 8 * 6] = r4;
-(shared.m + get_local_id(0))[8 * 8 * 7] = r13;
-(shared.m + get_local_id(0))[8 * 8 * 8] = r5;
-(shared.m + get_local_id(0))[8 * 8 * 9] = r12;
-(shared.m + get_local_id(0))[8 * 8 * 10] = r6;
-(shared.m + get_local_id(0))[8 * 8 * 11] = r11;
-(shared.m + get_local_id(0))[8 * 8 * 12] = r7;
-(shared.m + get_local_id(0))[8 * 8 * 13] = r10;
-(shared.m + get_local_id(0))[8 * 8 * 14] = r8;
-(shared.m + get_local_id(0))[8 * 8 * 15] = r9;
-barrier(CLK_LOCAL_MEM_FENCE);
-{
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
- HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8];
- HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[16];
- HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[24];
- HS_CMP_XCHG(r0_2, r0_3)
- HS_CMP_XCHG(r0_1, r0_4)
- HS_CMP_XCHG(r0_3, r0_4)
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[0] = r0_1;
- (shared.m + smem_l_idx)[8] = r0_2;
- (shared.m + smem_r_idx)[16] = r0_3;
- (shared.m + smem_r_idx)[24] = r0_4;
- }
- {
- HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[32];
- HS_KEY_TYPE r1_2 = (shared.m + smem_l_idx)[40];
- HS_KEY_TYPE r1_3 = (shared.m + smem_r_idx)[48];
- HS_KEY_TYPE r1_4 = (shared.m + smem_r_idx)[56];
- HS_CMP_XCHG(r1_2, r1_3)
- HS_CMP_XCHG(r1_1, r1_4)
- HS_CMP_XCHG(r1_3, r1_4)
- HS_CMP_XCHG(r1_1, r1_2)
- (shared.m + smem_l_idx)[32] = r1_1;
- (shared.m + smem_l_idx)[40] = r1_2;
- (shared.m + smem_r_idx)[48] = r1_3;
- (shared.m + smem_r_idx)[56] = r1_4;
- }
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[512];
- HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[520];
- HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[528];
- HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[536];
- HS_CMP_XCHG(r0_2, r0_3)
- HS_CMP_XCHG(r0_1, r0_4)
- HS_CMP_XCHG(r0_3, r0_4)
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[512] = r0_1;
- (shared.m + smem_l_idx)[520] = r0_2;
- (shared.m + smem_r_idx)[528] = r0_3;
- (shared.m + smem_r_idx)[536] = r0_4;
- }
- {
- HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[544];
- HS_KEY_TYPE r1_2 = (shared.m + smem_l_idx)[552];
- HS_KEY_TYPE r1_3 = (shared.m + smem_r_idx)[560];
- HS_KEY_TYPE r1_4 = (shared.m + smem_r_idx)[568];
- HS_CMP_XCHG(r1_2, r1_3)
- HS_CMP_XCHG(r1_1, r1_4)
- HS_CMP_XCHG(r1_3, r1_4)
- HS_CMP_XCHG(r1_1, r1_2)
- (shared.m + smem_l_idx)[544] = r1_1;
- (shared.m + smem_l_idx)[552] = r1_2;
- (shared.m + smem_r_idx)[560] = r1_3;
- (shared.m + smem_r_idx)[568] = r1_4;
- }
-}
-barrier(CLK_LOCAL_MEM_FENCE);
-r1 = (shared.m + get_local_id(0))[8 * 8 * 0];
-r16 = (shared.m + get_local_id(0))[8 * 8 * 1];
-r2 = (shared.m + get_local_id(0))[8 * 8 * 2];
-r15 = (shared.m + get_local_id(0))[8 * 8 * 3];
-r3 = (shared.m + get_local_id(0))[8 * 8 * 4];
-r14 = (shared.m + get_local_id(0))[8 * 8 * 5];
-r4 = (shared.m + get_local_id(0))[8 * 8 * 6];
-r13 = (shared.m + get_local_id(0))[8 * 8 * 7];
-r5 = (shared.m + get_local_id(0))[8 * 8 * 8];
-r12 = (shared.m + get_local_id(0))[8 * 8 * 9];
-r6 = (shared.m + get_local_id(0))[8 * 8 * 10];
-r11 = (shared.m + get_local_id(0))[8 * 8 * 11];
-r7 = (shared.m + get_local_id(0))[8 * 8 * 12];
-r10 = (shared.m + get_local_id(0))[8 * 8 * 13];
-r8 = (shared.m + get_local_id(0))[8 * 8 * 14];
-r9 = (shared.m + get_local_id(0))[8 * 8 * 15];
-{ { uint const half_lane_mask = 4;
-uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
-int const t_lt = get_sub_group_local_id() < half_lane_idx;
-HS_CMP_HALF(0, r1)
-HS_CMP_HALF(1, r2)
-HS_CMP_HALF(2, r3)
-HS_CMP_HALF(3, r4)
-HS_CMP_HALF(4, r5)
-HS_CMP_HALF(5, r6)
-HS_CMP_HALF(6, r7)
-HS_CMP_HALF(7, r8)
-HS_CMP_HALF(8, r9)
-HS_CMP_HALF(9, r10)
-HS_CMP_HALF(10, r11)
-HS_CMP_HALF(11, r12)
-HS_CMP_HALF(12, r13)
-HS_CMP_HALF(13, r14)
-HS_CMP_HALF(14, r15)
-HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-HS_CMP_XCHG(r1, r9)
-HS_CMP_XCHG(r5, r13)
-HS_CMP_XCHG(r1, r5)
-HS_CMP_XCHG(r9, r13)
-HS_CMP_XCHG(r3, r11)
-HS_CMP_XCHG(r7, r15)
-HS_CMP_XCHG(r3, r7)
-HS_CMP_XCHG(r11, r15)
-HS_CMP_XCHG(r1, r3)
-HS_CMP_XCHG(r5, r7)
-HS_CMP_XCHG(r9, r11)
-HS_CMP_XCHG(r13, r15)
-HS_CMP_XCHG(r2, r10)
-HS_CMP_XCHG(r6, r14)
-HS_CMP_XCHG(r2, r6)
-HS_CMP_XCHG(r10, r14)
-HS_CMP_XCHG(r4, r12)
-HS_CMP_XCHG(r8, r16)
-HS_CMP_XCHG(r4, r8)
-HS_CMP_XCHG(r12, r16)
-HS_CMP_XCHG(r2, r4)
-HS_CMP_XCHG(r6, r8)
-HS_CMP_XCHG(r10, r12)
-HS_CMP_XCHG(r14, r16)
-HS_CMP_XCHG(r1, r2)
-HS_CMP_XCHG(r3, r4)
-HS_CMP_XCHG(r5, r6)
-HS_CMP_XCHG(r7, r8)
-HS_CMP_XCHG(r9, r10)
-HS_CMP_XCHG(r11, r12)
-HS_CMP_XCHG(r13, r14)
-HS_CMP_XCHG(r15, r16)
-}
-(shared.m + get_local_id(0))[8 * 8 * 0] = r1;
-(shared.m + get_local_id(0))[8 * 8 * 1] = r16;
-(shared.m + get_local_id(0))[8 * 8 * 2] = r2;
-(shared.m + get_local_id(0))[8 * 8 * 3] = r15;
-(shared.m + get_local_id(0))[8 * 8 * 4] = r3;
-(shared.m + get_local_id(0))[8 * 8 * 5] = r14;
-(shared.m + get_local_id(0))[8 * 8 * 6] = r4;
-(shared.m + get_local_id(0))[8 * 8 * 7] = r13;
-(shared.m + get_local_id(0))[8 * 8 * 8] = r5;
-(shared.m + get_local_id(0))[8 * 8 * 9] = r12;
-(shared.m + get_local_id(0))[8 * 8 * 10] = r6;
-(shared.m + get_local_id(0))[8 * 8 * 11] = r11;
-(shared.m + get_local_id(0))[8 * 8 * 12] = r7;
-(shared.m + get_local_id(0))[8 * 8 * 13] = r10;
-(shared.m + get_local_id(0))[8 * 8 * 14] = r8;
-(shared.m + get_local_id(0))[8 * 8 * 15] = r9;
-barrier(CLK_LOCAL_MEM_FENCE);
-{
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
- HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8];
- HS_KEY_TYPE r0_3 = (shared.m + smem_l_idx)[16];
- HS_KEY_TYPE r0_4 = (shared.m + smem_l_idx)[24];
- HS_KEY_TYPE r0_5 = (shared.m + smem_r_idx)[32];
- HS_KEY_TYPE r0_6 = (shared.m + smem_r_idx)[40];
- HS_KEY_TYPE r0_7 = (shared.m + smem_r_idx)[48];
- HS_KEY_TYPE r0_8 = (shared.m + smem_r_idx)[56];
- HS_CMP_XCHG(r0_4, r0_5)
- HS_CMP_XCHG(r0_3, r0_6)
- HS_CMP_XCHG(r0_2, r0_7)
- HS_CMP_XCHG(r0_1, r0_8)
- HS_CMP_XCHG(r0_5, r0_7)
- HS_CMP_XCHG(r0_6, r0_8)
- HS_CMP_XCHG(r0_5, r0_6)
- HS_CMP_XCHG(r0_7, r0_8)
- HS_CMP_XCHG(r0_1, r0_3)
- HS_CMP_XCHG(r0_2, r0_4)
- HS_CMP_XCHG(r0_1, r0_2)
- HS_CMP_XCHG(r0_3, r0_4)
- (shared.m + smem_l_idx)[0] = r0_1;
- (shared.m + smem_l_idx)[8] = r0_2;
- (shared.m + smem_l_idx)[16] = r0_3;
- (shared.m + smem_l_idx)[24] = r0_4;
- (shared.m + smem_r_idx)[32] = r0_5;
- (shared.m + smem_r_idx)[40] = r0_6;
- (shared.m + smem_r_idx)[48] = r0_7;
- (shared.m + smem_r_idx)[56] = r0_8;
- }
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[512];
- HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[520];
- HS_KEY_TYPE r0_3 = (shared.m + smem_l_idx)[528];
- HS_KEY_TYPE r0_4 = (shared.m + smem_l_idx)[536];
- HS_KEY_TYPE r0_5 = (shared.m + smem_r_idx)[544];
- HS_KEY_TYPE r0_6 = (shared.m + smem_r_idx)[552];
- HS_KEY_TYPE r0_7 = (shared.m + smem_r_idx)[560];
- HS_KEY_TYPE r0_8 = (shared.m + smem_r_idx)[568];
- HS_CMP_XCHG(r0_4, r0_5)
- HS_CMP_XCHG(r0_3, r0_6)
- HS_CMP_XCHG(r0_2, r0_7)
- HS_CMP_XCHG(r0_1, r0_8)
- HS_CMP_XCHG(r0_5, r0_7)
- HS_CMP_XCHG(r0_6, r0_8)
- HS_CMP_XCHG(r0_5, r0_6)
- HS_CMP_XCHG(r0_7, r0_8)
- HS_CMP_XCHG(r0_1, r0_3)
- HS_CMP_XCHG(r0_2, r0_4)
- HS_CMP_XCHG(r0_1, r0_2)
- HS_CMP_XCHG(r0_3, r0_4)
- (shared.m + smem_l_idx)[512] = r0_1;
- (shared.m + smem_l_idx)[520] = r0_2;
- (shared.m + smem_l_idx)[528] = r0_3;
- (shared.m + smem_l_idx)[536] = r0_4;
- (shared.m + smem_r_idx)[544] = r0_5;
- (shared.m + smem_r_idx)[552] = r0_6;
- (shared.m + smem_r_idx)[560] = r0_7;
- (shared.m + smem_r_idx)[568] = r0_8;
- }
-}
-barrier(CLK_LOCAL_MEM_FENCE);
-r1 = (shared.m + get_local_id(0))[8 * 8 * 0];
-r16 = (shared.m + get_local_id(0))[8 * 8 * 1];
-r2 = (shared.m + get_local_id(0))[8 * 8 * 2];
-r15 = (shared.m + get_local_id(0))[8 * 8 * 3];
-r3 = (shared.m + get_local_id(0))[8 * 8 * 4];
-r14 = (shared.m + get_local_id(0))[8 * 8 * 5];
-r4 = (shared.m + get_local_id(0))[8 * 8 * 6];
-r13 = (shared.m + get_local_id(0))[8 * 8 * 7];
-r5 = (shared.m + get_local_id(0))[8 * 8 * 8];
-r12 = (shared.m + get_local_id(0))[8 * 8 * 9];
-r6 = (shared.m + get_local_id(0))[8 * 8 * 10];
-r11 = (shared.m + get_local_id(0))[8 * 8 * 11];
-r7 = (shared.m + get_local_id(0))[8 * 8 * 12];
-r10 = (shared.m + get_local_id(0))[8 * 8 * 13];
-r8 = (shared.m + get_local_id(0))[8 * 8 * 14];
-r9 = (shared.m + get_local_id(0))[8 * 8 * 15];
-{ { uint const half_lane_mask = 4;
-uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
-int const t_lt = get_sub_group_local_id() < half_lane_idx;
-HS_CMP_HALF(0, r1)
-HS_CMP_HALF(1, r2)
-HS_CMP_HALF(2, r3)
-HS_CMP_HALF(3, r4)
-HS_CMP_HALF(4, r5)
-HS_CMP_HALF(5, r6)
-HS_CMP_HALF(6, r7)
-HS_CMP_HALF(7, r8)
-HS_CMP_HALF(8, r9)
-HS_CMP_HALF(9, r10)
-HS_CMP_HALF(10, r11)
-HS_CMP_HALF(11, r12)
-HS_CMP_HALF(12, r13)
-HS_CMP_HALF(13, r14)
-HS_CMP_HALF(14, r15)
-HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-HS_CMP_XCHG(r1, r9)
-HS_CMP_XCHG(r5, r13)
-HS_CMP_XCHG(r1, r5)
-HS_CMP_XCHG(r9, r13)
-HS_CMP_XCHG(r3, r11)
-HS_CMP_XCHG(r7, r15)
-HS_CMP_XCHG(r3, r7)
-HS_CMP_XCHG(r11, r15)
-HS_CMP_XCHG(r1, r3)
-HS_CMP_XCHG(r5, r7)
-HS_CMP_XCHG(r9, r11)
-HS_CMP_XCHG(r13, r15)
-HS_CMP_XCHG(r2, r10)
-HS_CMP_XCHG(r6, r14)
-HS_CMP_XCHG(r2, r6)
-HS_CMP_XCHG(r10, r14)
-HS_CMP_XCHG(r4, r12)
-HS_CMP_XCHG(r8, r16)
-HS_CMP_XCHG(r4, r8)
-HS_CMP_XCHG(r12, r16)
-HS_CMP_XCHG(r2, r4)
-HS_CMP_XCHG(r6, r8)
-HS_CMP_XCHG(r10, r12)
-HS_CMP_XCHG(r14, r16)
-HS_CMP_XCHG(r1, r2)
-HS_CMP_XCHG(r3, r4)
-HS_CMP_XCHG(r5, r6)
-HS_CMP_XCHG(r7, r8)
-HS_CMP_XCHG(r9, r10)
-HS_CMP_XCHG(r11, r12)
-HS_CMP_XCHG(r13, r14)
-HS_CMP_XCHG(r15, r16)
-}
-(vout + gmem_idx)[0 * 8] = r1;
-(vout + gmem_idx)[1 * 8] = r2;
-(vout + gmem_idx)[2 * 8] = r3;
-(vout + gmem_idx)[3 * 8] = r4;
-(vout + gmem_idx)[4 * 8] = r5;
-(vout + gmem_idx)[5 * 8] = r6;
-(vout + gmem_idx)[6 * 8] = r7;
-(vout + gmem_idx)[7 * 8] = r8;
-(vout + gmem_idx)[8 * 8] = r9;
-(vout + gmem_idx)[9 * 8] = r10;
-(vout + gmem_idx)[10 * 8] = r11;
-(vout + gmem_idx)[11 * 8] = r12;
-(vout + gmem_idx)[12 * 8] = r13;
-(vout + gmem_idx)[13 * 8] = r14;
-(vout + gmem_idx)[14 * 8] = r15;
-(vout + gmem_idx)[15 * 8] = r16;
-}
-
-__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
-__attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_bs_2(__global HS_KEY_TYPE const* const restrict vin,
- __global HS_KEY_TYPE* const restrict vout)
-{
- __local union
- {
- HS_KEY_TYPE m[16 * 32];
- } shared;
-
- uint const global_id = get_global_id(0);
- uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
-
- HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8];
- HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8];
- HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8];
- HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8];
- HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8];
- HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8];
- HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8];
- HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8];
- HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8];
- HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8];
- HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8];
- HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8];
- HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8];
- HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8];
- HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8];
- HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8];
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r6, r11)
- HS_CMP_XCHG(r7, r10)
- HS_CMP_XCHG(r4, r13)
- HS_CMP_XCHG(r14, r15)
- HS_CMP_XCHG(r8, r12)
- HS_CMP_XCHG(r2, r3)
- HS_CMP_XCHG(r5, r9)
- HS_CMP_XCHG(r2, r5)
- HS_CMP_XCHG(r8, r14)
- HS_CMP_XCHG(r3, r9)
- HS_CMP_XCHG(r12, r15)
- HS_CMP_XCHG(r3, r5)
- HS_CMP_XCHG(r6, r7)
- HS_CMP_XCHG(r10, r11)
- HS_CMP_XCHG(r12, r14)
- HS_CMP_XCHG(r4, r9)
- HS_CMP_XCHG(r8, r13)
- HS_CMP_XCHG(r7, r9)
- HS_CMP_XCHG(r11, r13)
- HS_CMP_XCHG(r4, r6)
- HS_CMP_XCHG(r8, r10)
- HS_CMP_XCHG(r4, r5)
- HS_CMP_XCHG(r6, r7)
- HS_CMP_XCHG(r8, r9)
- HS_CMP_XCHG(r10, r11)
- HS_CMP_XCHG(r12, r13)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- {
- uint const flip_lane_mask = 1;
- uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
- int const t_lt = get_sub_group_local_id() < flip_lane_idx;
- HS_CMP_FLIP(0, r1, r16)
- HS_CMP_FLIP(1, r2, r15)
- HS_CMP_FLIP(2, r3, r14)
- HS_CMP_FLIP(3, r4, r13)
- HS_CMP_FLIP(4, r5, r12)
- HS_CMP_FLIP(5, r6, r11)
- HS_CMP_FLIP(6, r7, r10)
- HS_CMP_FLIP(7, r8, r9)
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- {
- uint const flip_lane_mask = 3;
- uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
- int const t_lt = get_sub_group_local_id() < flip_lane_idx;
- HS_CMP_FLIP(0, r1, r16)
- HS_CMP_FLIP(1, r2, r15)
- HS_CMP_FLIP(2, r3, r14)
- HS_CMP_FLIP(3, r4, r13)
- HS_CMP_FLIP(4, r5, r12)
- HS_CMP_FLIP(5, r6, r11)
- HS_CMP_FLIP(6, r7, r10)
- HS_CMP_FLIP(7, r8, r9)
- }
- {
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- {
- uint const flip_lane_mask = 7;
- uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
- int const t_lt = get_sub_group_local_id() < flip_lane_idx;
- HS_CMP_FLIP(0, r1, r16)
- HS_CMP_FLIP(1, r2, r15)
- HS_CMP_FLIP(2, r3, r14)
- HS_CMP_FLIP(3, r4, r13)
- HS_CMP_FLIP(4, r5, r12)
- HS_CMP_FLIP(5, r6, r11)
- HS_CMP_FLIP(6, r7, r10)
- HS_CMP_FLIP(7, r8, r9)
- }
- {
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
- }
- {
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- uint const smem_l_idx = get_sub_group_id() * 32 + get_sub_group_local_id();
- uint const smem_r_idx =
- (get_sub_group_id() ^ 1) * 32 + (get_sub_group_local_id() ^ 7);
- (shared.m + get_local_id(0))[4 * 8 * 0] = r1;
- (shared.m + get_local_id(0))[4 * 8 * 1] = r16;
- (shared.m + get_local_id(0))[4 * 8 * 2] = r2;
- (shared.m + get_local_id(0))[4 * 8 * 3] = r15;
- (shared.m + get_local_id(0))[4 * 8 * 4] = r3;
- (shared.m + get_local_id(0))[4 * 8 * 5] = r14;
- (shared.m + get_local_id(0))[4 * 8 * 6] = r4;
- (shared.m + get_local_id(0))[4 * 8 * 7] = r13;
- (shared.m + get_local_id(0))[4 * 8 * 8] = r5;
- (shared.m + get_local_id(0))[4 * 8 * 9] = r12;
- (shared.m + get_local_id(0))[4 * 8 * 10] = r6;
- (shared.m + get_local_id(0))[4 * 8 * 11] = r11;
- (shared.m + get_local_id(0))[4 * 8 * 12] = r7;
- (shared.m + get_local_id(0))[4 * 8 * 13] = r10;
- (shared.m + get_local_id(0))[4 * 8 * 14] = r8;
- (shared.m + get_local_id(0))[4 * 8 * 15] = r9;
- barrier(CLK_LOCAL_MEM_FENCE);
- {
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
- HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[8];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[0] = r0_1;
- (shared.m + smem_r_idx)[8] = r0_2;
- }
- {
- HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[16];
- HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[24];
- HS_CMP_XCHG(r1_1, r1_2)
- (shared.m + smem_l_idx)[16] = r1_1;
- (shared.m + smem_r_idx)[24] = r1_2;
- }
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[128];
- HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[136];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[128] = r0_1;
- (shared.m + smem_r_idx)[136] = r0_2;
- }
- {
- HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[144];
- HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[152];
- HS_CMP_XCHG(r1_1, r1_2)
- (shared.m + smem_l_idx)[144] = r1_1;
- (shared.m + smem_r_idx)[152] = r1_2;
- }
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[256];
- HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[264];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[256] = r0_1;
- (shared.m + smem_r_idx)[264] = r0_2;
- }
- {
- HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[272];
- HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[280];
- HS_CMP_XCHG(r1_1, r1_2)
- (shared.m + smem_l_idx)[272] = r1_1;
- (shared.m + smem_r_idx)[280] = r1_2;
- }
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[384];
- HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[392];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[384] = r0_1;
- (shared.m + smem_r_idx)[392] = r0_2;
- }
- {
- HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[400];
- HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[408];
- HS_CMP_XCHG(r1_1, r1_2)
- (shared.m + smem_l_idx)[400] = r1_1;
- (shared.m + smem_r_idx)[408] = r1_2;
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- r1 = (shared.m + get_local_id(0))[4 * 8 * 0];
- r16 = (shared.m + get_local_id(0))[4 * 8 * 1];
- r2 = (shared.m + get_local_id(0))[4 * 8 * 2];
- r15 = (shared.m + get_local_id(0))[4 * 8 * 3];
- r3 = (shared.m + get_local_id(0))[4 * 8 * 4];
- r14 = (shared.m + get_local_id(0))[4 * 8 * 5];
- r4 = (shared.m + get_local_id(0))[4 * 8 * 6];
- r13 = (shared.m + get_local_id(0))[4 * 8 * 7];
- r5 = (shared.m + get_local_id(0))[4 * 8 * 8];
- r12 = (shared.m + get_local_id(0))[4 * 8 * 9];
- r6 = (shared.m + get_local_id(0))[4 * 8 * 10];
- r11 = (shared.m + get_local_id(0))[4 * 8 * 11];
- r7 = (shared.m + get_local_id(0))[4 * 8 * 12];
- r10 = (shared.m + get_local_id(0))[4 * 8 * 13];
- r8 = (shared.m + get_local_id(0))[4 * 8 * 14];
- r9 = (shared.m + get_local_id(0))[4 * 8 * 15];
- { { uint const half_lane_mask = 4;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-HS_CMP_XCHG(r1, r9)
-HS_CMP_XCHG(r5, r13)
-HS_CMP_XCHG(r1, r5)
-HS_CMP_XCHG(r9, r13)
-HS_CMP_XCHG(r3, r11)
-HS_CMP_XCHG(r7, r15)
-HS_CMP_XCHG(r3, r7)
-HS_CMP_XCHG(r11, r15)
-HS_CMP_XCHG(r1, r3)
-HS_CMP_XCHG(r5, r7)
-HS_CMP_XCHG(r9, r11)
-HS_CMP_XCHG(r13, r15)
-HS_CMP_XCHG(r2, r10)
-HS_CMP_XCHG(r6, r14)
-HS_CMP_XCHG(r2, r6)
-HS_CMP_XCHG(r10, r14)
-HS_CMP_XCHG(r4, r12)
-HS_CMP_XCHG(r8, r16)
-HS_CMP_XCHG(r4, r8)
-HS_CMP_XCHG(r12, r16)
-HS_CMP_XCHG(r2, r4)
-HS_CMP_XCHG(r6, r8)
-HS_CMP_XCHG(r10, r12)
-HS_CMP_XCHG(r14, r16)
-HS_CMP_XCHG(r1, r2)
-HS_CMP_XCHG(r3, r4)
-HS_CMP_XCHG(r5, r6)
-HS_CMP_XCHG(r7, r8)
-HS_CMP_XCHG(r9, r10)
-HS_CMP_XCHG(r11, r12)
-HS_CMP_XCHG(r13, r14)
-HS_CMP_XCHG(r15, r16)
-}
-(shared.m + get_local_id(0))[4 * 8 * 0] = r1;
-(shared.m + get_local_id(0))[4 * 8 * 1] = r16;
-(shared.m + get_local_id(0))[4 * 8 * 2] = r2;
-(shared.m + get_local_id(0))[4 * 8 * 3] = r15;
-(shared.m + get_local_id(0))[4 * 8 * 4] = r3;
-(shared.m + get_local_id(0))[4 * 8 * 5] = r14;
-(shared.m + get_local_id(0))[4 * 8 * 6] = r4;
-(shared.m + get_local_id(0))[4 * 8 * 7] = r13;
-(shared.m + get_local_id(0))[4 * 8 * 8] = r5;
-(shared.m + get_local_id(0))[4 * 8 * 9] = r12;
-(shared.m + get_local_id(0))[4 * 8 * 10] = r6;
-(shared.m + get_local_id(0))[4 * 8 * 11] = r11;
-(shared.m + get_local_id(0))[4 * 8 * 12] = r7;
-(shared.m + get_local_id(0))[4 * 8 * 13] = r10;
-(shared.m + get_local_id(0))[4 * 8 * 14] = r8;
-(shared.m + get_local_id(0))[4 * 8 * 15] = r9;
-barrier(CLK_LOCAL_MEM_FENCE);
-{
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
- HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8];
- HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[16];
- HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[24];
- HS_CMP_XCHG(r0_2, r0_3)
- HS_CMP_XCHG(r0_1, r0_4)
- HS_CMP_XCHG(r0_3, r0_4)
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[0] = r0_1;
- (shared.m + smem_l_idx)[8] = r0_2;
- (shared.m + smem_r_idx)[16] = r0_3;
- (shared.m + smem_r_idx)[24] = r0_4;
- }
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[128];
- HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[136];
- HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[144];
- HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[152];
- HS_CMP_XCHG(r0_2, r0_3)
- HS_CMP_XCHG(r0_1, r0_4)
- HS_CMP_XCHG(r0_3, r0_4)
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[128] = r0_1;
- (shared.m + smem_l_idx)[136] = r0_2;
- (shared.m + smem_r_idx)[144] = r0_3;
- (shared.m + smem_r_idx)[152] = r0_4;
- }
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[256];
- HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[264];
- HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[272];
- HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[280];
- HS_CMP_XCHG(r0_2, r0_3)
- HS_CMP_XCHG(r0_1, r0_4)
- HS_CMP_XCHG(r0_3, r0_4)
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[256] = r0_1;
- (shared.m + smem_l_idx)[264] = r0_2;
- (shared.m + smem_r_idx)[272] = r0_3;
- (shared.m + smem_r_idx)[280] = r0_4;
- }
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[384];
- HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[392];
- HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[400];
- HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[408];
- HS_CMP_XCHG(r0_2, r0_3)
- HS_CMP_XCHG(r0_1, r0_4)
- HS_CMP_XCHG(r0_3, r0_4)
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[384] = r0_1;
- (shared.m + smem_l_idx)[392] = r0_2;
- (shared.m + smem_r_idx)[400] = r0_3;
- (shared.m + smem_r_idx)[408] = r0_4;
- }
-}
-barrier(CLK_LOCAL_MEM_FENCE);
-r1 = (shared.m + get_local_id(0))[4 * 8 * 0];
-r16 = (shared.m + get_local_id(0))[4 * 8 * 1];
-r2 = (shared.m + get_local_id(0))[4 * 8 * 2];
-r15 = (shared.m + get_local_id(0))[4 * 8 * 3];
-r3 = (shared.m + get_local_id(0))[4 * 8 * 4];
-r14 = (shared.m + get_local_id(0))[4 * 8 * 5];
-r4 = (shared.m + get_local_id(0))[4 * 8 * 6];
-r13 = (shared.m + get_local_id(0))[4 * 8 * 7];
-r5 = (shared.m + get_local_id(0))[4 * 8 * 8];
-r12 = (shared.m + get_local_id(0))[4 * 8 * 9];
-r6 = (shared.m + get_local_id(0))[4 * 8 * 10];
-r11 = (shared.m + get_local_id(0))[4 * 8 * 11];
-r7 = (shared.m + get_local_id(0))[4 * 8 * 12];
-r10 = (shared.m + get_local_id(0))[4 * 8 * 13];
-r8 = (shared.m + get_local_id(0))[4 * 8 * 14];
-r9 = (shared.m + get_local_id(0))[4 * 8 * 15];
-{ { uint const half_lane_mask = 4;
-uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
-int const t_lt = get_sub_group_local_id() < half_lane_idx;
-HS_CMP_HALF(0, r1)
-HS_CMP_HALF(1, r2)
-HS_CMP_HALF(2, r3)
-HS_CMP_HALF(3, r4)
-HS_CMP_HALF(4, r5)
-HS_CMP_HALF(5, r6)
-HS_CMP_HALF(6, r7)
-HS_CMP_HALF(7, r8)
-HS_CMP_HALF(8, r9)
-HS_CMP_HALF(9, r10)
-HS_CMP_HALF(10, r11)
-HS_CMP_HALF(11, r12)
-HS_CMP_HALF(12, r13)
-HS_CMP_HALF(13, r14)
-HS_CMP_HALF(14, r15)
-HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-HS_CMP_XCHG(r1, r9)
-HS_CMP_XCHG(r5, r13)
-HS_CMP_XCHG(r1, r5)
-HS_CMP_XCHG(r9, r13)
-HS_CMP_XCHG(r3, r11)
-HS_CMP_XCHG(r7, r15)
-HS_CMP_XCHG(r3, r7)
-HS_CMP_XCHG(r11, r15)
-HS_CMP_XCHG(r1, r3)
-HS_CMP_XCHG(r5, r7)
-HS_CMP_XCHG(r9, r11)
-HS_CMP_XCHG(r13, r15)
-HS_CMP_XCHG(r2, r10)
-HS_CMP_XCHG(r6, r14)
-HS_CMP_XCHG(r2, r6)
-HS_CMP_XCHG(r10, r14)
-HS_CMP_XCHG(r4, r12)
-HS_CMP_XCHG(r8, r16)
-HS_CMP_XCHG(r4, r8)
-HS_CMP_XCHG(r12, r16)
-HS_CMP_XCHG(r2, r4)
-HS_CMP_XCHG(r6, r8)
-HS_CMP_XCHG(r10, r12)
-HS_CMP_XCHG(r14, r16)
-HS_CMP_XCHG(r1, r2)
-HS_CMP_XCHG(r3, r4)
-HS_CMP_XCHG(r5, r6)
-HS_CMP_XCHG(r7, r8)
-HS_CMP_XCHG(r9, r10)
-HS_CMP_XCHG(r11, r12)
-HS_CMP_XCHG(r13, r14)
-HS_CMP_XCHG(r15, r16)
-}
-(vout + gmem_idx)[0 * 8] = r1;
-(vout + gmem_idx)[1 * 8] = r2;
-(vout + gmem_idx)[2 * 8] = r3;
-(vout + gmem_idx)[3 * 8] = r4;
-(vout + gmem_idx)[4 * 8] = r5;
-(vout + gmem_idx)[5 * 8] = r6;
-(vout + gmem_idx)[6 * 8] = r7;
-(vout + gmem_idx)[7 * 8] = r8;
-(vout + gmem_idx)[8 * 8] = r9;
-(vout + gmem_idx)[9 * 8] = r10;
-(vout + gmem_idx)[10 * 8] = r11;
-(vout + gmem_idx)[11 * 8] = r12;
-(vout + gmem_idx)[12 * 8] = r13;
-(vout + gmem_idx)[13 * 8] = r14;
-(vout + gmem_idx)[14 * 8] = r15;
-(vout + gmem_idx)[15 * 8] = r16;
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1)))
-__attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_bs_1(__global HS_KEY_TYPE const* const restrict vin,
- __global HS_KEY_TYPE* const restrict vout)
-{
- __local union
- {
- HS_KEY_TYPE m[16 * 16];
- } shared;
-
- uint const global_id = get_global_id(0);
- uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
-
- HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8];
- HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8];
- HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8];
- HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8];
- HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8];
- HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8];
- HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8];
- HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8];
- HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8];
- HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8];
- HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8];
- HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8];
- HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8];
- HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8];
- HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8];
- HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8];
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r6, r11)
- HS_CMP_XCHG(r7, r10)
- HS_CMP_XCHG(r4, r13)
- HS_CMP_XCHG(r14, r15)
- HS_CMP_XCHG(r8, r12)
- HS_CMP_XCHG(r2, r3)
- HS_CMP_XCHG(r5, r9)
- HS_CMP_XCHG(r2, r5)
- HS_CMP_XCHG(r8, r14)
- HS_CMP_XCHG(r3, r9)
- HS_CMP_XCHG(r12, r15)
- HS_CMP_XCHG(r3, r5)
- HS_CMP_XCHG(r6, r7)
- HS_CMP_XCHG(r10, r11)
- HS_CMP_XCHG(r12, r14)
- HS_CMP_XCHG(r4, r9)
- HS_CMP_XCHG(r8, r13)
- HS_CMP_XCHG(r7, r9)
- HS_CMP_XCHG(r11, r13)
- HS_CMP_XCHG(r4, r6)
- HS_CMP_XCHG(r8, r10)
- HS_CMP_XCHG(r4, r5)
- HS_CMP_XCHG(r6, r7)
- HS_CMP_XCHG(r8, r9)
- HS_CMP_XCHG(r10, r11)
- HS_CMP_XCHG(r12, r13)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- {
- uint const flip_lane_mask = 1;
- uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
- int const t_lt = get_sub_group_local_id() < flip_lane_idx;
- HS_CMP_FLIP(0, r1, r16)
- HS_CMP_FLIP(1, r2, r15)
- HS_CMP_FLIP(2, r3, r14)
- HS_CMP_FLIP(3, r4, r13)
- HS_CMP_FLIP(4, r5, r12)
- HS_CMP_FLIP(5, r6, r11)
- HS_CMP_FLIP(6, r7, r10)
- HS_CMP_FLIP(7, r8, r9)
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- {
- uint const flip_lane_mask = 3;
- uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
- int const t_lt = get_sub_group_local_id() < flip_lane_idx;
- HS_CMP_FLIP(0, r1, r16)
- HS_CMP_FLIP(1, r2, r15)
- HS_CMP_FLIP(2, r3, r14)
- HS_CMP_FLIP(3, r4, r13)
- HS_CMP_FLIP(4, r5, r12)
- HS_CMP_FLIP(5, r6, r11)
- HS_CMP_FLIP(6, r7, r10)
- HS_CMP_FLIP(7, r8, r9)
- }
- {
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- {
- uint const flip_lane_mask = 7;
- uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
- int const t_lt = get_sub_group_local_id() < flip_lane_idx;
- HS_CMP_FLIP(0, r1, r16)
- HS_CMP_FLIP(1, r2, r15)
- HS_CMP_FLIP(2, r3, r14)
- HS_CMP_FLIP(3, r4, r13)
- HS_CMP_FLIP(4, r5, r12)
- HS_CMP_FLIP(5, r6, r11)
- HS_CMP_FLIP(6, r7, r10)
- HS_CMP_FLIP(7, r8, r9)
- }
- {
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
- }
- {
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- uint const smem_l_idx = get_sub_group_id() * 16 + get_sub_group_local_id();
- uint const smem_r_idx =
- (get_sub_group_id() ^ 1) * 16 + (get_sub_group_local_id() ^ 7);
- (shared.m + get_local_id(0))[2 * 8 * 0] = r1;
- (shared.m + get_local_id(0))[2 * 8 * 1] = r16;
- (shared.m + get_local_id(0))[2 * 8 * 2] = r2;
- (shared.m + get_local_id(0))[2 * 8 * 3] = r15;
- (shared.m + get_local_id(0))[2 * 8 * 4] = r3;
- (shared.m + get_local_id(0))[2 * 8 * 5] = r14;
- (shared.m + get_local_id(0))[2 * 8 * 6] = r4;
- (shared.m + get_local_id(0))[2 * 8 * 7] = r13;
- (shared.m + get_local_id(0))[2 * 8 * 8] = r5;
- (shared.m + get_local_id(0))[2 * 8 * 9] = r12;
- (shared.m + get_local_id(0))[2 * 8 * 10] = r6;
- (shared.m + get_local_id(0))[2 * 8 * 11] = r11;
- (shared.m + get_local_id(0))[2 * 8 * 12] = r7;
- (shared.m + get_local_id(0))[2 * 8 * 13] = r10;
- (shared.m + get_local_id(0))[2 * 8 * 14] = r8;
- (shared.m + get_local_id(0))[2 * 8 * 15] = r9;
- barrier(CLK_LOCAL_MEM_FENCE);
- {
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
- HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[8];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[0] = r0_1;
- (shared.m + smem_r_idx)[8] = r0_2;
- }
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[32];
- HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[40];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[32] = r0_1;
- (shared.m + smem_r_idx)[40] = r0_2;
- }
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[64];
- HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[72];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[64] = r0_1;
- (shared.m + smem_r_idx)[72] = r0_2;
- }
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[96];
- HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[104];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[96] = r0_1;
- (shared.m + smem_r_idx)[104] = r0_2;
- }
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[128];
- HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[136];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[128] = r0_1;
- (shared.m + smem_r_idx)[136] = r0_2;
- }
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[160];
- HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[168];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[160] = r0_1;
- (shared.m + smem_r_idx)[168] = r0_2;
- }
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[192];
- HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[200];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[192] = r0_1;
- (shared.m + smem_r_idx)[200] = r0_2;
- }
- {
- HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[224];
- HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[232];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[224] = r0_1;
- (shared.m + smem_r_idx)[232] = r0_2;
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- r1 = (shared.m + get_local_id(0))[2 * 8 * 0];
- r16 = (shared.m + get_local_id(0))[2 * 8 * 1];
- r2 = (shared.m + get_local_id(0))[2 * 8 * 2];
- r15 = (shared.m + get_local_id(0))[2 * 8 * 3];
- r3 = (shared.m + get_local_id(0))[2 * 8 * 4];
- r14 = (shared.m + get_local_id(0))[2 * 8 * 5];
- r4 = (shared.m + get_local_id(0))[2 * 8 * 6];
- r13 = (shared.m + get_local_id(0))[2 * 8 * 7];
- r5 = (shared.m + get_local_id(0))[2 * 8 * 8];
- r12 = (shared.m + get_local_id(0))[2 * 8 * 9];
- r6 = (shared.m + get_local_id(0))[2 * 8 * 10];
- r11 = (shared.m + get_local_id(0))[2 * 8 * 11];
- r7 = (shared.m + get_local_id(0))[2 * 8 * 12];
- r10 = (shared.m + get_local_id(0))[2 * 8 * 13];
- r8 = (shared.m + get_local_id(0))[2 * 8 * 14];
- r9 = (shared.m + get_local_id(0))[2 * 8 * 15];
- { { uint const half_lane_mask = 4;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-HS_CMP_XCHG(r1, r9)
-HS_CMP_XCHG(r5, r13)
-HS_CMP_XCHG(r1, r5)
-HS_CMP_XCHG(r9, r13)
-HS_CMP_XCHG(r3, r11)
-HS_CMP_XCHG(r7, r15)
-HS_CMP_XCHG(r3, r7)
-HS_CMP_XCHG(r11, r15)
-HS_CMP_XCHG(r1, r3)
-HS_CMP_XCHG(r5, r7)
-HS_CMP_XCHG(r9, r11)
-HS_CMP_XCHG(r13, r15)
-HS_CMP_XCHG(r2, r10)
-HS_CMP_XCHG(r6, r14)
-HS_CMP_XCHG(r2, r6)
-HS_CMP_XCHG(r10, r14)
-HS_CMP_XCHG(r4, r12)
-HS_CMP_XCHG(r8, r16)
-HS_CMP_XCHG(r4, r8)
-HS_CMP_XCHG(r12, r16)
-HS_CMP_XCHG(r2, r4)
-HS_CMP_XCHG(r6, r8)
-HS_CMP_XCHG(r10, r12)
-HS_CMP_XCHG(r14, r16)
-HS_CMP_XCHG(r1, r2)
-HS_CMP_XCHG(r3, r4)
-HS_CMP_XCHG(r5, r6)
-HS_CMP_XCHG(r7, r8)
-HS_CMP_XCHG(r9, r10)
-HS_CMP_XCHG(r11, r12)
-HS_CMP_XCHG(r13, r14)
-HS_CMP_XCHG(r15, r16)
-}
-(vout + gmem_idx)[0 * 8] = r1;
-(vout + gmem_idx)[1 * 8] = r2;
-(vout + gmem_idx)[2 * 8] = r3;
-(vout + gmem_idx)[3 * 8] = r4;
-(vout + gmem_idx)[4 * 8] = r5;
-(vout + gmem_idx)[5 * 8] = r6;
-(vout + gmem_idx)[6 * 8] = r7;
-(vout + gmem_idx)[7 * 8] = r8;
-(vout + gmem_idx)[8 * 8] = r9;
-(vout + gmem_idx)[9 * 8] = r10;
-(vout + gmem_idx)[10 * 8] = r11;
-(vout + gmem_idx)[11 * 8] = r12;
-(vout + gmem_idx)[12 * 8] = r13;
-(vout + gmem_idx)[13 * 8] = r14;
-(vout + gmem_idx)[14 * 8] = r15;
-(vout + gmem_idx)[15 * 8] = r16;
-}
-
-__kernel __attribute__((reqd_work_group_size(8, 1, 1)))
-__attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_bs_0(__global HS_KEY_TYPE const* const restrict vin,
- __global HS_KEY_TYPE* const restrict vout)
-{
- __local union
- {
- } shared;
-
- uint const global_id = get_global_id(0);
- uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
-
- HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8];
- HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8];
- HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8];
- HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8];
- HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8];
- HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8];
- HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8];
- HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8];
- HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8];
- HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8];
- HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8];
- HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8];
- HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8];
- HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8];
- HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8];
- HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8];
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r6, r11)
- HS_CMP_XCHG(r7, r10)
- HS_CMP_XCHG(r4, r13)
- HS_CMP_XCHG(r14, r15)
- HS_CMP_XCHG(r8, r12)
- HS_CMP_XCHG(r2, r3)
- HS_CMP_XCHG(r5, r9)
- HS_CMP_XCHG(r2, r5)
- HS_CMP_XCHG(r8, r14)
- HS_CMP_XCHG(r3, r9)
- HS_CMP_XCHG(r12, r15)
- HS_CMP_XCHG(r3, r5)
- HS_CMP_XCHG(r6, r7)
- HS_CMP_XCHG(r10, r11)
- HS_CMP_XCHG(r12, r14)
- HS_CMP_XCHG(r4, r9)
- HS_CMP_XCHG(r8, r13)
- HS_CMP_XCHG(r7, r9)
- HS_CMP_XCHG(r11, r13)
- HS_CMP_XCHG(r4, r6)
- HS_CMP_XCHG(r8, r10)
- HS_CMP_XCHG(r4, r5)
- HS_CMP_XCHG(r6, r7)
- HS_CMP_XCHG(r8, r9)
- HS_CMP_XCHG(r10, r11)
- HS_CMP_XCHG(r12, r13)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- {
- uint const flip_lane_mask = 1;
- uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
- int const t_lt = get_sub_group_local_id() < flip_lane_idx;
- HS_CMP_FLIP(0, r1, r16)
- HS_CMP_FLIP(1, r2, r15)
- HS_CMP_FLIP(2, r3, r14)
- HS_CMP_FLIP(3, r4, r13)
- HS_CMP_FLIP(4, r5, r12)
- HS_CMP_FLIP(5, r6, r11)
- HS_CMP_FLIP(6, r7, r10)
- HS_CMP_FLIP(7, r8, r9)
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- {
- uint const flip_lane_mask = 3;
- uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
- int const t_lt = get_sub_group_local_id() < flip_lane_idx;
- HS_CMP_FLIP(0, r1, r16)
- HS_CMP_FLIP(1, r2, r15)
- HS_CMP_FLIP(2, r3, r14)
- HS_CMP_FLIP(3, r4, r13)
- HS_CMP_FLIP(4, r5, r12)
- HS_CMP_FLIP(5, r6, r11)
- HS_CMP_FLIP(6, r7, r10)
- HS_CMP_FLIP(7, r8, r9)
- }
- {
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- {
- uint const flip_lane_mask = 7;
- uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
- int const t_lt = get_sub_group_local_id() < flip_lane_idx;
- HS_CMP_FLIP(0, r1, r16)
- HS_CMP_FLIP(1, r2, r15)
- HS_CMP_FLIP(2, r3, r14)
- HS_CMP_FLIP(3, r4, r13)
- HS_CMP_FLIP(4, r5, r12)
- HS_CMP_FLIP(5, r6, r11)
- HS_CMP_FLIP(6, r7, r10)
- HS_CMP_FLIP(7, r8, r9)
- }
- {
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
- }
- {
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- (vout + gmem_idx)[0 * 8] = r1;
- (vout + gmem_idx)[1 * 8] = r2;
- (vout + gmem_idx)[2 * 8] = r3;
- (vout + gmem_idx)[3 * 8] = r4;
- (vout + gmem_idx)[4 * 8] = r5;
- (vout + gmem_idx)[5 * 8] = r6;
- (vout + gmem_idx)[6 * 8] = r7;
- (vout + gmem_idx)[7 * 8] = r8;
- (vout + gmem_idx)[8 * 8] = r9;
- (vout + gmem_idx)[9 * 8] = r10;
- (vout + gmem_idx)[10 * 8] = r11;
- (vout + gmem_idx)[11 * 8] = r12;
- (vout + gmem_idx)[12 * 8] = r13;
- (vout + gmem_idx)[13 * 8] = r14;
- (vout + gmem_idx)[14 * 8] = r15;
- (vout + gmem_idx)[15 * 8] = r16;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_bc_4(__global HS_KEY_TYPE* const restrict vout)
-{
- __local union
- {
- HS_KEY_TYPE m[16 * 128];
- } shared;
-
- uint const global_id = get_global_id(0);
- uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
-
- uint const gmem_l_idx = (global_id / 128) * 2048 + (global_id & 127);
- uint const smem_l_idx = get_sub_group_id() * 128 + get_sub_group_local_id();
- {
- {
- HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[0];
- HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[128];
- HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[256];
- HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[384];
- HS_KEY_TYPE r0_5 = (vout + gmem_l_idx)[512];
- HS_KEY_TYPE r0_6 = (vout + gmem_l_idx)[640];
- HS_KEY_TYPE r0_7 = (vout + gmem_l_idx)[768];
- HS_KEY_TYPE r0_8 = (vout + gmem_l_idx)[896];
- HS_KEY_TYPE r0_9 = (vout + gmem_l_idx)[1024];
- HS_KEY_TYPE r0_10 = (vout + gmem_l_idx)[1152];
- HS_KEY_TYPE r0_11 = (vout + gmem_l_idx)[1280];
- HS_KEY_TYPE r0_12 = (vout + gmem_l_idx)[1408];
- HS_KEY_TYPE r0_13 = (vout + gmem_l_idx)[1536];
- HS_KEY_TYPE r0_14 = (vout + gmem_l_idx)[1664];
- HS_KEY_TYPE r0_15 = (vout + gmem_l_idx)[1792];
- HS_KEY_TYPE r0_16 = (vout + gmem_l_idx)[1920];
- HS_CMP_XCHG(r0_1, r0_9)
- HS_CMP_XCHG(r0_5, r0_13)
- HS_CMP_XCHG(r0_1, r0_5)
- HS_CMP_XCHG(r0_9, r0_13)
- HS_CMP_XCHG(r0_3, r0_11)
- HS_CMP_XCHG(r0_7, r0_15)
- HS_CMP_XCHG(r0_3, r0_7)
- HS_CMP_XCHG(r0_11, r0_15)
- HS_CMP_XCHG(r0_1, r0_3)
- HS_CMP_XCHG(r0_5, r0_7)
- HS_CMP_XCHG(r0_9, r0_11)
- HS_CMP_XCHG(r0_13, r0_15)
- HS_CMP_XCHG(r0_2, r0_10)
- HS_CMP_XCHG(r0_6, r0_14)
- HS_CMP_XCHG(r0_2, r0_6)
- HS_CMP_XCHG(r0_10, r0_14)
- HS_CMP_XCHG(r0_4, r0_12)
- HS_CMP_XCHG(r0_8, r0_16)
- HS_CMP_XCHG(r0_4, r0_8)
- HS_CMP_XCHG(r0_12, r0_16)
- HS_CMP_XCHG(r0_2, r0_4)
- HS_CMP_XCHG(r0_6, r0_8)
- HS_CMP_XCHG(r0_10, r0_12)
- HS_CMP_XCHG(r0_14, r0_16)
- HS_CMP_XCHG(r0_1, r0_2)
- HS_CMP_XCHG(r0_3, r0_4)
- HS_CMP_XCHG(r0_5, r0_6)
- HS_CMP_XCHG(r0_7, r0_8)
- HS_CMP_XCHG(r0_9, r0_10)
- HS_CMP_XCHG(r0_11, r0_12)
- HS_CMP_XCHG(r0_13, r0_14)
- HS_CMP_XCHG(r0_15, r0_16)
- (shared.m + smem_l_idx)[0] = r0_1;
- (shared.m + smem_l_idx)[8] = r0_2;
- (shared.m + smem_l_idx)[16] = r0_3;
- (shared.m + smem_l_idx)[24] = r0_4;
- (shared.m + smem_l_idx)[32] = r0_5;
- (shared.m + smem_l_idx)[40] = r0_6;
- (shared.m + smem_l_idx)[48] = r0_7;
- (shared.m + smem_l_idx)[56] = r0_8;
- (shared.m + smem_l_idx)[64] = r0_9;
- (shared.m + smem_l_idx)[72] = r0_10;
- (shared.m + smem_l_idx)[80] = r0_11;
- (shared.m + smem_l_idx)[88] = r0_12;
- (shared.m + smem_l_idx)[96] = r0_13;
- (shared.m + smem_l_idx)[104] = r0_14;
- (shared.m + smem_l_idx)[112] = r0_15;
- (shared.m + smem_l_idx)[120] = r0_16;
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- HS_KEY_TYPE r1 = (shared.m + get_local_id(0))[16 * 8 * 0];
- HS_KEY_TYPE r2 = (shared.m + get_local_id(0))[16 * 8 * 1];
- HS_KEY_TYPE r3 = (shared.m + get_local_id(0))[16 * 8 * 2];
- HS_KEY_TYPE r4 = (shared.m + get_local_id(0))[16 * 8 * 3];
- HS_KEY_TYPE r5 = (shared.m + get_local_id(0))[16 * 8 * 4];
- HS_KEY_TYPE r6 = (shared.m + get_local_id(0))[16 * 8 * 5];
- HS_KEY_TYPE r7 = (shared.m + get_local_id(0))[16 * 8 * 6];
- HS_KEY_TYPE r8 = (shared.m + get_local_id(0))[16 * 8 * 7];
- HS_KEY_TYPE r9 = (shared.m + get_local_id(0))[16 * 8 * 8];
- HS_KEY_TYPE r10 = (shared.m + get_local_id(0))[16 * 8 * 9];
- HS_KEY_TYPE r11 = (shared.m + get_local_id(0))[16 * 8 * 10];
- HS_KEY_TYPE r12 = (shared.m + get_local_id(0))[16 * 8 * 11];
- HS_KEY_TYPE r13 = (shared.m + get_local_id(0))[16 * 8 * 12];
- HS_KEY_TYPE r14 = (shared.m + get_local_id(0))[16 * 8 * 13];
- HS_KEY_TYPE r15 = (shared.m + get_local_id(0))[16 * 8 * 14];
- HS_KEY_TYPE r16 = (shared.m + get_local_id(0))[16 * 8 * 15];
- { { uint const half_lane_mask = 4;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-HS_CMP_XCHG(r1, r9)
-HS_CMP_XCHG(r5, r13)
-HS_CMP_XCHG(r1, r5)
-HS_CMP_XCHG(r9, r13)
-HS_CMP_XCHG(r3, r11)
-HS_CMP_XCHG(r7, r15)
-HS_CMP_XCHG(r3, r7)
-HS_CMP_XCHG(r11, r15)
-HS_CMP_XCHG(r1, r3)
-HS_CMP_XCHG(r5, r7)
-HS_CMP_XCHG(r9, r11)
-HS_CMP_XCHG(r13, r15)
-HS_CMP_XCHG(r2, r10)
-HS_CMP_XCHG(r6, r14)
-HS_CMP_XCHG(r2, r6)
-HS_CMP_XCHG(r10, r14)
-HS_CMP_XCHG(r4, r12)
-HS_CMP_XCHG(r8, r16)
-HS_CMP_XCHG(r4, r8)
-HS_CMP_XCHG(r12, r16)
-HS_CMP_XCHG(r2, r4)
-HS_CMP_XCHG(r6, r8)
-HS_CMP_XCHG(r10, r12)
-HS_CMP_XCHG(r14, r16)
-HS_CMP_XCHG(r1, r2)
-HS_CMP_XCHG(r3, r4)
-HS_CMP_XCHG(r5, r6)
-HS_CMP_XCHG(r7, r8)
-HS_CMP_XCHG(r9, r10)
-HS_CMP_XCHG(r11, r12)
-HS_CMP_XCHG(r13, r14)
-HS_CMP_XCHG(r15, r16)
-}
-(vout + gmem_idx)[0 * 8] = r1;
-(vout + gmem_idx)[1 * 8] = r2;
-(vout + gmem_idx)[2 * 8] = r3;
-(vout + gmem_idx)[3 * 8] = r4;
-(vout + gmem_idx)[4 * 8] = r5;
-(vout + gmem_idx)[5 * 8] = r6;
-(vout + gmem_idx)[6 * 8] = r7;
-(vout + gmem_idx)[7 * 8] = r8;
-(vout + gmem_idx)[8 * 8] = r9;
-(vout + gmem_idx)[9 * 8] = r10;
-(vout + gmem_idx)[10 * 8] = r11;
-(vout + gmem_idx)[11 * 8] = r12;
-(vout + gmem_idx)[12 * 8] = r13;
-(vout + gmem_idx)[13 * 8] = r14;
-(vout + gmem_idx)[14 * 8] = r15;
-(vout + gmem_idx)[15 * 8] = r16;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_bc_3(__global HS_KEY_TYPE* const restrict vout)
-{
- __local union
- {
- HS_KEY_TYPE m[16 * 64];
- } shared;
-
- uint const global_id = get_global_id(0);
- uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
-
- uint const gmem_l_idx = (global_id / 64) * 1024 + (global_id & 63);
- uint const smem_l_idx = get_sub_group_id() * 64 + get_sub_group_local_id();
- {
- {
- HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[0];
- HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[128];
- HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[256];
- HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[384];
- HS_KEY_TYPE r0_5 = (vout + gmem_l_idx)[512];
- HS_KEY_TYPE r0_6 = (vout + gmem_l_idx)[640];
- HS_KEY_TYPE r0_7 = (vout + gmem_l_idx)[768];
- HS_KEY_TYPE r0_8 = (vout + gmem_l_idx)[896];
- HS_CMP_XCHG(r0_1, r0_5)
- HS_CMP_XCHG(r0_3, r0_7)
- HS_CMP_XCHG(r0_1, r0_3)
- HS_CMP_XCHG(r0_5, r0_7)
- HS_CMP_XCHG(r0_2, r0_6)
- HS_CMP_XCHG(r0_4, r0_8)
- HS_CMP_XCHG(r0_2, r0_4)
- HS_CMP_XCHG(r0_6, r0_8)
- HS_CMP_XCHG(r0_1, r0_2)
- HS_CMP_XCHG(r0_3, r0_4)
- HS_CMP_XCHG(r0_5, r0_6)
- HS_CMP_XCHG(r0_7, r0_8)
- (shared.m + smem_l_idx)[0] = r0_1;
- (shared.m + smem_l_idx)[8] = r0_2;
- (shared.m + smem_l_idx)[16] = r0_3;
- (shared.m + smem_l_idx)[24] = r0_4;
- (shared.m + smem_l_idx)[32] = r0_5;
- (shared.m + smem_l_idx)[40] = r0_6;
- (shared.m + smem_l_idx)[48] = r0_7;
- (shared.m + smem_l_idx)[56] = r0_8;
- }
- {
- HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[64];
- HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[192];
- HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[320];
- HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[448];
- HS_KEY_TYPE r0_5 = (vout + gmem_l_idx)[576];
- HS_KEY_TYPE r0_6 = (vout + gmem_l_idx)[704];
- HS_KEY_TYPE r0_7 = (vout + gmem_l_idx)[832];
- HS_KEY_TYPE r0_8 = (vout + gmem_l_idx)[960];
- HS_CMP_XCHG(r0_1, r0_5)
- HS_CMP_XCHG(r0_3, r0_7)
- HS_CMP_XCHG(r0_1, r0_3)
- HS_CMP_XCHG(r0_5, r0_7)
- HS_CMP_XCHG(r0_2, r0_6)
- HS_CMP_XCHG(r0_4, r0_8)
- HS_CMP_XCHG(r0_2, r0_4)
- HS_CMP_XCHG(r0_6, r0_8)
- HS_CMP_XCHG(r0_1, r0_2)
- HS_CMP_XCHG(r0_3, r0_4)
- HS_CMP_XCHG(r0_5, r0_6)
- HS_CMP_XCHG(r0_7, r0_8)
- (shared.m + smem_l_idx)[512] = r0_1;
- (shared.m + smem_l_idx)[520] = r0_2;
- (shared.m + smem_l_idx)[528] = r0_3;
- (shared.m + smem_l_idx)[536] = r0_4;
- (shared.m + smem_l_idx)[544] = r0_5;
- (shared.m + smem_l_idx)[552] = r0_6;
- (shared.m + smem_l_idx)[560] = r0_7;
- (shared.m + smem_l_idx)[568] = r0_8;
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- HS_KEY_TYPE r1 = (shared.m + get_local_id(0))[8 * 8 * 0];
- HS_KEY_TYPE r2 = (shared.m + get_local_id(0))[8 * 8 * 1];
- HS_KEY_TYPE r3 = (shared.m + get_local_id(0))[8 * 8 * 2];
- HS_KEY_TYPE r4 = (shared.m + get_local_id(0))[8 * 8 * 3];
- HS_KEY_TYPE r5 = (shared.m + get_local_id(0))[8 * 8 * 4];
- HS_KEY_TYPE r6 = (shared.m + get_local_id(0))[8 * 8 * 5];
- HS_KEY_TYPE r7 = (shared.m + get_local_id(0))[8 * 8 * 6];
- HS_KEY_TYPE r8 = (shared.m + get_local_id(0))[8 * 8 * 7];
- HS_KEY_TYPE r9 = (shared.m + get_local_id(0))[8 * 8 * 8];
- HS_KEY_TYPE r10 = (shared.m + get_local_id(0))[8 * 8 * 9];
- HS_KEY_TYPE r11 = (shared.m + get_local_id(0))[8 * 8 * 10];
- HS_KEY_TYPE r12 = (shared.m + get_local_id(0))[8 * 8 * 11];
- HS_KEY_TYPE r13 = (shared.m + get_local_id(0))[8 * 8 * 12];
- HS_KEY_TYPE r14 = (shared.m + get_local_id(0))[8 * 8 * 13];
- HS_KEY_TYPE r15 = (shared.m + get_local_id(0))[8 * 8 * 14];
- HS_KEY_TYPE r16 = (shared.m + get_local_id(0))[8 * 8 * 15];
- { { uint const half_lane_mask = 4;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-HS_CMP_XCHG(r1, r9)
-HS_CMP_XCHG(r5, r13)
-HS_CMP_XCHG(r1, r5)
-HS_CMP_XCHG(r9, r13)
-HS_CMP_XCHG(r3, r11)
-HS_CMP_XCHG(r7, r15)
-HS_CMP_XCHG(r3, r7)
-HS_CMP_XCHG(r11, r15)
-HS_CMP_XCHG(r1, r3)
-HS_CMP_XCHG(r5, r7)
-HS_CMP_XCHG(r9, r11)
-HS_CMP_XCHG(r13, r15)
-HS_CMP_XCHG(r2, r10)
-HS_CMP_XCHG(r6, r14)
-HS_CMP_XCHG(r2, r6)
-HS_CMP_XCHG(r10, r14)
-HS_CMP_XCHG(r4, r12)
-HS_CMP_XCHG(r8, r16)
-HS_CMP_XCHG(r4, r8)
-HS_CMP_XCHG(r12, r16)
-HS_CMP_XCHG(r2, r4)
-HS_CMP_XCHG(r6, r8)
-HS_CMP_XCHG(r10, r12)
-HS_CMP_XCHG(r14, r16)
-HS_CMP_XCHG(r1, r2)
-HS_CMP_XCHG(r3, r4)
-HS_CMP_XCHG(r5, r6)
-HS_CMP_XCHG(r7, r8)
-HS_CMP_XCHG(r9, r10)
-HS_CMP_XCHG(r11, r12)
-HS_CMP_XCHG(r13, r14)
-HS_CMP_XCHG(r15, r16)
-}
-(vout + gmem_idx)[0 * 8] = r1;
-(vout + gmem_idx)[1 * 8] = r2;
-(vout + gmem_idx)[2 * 8] = r3;
-(vout + gmem_idx)[3 * 8] = r4;
-(vout + gmem_idx)[4 * 8] = r5;
-(vout + gmem_idx)[5 * 8] = r6;
-(vout + gmem_idx)[6 * 8] = r7;
-(vout + gmem_idx)[7 * 8] = r8;
-(vout + gmem_idx)[8 * 8] = r9;
-(vout + gmem_idx)[9 * 8] = r10;
-(vout + gmem_idx)[10 * 8] = r11;
-(vout + gmem_idx)[11 * 8] = r12;
-(vout + gmem_idx)[12 * 8] = r13;
-(vout + gmem_idx)[13 * 8] = r14;
-(vout + gmem_idx)[14 * 8] = r15;
-(vout + gmem_idx)[15 * 8] = r16;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_bc_2(__global HS_KEY_TYPE* const restrict vout)
-{
- __local union
- {
- HS_KEY_TYPE m[16 * 32];
- } shared;
-
- uint const global_id = get_global_id(0);
- uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
-
- uint const gmem_l_idx = (global_id / 32) * 512 + (global_id & 31);
- uint const smem_l_idx = get_sub_group_id() * 32 + get_sub_group_local_id();
- {
- {
- HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[0];
- HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[128];
- HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[256];
- HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[384];
- HS_CMP_XCHG(r0_1, r0_3)
- HS_CMP_XCHG(r0_2, r0_4)
- HS_CMP_XCHG(r0_1, r0_2)
- HS_CMP_XCHG(r0_3, r0_4)
- (shared.m + smem_l_idx)[0] = r0_1;
- (shared.m + smem_l_idx)[8] = r0_2;
- (shared.m + smem_l_idx)[16] = r0_3;
- (shared.m + smem_l_idx)[24] = r0_4;
- }
- {
- HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[32];
- HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[160];
- HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[288];
- HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[416];
- HS_CMP_XCHG(r0_1, r0_3)
- HS_CMP_XCHG(r0_2, r0_4)
- HS_CMP_XCHG(r0_1, r0_2)
- HS_CMP_XCHG(r0_3, r0_4)
- (shared.m + smem_l_idx)[128] = r0_1;
- (shared.m + smem_l_idx)[136] = r0_2;
- (shared.m + smem_l_idx)[144] = r0_3;
- (shared.m + smem_l_idx)[152] = r0_4;
- }
- {
- HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[64];
- HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[192];
- HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[320];
- HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[448];
- HS_CMP_XCHG(r0_1, r0_3)
- HS_CMP_XCHG(r0_2, r0_4)
- HS_CMP_XCHG(r0_1, r0_2)
- HS_CMP_XCHG(r0_3, r0_4)
- (shared.m + smem_l_idx)[256] = r0_1;
- (shared.m + smem_l_idx)[264] = r0_2;
- (shared.m + smem_l_idx)[272] = r0_3;
- (shared.m + smem_l_idx)[280] = r0_4;
- }
- {
- HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[96];
- HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[224];
- HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[352];
- HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[480];
- HS_CMP_XCHG(r0_1, r0_3)
- HS_CMP_XCHG(r0_2, r0_4)
- HS_CMP_XCHG(r0_1, r0_2)
- HS_CMP_XCHG(r0_3, r0_4)
- (shared.m + smem_l_idx)[384] = r0_1;
- (shared.m + smem_l_idx)[392] = r0_2;
- (shared.m + smem_l_idx)[400] = r0_3;
- (shared.m + smem_l_idx)[408] = r0_4;
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- HS_KEY_TYPE r1 = (shared.m + get_local_id(0))[4 * 8 * 0];
- HS_KEY_TYPE r2 = (shared.m + get_local_id(0))[4 * 8 * 1];
- HS_KEY_TYPE r3 = (shared.m + get_local_id(0))[4 * 8 * 2];
- HS_KEY_TYPE r4 = (shared.m + get_local_id(0))[4 * 8 * 3];
- HS_KEY_TYPE r5 = (shared.m + get_local_id(0))[4 * 8 * 4];
- HS_KEY_TYPE r6 = (shared.m + get_local_id(0))[4 * 8 * 5];
- HS_KEY_TYPE r7 = (shared.m + get_local_id(0))[4 * 8 * 6];
- HS_KEY_TYPE r8 = (shared.m + get_local_id(0))[4 * 8 * 7];
- HS_KEY_TYPE r9 = (shared.m + get_local_id(0))[4 * 8 * 8];
- HS_KEY_TYPE r10 = (shared.m + get_local_id(0))[4 * 8 * 9];
- HS_KEY_TYPE r11 = (shared.m + get_local_id(0))[4 * 8 * 10];
- HS_KEY_TYPE r12 = (shared.m + get_local_id(0))[4 * 8 * 11];
- HS_KEY_TYPE r13 = (shared.m + get_local_id(0))[4 * 8 * 12];
- HS_KEY_TYPE r14 = (shared.m + get_local_id(0))[4 * 8 * 13];
- HS_KEY_TYPE r15 = (shared.m + get_local_id(0))[4 * 8 * 14];
- HS_KEY_TYPE r16 = (shared.m + get_local_id(0))[4 * 8 * 15];
- { { uint const half_lane_mask = 4;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-HS_CMP_XCHG(r1, r9)
-HS_CMP_XCHG(r5, r13)
-HS_CMP_XCHG(r1, r5)
-HS_CMP_XCHG(r9, r13)
-HS_CMP_XCHG(r3, r11)
-HS_CMP_XCHG(r7, r15)
-HS_CMP_XCHG(r3, r7)
-HS_CMP_XCHG(r11, r15)
-HS_CMP_XCHG(r1, r3)
-HS_CMP_XCHG(r5, r7)
-HS_CMP_XCHG(r9, r11)
-HS_CMP_XCHG(r13, r15)
-HS_CMP_XCHG(r2, r10)
-HS_CMP_XCHG(r6, r14)
-HS_CMP_XCHG(r2, r6)
-HS_CMP_XCHG(r10, r14)
-HS_CMP_XCHG(r4, r12)
-HS_CMP_XCHG(r8, r16)
-HS_CMP_XCHG(r4, r8)
-HS_CMP_XCHG(r12, r16)
-HS_CMP_XCHG(r2, r4)
-HS_CMP_XCHG(r6, r8)
-HS_CMP_XCHG(r10, r12)
-HS_CMP_XCHG(r14, r16)
-HS_CMP_XCHG(r1, r2)
-HS_CMP_XCHG(r3, r4)
-HS_CMP_XCHG(r5, r6)
-HS_CMP_XCHG(r7, r8)
-HS_CMP_XCHG(r9, r10)
-HS_CMP_XCHG(r11, r12)
-HS_CMP_XCHG(r13, r14)
-HS_CMP_XCHG(r15, r16)
-}
-(vout + gmem_idx)[0 * 8] = r1;
-(vout + gmem_idx)[1 * 8] = r2;
-(vout + gmem_idx)[2 * 8] = r3;
-(vout + gmem_idx)[3 * 8] = r4;
-(vout + gmem_idx)[4 * 8] = r5;
-(vout + gmem_idx)[5 * 8] = r6;
-(vout + gmem_idx)[6 * 8] = r7;
-(vout + gmem_idx)[7 * 8] = r8;
-(vout + gmem_idx)[8 * 8] = r9;
-(vout + gmem_idx)[9 * 8] = r10;
-(vout + gmem_idx)[10 * 8] = r11;
-(vout + gmem_idx)[11 * 8] = r12;
-(vout + gmem_idx)[12 * 8] = r13;
-(vout + gmem_idx)[13 * 8] = r14;
-(vout + gmem_idx)[14 * 8] = r15;
-(vout + gmem_idx)[15 * 8] = r16;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_bc_1(__global HS_KEY_TYPE* const restrict vout)
-{
- __local union
- {
- HS_KEY_TYPE m[16 * 16];
- } shared;
-
- uint const global_id = get_global_id(0);
- uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
-
- uint const gmem_l_idx = (global_id / 16) * 256 + (global_id & 15);
- uint const smem_l_idx = get_sub_group_id() * 16 + get_sub_group_local_id();
- {
- {
- HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[0];
- HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[128];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[0] = r0_1;
- (shared.m + smem_l_idx)[8] = r0_2;
- }
- {
- HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[16];
- HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[144];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[32] = r0_1;
- (shared.m + smem_l_idx)[40] = r0_2;
- }
- {
- HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[32];
- HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[160];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[64] = r0_1;
- (shared.m + smem_l_idx)[72] = r0_2;
- }
- {
- HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[48];
- HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[176];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[96] = r0_1;
- (shared.m + smem_l_idx)[104] = r0_2;
- }
- {
- HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[64];
- HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[192];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[128] = r0_1;
- (shared.m + smem_l_idx)[136] = r0_2;
- }
- {
- HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[80];
- HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[208];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[160] = r0_1;
- (shared.m + smem_l_idx)[168] = r0_2;
- }
- {
- HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[96];
- HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[224];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[192] = r0_1;
- (shared.m + smem_l_idx)[200] = r0_2;
- }
- {
- HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[112];
- HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[240];
- HS_CMP_XCHG(r0_1, r0_2)
- (shared.m + smem_l_idx)[224] = r0_1;
- (shared.m + smem_l_idx)[232] = r0_2;
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- HS_KEY_TYPE r1 = (shared.m + get_local_id(0))[2 * 8 * 0];
- HS_KEY_TYPE r2 = (shared.m + get_local_id(0))[2 * 8 * 1];
- HS_KEY_TYPE r3 = (shared.m + get_local_id(0))[2 * 8 * 2];
- HS_KEY_TYPE r4 = (shared.m + get_local_id(0))[2 * 8 * 3];
- HS_KEY_TYPE r5 = (shared.m + get_local_id(0))[2 * 8 * 4];
- HS_KEY_TYPE r6 = (shared.m + get_local_id(0))[2 * 8 * 5];
- HS_KEY_TYPE r7 = (shared.m + get_local_id(0))[2 * 8 * 6];
- HS_KEY_TYPE r8 = (shared.m + get_local_id(0))[2 * 8 * 7];
- HS_KEY_TYPE r9 = (shared.m + get_local_id(0))[2 * 8 * 8];
- HS_KEY_TYPE r10 = (shared.m + get_local_id(0))[2 * 8 * 9];
- HS_KEY_TYPE r11 = (shared.m + get_local_id(0))[2 * 8 * 10];
- HS_KEY_TYPE r12 = (shared.m + get_local_id(0))[2 * 8 * 11];
- HS_KEY_TYPE r13 = (shared.m + get_local_id(0))[2 * 8 * 12];
- HS_KEY_TYPE r14 = (shared.m + get_local_id(0))[2 * 8 * 13];
- HS_KEY_TYPE r15 = (shared.m + get_local_id(0))[2 * 8 * 14];
- HS_KEY_TYPE r16 = (shared.m + get_local_id(0))[2 * 8 * 15];
- { { uint const half_lane_mask = 4;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-HS_CMP_XCHG(r1, r9)
-HS_CMP_XCHG(r5, r13)
-HS_CMP_XCHG(r1, r5)
-HS_CMP_XCHG(r9, r13)
-HS_CMP_XCHG(r3, r11)
-HS_CMP_XCHG(r7, r15)
-HS_CMP_XCHG(r3, r7)
-HS_CMP_XCHG(r11, r15)
-HS_CMP_XCHG(r1, r3)
-HS_CMP_XCHG(r5, r7)
-HS_CMP_XCHG(r9, r11)
-HS_CMP_XCHG(r13, r15)
-HS_CMP_XCHG(r2, r10)
-HS_CMP_XCHG(r6, r14)
-HS_CMP_XCHG(r2, r6)
-HS_CMP_XCHG(r10, r14)
-HS_CMP_XCHG(r4, r12)
-HS_CMP_XCHG(r8, r16)
-HS_CMP_XCHG(r4, r8)
-HS_CMP_XCHG(r12, r16)
-HS_CMP_XCHG(r2, r4)
-HS_CMP_XCHG(r6, r8)
-HS_CMP_XCHG(r10, r12)
-HS_CMP_XCHG(r14, r16)
-HS_CMP_XCHG(r1, r2)
-HS_CMP_XCHG(r3, r4)
-HS_CMP_XCHG(r5, r6)
-HS_CMP_XCHG(r7, r8)
-HS_CMP_XCHG(r9, r10)
-HS_CMP_XCHG(r11, r12)
-HS_CMP_XCHG(r13, r14)
-HS_CMP_XCHG(r15, r16)
-}
-(vout + gmem_idx)[0 * 8] = r1;
-(vout + gmem_idx)[1 * 8] = r2;
-(vout + gmem_idx)[2 * 8] = r3;
-(vout + gmem_idx)[3 * 8] = r4;
-(vout + gmem_idx)[4 * 8] = r5;
-(vout + gmem_idx)[5 * 8] = r6;
-(vout + gmem_idx)[6 * 8] = r7;
-(vout + gmem_idx)[7 * 8] = r8;
-(vout + gmem_idx)[8 * 8] = r9;
-(vout + gmem_idx)[9 * 8] = r10;
-(vout + gmem_idx)[10 * 8] = r11;
-(vout + gmem_idx)[11 * 8] = r12;
-(vout + gmem_idx)[12 * 8] = r13;
-(vout + gmem_idx)[13 * 8] = r14;
-(vout + gmem_idx)[14 * 8] = r15;
-(vout + gmem_idx)[15 * 8] = r16;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_bc_0(__global HS_KEY_TYPE* const restrict vout)
-{
- __local union
- {
- } shared;
-
- uint const global_id = get_global_id(0);
- uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
-
- HS_KEY_TYPE r1 = (vout + gmem_idx)[0 * 8];
- HS_KEY_TYPE r2 = (vout + gmem_idx)[1 * 8];
- HS_KEY_TYPE r3 = (vout + gmem_idx)[2 * 8];
- HS_KEY_TYPE r4 = (vout + gmem_idx)[3 * 8];
- HS_KEY_TYPE r5 = (vout + gmem_idx)[4 * 8];
- HS_KEY_TYPE r6 = (vout + gmem_idx)[5 * 8];
- HS_KEY_TYPE r7 = (vout + gmem_idx)[6 * 8];
- HS_KEY_TYPE r8 = (vout + gmem_idx)[7 * 8];
- HS_KEY_TYPE r9 = (vout + gmem_idx)[8 * 8];
- HS_KEY_TYPE r10 = (vout + gmem_idx)[9 * 8];
- HS_KEY_TYPE r11 = (vout + gmem_idx)[10 * 8];
- HS_KEY_TYPE r12 = (vout + gmem_idx)[11 * 8];
- HS_KEY_TYPE r13 = (vout + gmem_idx)[12 * 8];
- HS_KEY_TYPE r14 = (vout + gmem_idx)[13 * 8];
- HS_KEY_TYPE r15 = (vout + gmem_idx)[14 * 8];
- HS_KEY_TYPE r16 = (vout + gmem_idx)[15 * 8];
- { { uint const half_lane_mask = 4;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 2;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-{
- uint const half_lane_mask = 1;
- uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
- int const t_lt = get_sub_group_local_id() < half_lane_idx;
- HS_CMP_HALF(0, r1)
- HS_CMP_HALF(1, r2)
- HS_CMP_HALF(2, r3)
- HS_CMP_HALF(3, r4)
- HS_CMP_HALF(4, r5)
- HS_CMP_HALF(5, r6)
- HS_CMP_HALF(6, r7)
- HS_CMP_HALF(7, r8)
- HS_CMP_HALF(8, r9)
- HS_CMP_HALF(9, r10)
- HS_CMP_HALF(10, r11)
- HS_CMP_HALF(11, r12)
- HS_CMP_HALF(12, r13)
- HS_CMP_HALF(13, r14)
- HS_CMP_HALF(14, r15)
- HS_CMP_HALF(15, r16)
-}
-HS_CMP_XCHG(r1, r9)
-HS_CMP_XCHG(r5, r13)
-HS_CMP_XCHG(r1, r5)
-HS_CMP_XCHG(r9, r13)
-HS_CMP_XCHG(r3, r11)
-HS_CMP_XCHG(r7, r15)
-HS_CMP_XCHG(r3, r7)
-HS_CMP_XCHG(r11, r15)
-HS_CMP_XCHG(r1, r3)
-HS_CMP_XCHG(r5, r7)
-HS_CMP_XCHG(r9, r11)
-HS_CMP_XCHG(r13, r15)
-HS_CMP_XCHG(r2, r10)
-HS_CMP_XCHG(r6, r14)
-HS_CMP_XCHG(r2, r6)
-HS_CMP_XCHG(r10, r14)
-HS_CMP_XCHG(r4, r12)
-HS_CMP_XCHG(r8, r16)
-HS_CMP_XCHG(r4, r8)
-HS_CMP_XCHG(r12, r16)
-HS_CMP_XCHG(r2, r4)
-HS_CMP_XCHG(r6, r8)
-HS_CMP_XCHG(r10, r12)
-HS_CMP_XCHG(r14, r16)
-HS_CMP_XCHG(r1, r2)
-HS_CMP_XCHG(r3, r4)
-HS_CMP_XCHG(r5, r6)
-HS_CMP_XCHG(r7, r8)
-HS_CMP_XCHG(r9, r10)
-HS_CMP_XCHG(r11, r12)
-HS_CMP_XCHG(r13, r14)
-HS_CMP_XCHG(r15, r16)
-}
-(vout + gmem_idx)[0 * 8] = r1;
-(vout + gmem_idx)[1 * 8] = r2;
-(vout + gmem_idx)[2 * 8] = r3;
-(vout + gmem_idx)[3 * 8] = r4;
-(vout + gmem_idx)[4 * 8] = r5;
-(vout + gmem_idx)[5 * 8] = r6;
-(vout + gmem_idx)[6 * 8] = r7;
-(vout + gmem_idx)[7 * 8] = r8;
-(vout + gmem_idx)[8 * 8] = r9;
-(vout + gmem_idx)[9 * 8] = r10;
-(vout + gmem_idx)[10 * 8] = r11;
-(vout + gmem_idx)[11 * 8] = r12;
-(vout + gmem_idx)[12 * 8] = r13;
-(vout + gmem_idx)[13 * 8] = r14;
-(vout + gmem_idx)[14 * 8] = r15;
-(vout + gmem_idx)[15 * 8] = r16;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_fm_1(__global HS_KEY_TYPE* const restrict vout,
- uint const fm_full,
- uint const fm_frac)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = warp_idx / 16 >> 0;
-
- uint const merge_stride = 16 * 8 << 0;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
-
- uint const merge_l_off =
- (warp_idx - merge_idx * (16 << 0)) * 8 + warp_lane_idx;
- uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
-
- int const merge_r_off = merge_keys - merge_l_end - 1;
-
- __global HS_KEY_TYPE* const restrict merge_l =
- vout + (merge_base + merge_l_off);
- __global HS_KEY_TYPE* const restrict merge_r =
- vout + (merge_base + merge_r_off);
-
- HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
- if (merge_idx < fm_full) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
- HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
- HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
- HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
- HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
- HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
- HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
- HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r8, r25)
- HS_CMP_XCHG(r7, r26)
- HS_CMP_XCHG(r6, r27)
- HS_CMP_XCHG(r5, r28)
- HS_CMP_XCHG(r4, r29)
- HS_CMP_XCHG(r3, r30)
- HS_CMP_XCHG(r2, r31)
- HS_CMP_XCHG(r1, r32)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_r[15 * merge_stride] = r32;
- merge_r[14 * merge_stride] = r31;
- merge_r[13 * merge_stride] = r30;
- merge_r[12 * merge_stride] = r29;
- merge_r[11 * merge_stride] = r28;
- merge_r[10 * merge_stride] = r27;
- merge_r[9 * merge_stride] = r26;
- merge_r[8 * merge_stride] = r25;
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 8) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 4) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 2) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r17, r18)
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- merge_r[0 * merge_stride] = r17;
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- merge_l[15 * merge_stride] = r16;
- merge_l[14 * merge_stride] = r15;
- merge_l[13 * merge_stride] = r14;
- merge_l[12 * merge_stride] = r13;
- merge_l[11 * merge_stride] = r12;
- merge_l[10 * merge_stride] = r11;
- merge_l[9 * merge_stride] = r10;
- merge_l[8 * merge_stride] = r9;
- merge_l[7 * merge_stride] = r8;
- merge_l[6 * merge_stride] = r7;
- merge_l[5 * merge_stride] = r6;
- merge_l[4 * merge_stride] = r5;
- merge_l[3 * merge_stride] = r4;
- merge_l[2 * merge_stride] = r3;
- merge_l[1 * merge_stride] = r2;
- merge_l[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_fm_2(__global HS_KEY_TYPE* const restrict vout,
- uint const fm_full,
- uint const fm_frac)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = warp_idx / 16 >> 1;
-
- uint const merge_stride = 16 * 8 << 1;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
-
- uint const merge_l_off =
- (warp_idx - merge_idx * (16 << 1)) * 8 + warp_lane_idx;
- uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
-
- int const merge_r_off = merge_keys - merge_l_end - 1;
-
- __global HS_KEY_TYPE* const restrict merge_l =
- vout + (merge_base + merge_l_off);
- __global HS_KEY_TYPE* const restrict merge_r =
- vout + (merge_base + merge_r_off);
-
- HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
- if (merge_idx < fm_full) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
- HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
- HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
- HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
- HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
- HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
- HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
- HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r8, r25)
- HS_CMP_XCHG(r7, r26)
- HS_CMP_XCHG(r6, r27)
- HS_CMP_XCHG(r5, r28)
- HS_CMP_XCHG(r4, r29)
- HS_CMP_XCHG(r3, r30)
- HS_CMP_XCHG(r2, r31)
- HS_CMP_XCHG(r1, r32)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_r[15 * merge_stride] = r32;
- merge_r[14 * merge_stride] = r31;
- merge_r[13 * merge_stride] = r30;
- merge_r[12 * merge_stride] = r29;
- merge_r[11 * merge_stride] = r28;
- merge_r[10 * merge_stride] = r27;
- merge_r[9 * merge_stride] = r26;
- merge_r[8 * merge_stride] = r25;
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 8) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 4) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 2) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r17, r18)
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- merge_r[0 * merge_stride] = r17;
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- merge_l[15 * merge_stride] = r16;
- merge_l[14 * merge_stride] = r15;
- merge_l[13 * merge_stride] = r14;
- merge_l[12 * merge_stride] = r13;
- merge_l[11 * merge_stride] = r12;
- merge_l[10 * merge_stride] = r11;
- merge_l[9 * merge_stride] = r10;
- merge_l[8 * merge_stride] = r9;
- merge_l[7 * merge_stride] = r8;
- merge_l[6 * merge_stride] = r7;
- merge_l[5 * merge_stride] = r6;
- merge_l[4 * merge_stride] = r5;
- merge_l[3 * merge_stride] = r4;
- merge_l[2 * merge_stride] = r3;
- merge_l[1 * merge_stride] = r2;
- merge_l[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_fm_3(__global HS_KEY_TYPE* const restrict vout,
- uint const fm_full,
- uint const fm_frac)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = warp_idx / 16 >> 2;
-
- uint const merge_stride = 16 * 8 << 2;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
-
- uint const merge_l_off =
- (warp_idx - merge_idx * (16 << 2)) * 8 + warp_lane_idx;
- uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
-
- int const merge_r_off = merge_keys - merge_l_end - 1;
-
- __global HS_KEY_TYPE* const restrict merge_l =
- vout + (merge_base + merge_l_off);
- __global HS_KEY_TYPE* const restrict merge_r =
- vout + (merge_base + merge_r_off);
-
- HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
- if (merge_idx < fm_full) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
- HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
- HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
- HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
- HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
- HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
- HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
- HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r8, r25)
- HS_CMP_XCHG(r7, r26)
- HS_CMP_XCHG(r6, r27)
- HS_CMP_XCHG(r5, r28)
- HS_CMP_XCHG(r4, r29)
- HS_CMP_XCHG(r3, r30)
- HS_CMP_XCHG(r2, r31)
- HS_CMP_XCHG(r1, r32)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_r[15 * merge_stride] = r32;
- merge_r[14 * merge_stride] = r31;
- merge_r[13 * merge_stride] = r30;
- merge_r[12 * merge_stride] = r29;
- merge_r[11 * merge_stride] = r28;
- merge_r[10 * merge_stride] = r27;
- merge_r[9 * merge_stride] = r26;
- merge_r[8 * merge_stride] = r25;
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 8) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 4) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 2) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r17, r18)
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- merge_r[0 * merge_stride] = r17;
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- merge_l[15 * merge_stride] = r16;
- merge_l[14 * merge_stride] = r15;
- merge_l[13 * merge_stride] = r14;
- merge_l[12 * merge_stride] = r13;
- merge_l[11 * merge_stride] = r12;
- merge_l[10 * merge_stride] = r11;
- merge_l[9 * merge_stride] = r10;
- merge_l[8 * merge_stride] = r9;
- merge_l[7 * merge_stride] = r8;
- merge_l[6 * merge_stride] = r7;
- merge_l[5 * merge_stride] = r6;
- merge_l[4 * merge_stride] = r5;
- merge_l[3 * merge_stride] = r4;
- merge_l[2 * merge_stride] = r3;
- merge_l[1 * merge_stride] = r2;
- merge_l[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_fm_4(__global HS_KEY_TYPE* const restrict vout,
- uint const fm_full,
- uint const fm_frac)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = warp_idx / 16 >> 3;
-
- uint const merge_stride = 16 * 8 << 3;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
-
- uint const merge_l_off =
- (warp_idx - merge_idx * (16 << 3)) * 8 + warp_lane_idx;
- uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
-
- int const merge_r_off = merge_keys - merge_l_end - 1;
-
- __global HS_KEY_TYPE* const restrict merge_l =
- vout + (merge_base + merge_l_off);
- __global HS_KEY_TYPE* const restrict merge_r =
- vout + (merge_base + merge_r_off);
-
- HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
- if (merge_idx < fm_full) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
- HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
- HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
- HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
- HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
- HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
- HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
- HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r8, r25)
- HS_CMP_XCHG(r7, r26)
- HS_CMP_XCHG(r6, r27)
- HS_CMP_XCHG(r5, r28)
- HS_CMP_XCHG(r4, r29)
- HS_CMP_XCHG(r3, r30)
- HS_CMP_XCHG(r2, r31)
- HS_CMP_XCHG(r1, r32)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_r[15 * merge_stride] = r32;
- merge_r[14 * merge_stride] = r31;
- merge_r[13 * merge_stride] = r30;
- merge_r[12 * merge_stride] = r29;
- merge_r[11 * merge_stride] = r28;
- merge_r[10 * merge_stride] = r27;
- merge_r[9 * merge_stride] = r26;
- merge_r[8 * merge_stride] = r25;
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 8) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 4) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 2) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r17, r18)
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- merge_r[0 * merge_stride] = r17;
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- merge_l[15 * merge_stride] = r16;
- merge_l[14 * merge_stride] = r15;
- merge_l[13 * merge_stride] = r14;
- merge_l[12 * merge_stride] = r13;
- merge_l[11 * merge_stride] = r12;
- merge_l[10 * merge_stride] = r11;
- merge_l[9 * merge_stride] = r10;
- merge_l[8 * merge_stride] = r9;
- merge_l[7 * merge_stride] = r8;
- merge_l[6 * merge_stride] = r7;
- merge_l[5 * merge_stride] = r6;
- merge_l[4 * merge_stride] = r5;
- merge_l[3 * merge_stride] = r4;
- merge_l[2 * merge_stride] = r3;
- merge_l[1 * merge_stride] = r2;
- merge_l[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_fm_5(__global HS_KEY_TYPE* const restrict vout,
- uint const fm_full,
- uint const fm_frac)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = warp_idx / 16 >> 4;
-
- uint const merge_stride = 16 * 8 << 4;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
-
- uint const merge_l_off =
- (warp_idx - merge_idx * (16 << 4)) * 8 + warp_lane_idx;
- uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
-
- int const merge_r_off = merge_keys - merge_l_end - 1;
-
- __global HS_KEY_TYPE* const restrict merge_l =
- vout + (merge_base + merge_l_off);
- __global HS_KEY_TYPE* const restrict merge_r =
- vout + (merge_base + merge_r_off);
-
- HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
- if (merge_idx < fm_full) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
- HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
- HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
- HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
- HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
- HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
- HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
- HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r8, r25)
- HS_CMP_XCHG(r7, r26)
- HS_CMP_XCHG(r6, r27)
- HS_CMP_XCHG(r5, r28)
- HS_CMP_XCHG(r4, r29)
- HS_CMP_XCHG(r3, r30)
- HS_CMP_XCHG(r2, r31)
- HS_CMP_XCHG(r1, r32)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_r[15 * merge_stride] = r32;
- merge_r[14 * merge_stride] = r31;
- merge_r[13 * merge_stride] = r30;
- merge_r[12 * merge_stride] = r29;
- merge_r[11 * merge_stride] = r28;
- merge_r[10 * merge_stride] = r27;
- merge_r[9 * merge_stride] = r26;
- merge_r[8 * merge_stride] = r25;
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 8) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 4) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 2) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r17, r18)
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- merge_r[0 * merge_stride] = r17;
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- merge_l[15 * merge_stride] = r16;
- merge_l[14 * merge_stride] = r15;
- merge_l[13 * merge_stride] = r14;
- merge_l[12 * merge_stride] = r13;
- merge_l[11 * merge_stride] = r12;
- merge_l[10 * merge_stride] = r11;
- merge_l[9 * merge_stride] = r10;
- merge_l[8 * merge_stride] = r9;
- merge_l[7 * merge_stride] = r8;
- merge_l[6 * merge_stride] = r7;
- merge_l[5 * merge_stride] = r6;
- merge_l[4 * merge_stride] = r5;
- merge_l[3 * merge_stride] = r4;
- merge_l[2 * merge_stride] = r3;
- merge_l[1 * merge_stride] = r2;
- merge_l[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_fm_6(__global HS_KEY_TYPE* const restrict vout,
- uint const fm_full,
- uint const fm_frac)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = warp_idx / 16 >> 5;
-
- uint const merge_stride = 16 * 8 << 5;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
-
- uint const merge_l_off =
- (warp_idx - merge_idx * (16 << 5)) * 8 + warp_lane_idx;
- uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
-
- int const merge_r_off = merge_keys - merge_l_end - 1;
-
- __global HS_KEY_TYPE* const restrict merge_l =
- vout + (merge_base + merge_l_off);
- __global HS_KEY_TYPE* const restrict merge_r =
- vout + (merge_base + merge_r_off);
-
- HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
- if (merge_idx < fm_full) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
- HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
- HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
- HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
- HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
- HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
- HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
- HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r8, r25)
- HS_CMP_XCHG(r7, r26)
- HS_CMP_XCHG(r6, r27)
- HS_CMP_XCHG(r5, r28)
- HS_CMP_XCHG(r4, r29)
- HS_CMP_XCHG(r3, r30)
- HS_CMP_XCHG(r2, r31)
- HS_CMP_XCHG(r1, r32)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_r[15 * merge_stride] = r32;
- merge_r[14 * merge_stride] = r31;
- merge_r[13 * merge_stride] = r30;
- merge_r[12 * merge_stride] = r29;
- merge_r[11 * merge_stride] = r28;
- merge_r[10 * merge_stride] = r27;
- merge_r[9 * merge_stride] = r26;
- merge_r[8 * merge_stride] = r25;
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 8) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 4) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 2) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r17, r18)
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- merge_r[0 * merge_stride] = r17;
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- merge_l[15 * merge_stride] = r16;
- merge_l[14 * merge_stride] = r15;
- merge_l[13 * merge_stride] = r14;
- merge_l[12 * merge_stride] = r13;
- merge_l[11 * merge_stride] = r12;
- merge_l[10 * merge_stride] = r11;
- merge_l[9 * merge_stride] = r10;
- merge_l[8 * merge_stride] = r9;
- merge_l[7 * merge_stride] = r8;
- merge_l[6 * merge_stride] = r7;
- merge_l[5 * merge_stride] = r6;
- merge_l[4 * merge_stride] = r5;
- merge_l[3 * merge_stride] = r4;
- merge_l[2 * merge_stride] = r3;
- merge_l[1 * merge_stride] = r2;
- merge_l[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_hm_5(__global HS_KEY_TYPE* const restrict vout)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = (warp_idx / 16) >> 0;
-
- uint const merge_stride = 16 * 8 << 0;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
- uint const merge_off = (warp_idx - merge_idx * (16 << 0)) * 8;
-
- __global HS_KEY_TYPE* const restrict merge_ptr =
- vout + (merge_base + merge_off + warp_lane_idx);
-
- HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
- HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
- HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
- HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
- HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
- HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
- HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
- HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
- HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
- HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
- HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
- HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
- HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
- HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
- HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
- HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
- HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
- HS_CMP_XCHG(r1, r17)
- HS_CMP_XCHG(r9, r25)
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r5, r21)
- HS_CMP_XCHG(r13, r29)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r3, r19)
- HS_CMP_XCHG(r11, r27)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r7, r23)
- HS_CMP_XCHG(r15, r31)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r2, r18)
- HS_CMP_XCHG(r10, r26)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r6, r22)
- HS_CMP_XCHG(r14, r30)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r4, r20)
- HS_CMP_XCHG(r12, r28)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r8, r24)
- HS_CMP_XCHG(r16, r32)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_ptr[31 * merge_stride] = r32;
- merge_ptr[30 * merge_stride] = r31;
- merge_ptr[29 * merge_stride] = r30;
- merge_ptr[28 * merge_stride] = r29;
- merge_ptr[27 * merge_stride] = r28;
- merge_ptr[26 * merge_stride] = r27;
- merge_ptr[25 * merge_stride] = r26;
- merge_ptr[24 * merge_stride] = r25;
- merge_ptr[23 * merge_stride] = r24;
- merge_ptr[22 * merge_stride] = r23;
- merge_ptr[21 * merge_stride] = r22;
- merge_ptr[20 * merge_stride] = r21;
- merge_ptr[19 * merge_stride] = r20;
- merge_ptr[18 * merge_stride] = r19;
- merge_ptr[17 * merge_stride] = r18;
- merge_ptr[16 * merge_stride] = r17;
- merge_ptr[15 * merge_stride] = r16;
- merge_ptr[14 * merge_stride] = r15;
- merge_ptr[13 * merge_stride] = r14;
- merge_ptr[12 * merge_stride] = r13;
- merge_ptr[11 * merge_stride] = r12;
- merge_ptr[10 * merge_stride] = r11;
- merge_ptr[9 * merge_stride] = r10;
- merge_ptr[8 * merge_stride] = r9;
- merge_ptr[7 * merge_stride] = r8;
- merge_ptr[6 * merge_stride] = r7;
- merge_ptr[5 * merge_stride] = r6;
- merge_ptr[4 * merge_stride] = r5;
- merge_ptr[3 * merge_stride] = r4;
- merge_ptr[2 * merge_stride] = r3;
- merge_ptr[1 * merge_stride] = r2;
- merge_ptr[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_fm_7(__global HS_KEY_TYPE* const restrict vout,
- uint const fm_full,
- uint const fm_frac)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = warp_idx / 16 >> 6;
-
- uint const merge_stride = 16 * 8 << 6;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
-
- uint const merge_l_off =
- (warp_idx - merge_idx * (16 << 6)) * 8 + warp_lane_idx;
- uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
-
- int const merge_r_off = merge_keys - merge_l_end - 1;
-
- __global HS_KEY_TYPE* const restrict merge_l =
- vout + (merge_base + merge_l_off);
- __global HS_KEY_TYPE* const restrict merge_r =
- vout + (merge_base + merge_r_off);
-
- HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
- if (merge_idx < fm_full) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
- HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
- HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
- HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
- HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
- HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
- HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
- HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r8, r25)
- HS_CMP_XCHG(r7, r26)
- HS_CMP_XCHG(r6, r27)
- HS_CMP_XCHG(r5, r28)
- HS_CMP_XCHG(r4, r29)
- HS_CMP_XCHG(r3, r30)
- HS_CMP_XCHG(r2, r31)
- HS_CMP_XCHG(r1, r32)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_r[15 * merge_stride] = r32;
- merge_r[14 * merge_stride] = r31;
- merge_r[13 * merge_stride] = r30;
- merge_r[12 * merge_stride] = r29;
- merge_r[11 * merge_stride] = r28;
- merge_r[10 * merge_stride] = r27;
- merge_r[9 * merge_stride] = r26;
- merge_r[8 * merge_stride] = r25;
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 8) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 4) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 2) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r17, r18)
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- merge_r[0 * merge_stride] = r17;
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- merge_l[15 * merge_stride] = r16;
- merge_l[14 * merge_stride] = r15;
- merge_l[13 * merge_stride] = r14;
- merge_l[12 * merge_stride] = r13;
- merge_l[11 * merge_stride] = r12;
- merge_l[10 * merge_stride] = r11;
- merge_l[9 * merge_stride] = r10;
- merge_l[8 * merge_stride] = r9;
- merge_l[7 * merge_stride] = r8;
- merge_l[6 * merge_stride] = r7;
- merge_l[5 * merge_stride] = r6;
- merge_l[4 * merge_stride] = r5;
- merge_l[3 * merge_stride] = r4;
- merge_l[2 * merge_stride] = r3;
- merge_l[1 * merge_stride] = r2;
- merge_l[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_hm_6(__global HS_KEY_TYPE* const restrict vout)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = (warp_idx / 16) >> 1;
-
- uint const merge_stride = 16 * 8 << 1;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
- uint const merge_off = (warp_idx - merge_idx * (16 << 1)) * 8;
-
- __global HS_KEY_TYPE* const restrict merge_ptr =
- vout + (merge_base + merge_off + warp_lane_idx);
-
- HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
- HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
- HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
- HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
- HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
- HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
- HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
- HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
- HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
- HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
- HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
- HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
- HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
- HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
- HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
- HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
- HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
- HS_CMP_XCHG(r1, r17)
- HS_CMP_XCHG(r9, r25)
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r5, r21)
- HS_CMP_XCHG(r13, r29)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r3, r19)
- HS_CMP_XCHG(r11, r27)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r7, r23)
- HS_CMP_XCHG(r15, r31)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r2, r18)
- HS_CMP_XCHG(r10, r26)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r6, r22)
- HS_CMP_XCHG(r14, r30)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r4, r20)
- HS_CMP_XCHG(r12, r28)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r8, r24)
- HS_CMP_XCHG(r16, r32)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_ptr[31 * merge_stride] = r32;
- merge_ptr[30 * merge_stride] = r31;
- merge_ptr[29 * merge_stride] = r30;
- merge_ptr[28 * merge_stride] = r29;
- merge_ptr[27 * merge_stride] = r28;
- merge_ptr[26 * merge_stride] = r27;
- merge_ptr[25 * merge_stride] = r26;
- merge_ptr[24 * merge_stride] = r25;
- merge_ptr[23 * merge_stride] = r24;
- merge_ptr[22 * merge_stride] = r23;
- merge_ptr[21 * merge_stride] = r22;
- merge_ptr[20 * merge_stride] = r21;
- merge_ptr[19 * merge_stride] = r20;
- merge_ptr[18 * merge_stride] = r19;
- merge_ptr[17 * merge_stride] = r18;
- merge_ptr[16 * merge_stride] = r17;
- merge_ptr[15 * merge_stride] = r16;
- merge_ptr[14 * merge_stride] = r15;
- merge_ptr[13 * merge_stride] = r14;
- merge_ptr[12 * merge_stride] = r13;
- merge_ptr[11 * merge_stride] = r12;
- merge_ptr[10 * merge_stride] = r11;
- merge_ptr[9 * merge_stride] = r10;
- merge_ptr[8 * merge_stride] = r9;
- merge_ptr[7 * merge_stride] = r8;
- merge_ptr[6 * merge_stride] = r7;
- merge_ptr[5 * merge_stride] = r6;
- merge_ptr[4 * merge_stride] = r5;
- merge_ptr[3 * merge_stride] = r4;
- merge_ptr[2 * merge_stride] = r3;
- merge_ptr[1 * merge_stride] = r2;
- merge_ptr[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_fm_8(__global HS_KEY_TYPE* const restrict vout,
- uint const fm_full,
- uint const fm_frac)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = warp_idx / 16 >> 7;
-
- uint const merge_stride = 16 * 8 << 7;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
-
- uint const merge_l_off =
- (warp_idx - merge_idx * (16 << 7)) * 8 + warp_lane_idx;
- uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
-
- int const merge_r_off = merge_keys - merge_l_end - 1;
-
- __global HS_KEY_TYPE* const restrict merge_l =
- vout + (merge_base + merge_l_off);
- __global HS_KEY_TYPE* const restrict merge_r =
- vout + (merge_base + merge_r_off);
-
- HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
- if (merge_idx < fm_full) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
- HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
- HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
- HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
- HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
- HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
- HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
- HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r8, r25)
- HS_CMP_XCHG(r7, r26)
- HS_CMP_XCHG(r6, r27)
- HS_CMP_XCHG(r5, r28)
- HS_CMP_XCHG(r4, r29)
- HS_CMP_XCHG(r3, r30)
- HS_CMP_XCHG(r2, r31)
- HS_CMP_XCHG(r1, r32)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_r[15 * merge_stride] = r32;
- merge_r[14 * merge_stride] = r31;
- merge_r[13 * merge_stride] = r30;
- merge_r[12 * merge_stride] = r29;
- merge_r[11 * merge_stride] = r28;
- merge_r[10 * merge_stride] = r27;
- merge_r[9 * merge_stride] = r26;
- merge_r[8 * merge_stride] = r25;
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 8) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 4) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 2) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r17, r18)
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- merge_r[0 * merge_stride] = r17;
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- merge_l[15 * merge_stride] = r16;
- merge_l[14 * merge_stride] = r15;
- merge_l[13 * merge_stride] = r14;
- merge_l[12 * merge_stride] = r13;
- merge_l[11 * merge_stride] = r12;
- merge_l[10 * merge_stride] = r11;
- merge_l[9 * merge_stride] = r10;
- merge_l[8 * merge_stride] = r9;
- merge_l[7 * merge_stride] = r8;
- merge_l[6 * merge_stride] = r7;
- merge_l[5 * merge_stride] = r6;
- merge_l[4 * merge_stride] = r5;
- merge_l[3 * merge_stride] = r4;
- merge_l[2 * merge_stride] = r3;
- merge_l[1 * merge_stride] = r2;
- merge_l[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_hm_7(__global HS_KEY_TYPE* const restrict vout)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = (warp_idx / 16) >> 2;
-
- uint const merge_stride = 16 * 8 << 2;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
- uint const merge_off = (warp_idx - merge_idx * (16 << 2)) * 8;
-
- __global HS_KEY_TYPE* const restrict merge_ptr =
- vout + (merge_base + merge_off + warp_lane_idx);
-
- HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
- HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
- HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
- HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
- HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
- HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
- HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
- HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
- HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
- HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
- HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
- HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
- HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
- HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
- HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
- HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
- HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
- HS_CMP_XCHG(r1, r17)
- HS_CMP_XCHG(r9, r25)
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r5, r21)
- HS_CMP_XCHG(r13, r29)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r3, r19)
- HS_CMP_XCHG(r11, r27)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r7, r23)
- HS_CMP_XCHG(r15, r31)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r2, r18)
- HS_CMP_XCHG(r10, r26)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r6, r22)
- HS_CMP_XCHG(r14, r30)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r4, r20)
- HS_CMP_XCHG(r12, r28)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r8, r24)
- HS_CMP_XCHG(r16, r32)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_ptr[31 * merge_stride] = r32;
- merge_ptr[30 * merge_stride] = r31;
- merge_ptr[29 * merge_stride] = r30;
- merge_ptr[28 * merge_stride] = r29;
- merge_ptr[27 * merge_stride] = r28;
- merge_ptr[26 * merge_stride] = r27;
- merge_ptr[25 * merge_stride] = r26;
- merge_ptr[24 * merge_stride] = r25;
- merge_ptr[23 * merge_stride] = r24;
- merge_ptr[22 * merge_stride] = r23;
- merge_ptr[21 * merge_stride] = r22;
- merge_ptr[20 * merge_stride] = r21;
- merge_ptr[19 * merge_stride] = r20;
- merge_ptr[18 * merge_stride] = r19;
- merge_ptr[17 * merge_stride] = r18;
- merge_ptr[16 * merge_stride] = r17;
- merge_ptr[15 * merge_stride] = r16;
- merge_ptr[14 * merge_stride] = r15;
- merge_ptr[13 * merge_stride] = r14;
- merge_ptr[12 * merge_stride] = r13;
- merge_ptr[11 * merge_stride] = r12;
- merge_ptr[10 * merge_stride] = r11;
- merge_ptr[9 * merge_stride] = r10;
- merge_ptr[8 * merge_stride] = r9;
- merge_ptr[7 * merge_stride] = r8;
- merge_ptr[6 * merge_stride] = r7;
- merge_ptr[5 * merge_stride] = r6;
- merge_ptr[4 * merge_stride] = r5;
- merge_ptr[3 * merge_stride] = r4;
- merge_ptr[2 * merge_stride] = r3;
- merge_ptr[1 * merge_stride] = r2;
- merge_ptr[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_fm_9(__global HS_KEY_TYPE* const restrict vout,
- uint const fm_full,
- uint const fm_frac)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = warp_idx / 16 >> 8;
-
- uint const merge_stride = 16 * 8 << 8;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
-
- uint const merge_l_off =
- (warp_idx - merge_idx * (16 << 8)) * 8 + warp_lane_idx;
- uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
-
- int const merge_r_off = merge_keys - merge_l_end - 1;
-
- __global HS_KEY_TYPE* const restrict merge_l =
- vout + (merge_base + merge_l_off);
- __global HS_KEY_TYPE* const restrict merge_r =
- vout + (merge_base + merge_r_off);
-
- HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
- if (merge_idx < fm_full) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
- HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
- HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
- HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
- HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
- HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
- HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
- HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r8, r25)
- HS_CMP_XCHG(r7, r26)
- HS_CMP_XCHG(r6, r27)
- HS_CMP_XCHG(r5, r28)
- HS_CMP_XCHG(r4, r29)
- HS_CMP_XCHG(r3, r30)
- HS_CMP_XCHG(r2, r31)
- HS_CMP_XCHG(r1, r32)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_r[15 * merge_stride] = r32;
- merge_r[14 * merge_stride] = r31;
- merge_r[13 * merge_stride] = r30;
- merge_r[12 * merge_stride] = r29;
- merge_r[11 * merge_stride] = r28;
- merge_r[10 * merge_stride] = r27;
- merge_r[9 * merge_stride] = r26;
- merge_r[8 * merge_stride] = r25;
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 8) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 4) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 2) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r17, r18)
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- merge_r[0 * merge_stride] = r17;
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- merge_l[15 * merge_stride] = r16;
- merge_l[14 * merge_stride] = r15;
- merge_l[13 * merge_stride] = r14;
- merge_l[12 * merge_stride] = r13;
- merge_l[11 * merge_stride] = r12;
- merge_l[10 * merge_stride] = r11;
- merge_l[9 * merge_stride] = r10;
- merge_l[8 * merge_stride] = r9;
- merge_l[7 * merge_stride] = r8;
- merge_l[6 * merge_stride] = r7;
- merge_l[5 * merge_stride] = r6;
- merge_l[4 * merge_stride] = r5;
- merge_l[3 * merge_stride] = r4;
- merge_l[2 * merge_stride] = r3;
- merge_l[1 * merge_stride] = r2;
- merge_l[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_hm_8(__global HS_KEY_TYPE* const restrict vout)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = (warp_idx / 16) >> 3;
-
- uint const merge_stride = 16 * 8 << 3;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
- uint const merge_off = (warp_idx - merge_idx * (16 << 3)) * 8;
-
- __global HS_KEY_TYPE* const restrict merge_ptr =
- vout + (merge_base + merge_off + warp_lane_idx);
-
- HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
- HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
- HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
- HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
- HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
- HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
- HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
- HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
- HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
- HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
- HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
- HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
- HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
- HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
- HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
- HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
- HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
- HS_CMP_XCHG(r1, r17)
- HS_CMP_XCHG(r9, r25)
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r5, r21)
- HS_CMP_XCHG(r13, r29)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r3, r19)
- HS_CMP_XCHG(r11, r27)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r7, r23)
- HS_CMP_XCHG(r15, r31)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r2, r18)
- HS_CMP_XCHG(r10, r26)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r6, r22)
- HS_CMP_XCHG(r14, r30)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r4, r20)
- HS_CMP_XCHG(r12, r28)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r8, r24)
- HS_CMP_XCHG(r16, r32)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_ptr[31 * merge_stride] = r32;
- merge_ptr[30 * merge_stride] = r31;
- merge_ptr[29 * merge_stride] = r30;
- merge_ptr[28 * merge_stride] = r29;
- merge_ptr[27 * merge_stride] = r28;
- merge_ptr[26 * merge_stride] = r27;
- merge_ptr[25 * merge_stride] = r26;
- merge_ptr[24 * merge_stride] = r25;
- merge_ptr[23 * merge_stride] = r24;
- merge_ptr[22 * merge_stride] = r23;
- merge_ptr[21 * merge_stride] = r22;
- merge_ptr[20 * merge_stride] = r21;
- merge_ptr[19 * merge_stride] = r20;
- merge_ptr[18 * merge_stride] = r19;
- merge_ptr[17 * merge_stride] = r18;
- merge_ptr[16 * merge_stride] = r17;
- merge_ptr[15 * merge_stride] = r16;
- merge_ptr[14 * merge_stride] = r15;
- merge_ptr[13 * merge_stride] = r14;
- merge_ptr[12 * merge_stride] = r13;
- merge_ptr[11 * merge_stride] = r12;
- merge_ptr[10 * merge_stride] = r11;
- merge_ptr[9 * merge_stride] = r10;
- merge_ptr[8 * merge_stride] = r9;
- merge_ptr[7 * merge_stride] = r8;
- merge_ptr[6 * merge_stride] = r7;
- merge_ptr[5 * merge_stride] = r6;
- merge_ptr[4 * merge_stride] = r5;
- merge_ptr[3 * merge_stride] = r4;
- merge_ptr[2 * merge_stride] = r3;
- merge_ptr[1 * merge_stride] = r2;
- merge_ptr[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_fm_10(__global HS_KEY_TYPE* const restrict vout,
- uint const fm_full,
- uint const fm_frac)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = warp_idx / 16 >> 9;
-
- uint const merge_stride = 16 * 8 << 9;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
-
- uint const merge_l_off =
- (warp_idx - merge_idx * (16 << 9)) * 8 + warp_lane_idx;
- uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
-
- int const merge_r_off = merge_keys - merge_l_end - 1;
-
- __global HS_KEY_TYPE* const restrict merge_l =
- vout + (merge_base + merge_l_off);
- __global HS_KEY_TYPE* const restrict merge_r =
- vout + (merge_base + merge_r_off);
-
- HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
- if (merge_idx < fm_full) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
- HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
- HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
- HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
- HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
- HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
- HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
- HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r8, r25)
- HS_CMP_XCHG(r7, r26)
- HS_CMP_XCHG(r6, r27)
- HS_CMP_XCHG(r5, r28)
- HS_CMP_XCHG(r4, r29)
- HS_CMP_XCHG(r3, r30)
- HS_CMP_XCHG(r2, r31)
- HS_CMP_XCHG(r1, r32)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_r[15 * merge_stride] = r32;
- merge_r[14 * merge_stride] = r31;
- merge_r[13 * merge_stride] = r30;
- merge_r[12 * merge_stride] = r29;
- merge_r[11 * merge_stride] = r28;
- merge_r[10 * merge_stride] = r27;
- merge_r[9 * merge_stride] = r26;
- merge_r[8 * merge_stride] = r25;
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 8) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 4) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 2) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r17, r18)
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- merge_r[0 * merge_stride] = r17;
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- merge_l[15 * merge_stride] = r16;
- merge_l[14 * merge_stride] = r15;
- merge_l[13 * merge_stride] = r14;
- merge_l[12 * merge_stride] = r13;
- merge_l[11 * merge_stride] = r12;
- merge_l[10 * merge_stride] = r11;
- merge_l[9 * merge_stride] = r10;
- merge_l[8 * merge_stride] = r9;
- merge_l[7 * merge_stride] = r8;
- merge_l[6 * merge_stride] = r7;
- merge_l[5 * merge_stride] = r6;
- merge_l[4 * merge_stride] = r5;
- merge_l[3 * merge_stride] = r4;
- merge_l[2 * merge_stride] = r3;
- merge_l[1 * merge_stride] = r2;
- merge_l[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_hm_9(__global HS_KEY_TYPE* const restrict vout)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = (warp_idx / 16) >> 4;
-
- uint const merge_stride = 16 * 8 << 4;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
- uint const merge_off = (warp_idx - merge_idx * (16 << 4)) * 8;
-
- __global HS_KEY_TYPE* const restrict merge_ptr =
- vout + (merge_base + merge_off + warp_lane_idx);
-
- HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
- HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
- HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
- HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
- HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
- HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
- HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
- HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
- HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
- HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
- HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
- HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
- HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
- HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
- HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
- HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
- HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
- HS_CMP_XCHG(r1, r17)
- HS_CMP_XCHG(r9, r25)
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r5, r21)
- HS_CMP_XCHG(r13, r29)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r3, r19)
- HS_CMP_XCHG(r11, r27)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r7, r23)
- HS_CMP_XCHG(r15, r31)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r2, r18)
- HS_CMP_XCHG(r10, r26)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r6, r22)
- HS_CMP_XCHG(r14, r30)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r4, r20)
- HS_CMP_XCHG(r12, r28)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r8, r24)
- HS_CMP_XCHG(r16, r32)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_ptr[31 * merge_stride] = r32;
- merge_ptr[30 * merge_stride] = r31;
- merge_ptr[29 * merge_stride] = r30;
- merge_ptr[28 * merge_stride] = r29;
- merge_ptr[27 * merge_stride] = r28;
- merge_ptr[26 * merge_stride] = r27;
- merge_ptr[25 * merge_stride] = r26;
- merge_ptr[24 * merge_stride] = r25;
- merge_ptr[23 * merge_stride] = r24;
- merge_ptr[22 * merge_stride] = r23;
- merge_ptr[21 * merge_stride] = r22;
- merge_ptr[20 * merge_stride] = r21;
- merge_ptr[19 * merge_stride] = r20;
- merge_ptr[18 * merge_stride] = r19;
- merge_ptr[17 * merge_stride] = r18;
- merge_ptr[16 * merge_stride] = r17;
- merge_ptr[15 * merge_stride] = r16;
- merge_ptr[14 * merge_stride] = r15;
- merge_ptr[13 * merge_stride] = r14;
- merge_ptr[12 * merge_stride] = r13;
- merge_ptr[11 * merge_stride] = r12;
- merge_ptr[10 * merge_stride] = r11;
- merge_ptr[9 * merge_stride] = r10;
- merge_ptr[8 * merge_stride] = r9;
- merge_ptr[7 * merge_stride] = r8;
- merge_ptr[6 * merge_stride] = r7;
- merge_ptr[5 * merge_stride] = r6;
- merge_ptr[4 * merge_stride] = r5;
- merge_ptr[3 * merge_stride] = r4;
- merge_ptr[2 * merge_stride] = r3;
- merge_ptr[1 * merge_stride] = r2;
- merge_ptr[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_fm_11(__global HS_KEY_TYPE* const restrict vout,
- uint const fm_full,
- uint const fm_frac)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = warp_idx / 16 >> 10;
-
- uint const merge_stride = 16 * 8 << 10;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
-
- uint const merge_l_off =
- (warp_idx - merge_idx * (16 << 10)) * 8 + warp_lane_idx;
- uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
-
- int const merge_r_off = merge_keys - merge_l_end - 1;
-
- __global HS_KEY_TYPE* const restrict merge_l =
- vout + (merge_base + merge_l_off);
- __global HS_KEY_TYPE* const restrict merge_r =
- vout + (merge_base + merge_r_off);
-
- HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
- if (merge_idx < fm_full) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
- HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
- HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
- HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
- HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
- HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
- HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
- HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r8, r25)
- HS_CMP_XCHG(r7, r26)
- HS_CMP_XCHG(r6, r27)
- HS_CMP_XCHG(r5, r28)
- HS_CMP_XCHG(r4, r29)
- HS_CMP_XCHG(r3, r30)
- HS_CMP_XCHG(r2, r31)
- HS_CMP_XCHG(r1, r32)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_r[15 * merge_stride] = r32;
- merge_r[14 * merge_stride] = r31;
- merge_r[13 * merge_stride] = r30;
- merge_r[12 * merge_stride] = r29;
- merge_r[11 * merge_stride] = r28;
- merge_r[10 * merge_stride] = r27;
- merge_r[9 * merge_stride] = r26;
- merge_r[8 * merge_stride] = r25;
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 8) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 4) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 2) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r17, r18)
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- merge_r[0 * merge_stride] = r17;
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- merge_l[15 * merge_stride] = r16;
- merge_l[14 * merge_stride] = r15;
- merge_l[13 * merge_stride] = r14;
- merge_l[12 * merge_stride] = r13;
- merge_l[11 * merge_stride] = r12;
- merge_l[10 * merge_stride] = r11;
- merge_l[9 * merge_stride] = r10;
- merge_l[8 * merge_stride] = r9;
- merge_l[7 * merge_stride] = r8;
- merge_l[6 * merge_stride] = r7;
- merge_l[5 * merge_stride] = r6;
- merge_l[4 * merge_stride] = r5;
- merge_l[3 * merge_stride] = r4;
- merge_l[2 * merge_stride] = r3;
- merge_l[1 * merge_stride] = r2;
- merge_l[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_hm_10(__global HS_KEY_TYPE* const restrict vout)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = (warp_idx / 16) >> 5;
-
- uint const merge_stride = 16 * 8 << 5;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
- uint const merge_off = (warp_idx - merge_idx * (16 << 5)) * 8;
-
- __global HS_KEY_TYPE* const restrict merge_ptr =
- vout + (merge_base + merge_off + warp_lane_idx);
-
- HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
- HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
- HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
- HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
- HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
- HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
- HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
- HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
- HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
- HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
- HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
- HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
- HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
- HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
- HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
- HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
- HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
- HS_CMP_XCHG(r1, r17)
- HS_CMP_XCHG(r9, r25)
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r5, r21)
- HS_CMP_XCHG(r13, r29)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r3, r19)
- HS_CMP_XCHG(r11, r27)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r7, r23)
- HS_CMP_XCHG(r15, r31)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r2, r18)
- HS_CMP_XCHG(r10, r26)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r6, r22)
- HS_CMP_XCHG(r14, r30)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r4, r20)
- HS_CMP_XCHG(r12, r28)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r8, r24)
- HS_CMP_XCHG(r16, r32)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_ptr[31 * merge_stride] = r32;
- merge_ptr[30 * merge_stride] = r31;
- merge_ptr[29 * merge_stride] = r30;
- merge_ptr[28 * merge_stride] = r29;
- merge_ptr[27 * merge_stride] = r28;
- merge_ptr[26 * merge_stride] = r27;
- merge_ptr[25 * merge_stride] = r26;
- merge_ptr[24 * merge_stride] = r25;
- merge_ptr[23 * merge_stride] = r24;
- merge_ptr[22 * merge_stride] = r23;
- merge_ptr[21 * merge_stride] = r22;
- merge_ptr[20 * merge_stride] = r21;
- merge_ptr[19 * merge_stride] = r20;
- merge_ptr[18 * merge_stride] = r19;
- merge_ptr[17 * merge_stride] = r18;
- merge_ptr[16 * merge_stride] = r17;
- merge_ptr[15 * merge_stride] = r16;
- merge_ptr[14 * merge_stride] = r15;
- merge_ptr[13 * merge_stride] = r14;
- merge_ptr[12 * merge_stride] = r13;
- merge_ptr[11 * merge_stride] = r12;
- merge_ptr[10 * merge_stride] = r11;
- merge_ptr[9 * merge_stride] = r10;
- merge_ptr[8 * merge_stride] = r9;
- merge_ptr[7 * merge_stride] = r8;
- merge_ptr[6 * merge_stride] = r7;
- merge_ptr[5 * merge_stride] = r6;
- merge_ptr[4 * merge_stride] = r5;
- merge_ptr[3 * merge_stride] = r4;
- merge_ptr[2 * merge_stride] = r3;
- merge_ptr[1 * merge_stride] = r2;
- merge_ptr[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_fm_12(__global HS_KEY_TYPE* const restrict vout,
- uint const fm_full,
- uint const fm_frac)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = warp_idx / 16 >> 11;
-
- uint const merge_stride = 16 * 8 << 11;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
-
- uint const merge_l_off =
- (warp_idx - merge_idx * (16 << 11)) * 8 + warp_lane_idx;
- uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
-
- int const merge_r_off = merge_keys - merge_l_end - 1;
-
- __global HS_KEY_TYPE* const restrict merge_l =
- vout + (merge_base + merge_l_off);
- __global HS_KEY_TYPE* const restrict merge_r =
- vout + (merge_base + merge_r_off);
-
- HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
- if (merge_idx < fm_full) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
- HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
- HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
- HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
- HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
- HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
- HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
- HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r8, r25)
- HS_CMP_XCHG(r7, r26)
- HS_CMP_XCHG(r6, r27)
- HS_CMP_XCHG(r5, r28)
- HS_CMP_XCHG(r4, r29)
- HS_CMP_XCHG(r3, r30)
- HS_CMP_XCHG(r2, r31)
- HS_CMP_XCHG(r1, r32)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_r[15 * merge_stride] = r32;
- merge_r[14 * merge_stride] = r31;
- merge_r[13 * merge_stride] = r30;
- merge_r[12 * merge_stride] = r29;
- merge_r[11 * merge_stride] = r28;
- merge_r[10 * merge_stride] = r27;
- merge_r[9 * merge_stride] = r26;
- merge_r[8 * merge_stride] = r25;
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 8) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 4) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 2) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r17, r18)
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- merge_r[0 * merge_stride] = r17;
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- merge_l[15 * merge_stride] = r16;
- merge_l[14 * merge_stride] = r15;
- merge_l[13 * merge_stride] = r14;
- merge_l[12 * merge_stride] = r13;
- merge_l[11 * merge_stride] = r12;
- merge_l[10 * merge_stride] = r11;
- merge_l[9 * merge_stride] = r10;
- merge_l[8 * merge_stride] = r9;
- merge_l[7 * merge_stride] = r8;
- merge_l[6 * merge_stride] = r7;
- merge_l[5 * merge_stride] = r6;
- merge_l[4 * merge_stride] = r5;
- merge_l[3 * merge_stride] = r4;
- merge_l[2 * merge_stride] = r3;
- merge_l[1 * merge_stride] = r2;
- merge_l[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_hm_11(__global HS_KEY_TYPE* const restrict vout)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = (warp_idx / 16) >> 6;
-
- uint const merge_stride = 16 * 8 << 6;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
- uint const merge_off = (warp_idx - merge_idx * (16 << 6)) * 8;
-
- __global HS_KEY_TYPE* const restrict merge_ptr =
- vout + (merge_base + merge_off + warp_lane_idx);
-
- HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
- HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
- HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
- HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
- HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
- HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
- HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
- HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
- HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
- HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
- HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
- HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
- HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
- HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
- HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
- HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
- HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
- HS_CMP_XCHG(r1, r17)
- HS_CMP_XCHG(r9, r25)
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r5, r21)
- HS_CMP_XCHG(r13, r29)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r3, r19)
- HS_CMP_XCHG(r11, r27)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r7, r23)
- HS_CMP_XCHG(r15, r31)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r2, r18)
- HS_CMP_XCHG(r10, r26)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r6, r22)
- HS_CMP_XCHG(r14, r30)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r4, r20)
- HS_CMP_XCHG(r12, r28)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r8, r24)
- HS_CMP_XCHG(r16, r32)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_ptr[31 * merge_stride] = r32;
- merge_ptr[30 * merge_stride] = r31;
- merge_ptr[29 * merge_stride] = r30;
- merge_ptr[28 * merge_stride] = r29;
- merge_ptr[27 * merge_stride] = r28;
- merge_ptr[26 * merge_stride] = r27;
- merge_ptr[25 * merge_stride] = r26;
- merge_ptr[24 * merge_stride] = r25;
- merge_ptr[23 * merge_stride] = r24;
- merge_ptr[22 * merge_stride] = r23;
- merge_ptr[21 * merge_stride] = r22;
- merge_ptr[20 * merge_stride] = r21;
- merge_ptr[19 * merge_stride] = r20;
- merge_ptr[18 * merge_stride] = r19;
- merge_ptr[17 * merge_stride] = r18;
- merge_ptr[16 * merge_stride] = r17;
- merge_ptr[15 * merge_stride] = r16;
- merge_ptr[14 * merge_stride] = r15;
- merge_ptr[13 * merge_stride] = r14;
- merge_ptr[12 * merge_stride] = r13;
- merge_ptr[11 * merge_stride] = r12;
- merge_ptr[10 * merge_stride] = r11;
- merge_ptr[9 * merge_stride] = r10;
- merge_ptr[8 * merge_stride] = r9;
- merge_ptr[7 * merge_stride] = r8;
- merge_ptr[6 * merge_stride] = r7;
- merge_ptr[5 * merge_stride] = r6;
- merge_ptr[4 * merge_stride] = r5;
- merge_ptr[3 * merge_stride] = r4;
- merge_ptr[2 * merge_stride] = r3;
- merge_ptr[1 * merge_stride] = r2;
- merge_ptr[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_fm_13(__global HS_KEY_TYPE* const restrict vout,
- uint const fm_full,
- uint const fm_frac)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = warp_idx / 16 >> 12;
-
- uint const merge_stride = 16 * 8 << 12;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
-
- uint const merge_l_off =
- (warp_idx - merge_idx * (16 << 12)) * 8 + warp_lane_idx;
- uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
-
- int const merge_r_off = merge_keys - merge_l_end - 1;
-
- __global HS_KEY_TYPE* const restrict merge_l =
- vout + (merge_base + merge_l_off);
- __global HS_KEY_TYPE* const restrict merge_r =
- vout + (merge_base + merge_r_off);
-
- HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
- if (merge_idx < fm_full) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
- HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
- HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
- HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
- HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
- HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
- HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
- HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r8, r25)
- HS_CMP_XCHG(r7, r26)
- HS_CMP_XCHG(r6, r27)
- HS_CMP_XCHG(r5, r28)
- HS_CMP_XCHG(r4, r29)
- HS_CMP_XCHG(r3, r30)
- HS_CMP_XCHG(r2, r31)
- HS_CMP_XCHG(r1, r32)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_r[15 * merge_stride] = r32;
- merge_r[14 * merge_stride] = r31;
- merge_r[13 * merge_stride] = r30;
- merge_r[12 * merge_stride] = r29;
- merge_r[11 * merge_stride] = r28;
- merge_r[10 * merge_stride] = r27;
- merge_r[9 * merge_stride] = r26;
- merge_r[8 * merge_stride] = r25;
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 8) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 4) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 2) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r17, r18)
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- merge_r[0 * merge_stride] = r17;
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- merge_l[15 * merge_stride] = r16;
- merge_l[14 * merge_stride] = r15;
- merge_l[13 * merge_stride] = r14;
- merge_l[12 * merge_stride] = r13;
- merge_l[11 * merge_stride] = r12;
- merge_l[10 * merge_stride] = r11;
- merge_l[9 * merge_stride] = r10;
- merge_l[8 * merge_stride] = r9;
- merge_l[7 * merge_stride] = r8;
- merge_l[6 * merge_stride] = r7;
- merge_l[5 * merge_stride] = r6;
- merge_l[4 * merge_stride] = r5;
- merge_l[3 * merge_stride] = r4;
- merge_l[2 * merge_stride] = r3;
- merge_l[1 * merge_stride] = r2;
- merge_l[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_hm_12(__global HS_KEY_TYPE* const restrict vout)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = (warp_idx / 16) >> 7;
-
- uint const merge_stride = 16 * 8 << 7;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
- uint const merge_off = (warp_idx - merge_idx * (16 << 7)) * 8;
-
- __global HS_KEY_TYPE* const restrict merge_ptr =
- vout + (merge_base + merge_off + warp_lane_idx);
-
- HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
- HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
- HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
- HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
- HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
- HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
- HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
- HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
- HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
- HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
- HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
- HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
- HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
- HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
- HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
- HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
- HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
- HS_CMP_XCHG(r1, r17)
- HS_CMP_XCHG(r9, r25)
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r5, r21)
- HS_CMP_XCHG(r13, r29)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r3, r19)
- HS_CMP_XCHG(r11, r27)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r7, r23)
- HS_CMP_XCHG(r15, r31)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r2, r18)
- HS_CMP_XCHG(r10, r26)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r6, r22)
- HS_CMP_XCHG(r14, r30)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r4, r20)
- HS_CMP_XCHG(r12, r28)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r8, r24)
- HS_CMP_XCHG(r16, r32)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_ptr[31 * merge_stride] = r32;
- merge_ptr[30 * merge_stride] = r31;
- merge_ptr[29 * merge_stride] = r30;
- merge_ptr[28 * merge_stride] = r29;
- merge_ptr[27 * merge_stride] = r28;
- merge_ptr[26 * merge_stride] = r27;
- merge_ptr[25 * merge_stride] = r26;
- merge_ptr[24 * merge_stride] = r25;
- merge_ptr[23 * merge_stride] = r24;
- merge_ptr[22 * merge_stride] = r23;
- merge_ptr[21 * merge_stride] = r22;
- merge_ptr[20 * merge_stride] = r21;
- merge_ptr[19 * merge_stride] = r20;
- merge_ptr[18 * merge_stride] = r19;
- merge_ptr[17 * merge_stride] = r18;
- merge_ptr[16 * merge_stride] = r17;
- merge_ptr[15 * merge_stride] = r16;
- merge_ptr[14 * merge_stride] = r15;
- merge_ptr[13 * merge_stride] = r14;
- merge_ptr[12 * merge_stride] = r13;
- merge_ptr[11 * merge_stride] = r12;
- merge_ptr[10 * merge_stride] = r11;
- merge_ptr[9 * merge_stride] = r10;
- merge_ptr[8 * merge_stride] = r9;
- merge_ptr[7 * merge_stride] = r8;
- merge_ptr[6 * merge_stride] = r7;
- merge_ptr[5 * merge_stride] = r6;
- merge_ptr[4 * merge_stride] = r5;
- merge_ptr[3 * merge_stride] = r4;
- merge_ptr[2 * merge_stride] = r3;
- merge_ptr[1 * merge_stride] = r2;
- merge_ptr[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_fm_14(__global HS_KEY_TYPE* const restrict vout,
- uint const fm_full,
- uint const fm_frac)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = warp_idx / 16 >> 13;
-
- uint const merge_stride = 16 * 8 << 13;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
-
- uint const merge_l_off =
- (warp_idx - merge_idx * (16 << 13)) * 8 + warp_lane_idx;
- uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
-
- int const merge_r_off = merge_keys - merge_l_end - 1;
-
- __global HS_KEY_TYPE* const restrict merge_l =
- vout + (merge_base + merge_l_off);
- __global HS_KEY_TYPE* const restrict merge_r =
- vout + (merge_base + merge_r_off);
-
- HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
- if (merge_idx < fm_full) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
- HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
- HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
- HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
- HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
- HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
- HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
- HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r8, r25)
- HS_CMP_XCHG(r7, r26)
- HS_CMP_XCHG(r6, r27)
- HS_CMP_XCHG(r5, r28)
- HS_CMP_XCHG(r4, r29)
- HS_CMP_XCHG(r3, r30)
- HS_CMP_XCHG(r2, r31)
- HS_CMP_XCHG(r1, r32)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_r[15 * merge_stride] = r32;
- merge_r[14 * merge_stride] = r31;
- merge_r[13 * merge_stride] = r30;
- merge_r[12 * merge_stride] = r29;
- merge_r[11 * merge_stride] = r28;
- merge_r[10 * merge_stride] = r27;
- merge_r[9 * merge_stride] = r26;
- merge_r[8 * merge_stride] = r25;
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 8) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 4) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 2) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r17, r18)
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- merge_r[0 * merge_stride] = r17;
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- merge_l[15 * merge_stride] = r16;
- merge_l[14 * merge_stride] = r15;
- merge_l[13 * merge_stride] = r14;
- merge_l[12 * merge_stride] = r13;
- merge_l[11 * merge_stride] = r12;
- merge_l[10 * merge_stride] = r11;
- merge_l[9 * merge_stride] = r10;
- merge_l[8 * merge_stride] = r9;
- merge_l[7 * merge_stride] = r8;
- merge_l[6 * merge_stride] = r7;
- merge_l[5 * merge_stride] = r6;
- merge_l[4 * merge_stride] = r5;
- merge_l[3 * merge_stride] = r4;
- merge_l[2 * merge_stride] = r3;
- merge_l[1 * merge_stride] = r2;
- merge_l[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_hm_13(__global HS_KEY_TYPE* const restrict vout)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = (warp_idx / 16) >> 8;
-
- uint const merge_stride = 16 * 8 << 8;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
- uint const merge_off = (warp_idx - merge_idx * (16 << 8)) * 8;
-
- __global HS_KEY_TYPE* const restrict merge_ptr =
- vout + (merge_base + merge_off + warp_lane_idx);
-
- HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
- HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
- HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
- HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
- HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
- HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
- HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
- HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
- HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
- HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
- HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
- HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
- HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
- HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
- HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
- HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
- HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
- HS_CMP_XCHG(r1, r17)
- HS_CMP_XCHG(r9, r25)
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r5, r21)
- HS_CMP_XCHG(r13, r29)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r3, r19)
- HS_CMP_XCHG(r11, r27)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r7, r23)
- HS_CMP_XCHG(r15, r31)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r2, r18)
- HS_CMP_XCHG(r10, r26)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r6, r22)
- HS_CMP_XCHG(r14, r30)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r4, r20)
- HS_CMP_XCHG(r12, r28)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r8, r24)
- HS_CMP_XCHG(r16, r32)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_ptr[31 * merge_stride] = r32;
- merge_ptr[30 * merge_stride] = r31;
- merge_ptr[29 * merge_stride] = r30;
- merge_ptr[28 * merge_stride] = r29;
- merge_ptr[27 * merge_stride] = r28;
- merge_ptr[26 * merge_stride] = r27;
- merge_ptr[25 * merge_stride] = r26;
- merge_ptr[24 * merge_stride] = r25;
- merge_ptr[23 * merge_stride] = r24;
- merge_ptr[22 * merge_stride] = r23;
- merge_ptr[21 * merge_stride] = r22;
- merge_ptr[20 * merge_stride] = r21;
- merge_ptr[19 * merge_stride] = r20;
- merge_ptr[18 * merge_stride] = r19;
- merge_ptr[17 * merge_stride] = r18;
- merge_ptr[16 * merge_stride] = r17;
- merge_ptr[15 * merge_stride] = r16;
- merge_ptr[14 * merge_stride] = r15;
- merge_ptr[13 * merge_stride] = r14;
- merge_ptr[12 * merge_stride] = r13;
- merge_ptr[11 * merge_stride] = r12;
- merge_ptr[10 * merge_stride] = r11;
- merge_ptr[9 * merge_stride] = r10;
- merge_ptr[8 * merge_stride] = r9;
- merge_ptr[7 * merge_stride] = r8;
- merge_ptr[6 * merge_stride] = r7;
- merge_ptr[5 * merge_stride] = r6;
- merge_ptr[4 * merge_stride] = r5;
- merge_ptr[3 * merge_stride] = r4;
- merge_ptr[2 * merge_stride] = r3;
- merge_ptr[1 * merge_stride] = r2;
- merge_ptr[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_fm_15(__global HS_KEY_TYPE* const restrict vout,
- uint const fm_full,
- uint const fm_frac)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = warp_idx / 16 >> 14;
-
- uint const merge_stride = 16 * 8 << 14;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
-
- uint const merge_l_off =
- (warp_idx - merge_idx * (16 << 14)) * 8 + warp_lane_idx;
- uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
-
- int const merge_r_off = merge_keys - merge_l_end - 1;
-
- __global HS_KEY_TYPE* const restrict merge_l =
- vout + (merge_base + merge_l_off);
- __global HS_KEY_TYPE* const restrict merge_r =
- vout + (merge_base + merge_r_off);
-
- HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
- if (merge_idx < fm_full) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
- HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
- HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
- HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
- HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
- HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
- HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
- HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r8, r25)
- HS_CMP_XCHG(r7, r26)
- HS_CMP_XCHG(r6, r27)
- HS_CMP_XCHG(r5, r28)
- HS_CMP_XCHG(r4, r29)
- HS_CMP_XCHG(r3, r30)
- HS_CMP_XCHG(r2, r31)
- HS_CMP_XCHG(r1, r32)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_r[15 * merge_stride] = r32;
- merge_r[14 * merge_stride] = r31;
- merge_r[13 * merge_stride] = r30;
- merge_r[12 * merge_stride] = r29;
- merge_r[11 * merge_stride] = r28;
- merge_r[10 * merge_stride] = r27;
- merge_r[9 * merge_stride] = r26;
- merge_r[8 * merge_stride] = r25;
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 8) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 4) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 2) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r17, r18)
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- merge_r[0 * merge_stride] = r17;
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- merge_l[15 * merge_stride] = r16;
- merge_l[14 * merge_stride] = r15;
- merge_l[13 * merge_stride] = r14;
- merge_l[12 * merge_stride] = r13;
- merge_l[11 * merge_stride] = r12;
- merge_l[10 * merge_stride] = r11;
- merge_l[9 * merge_stride] = r10;
- merge_l[8 * merge_stride] = r9;
- merge_l[7 * merge_stride] = r8;
- merge_l[6 * merge_stride] = r7;
- merge_l[5 * merge_stride] = r6;
- merge_l[4 * merge_stride] = r5;
- merge_l[3 * merge_stride] = r4;
- merge_l[2 * merge_stride] = r3;
- merge_l[1 * merge_stride] = r2;
- merge_l[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_hm_14(__global HS_KEY_TYPE* const restrict vout)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = (warp_idx / 16) >> 9;
-
- uint const merge_stride = 16 * 8 << 9;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
- uint const merge_off = (warp_idx - merge_idx * (16 << 9)) * 8;
-
- __global HS_KEY_TYPE* const restrict merge_ptr =
- vout + (merge_base + merge_off + warp_lane_idx);
-
- HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
- HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
- HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
- HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
- HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
- HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
- HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
- HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
- HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
- HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
- HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
- HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
- HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
- HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
- HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
- HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
- HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
- HS_CMP_XCHG(r1, r17)
- HS_CMP_XCHG(r9, r25)
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r5, r21)
- HS_CMP_XCHG(r13, r29)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r3, r19)
- HS_CMP_XCHG(r11, r27)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r7, r23)
- HS_CMP_XCHG(r15, r31)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r2, r18)
- HS_CMP_XCHG(r10, r26)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r6, r22)
- HS_CMP_XCHG(r14, r30)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r4, r20)
- HS_CMP_XCHG(r12, r28)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r8, r24)
- HS_CMP_XCHG(r16, r32)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_ptr[31 * merge_stride] = r32;
- merge_ptr[30 * merge_stride] = r31;
- merge_ptr[29 * merge_stride] = r30;
- merge_ptr[28 * merge_stride] = r29;
- merge_ptr[27 * merge_stride] = r28;
- merge_ptr[26 * merge_stride] = r27;
- merge_ptr[25 * merge_stride] = r26;
- merge_ptr[24 * merge_stride] = r25;
- merge_ptr[23 * merge_stride] = r24;
- merge_ptr[22 * merge_stride] = r23;
- merge_ptr[21 * merge_stride] = r22;
- merge_ptr[20 * merge_stride] = r21;
- merge_ptr[19 * merge_stride] = r20;
- merge_ptr[18 * merge_stride] = r19;
- merge_ptr[17 * merge_stride] = r18;
- merge_ptr[16 * merge_stride] = r17;
- merge_ptr[15 * merge_stride] = r16;
- merge_ptr[14 * merge_stride] = r15;
- merge_ptr[13 * merge_stride] = r14;
- merge_ptr[12 * merge_stride] = r13;
- merge_ptr[11 * merge_stride] = r12;
- merge_ptr[10 * merge_stride] = r11;
- merge_ptr[9 * merge_stride] = r10;
- merge_ptr[8 * merge_stride] = r9;
- merge_ptr[7 * merge_stride] = r8;
- merge_ptr[6 * merge_stride] = r7;
- merge_ptr[5 * merge_stride] = r6;
- merge_ptr[4 * merge_stride] = r5;
- merge_ptr[3 * merge_stride] = r4;
- merge_ptr[2 * merge_stride] = r3;
- merge_ptr[1 * merge_stride] = r2;
- merge_ptr[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_fm_16(__global HS_KEY_TYPE* const restrict vout,
- uint const fm_full,
- uint const fm_frac)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = warp_idx / 16 >> 15;
-
- uint const merge_stride = 16 * 8 << 15;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
-
- uint const merge_l_off =
- (warp_idx - merge_idx * (16 << 15)) * 8 + warp_lane_idx;
- uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
-
- int const merge_r_off = merge_keys - merge_l_end - 1;
-
- __global HS_KEY_TYPE* const restrict merge_l =
- vout + (merge_base + merge_l_off);
- __global HS_KEY_TYPE* const restrict merge_r =
- vout + (merge_base + merge_r_off);
-
- HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
- if (merge_idx < fm_full) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
- HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
- HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
- HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
- HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
- HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
- HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
- HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r8, r25)
- HS_CMP_XCHG(r7, r26)
- HS_CMP_XCHG(r6, r27)
- HS_CMP_XCHG(r5, r28)
- HS_CMP_XCHG(r4, r29)
- HS_CMP_XCHG(r3, r30)
- HS_CMP_XCHG(r2, r31)
- HS_CMP_XCHG(r1, r32)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_r[15 * merge_stride] = r32;
- merge_r[14 * merge_stride] = r31;
- merge_r[13 * merge_stride] = r30;
- merge_r[12 * merge_stride] = r29;
- merge_r[11 * merge_stride] = r28;
- merge_r[10 * merge_stride] = r27;
- merge_r[9 * merge_stride] = r26;
- merge_r[8 * merge_stride] = r25;
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 8) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
- HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
- HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
- HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r12, r21)
- HS_CMP_XCHG(r11, r22)
- HS_CMP_XCHG(r10, r23)
- HS_CMP_XCHG(r9, r24)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- merge_r[7 * merge_stride] = r24;
- merge_r[6 * merge_stride] = r23;
- merge_r[5 * merge_stride] = r22;
- merge_r[4 * merge_stride] = r21;
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 4) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
- HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r14, r19)
- HS_CMP_XCHG(r13, r20)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- merge_r[3 * merge_stride] = r20;
- merge_r[2 * merge_stride] = r19;
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else if (fm_frac == 2) {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- HS_CMP_XCHG(r15, r18)
- HS_CMP_XCHG(r17, r18)
- merge_r[1 * merge_stride] = r18;
- merge_r[0 * merge_stride] = r17;
- } else {
- HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
- HS_CMP_XCHG(r16, r17)
- merge_r[0 * merge_stride] = r17;
- }
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- merge_l[15 * merge_stride] = r16;
- merge_l[14 * merge_stride] = r15;
- merge_l[13 * merge_stride] = r14;
- merge_l[12 * merge_stride] = r13;
- merge_l[11 * merge_stride] = r12;
- merge_l[10 * merge_stride] = r11;
- merge_l[9 * merge_stride] = r10;
- merge_l[8 * merge_stride] = r9;
- merge_l[7 * merge_stride] = r8;
- merge_l[6 * merge_stride] = r7;
- merge_l[5 * merge_stride] = r6;
- merge_l[4 * merge_stride] = r5;
- merge_l[3 * merge_stride] = r4;
- merge_l[2 * merge_stride] = r3;
- merge_l[1 * merge_stride] = r2;
- merge_l[0 * merge_stride] = r1;
-}
-
-__kernel __attribute__((intel_reqd_sub_group_size(8))) void
-hs_kernel_hm_15(__global HS_KEY_TYPE* const restrict vout)
-{
- uint const global_id = (uint)get_global_id(0);
- uint const warp_idx = global_id / 8;
- uint const warp_lane_idx = global_id & 7;
-
- uint const merge_idx = (warp_idx / 16) >> 10;
-
- uint const merge_stride = 16 * 8 << 10;
- uint const merge_keys = merge_stride * 32;
-
- uint const merge_base = merge_idx * merge_keys;
- uint const merge_off = (warp_idx - merge_idx * (16 << 10)) * 8;
-
- __global HS_KEY_TYPE* const restrict merge_ptr =
- vout + (merge_base + merge_off + warp_lane_idx);
-
- HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
- HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
- HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
- HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
- HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
- HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
- HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
- HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
- HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
- HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
- HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
- HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
- HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
- HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
- HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
- HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
- HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
- HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
- HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
- HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
- HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
- HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
- HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
- HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
- HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
- HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
- HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
- HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
- HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
- HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
- HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
- HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
- HS_CMP_XCHG(r1, r17)
- HS_CMP_XCHG(r9, r25)
- HS_CMP_XCHG(r1, r9)
- HS_CMP_XCHG(r17, r25)
- HS_CMP_XCHG(r5, r21)
- HS_CMP_XCHG(r13, r29)
- HS_CMP_XCHG(r5, r13)
- HS_CMP_XCHG(r21, r29)
- HS_CMP_XCHG(r1, r5)
- HS_CMP_XCHG(r9, r13)
- HS_CMP_XCHG(r17, r21)
- HS_CMP_XCHG(r25, r29)
- HS_CMP_XCHG(r3, r19)
- HS_CMP_XCHG(r11, r27)
- HS_CMP_XCHG(r3, r11)
- HS_CMP_XCHG(r19, r27)
- HS_CMP_XCHG(r7, r23)
- HS_CMP_XCHG(r15, r31)
- HS_CMP_XCHG(r7, r15)
- HS_CMP_XCHG(r23, r31)
- HS_CMP_XCHG(r3, r7)
- HS_CMP_XCHG(r11, r15)
- HS_CMP_XCHG(r19, r23)
- HS_CMP_XCHG(r27, r31)
- HS_CMP_XCHG(r1, r3)
- HS_CMP_XCHG(r5, r7)
- HS_CMP_XCHG(r9, r11)
- HS_CMP_XCHG(r13, r15)
- HS_CMP_XCHG(r17, r19)
- HS_CMP_XCHG(r21, r23)
- HS_CMP_XCHG(r25, r27)
- HS_CMP_XCHG(r29, r31)
- HS_CMP_XCHG(r2, r18)
- HS_CMP_XCHG(r10, r26)
- HS_CMP_XCHG(r2, r10)
- HS_CMP_XCHG(r18, r26)
- HS_CMP_XCHG(r6, r22)
- HS_CMP_XCHG(r14, r30)
- HS_CMP_XCHG(r6, r14)
- HS_CMP_XCHG(r22, r30)
- HS_CMP_XCHG(r2, r6)
- HS_CMP_XCHG(r10, r14)
- HS_CMP_XCHG(r18, r22)
- HS_CMP_XCHG(r26, r30)
- HS_CMP_XCHG(r4, r20)
- HS_CMP_XCHG(r12, r28)
- HS_CMP_XCHG(r4, r12)
- HS_CMP_XCHG(r20, r28)
- HS_CMP_XCHG(r8, r24)
- HS_CMP_XCHG(r16, r32)
- HS_CMP_XCHG(r8, r16)
- HS_CMP_XCHG(r24, r32)
- HS_CMP_XCHG(r4, r8)
- HS_CMP_XCHG(r12, r16)
- HS_CMP_XCHG(r20, r24)
- HS_CMP_XCHG(r28, r32)
- HS_CMP_XCHG(r2, r4)
- HS_CMP_XCHG(r6, r8)
- HS_CMP_XCHG(r10, r12)
- HS_CMP_XCHG(r14, r16)
- HS_CMP_XCHG(r18, r20)
- HS_CMP_XCHG(r22, r24)
- HS_CMP_XCHG(r26, r28)
- HS_CMP_XCHG(r30, r32)
- HS_CMP_XCHG(r1, r2)
- HS_CMP_XCHG(r3, r4)
- HS_CMP_XCHG(r5, r6)
- HS_CMP_XCHG(r7, r8)
- HS_CMP_XCHG(r9, r10)
- HS_CMP_XCHG(r11, r12)
- HS_CMP_XCHG(r13, r14)
- HS_CMP_XCHG(r15, r16)
- HS_CMP_XCHG(r17, r18)
- HS_CMP_XCHG(r19, r20)
- HS_CMP_XCHG(r21, r22)
- HS_CMP_XCHG(r23, r24)
- HS_CMP_XCHG(r25, r26)
- HS_CMP_XCHG(r27, r28)
- HS_CMP_XCHG(r29, r30)
- HS_CMP_XCHG(r31, r32)
- merge_ptr[31 * merge_stride] = r32;
- merge_ptr[30 * merge_stride] = r31;
- merge_ptr[29 * merge_stride] = r30;
- merge_ptr[28 * merge_stride] = r29;
- merge_ptr[27 * merge_stride] = r28;
- merge_ptr[26 * merge_stride] = r27;
- merge_ptr[25 * merge_stride] = r26;
- merge_ptr[24 * merge_stride] = r25;
- merge_ptr[23 * merge_stride] = r24;
- merge_ptr[22 * merge_stride] = r23;
- merge_ptr[21 * merge_stride] = r22;
- merge_ptr[20 * merge_stride] = r21;
- merge_ptr[19 * merge_stride] = r20;
- merge_ptr[18 * merge_stride] = r19;
- merge_ptr[17 * merge_stride] = r18;
- merge_ptr[16 * merge_stride] = r17;
- merge_ptr[15 * merge_stride] = r16;
- merge_ptr[14 * merge_stride] = r15;
- merge_ptr[13 * merge_stride] = r14;
- merge_ptr[12 * merge_stride] = r13;
- merge_ptr[11 * merge_stride] = r12;
- merge_ptr[10 * merge_stride] = r11;
- merge_ptr[9 * merge_stride] = r10;
- merge_ptr[8 * merge_stride] = r9;
- merge_ptr[7 * merge_stride] = r8;
- merge_ptr[6 * merge_stride] = r7;
- merge_ptr[5 * merge_stride] = r6;
- merge_ptr[4 * merge_stride] = r5;
- merge_ptr[3 * merge_stride] = r4;
- merge_ptr[2 * merge_stride] = r3;
- merge_ptr[1 * merge_stride] = r2;
- merge_ptr[0 * merge_stride] = r1;
-}
-
-//
-//
-//
diff --git a/src/compute/hs/cl/gen9/hs_cl_macros.h b/src/compute/hs/cl/gen9/hs_cl_macros.h
deleted file mode 100644
index d314fe88ae..0000000000
--- a/src/compute/hs/cl/gen9/hs_cl_macros.h
+++ /dev/null
@@ -1,199 +0,0 @@
-//
-// Copyright 2016 Google Inc.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-//
-
-#ifndef HS_CL_MACROS_ONCE
-#define HS_CL_MACROS_ONCE
-
-//
-//
-//
-
-#include "hs_cl.h"
-
-//
-// Inter-lane compare exchange
-//
-
-// default
-#define HS_CMP_XCHG_V0(a,b) \
- { \
- HS_KEY_TYPE const t = min(a,b); \
- b = max(a,b); \
- a = t; \
- }
-
-// super slow
-#define HS_CMP_XCHG_V1(a,b) \
- { \
- HS_KEY_TYPE const tmp = a; \
- a = (a < b) ? a : b; \
- b ^= a ^ tmp; \
- }
-
-// best
-#define HS_CMP_XCHG_V2(a,b) \
- if (a >= b) { \
- HS_KEY_TYPE const t = a; \
- a = b; \
- b = t; \
- }
-
-// good
-#define HS_CMP_XCHG_V3(a,b) \
- { \
- int const ge = a >= b; \
- HS_KEY_TYPE const t = a; \
- a = ge ? b : a; \
- b = ge ? t : b; \
- }
-
-//
-//
-//
-
-#if (HS_KEY_WORDS == 1)
-#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V0(a,b)
-#elif (HS_KEY_WORDS == 2)
-#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V2(a,b)
-#endif
-
-//
-// Conditional inter-subgroup flip/half compare exchange
-//
-
-#define HS_CMP_FLIP(i,a,b) \
- { \
- HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,flip_lane_idx); \
- HS_KEY_TYPE const tb = intel_sub_group_shuffle(b,flip_lane_idx); \
- a = HS_COND_MIN_MAX(t_lt,a,tb); \
- b = HS_COND_MIN_MAX(t_lt,b,ta); \
- }
-
-#define HS_CMP_HALF(i,a) \
- { \
- HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,half_lane_idx); \
- a = HS_COND_MIN_MAX(t_lt,a,ta); \
- }
-
-//
-// The device's comparison operator might return what we actually
-// want. For example, it appears GEN 'cmp' returns {true:-1,false:0}.
-//
-
-#define HS_CMP_IS_ZERO_ONE
-
-#ifdef HS_CMP_IS_ZERO_ONE
-// OpenCL requires a {true: +1, false: 0} scalar result
-// (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF }
-#define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b))
-#define HS_CMP_TO_MASK(a) (HS_KEY_TYPE)(-a)
-#else
-// However, OpenCL requires { -1, 0 } for vectors
-// (a < b) -> { 0xFFFFFFFF, 0 }
-#define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64
-#define HS_CMP_TO_MASK(a) (a)
-#endif
-
-//
-// The flip/half comparisons rely on a "conditional min/max":
-//
-// - if the flag is false, return min(a,b)
-// - otherwise, return max(a,b)
-//
-// What's a little surprising is that sequence (1) is faster than (2)
-// for 32-bit keys.
-//
-// I suspect either a code generation problem or that the sequence
-// maps well to the GEN instruction set.
-//
-// We mostly care about 64-bit keys and unsurprisingly sequence (2) is
-// fastest for this wider type.
-//
-
-// this is what you would normally use
-#define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) ^ lt) ? b : a
-
-// this seems to be faster for 32-bit keys
-#define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b))
-
-//
-//
-//
-
-#if (HS_KEY_WORDS == 1)
-#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V1(lt,a,b)
-#elif (HS_KEY_WORDS == 2)
-#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b)
-#endif
-
-//
-// This snarl of macros is for transposing a "slab" of sorted elements
-// into linear order.
-//
-// This can occur as the last step in hs_sort() or via a custom kernel
-// that inspects the slab and then transposes and stores it to memory.
-//
-// The slab format can be inspected more efficiently than a linear
-// arrangement.
-//
-// The prime example is detecting when adjacent keys (in sort order)
-// have differing high order bits ("key changes"). The index of each
-// change is recorded to an auxilary array.
-//
-// A post-processing step like this needs to be able to navigate the
-// slab and eventually transpose and store the slab in linear order.
-//
-
-#define HS_TRANSPOSE_REG(prefix,row) prefix##row
-#define HS_TRANSPOSE_DECL(prefix,row) HS_KEY_TYPE const HS_TRANSPOSE_REG(prefix,row)
-
-#define HS_TRANSPOSE_DELTA(level) (HS_LANES_PER_WARP + (1 << (level-1)))
-#define HS_TRANSPOSE_IF(level) ((get_sub_group_local_id() >> (level - 1)) & 1)
-
-#define HS_TRANSPOSE_LL(level) HS_TRANSPOSE_IF(level) ? 0 : HS_TRANSPOSE_DELTA(level)
-#define HS_TRANSPOSE_UR(level) HS_TRANSPOSE_IF(level) ? HS_TRANSPOSE_DELTA(level) : 0
-
-#define HS_TRANSPOSE_DELTA_LL(level) delta_ll_##level
-#define HS_TRANSPOSE_DELTA_UR(level) delta_ur_##level
-
-#define HS_TRANSPOSE_STAGE(level) \
- uint const HS_TRANSPOSE_DELTA_LL(level) = HS_TRANSPOSE_LL(level); \
- uint const HS_TRANSPOSE_DELTA_UR(level) = HS_TRANSPOSE_UR(level);
-
-#define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \
- HS_TRANSPOSE_DECL(prefix_curr,row_ll) = \
- intel_sub_group_shuffle_down(HS_TRANSPOSE_REG(prefix_prev,row_ll), \
- HS_TRANSPOSE_REG(prefix_prev,row_ur), \
- HS_TRANSPOSE_DELTA_LL(level)); \
- HS_TRANSPOSE_DECL(prefix_curr,row_ur) = \
- intel_sub_group_shuffle_up(HS_TRANSPOSE_REG(prefix_prev,row_ll), \
- HS_TRANSPOSE_REG(prefix_prev,row_ur), \
- HS_TRANSPOSE_DELTA_UR(level)); \
-
-// #define HS_TRANSPOSE_LOAD(row) \
-// HS_TRANSPOSE_DECL(0,row) = (vout + gmem_idx)[(row-1) << HS_LANES_PER_WARP_LOG2];
-
-#define HS_TRANSPOSE_REMAP(prefix,row_from,row_to) \
- (vout + gmem_idx)[(row_to-1) << HS_LANES_PER_WARP_LOG2] = \
- HS_TRANSPOSE_REG(prefix,row_from);
-
-//
-// undefine these if you want to override
-//
-
-#define HS_TRANSPOSE_PREAMBLE()
-#define HS_TRANSPOSE_BODY()
-
-//
-//
-//
-
-#endif
-
-//
-//
-//
diff --git a/src/compute/hs/cl/gen9/make_all.bat b/src/compute/hs/cl/gen9/make_all.bat
deleted file mode 100644
index fac82b41a0..0000000000
--- a/src/compute/hs/cl/gen9/make_all.bat
+++ /dev/null
@@ -1,16 +0,0 @@
-@ECHO OFF
-
-SET HS_GEN=..\..\..\..\spinel\bin\x64\Debug\hs_gen
-
-REM --- 32-bit keys ---
-
-REM CMD /C %HS_GEN% -a 2 -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
-REM CMD /C %HS_GEN% -a 2 -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
-REM CMD /C %HS_GEN% -a 2 -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
-
-REM --- 64-bit keys
-
-CMD /C %HS_GEN% -a 2 -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
-REM CMD /C %HS_GEN% -a 2 -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
-
-CMD /C make_inl_cl.bat hs_cl.cl
diff --git a/src/compute/hs/cl/hs_cl_launcher.c b/src/compute/hs/cl/hs_cl_launcher.c
index f8a87f1dde..828f59ef63 100644
--- a/src/compute/hs/cl/hs_cl_launcher.c
+++ b/src/compute/hs/cl/hs_cl_launcher.c
@@ -11,126 +11,140 @@
//
#include <stdlib.h>
+#include <string.h>
//
//
//
-#include "hs_cl_launcher.h"
-#include "assert_cl.h"
-#include "macros.h"
-#include "util.h"
+#include "common/cl/assert_cl.h"
+#include "common/macros.h"
+#include "common/util.h"
//
//
//
-typedef uint32_t uint;
-typedef uint64_t ulong;
+#include "hs_cl_launcher.h"
//
//
//
-#include "hs_cl.h"
+struct hs_cl
+{
+ struct hs_cl_target_config config;
+
+ uint32_t key_val_size;
+ uint32_t slab_keys;
+ uint32_t bs_slabs_log2_ru;
+ uint32_t bc_slabs_log2_max;
+
+ struct {
+ uint32_t count;
+ cl_kernel * transpose;
+ cl_kernel * bs;
+ cl_kernel * bc;
+ cl_kernel * fm[3];
+ cl_kernel * hm[3];
+ cl_kernel all[];
+ } kernels;
+};
//
//
//
-#if 0 // #ifndef NDEBUG
-#define HS_KERNEL_SOURCE
-#else
-#define HS_KERNEL_BINARY
+struct hs_state
+{
+#ifndef NDEBUG
+ cl_ulong t_total; // 0
#endif
-//
-// #define HS_KERNEL_SPIRV
-//
+ cl_command_queue cq;
-//
-//
-//
+ // key buffers
+ cl_mem vin;
+ cl_mem vout; // can be vin
-#ifdef NDEBUG
+ // enforces ordering on out-of-order queue
+ cl_event wait_list[3]; // worst case
+ uint32_t wait_list_size;
-#define HS_LAUNCH_TRACE(k,g,l)
+ // bx_ru is number of rounded up warps in vin
+ uint32_t bx_ru;
+};
-#else
+//
+//
+//
-#include <stdio.h>
+static
+void
+hs_state_wait_list_release(struct hs_state * const state)
+{
+ for (uint32_t ii=0; ii<state->wait_list_size; ii++)
+ cl(ReleaseEvent(state->wait_list[ii]));
-#define HS_KERNEL_NAME_MAX 20
+ state->wait_list_size = 0;
+}
static
void
-hs_launch_trace(cl_kernel kernel,
- size_t const global_work_size,
- size_t const local_work_size)
+hs_state_wait_list_update(struct hs_state * const state,
+ uint32_t const wait_list_size,
+ cl_event const * const wait_list)
{
- if (kernel == NULL)
- return;
+ uint32_t const new_size = state->wait_list_size + wait_list_size;
- char name[HS_KERNEL_NAME_MAX];
-
- cl(GetKernelInfo(kernel,CL_KERNEL_FUNCTION_NAME,HS_KERNEL_NAME_MAX,name,NULL));
+ for (uint32_t ii=state->wait_list_size; ii<new_size; ii++)
+ state->wait_list[ii] = wait_list[ii];
- fprintf(stderr,"%-19s ( %6zu, %4zu )\n",name,global_work_size,local_work_size);
+ state->wait_list_size = new_size;
}
-#define HS_LAUNCH_TRACE(k,g,l) hs_launch_trace(k,g,l)
-
-#endif
-
//
//
//
#ifdef NDEBUG
-#define HS_EVENT_NEXT() NULL
-#define HS_EVENT_PROFILE(cq)
+#define HS_STATE_WAIT_LIST_PROFILE(state)
+#define HS_STATE_WAIT_LIST_PROFILE_EX(state,wait_list_size,wait_list)
#else
-#define HS_EVENTS_MAX 128
+#include <stdio.h>
-static cl_event events[HS_EVENTS_MAX];
-static uint32_t events_count;
+#define HS_STATE_WAIT_LIST_PROFILE(state) \
+ hs_state_wait_list_profile(state, \
+ state->wait_list_size, \
+ state->wait_list)
-static
-cl_event *
-hs_event_next()
-{
- if (events_count + 1 >= HS_EVENTS_MAX) // no events can be recorded?
- {
- return NULL;
- }
- else // return next event slot
- {
- return events + events_count++;
- }
-}
+#define HS_STATE_WAIT_LIST_PROFILE_EX(state,wait_list_size,wait_list) \
+ hs_state_wait_list_profile(state, \
+ wait_list_size, \
+ wait_list)
static
void
-hs_event_profile(cl_command_queue cq)
+hs_state_wait_list_profile(struct hs_state * const state,
+ uint32_t const wait_list_size,
+ cl_event const * const wait_list)
{
- cl(Finish(cq));
+ cl(Finish(state->cq));
cl_command_queue_properties props;
- cl(GetCommandQueueInfo(cq,
+ cl(GetCommandQueueInfo(state->cq,
CL_QUEUE_PROPERTIES,
sizeof(props),
&props,
NULL));
- cl_ulong t_min=UINT64_MAX, t_max=0;
-
- for (uint32_t ee=0; ee<events_count; ee++)
+ for (uint32_t ii=0; ii<wait_list_size; ii++)
{
- cl_event event = events[ee];
+ cl_event event = wait_list[ii];
//
// profiling
@@ -152,8 +166,7 @@ hs_event_profile(cl_command_queue cq)
&t_end,
NULL));
- t_min = MIN_MACRO(t_min,t_start);
- t_max = MAX_MACRO(t_max,t_end);
+ state->t_total += t_end - t_start;
}
//
@@ -164,316 +177,52 @@ hs_event_profile(cl_command_queue cq)
cl_get_event_info(event,&status,&type);
- fprintf(stdout,"%-3u, %-13s, %-28s, %20llu, %20llu, %20llu, %20llu\n",
- ee,
+ fprintf(stdout,"%-13s, %-28s, %20llu, %20llu, %20llu, %20llu\n",
cl_get_event_command_status_string(status),
cl_get_event_command_type_string(type),
- t_start,t_end,t_end-t_start,t_max-t_min);
-
- // release
- cl(ReleaseEvent(event));
+ t_start,t_end,t_end-t_start,state->t_total);
}
}
-#define HS_EVENT_NEXT() hs_event_next()
-#define HS_EVENT_PROFILE(cq) hs_event_profile(cq);
-
-#endif
-
-//
-//
-//
-
-struct hs_state
-{
- cl_mem vin;
- cl_mem vout;
-
- // bx.ru is number of rounded up warps in vin
- struct {
- uint32_t ru;
- } bx;
-
- // these values change on each iteration
- union {
- struct {
- uint32_t full;
- uint32_t frac;
- } bs; // warps
- struct {
- uint32_t full;
- uint32_t na;
- } bc; // warps
- struct {
- uint32_t full;
- uint32_t frac;
- } fm; // rows
- };
-};
-
-//
-//
-//
-
-#define HS_THREADS_PER_BLOCK (HS_BS_WARPS * HS_LANES_PER_WARP)
-#define HS_KEYS_PER_WARP (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
-
-#define HS_BS_KEYS_PER_BLOCK (HS_KEYS_PER_WARP * HS_BS_WARPS)
-#define HS_BS_BLOCK_SIZE (HS_BS_KEYS_PER_BLOCK * sizeof(HS_KEY_TYPE))
-
-#define HS_BC_KEYS_PER_BLOCK (HS_KEYS_PER_WARP << HS_BC_WARPS_LOG2_MAX)
-#define HS_BC_BLOCK_SIZE (HS_BC_KEYS_PER_BLOCK * sizeof(HS_KEY_TYPE))
-
-//
-//
-//
-
-#if defined( HS_KERNEL_SOURCE )
-
-#include "hs_cl.pre.src.inl"
-
-#elif defined( HS_KERNEL_BINARY )
-
-#include "hs_cl.pre.bin.inl"
-
-#elif defined( HS_KERNEL_SPIRV )
-
-#include "hs_cl.pre.spv.inl"
-
#endif
//
//
//
-struct hs_transpose_kernel
-{
- cl_kernel kernel;
- char const * name;
-};
-
-#define HS_TRANSPOSE_KERNEL_DECLARE(n) { .name = #n }
-
-static struct hs_transpose_kernel transpose_kernels[] =
- {
- HS_TRANSPOSE_KERNEL_DECLARE(hs_kernel_transpose)
- };
-
-//
-//
-//
-
-struct hs_bs_kernel
-{
- cl_kernel kernel;
- char const * name;
-};
-
-#define HS_BS_KERNEL_DECLARE(n) { .name = #n }
-
-static struct hs_bs_kernel bs_kernels[] =
- {
-#if 0 <= HS_BS_WARPS_LOG2_RU
- HS_BS_KERNEL_DECLARE(hs_kernel_bs_0),
-#endif
-#if 1 <= HS_BS_WARPS_LOG2_RU
- HS_BS_KERNEL_DECLARE(hs_kernel_bs_1),
-#endif
-#if 2 <= HS_BS_WARPS_LOG2_RU
- HS_BS_KERNEL_DECLARE(hs_kernel_bs_2),
-#endif
-#if 3 <= HS_BS_WARPS_LOG2_RU
- HS_BS_KERNEL_DECLARE(hs_kernel_bs_3),
-#endif
-#if 4 <= HS_BS_WARPS_LOG2_RU
- HS_BS_KERNEL_DECLARE(hs_kernel_bs_4),
-#endif
-#if 5 <= HS_BS_WARPS_LOG2_RU
- HS_BS_KERNEL_DECLARE(hs_kernel_bs_5),
-#endif
-#if 6 <= HS_BS_WARPS_LOG2_RU
- HS_BS_KERNEL_DECLARE(hs_kernel_bs_6),
-#endif
-#if 7 <= HS_BS_WARPS_LOG2_RU
- HS_BS_KERNEL_DECLARE(hs_kernel_bs_7),
-#endif
- };
-
-//
-//
-//
+#ifdef NDEBUG
-struct hs_bc_kernel
-{
- cl_kernel kernel;
- char const * name;
-};
+#define HS_LAUNCH_TRACE(k,g,l)
-#define HS_BC_KERNEL_DECLARE(n) { .name = #n }
+#else
-static struct hs_bc_kernel bc_kernels[] =
- {
-#if (0 >= HS_BC_WARPS_LOG2_MIN) && (0 <= HS_BC_WARPS_LOG2_MAX)
- HS_BC_KERNEL_DECLARE(hs_kernel_bc_0),
-#endif
-#if (1 >= HS_BC_WARPS_LOG2_MIN) && (1 <= HS_BC_WARPS_LOG2_MAX)
- HS_BC_KERNEL_DECLARE(hs_kernel_bc_1),
-#endif
-#if (2 >= HS_BC_WARPS_LOG2_MIN) && (2 <= HS_BC_WARPS_LOG2_MAX)
- HS_BC_KERNEL_DECLARE(hs_kernel_bc_2),
-#endif
-#if (3 >= HS_BC_WARPS_LOG2_MIN) && (3 <= HS_BC_WARPS_LOG2_MAX)
- HS_BC_KERNEL_DECLARE(hs_kernel_bc_3),
-#endif
-#if (4 >= HS_BC_WARPS_LOG2_MIN) && (4 <= HS_BC_WARPS_LOG2_MAX)
- HS_BC_KERNEL_DECLARE(hs_kernel_bc_4),
-#endif
-#if (5 >= HS_BC_WARPS_LOG2_MIN) && (5 <= HS_BC_WARPS_LOG2_MAX)
- HS_BC_KERNEL_DECLARE(hs_kernel_bc_5),
-#endif
-#if (6 >= HS_BC_WARPS_LOG2_MIN) && (6 <= HS_BC_WARPS_LOG2_MAX)
- HS_BC_KERNEL_DECLARE(hs_kernel_bc_6),
-#endif
-#if (7 >= HS_BC_WARPS_LOG2_MIN) && (7 <= HS_BC_WARPS_LOG2_MAX)
- HS_BC_KERNEL_DECLARE(hs_kernel_bc_7),
-#endif
- };
+#include <stdio.h>
-//
-//
-//
+#define HS_KERNEL_NAME_MAX 20
-struct hs_fm_kernel
+static
+void
+hs_launch_trace(cl_kernel kernel,
+ uint32_t const dim,
+ size_t const * const global_work_size)
{
- cl_kernel kernel;
- char const * name;
- uint32_t const log2;
-};
-
-#define HS_FM_KERNEL_DECLARE(n,l) { .name = #n, .log2 = l }
+ if (kernel == NULL)
+ return;
-static struct hs_fm_kernel fm_kernels[] =
- {
-#ifdef HS_FM_BLOCKS_LOG2_0
- HS_FM_KERNEL_DECLARE(hs_kernel_fm_0,HS_FM_BLOCKS_LOG2_0),
-#endif
-#ifdef HS_FM_BLOCKS_LOG2_1
- HS_FM_KERNEL_DECLARE(hs_kernel_fm_1,HS_FM_BLOCKS_LOG2_1),
-#endif
-#ifdef HS_FM_BLOCKS_LOG2_2
- HS_FM_KERNEL_DECLARE(hs_kernel_fm_2,HS_FM_BLOCKS_LOG2_2),
-#endif
-#ifdef HS_FM_BLOCKS_LOG2_3
- HS_FM_KERNEL_DECLARE(hs_kernel_fm_3,HS_FM_BLOCKS_LOG2_3),
-#endif
-#ifdef HS_FM_BLOCKS_LOG2_4
- HS_FM_KERNEL_DECLARE(hs_kernel_fm_4,HS_FM_BLOCKS_LOG2_4),
-#endif
-#ifdef HS_FM_BLOCKS_LOG2_5
- HS_FM_KERNEL_DECLARE(hs_kernel_fm_5,HS_FM_BLOCKS_LOG2_5),
-#endif
-#ifdef HS_FM_BLOCKS_LOG2_6
- HS_FM_KERNEL_DECLARE(hs_kernel_fm_6,HS_FM_BLOCKS_LOG2_6),
-#endif
-#ifdef HS_FM_BLOCKS_LOG2_7
- HS_FM_KERNEL_DECLARE(hs_kernel_fm_7,HS_FM_BLOCKS_LOG2_7),
-#endif
-#ifdef HS_FM_BLOCKS_LOG2_8
- HS_FM_KERNEL_DECLARE(hs_kernel_fm_8,HS_FM_BLOCKS_LOG2_8),
-#endif
-#ifdef HS_FM_BLOCKS_LOG2_9
- HS_FM_KERNEL_DECLARE(hs_kernel_fm_9,HS_FM_BLOCKS_LOG2_9),
-#endif
-#ifdef HS_FM_BLOCKS_LOG2_10
- HS_FM_KERNEL_DECLARE(hs_kernel_fm_10,HS_FM_BLOCKS_LOG2_10),
-#endif
-#ifdef HS_FM_BLOCKS_LOG2_11
- HS_FM_KERNEL_DECLARE(hs_kernel_fm_11,HS_FM_BLOCKS_LOG2_11),
-#endif
-#ifdef HS_FM_BLOCKS_LOG2_12
- HS_FM_KERNEL_DECLARE(hs_kernel_fm_12,HS_FM_BLOCKS_LOG2_12),
-#endif
-#ifdef HS_FM_BLOCKS_LOG2_13
- HS_FM_KERNEL_DECLARE(hs_kernel_fm_13,HS_FM_BLOCKS_LOG2_13),
-#endif
-#ifdef HS_FM_BLOCKS_LOG2_14
- HS_FM_KERNEL_DECLARE(hs_kernel_fm_14,HS_FM_BLOCKS_LOG2_14),
-#endif
-#ifdef HS_FM_BLOCKS_LOG2_15
- HS_FM_KERNEL_DECLARE(hs_kernel_fm_15,HS_FM_BLOCKS_LOG2_15),
-#endif
-#ifdef HS_FM_BLOCKS_LOG2_16
- HS_FM_KERNEL_DECLARE(hs_kernel_fm_16,HS_FM_BLOCKS_LOG2_16),
-#endif
- };
+ char name[HS_KERNEL_NAME_MAX];
-//
-//
-//
+ cl(GetKernelInfo(kernel,CL_KERNEL_FUNCTION_NAME,HS_KERNEL_NAME_MAX,name,NULL));
-struct hs_hm_kernel
-{
- cl_kernel kernel;
- char const * name;
- uint32_t const log2;
-};
+ fprintf(stderr,"%-19s ( %6zu, %6zu, %6zu )\n",
+ name,
+ global_work_size[0],
+ dim < 2 ? 0 : global_work_size[1],
+ dim < 3 ? 0 : global_work_size[2]);
+}
-#define HS_HM_KERNEL_DECLARE(n,l) { .name = #n, .log2 = l }
+#define HS_LAUNCH_TRACE(k,d,g) hs_launch_trace(k,d,g)
-static struct hs_hm_kernel hm_kernels[] =
- {
-#ifdef HS_HM_BLOCKS_LOG2_0
- HS_HM_KERNEL_DECLARE(hs_kernel_hm_0,HS_HM_BLOCKS_LOG2_0),
-#endif
-#ifdef HS_HM_BLOCKS_LOG2_1
- HS_HM_KERNEL_DECLARE(hs_kernel_hm_1,HS_HM_BLOCKS_LOG2_1),
-#endif
-#ifdef HS_HM_BLOCKS_LOG2_2
- HS_HM_KERNEL_DECLARE(hs_kernel_hm_2,HS_HM_BLOCKS_LOG2_2),
-#endif
-#ifdef HS_HM_BLOCKS_LOG2_3
- HS_HM_KERNEL_DECLARE(hs_kernel_hm_3,HS_HM_BLOCKS_LOG2_3),
-#endif
-#ifdef HS_HM_BLOCKS_LOG2_4
- HS_HM_KERNEL_DECLARE(hs_kernel_hm_4,HS_HM_BLOCKS_LOG2_4),
-#endif
-#ifdef HS_HM_BLOCKS_LOG2_5
- HS_HM_KERNEL_DECLARE(hs_kernel_hm_5,HS_HM_BLOCKS_LOG2_5),
-#endif
-#ifdef HS_HM_BLOCKS_LOG2_6
- HS_HM_KERNEL_DECLARE(hs_kernel_hm_6,HS_HM_BLOCKS_LOG2_6),
-#endif
-#ifdef HS_HM_BLOCKS_LOG2_7
- HS_HM_KERNEL_DECLARE(hs_kernel_hm_7,HS_HM_BLOCKS_LOG2_7),
-#endif
-#ifdef HS_HM_BLOCKS_LOG2_8
- HS_HM_KERNEL_DECLARE(hs_kernel_hm_8,HS_HM_BLOCKS_LOG2_8),
-#endif
-#ifdef HS_HM_BLOCKS_LOG2_9
- HS_HM_KERNEL_DECLARE(hs_kernel_hm_9,HS_HM_BLOCKS_LOG2_9),
-#endif
-#ifdef HS_HM_BLOCKS_LOG2_10
- HS_HM_KERNEL_DECLARE(hs_kernel_hm_10,HS_HM_BLOCKS_LOG2_10),
-#endif
-#ifdef HS_HM_BLOCKS_LOG2_11
- HS_HM_KERNEL_DECLARE(hs_kernel_hm_11,HS_HM_BLOCKS_LOG2_11),
#endif
-#ifdef HS_HM_BLOCKS_LOG2_12
- HS_HM_KERNEL_DECLARE(hs_kernel_hm_12,HS_HM_BLOCKS_LOG2_12),
-#endif
-#ifdef HS_HM_BLOCKS_LOG2_13
- HS_HM_KERNEL_DECLARE(hs_kernel_hm_13,HS_HM_BLOCKS_LOG2_13),
-#endif
-#ifdef HS_HM_BLOCKS_LOG2_14
- HS_HM_KERNEL_DECLARE(hs_kernel_hm_14,HS_HM_BLOCKS_LOG2_14),
-#endif
-#ifdef HS_HM_BLOCKS_LOG2_15
- HS_HM_KERNEL_DECLARE(hs_kernel_hm_15,HS_HM_BLOCKS_LOG2_15),
-#endif
-#ifdef HS_HM_BLOCKS_LOG2_16
- HS_HM_KERNEL_DECLARE(hs_kernel_hm_16,HS_HM_BLOCKS_LOG2_16),
-#endif
- };
//
//
@@ -481,36 +230,38 @@ static struct hs_hm_kernel hm_kernels[] =
static
void
-hs_barrier(cl_command_queue cq)
+hs_transpose_launcher(struct hs_cl const * const hs,
+ struct hs_state * const state)
{
- cl(EnqueueBarrierWithWaitList(cq,0,NULL,NULL));
-}
+ size_t const size[1] = { state->bx_ru << hs->config.slab.threads_log2 };
+ cl_kernel kernel = hs->kernels.transpose[0];
-//
-//
-//
-
-static
-void
-hs_launch_transpose(struct hs_state const * const state,
- cl_command_queue cq,
- cl_kernel kernel,
- size_t const global_work_size,
- size_t const local_work_size)
-{
- HS_LAUNCH_TRACE(kernel,global_work_size,local_work_size);
+ HS_LAUNCH_TRACE(kernel,1,size);
+ //
+ // The transpose kernel operates on a single slab. For now, let's
+ // rely on the driver to choose a workgroup size.
+ //
+ // size_t local_work_size[1] = { HS_SLAB_THREADS };
+ //
cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout));
- cl(EnqueueNDRangeKernel(cq,
+ cl_event wait_list_out[1];
+
+ cl(EnqueueNDRangeKernel(state->cq,
kernel,
1,
NULL,
- &global_work_size,
- &local_work_size,
- 0,
+ size,
NULL,
- HS_EVENT_NEXT()));
+ state->wait_list_size,
+ state->wait_list,
+ wait_list_out));
+
+ hs_state_wait_list_release(state);
+ hs_state_wait_list_update(state,1,wait_list_out);
+
+ HS_STATE_WAIT_LIST_PROFILE(state);
}
//
@@ -519,49 +270,63 @@ hs_launch_transpose(struct hs_state const * const state,
static
void
-hs_launch_bs(struct hs_state const * const state,
- cl_command_queue cq,
- cl_kernel kernel_full,
- cl_kernel kernel_frac,
- size_t const global_work_size_full,
- size_t const local_work_size_full,
- size_t const local_work_size_frac)
-
+hs_launch_bs(struct hs_cl const * const hs,
+ struct hs_state * const state,
+ uint32_t const full,
+ uint32_t const frac,
+ uint32_t const wait_list_size,
+ cl_event * wait_list)
{
- HS_LAUNCH_TRACE(kernel_full,global_work_size_full,local_work_size_full);
- HS_LAUNCH_TRACE(kernel_frac,local_work_size_frac,local_work_size_frac);
+ uint32_t wait_list_out_size = 0;
+ cl_event wait_list_out[2];
- if (kernel_full != NULL)
+ if (full > 0)
{
+ size_t const size_full[1] = { full << hs->config.slab.threads_log2 };
+ cl_kernel kernel_full = hs->kernels.bs[hs->bs_slabs_log2_ru];
+
+ HS_LAUNCH_TRACE(kernel_full,1,size_full);
+
cl(SetKernelArg(kernel_full,0,sizeof(state->vin), &state->vin));
cl(SetKernelArg(kernel_full,1,sizeof(state->vout),&state->vout));
- cl(EnqueueNDRangeKernel(cq,
+ cl(EnqueueNDRangeKernel(state->cq,
kernel_full,
1,
NULL,
- &global_work_size_full,
- &local_work_size_full,
- 0,
+ size_full,
NULL,
- HS_EVENT_NEXT()));
+ wait_list_size,
+ wait_list,
+ wait_list_out+wait_list_out_size++));
}
- if (kernel_frac != NULL)
+ if (frac > 0)
{
+ size_t const offset_frac[1] = { full << hs->config.slab.threads_log2 };
+ size_t const size_frac [1] = { frac << hs->config.slab.threads_log2 };
+ cl_kernel kernel_frac = hs->kernels.bs[msb_idx_u32(frac)];
+
+ HS_LAUNCH_TRACE(kernel_frac,1,size_frac);
+
cl(SetKernelArg(kernel_frac,0,sizeof(state->vin), &state->vin));
cl(SetKernelArg(kernel_frac,1,sizeof(state->vout),&state->vout));
- cl(EnqueueNDRangeKernel(cq,
+ cl(EnqueueNDRangeKernel(state->cq,
kernel_frac,
1,
- &global_work_size_full,
- &local_work_size_frac,
- &local_work_size_frac,
- 0,
+ offset_frac,
+ size_frac,
NULL,
- HS_EVENT_NEXT()));
+ wait_list_size,
+ wait_list,
+ wait_list_out+wait_list_out_size++));
}
+
+ hs_state_wait_list_release(state);
+ hs_state_wait_list_update(state,wait_list_out_size,wait_list_out);
+
+ HS_STATE_WAIT_LIST_PROFILE(state);
}
//
@@ -570,25 +335,34 @@ hs_launch_bs(struct hs_state const * const state,
static
void
-hs_launch_bc(struct hs_state const * const state,
- cl_command_queue cq,
- cl_kernel kernel,
- size_t const global_work_size,
- size_t const local_work_size)
+hs_launch_bc(struct hs_cl const * const hs,
+ struct hs_state * const state,
+ uint32_t const full,
+ uint32_t const clean_slabs_log2)
{
- HS_LAUNCH_TRACE(kernel,global_work_size,local_work_size);
+ size_t const size[1] = { full << hs->config.slab.threads_log2 };
+ cl_kernel kernel = hs->kernels.bc[clean_slabs_log2];
+
+ HS_LAUNCH_TRACE(kernel,1,size);
cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout));
- cl(EnqueueNDRangeKernel(cq,
+ cl_event wait_list_out[1];
+
+ cl(EnqueueNDRangeKernel(state->cq,
kernel,
1,
NULL,
- &global_work_size,
- &local_work_size,
- 0,
+ size,
NULL,
- HS_EVENT_NEXT()));
+ state->wait_list_size,
+ state->wait_list,
+ wait_list_out));
+
+ hs_state_wait_list_release(state);
+ hs_state_wait_list_update(state,1,wait_list_out);
+
+ HS_STATE_WAIT_LIST_PROFILE(state);
}
//
@@ -597,26 +371,64 @@ hs_launch_bc(struct hs_state const * const state,
static
void
-hs_launch_fm(struct hs_state const * const state,
- cl_command_queue cq,
- cl_kernel kernel,
- size_t const global_work_size)
+hs_launch_fm(struct hs_cl const * const hs,
+ struct hs_state * const state,
+ uint32_t const scale_log2,
+ uint32_t const fm_full,
+ uint32_t const fm_frac,
+ uint32_t const span_threads)
{
- HS_LAUNCH_TRACE(kernel,global_work_size,0);
+ //
+ // Note that some platforms might need to use .z on large grids
+ //
+ uint32_t wait_list_out_size = 0;
+ cl_event wait_list_out[2];
- cl(SetKernelArg(kernel,0,sizeof(state->vout), &state->vout));
- cl(SetKernelArg(kernel,1,sizeof(state->fm.full),&state->fm.full));
- cl(SetKernelArg(kernel,2,sizeof(state->fm.frac),&state->fm.frac));
+ if (fm_full > 0)
+ {
+ size_t const size_full[3] = { span_threads, fm_full, 1 };
+ cl_kernel kernel_full = hs->kernels.fm[scale_log2][hs->bs_slabs_log2_ru];
- cl(EnqueueNDRangeKernel(cq,
- kernel,
- 1,
- NULL,
- &global_work_size,
- NULL,
- 0,
- NULL,
- HS_EVENT_NEXT()));
+ HS_LAUNCH_TRACE(kernel_full,3,size_full);
+
+ cl(SetKernelArg(kernel_full,0,sizeof(state->vout),&state->vout));
+
+ cl(EnqueueNDRangeKernel(state->cq,
+ kernel_full,
+ 3,
+ NULL,
+ size_full,
+ NULL,
+ state->wait_list_size,
+ state->wait_list,
+ wait_list_out+wait_list_out_size++));
+ }
+
+ if (fm_frac > 0)
+ {
+ size_t const offset_frac[3] = { 0, fm_full, 0 };
+ size_t const size_frac [3] = { span_threads, 1, 1 };
+ cl_kernel kernel_frac = hs->kernels.fm[scale_log2][msb_idx_u32(fm_frac)];
+
+ HS_LAUNCH_TRACE(kernel_frac,3,size_frac);
+
+ cl(SetKernelArg(kernel_frac,0,sizeof(state->vout),&state->vout));
+
+ cl(EnqueueNDRangeKernel(state->cq,
+ kernel_frac,
+ 3,
+ offset_frac,
+ size_frac,
+ NULL,
+ state->wait_list_size,
+ state->wait_list,
+ wait_list_out+wait_list_out_size++));
+ }
+
+ hs_state_wait_list_release(state);
+ hs_state_wait_list_update(state,wait_list_out_size,wait_list_out);
+
+ HS_STATE_WAIT_LIST_PROFILE(state);
}
//
@@ -625,24 +437,38 @@ hs_launch_fm(struct hs_state const * const state,
static
void
-hs_launch_hm(struct hs_state const * const state,
- cl_command_queue cq,
- cl_kernel kernel,
- size_t const global_work_size)
+hs_launch_hm(struct hs_cl const * const hs,
+ struct hs_state * const state,
+ uint32_t const scale_log2,
+ uint32_t const spans,
+ uint32_t const span_threads)
{
- HS_LAUNCH_TRACE(kernel,global_work_size,0);
+ //
+ // Note that some platforms might need to use .z on large grids
+ //
+ size_t const size[3] = { span_threads, spans, 1 };
+ cl_kernel kernel = hs->kernels.hm[scale_log2][0];
+
+ HS_LAUNCH_TRACE(kernel,3,size);
cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout));
- cl(EnqueueNDRangeKernel(cq,
+ cl_event wait_list_out[1];
+
+ cl(EnqueueNDRangeKernel(state->cq,
kernel,
- 1,
+ 3,
NULL,
- &global_work_size,
+ size,
NULL,
- 0,
- NULL,
- HS_EVENT_NEXT()));
+ state->wait_list_size,
+ state->wait_list,
+ wait_list_out));
+
+ hs_state_wait_list_release(state);
+ hs_state_wait_list_update(state,1,wait_list_out);
+
+ HS_STATE_WAIT_LIST_PROFILE(state);
}
//
@@ -651,47 +477,54 @@ hs_launch_hm(struct hs_state const * const state,
static
void
-hs_transpose_launcher(struct hs_state * const state,
- cl_command_queue cq)
+hs_keyset_pre_sort(struct hs_cl const * const hs,
+ struct hs_state * const state,
+ uint32_t const count,
+ uint32_t const count_hi,
+ uint32_t const wait_list_size,
+ cl_event * wait_list,
+ cl_event * event)
{
- // transpose each slab
- size_t const global_work_size = state->bx.ru * HS_LANES_PER_WARP;
- size_t const local_work_size = HS_LANES_PER_WARP; // FIXME -- might not always want to specify this
-
- hs_launch_transpose(state,
- cq,
- transpose_kernels[0].kernel,
- global_work_size,
- local_work_size);
-}
+ uint32_t const vin_span = count_hi - count;
+ uint32_t const pattern = UINT32_MAX;
-//
-//
-//
+ cl(EnqueueFillBuffer(state->cq,
+ state->vin,
+ &pattern,
+ sizeof(pattern),
+ count * hs->key_val_size,
+ vin_span * hs->key_val_size,
+ wait_list_size,
+ wait_list,
+ event));
+
+ HS_STATE_WAIT_LIST_PROFILE_EX(state,1,event);
+}
static
void
-hs_bs_launcher(struct hs_state * const state,
- uint32_t const warps_in,
- cl_command_queue cq)
+hs_keyset_pre_merge(struct hs_cl const * const hs,
+ struct hs_state * const state,
+ uint32_t const count_lo,
+ uint32_t const count_hi,
+ uint32_t const wait_list_size,
+ cl_event * wait_list)
{
- // warps_in is already rounded up
- uint32_t const full = (warps_in / HS_BS_WARPS) * HS_BS_WARPS;
- uint32_t const frac = warps_in - full;
+ uint32_t const vout_span = count_hi - count_lo;
+ uint32_t const pattern = UINT32_MAX;
- //
- // FIXME -- launch on different queues
- //
- cl_kernel kernel_full = (full == 0) ? NULL : bs_kernels[HS_BS_WARPS_LOG2_RU].kernel;
- cl_kernel kernel_frac = (frac == 0) ? NULL : bs_kernels[msb_idx_u32(frac)].kernel;
-
- hs_launch_bs(state,
- cq,
- kernel_full,
- kernel_frac,
- full * HS_LANES_PER_WARP,
- HS_BS_WARPS * HS_LANES_PER_WARP,
- frac * HS_LANES_PER_WARP);
+ // appends event to incoming wait list
+ cl(EnqueueFillBuffer(state->cq,
+ state->vout,
+ &pattern,
+ sizeof(pattern),
+ count_lo * hs->key_val_size,
+ vout_span * hs->key_val_size,
+ wait_list_size,
+ wait_list,
+ state->wait_list+state->wait_list_size++));
+
+ HS_STATE_WAIT_LIST_PROFILE(state);
}
//
@@ -700,27 +533,19 @@ hs_bs_launcher(struct hs_state * const state,
static
void
-hs_bc_launcher(struct hs_state * const state,
- uint32_t const down_warps,
- uint32_t const down_warps_log2,
- cl_command_queue cq)
+hs_bs_launcher(struct hs_cl const * const hs,
+ struct hs_state * const state,
+ uint32_t const count_padded_in,
+ uint32_t const wait_list_size,
+ cl_event * wait_list)
{
- // block clean the minimal number of down_warps_log2 spans
- uint32_t const frac_ru = (1u << down_warps_log2) - 1;
- state->bc.full = (down_warps + frac_ru) & ~frac_ru;
-
- // launch block slab sorting grid
- size_t const global_work_size = state->bc.full * HS_LANES_PER_WARP;
- size_t const local_work_size = HS_LANES_PER_WARP << down_warps_log2;
+ uint32_t const slabs_in = count_padded_in / hs->slab_keys;
+ uint32_t const full = (slabs_in / hs->config.block.slabs) * hs->config.block.slabs;
+ uint32_t const frac = slabs_in - full;
- //
- // we better be capable of cleaning at least two warps !!!
- //
- hs_launch_bc(state,
- cq,
- bc_kernels[down_warps_log2].kernel,
- global_work_size,
- local_work_size);
+ hs_launch_bs(hs,state,
+ full,frac,
+ wait_list_size,wait_list);
}
//
@@ -728,30 +553,18 @@ hs_bc_launcher(struct hs_state * const state,
//
static
-uint32_t
-hs_hm_launcher(struct hs_state * const state,
- uint32_t const down_warps,
- uint32_t const down_warps_log2_in,
- cl_command_queue cq)
+void
+hs_bc_launcher(struct hs_cl const * const hs,
+ struct hs_state * const state,
+ uint32_t const down_slabs,
+ uint32_t const clean_slabs_log2)
{
- // how many scaled half-merge spans are there?
- uint32_t const frac_ru = (1 << down_warps_log2_in) - 1;
- uint32_t const spans_ru = (down_warps + frac_ru) >> down_warps_log2_in;
+ // block clean the minimal number of down_slabs_log2 spans
+ uint32_t const frac_ru = (1u << clean_slabs_log2) - 1;
+ uint32_t const full = (down_slabs + frac_ru) & ~frac_ru;
- // get the kernel record
- struct hs_hm_kernel const * const hm = hm_kernels + down_warps_log2_in - HS_BC_WARPS_LOG2_MAX - 1;
-
- // how large is the grid?
- size_t const global_work_size = HS_LANES_PER_WARP * HS_KEYS_PER_LANE * (spans_ru << hm->log2);
- size_t const local_work_size = HS_LANES_PER_WARP;
-
- // launch the hm kernel
- hs_launch_hm(state,
- cq,
- hm->kernel,
- global_work_size);
-
- return hm->log2;
+ // we better be capable of cleaning at least two warps !!!
+ hs_launch_bc(hs,state,full,clean_slabs_log2);
}
//
@@ -760,63 +573,74 @@ hs_hm_launcher(struct hs_state * const state,
static
uint32_t
-hs_fm_launcher(struct hs_state * const state,
- uint32_t const up_scale_log2,
- uint32_t * const down_warps,
- cl_command_queue cq)
+hs_fm_launcher(struct hs_cl const * const hs,
+ struct hs_state * const state,
+ uint32_t * const down_slabs,
+ uint32_t const up_scale_log2)
{
- // get the kernel record
- struct hs_fm_kernel const * const fm = fm_kernels + up_scale_log2 - 1;
+ //
+ // FIXME OPTIMIZATION: in previous HotSort launchers it's sometimes
+ // a performance win to bias toward launching the smaller flip merge
+ // kernel in order to get more warps in flight (increased
+ // occupancy). This is useful when merging small numbers of slabs.
+ //
+ // Note that HS_FM_SCALE_MIN will always be 0 or 1.
+ //
+ // So, for now, just clamp to the max until there is a reason to
+ // restore the fancier and probably low-impact approach.
+ //
+ uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.fm.scale_max,up_scale_log2);
+ uint32_t const clean_log2 = up_scale_log2 - scale_log2;
- // number of warps in a full-sized scaled flip-merge span
- uint32_t const full_span_warps = HS_BS_WARPS << up_scale_log2;
+ // number of slabs in a full-sized scaled flip-merge span
+ uint32_t const full_span_slabs = hs->config.block.slabs << up_scale_log2;
// how many full-sized scaled flip-merge spans are there?
- state->fm.full = state->bx.ru / full_span_warps;
- state->fm.frac = 0;
+ uint32_t fm_full = state->bx_ru / full_span_slabs;
+ uint32_t fm_frac = 0;
- // initialize down_warps
- *down_warps = state->fm.full * full_span_warps;
+ // initialize down_slabs
+ *down_slabs = fm_full * full_span_slabs;
// how many half-size scaled + fractional scaled spans are there?
- uint32_t const span_rem = state->bx.ru - state->fm.full * full_span_warps;
- uint32_t const half_span_warps = full_span_warps >> 1;
+ uint32_t const span_rem = state->bx_ru - *down_slabs;
+ uint32_t const half_span_slabs = full_span_slabs >> 1;
- if (span_rem > half_span_warps)
+ // if we have over a half-span then fractionally merge it
+ if (span_rem > half_span_slabs)
{
- uint32_t const frac_rem = span_rem - half_span_warps;
+ // the remaining slabs will be cleaned
+ *down_slabs += span_rem;
+
+ uint32_t const frac_rem = span_rem - half_span_slabs;
uint32_t const frac_rem_pow2 = pow2_ru_u32(frac_rem);
- if (frac_rem_pow2 >= half_span_warps)
+ if (frac_rem_pow2 >= half_span_slabs)
{
- *down_warps += full_span_warps;
- state->fm.full += 1;
+ // bump it up to a full span
+ fm_full += 1;
}
else
{
- uint32_t const frac_interleaved = frac_rem_pow2 >> fm->log2;
-
- *down_warps += half_span_warps + frac_rem_pow2;
- state->fm.frac = MAX_MACRO(1,frac_interleaved);
+ // otherwise, add fractional
+ fm_frac = MAX_MACRO(1,frac_rem_pow2 >> clean_log2);
}
}
// size the grid
- uint32_t const spans_frac = MIN_MACRO(state->fm.frac,1);
- uint32_t const spans_total = state->fm.full + spans_frac;
- uint32_t const scale = spans_total << fm->log2;
- size_t const global_work_size = HS_LANES_PER_WARP * HS_KEYS_PER_LANE * scale;
- size_t const local_work_size = HS_LANES_PER_WARP;
+ uint32_t const span_threads = hs->slab_keys << clean_log2;
//
// launch the fm kernel
//
- hs_launch_fm(state,
- cq,
- fm->kernel,
- global_work_size);
-
- return fm->log2;
+ hs_launch_fm(hs,
+ state,
+ scale_log2,
+ fm_full,
+ fm_frac,
+ span_threads);
+
+ return clean_log2;
}
//
@@ -824,67 +648,196 @@ hs_fm_launcher(struct hs_state * const state,
//
static
+uint32_t
+hs_hm_launcher(struct hs_cl const * const hs,
+ struct hs_state * const state,
+ uint32_t const down_slabs,
+ uint32_t const clean_slabs_log2)
+{
+ // how many scaled half-merge spans are there?
+ uint32_t const frac_ru = (1 << clean_slabs_log2) - 1;
+ uint32_t const spans = (down_slabs + frac_ru) >> clean_slabs_log2;
+
+ // for now, just clamp to the max
+ uint32_t const log2_rem = clean_slabs_log2 - hs->bc_slabs_log2_max;
+ uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.hm.scale_max,log2_rem);
+ uint32_t const log2_out = log2_rem - scale_log2;
+
+ // size the grid
+ uint32_t const span_threads = hs->slab_keys << log2_out;
+
+ // launch the hm kernel
+ hs_launch_hm(hs,
+ state,
+ scale_log2,
+ spans,
+ span_threads);
+
+ return log2_out;
+}
+
+//
+//
+//
+
void
-hs_keyset_launcher(cl_mem mem,
- uint32_t const offset,
- uint32_t const span,
- cl_command_queue cq)
+hs_cl_sort(struct hs_cl const * const hs,
+ cl_command_queue cq,
+ uint32_t const wait_list_size,
+ cl_event * wait_list,
+ cl_event * event,
+ cl_mem vin,
+ cl_mem vout,
+ uint32_t const count,
+ uint32_t const count_padded_in,
+ uint32_t const count_padded_out,
+ bool const linearize)
{
+ // is this sort in place?
+ bool const is_in_place = (vout == NULL);
+ // cq, buffers, wait list and slab count
+ struct hs_state state = {
+#ifndef NDEBUG
+ .t_total = 0,
+#endif
+ .cq = cq,
+ .vin = vin,
+ .vout = is_in_place ? vin : vout,
+ .wait_list_size = 0,
+ .bx_ru = (count + hs->slab_keys - 1) / hs->slab_keys
+ };
+
+ // initialize vin
+ uint32_t const count_hi = is_in_place ? count_padded_out : count_padded_in;
+ bool const is_pre_sort_keyset_reqd = count_hi > count;
+ cl_event event_keyset_pre_sort[1];
+
+ // initialize any trailing keys in vin before sorting
+ if (is_pre_sort_keyset_reqd)
+ {
+ hs_keyset_pre_sort(hs,&state,
+ count,count_hi,
+ wait_list_size,wait_list,
+ event_keyset_pre_sort);
+ }
+
+ // initialize any trailing keys in vout before merging
+ if (!is_in_place && (count_padded_out > count_padded_in))
+ {
+ hs_keyset_pre_merge(hs,&state,
+ count_padded_in,count_padded_out,
+ wait_list_size,wait_list);
+ }
//
- // DOES NOT TEST FOR SPAN = 0
+ // sort blocks of slabs
//
- HS_KEY_TYPE const pattern = (HS_KEY_TYPE)-1L;
+ hs_bs_launcher(hs,&state,
+ count_padded_in,
+ is_pre_sort_keyset_reqd ? 1 : wait_list_size,
+ is_pre_sort_keyset_reqd ? event_keyset_pre_sort : wait_list);
- cl(EnqueueFillBuffer(cq,
- mem,
- &pattern,
- sizeof(HS_KEY_TYPE),
- offset * sizeof(HS_KEY_TYPE),
- span * sizeof(HS_KEY_TYPE),
- 0,
- NULL,
- HS_EVENT_NEXT()));
+ // release the event
+ if (is_pre_sort_keyset_reqd)
+ cl(ReleaseEvent(event_keyset_pre_sort[0]));
+
+ //
+ // we're done if this was a single bs block...
+ //
+ // otherwise, merge sorted spans of slabs until done
+ //
+ if (state.bx_ru > hs->config.block.slabs)
+ {
+ int32_t up_scale_log2 = 1;
+
+ while (true)
+ {
+ uint32_t down_slabs;
+
+ // flip merge slabs -- return span of slabs that must be cleaned
+ uint32_t clean_slabs_log2 = hs_fm_launcher(hs,&state,
+ &down_slabs,
+ up_scale_log2);
+
+ // if span is gt largest slab block cleaner then half merge
+ while (clean_slabs_log2 > hs->bc_slabs_log2_max)
+ {
+ clean_slabs_log2 = hs_hm_launcher(hs,&state,
+ down_slabs,
+ clean_slabs_log2);
+ }
+
+ // launch clean slab grid -- is it the final launch?
+ hs_bc_launcher(hs,&state,
+ down_slabs,
+ clean_slabs_log2);
+
+ // was this the final block clean?
+ if (((uint32_t)hs->config.block.slabs << up_scale_log2) >= state.bx_ru)
+ break;
+
+ // otherwise, merge twice as many slabs
+ up_scale_log2 += 1;
+ }
+ }
+
+ // slabs or linear?
+ if (linearize) {
+ hs_transpose_launcher(hs,&state);
+ }
+
+ // does the caller want the final event?
+ if (event != NULL) {
+ *event = state.wait_list[0];
+ } else {
+ cl(ReleaseEvent(state.wait_list[0]));
+ }
}
//
-// all grids will be computed as a function of the minimum number of warps
+// all grids will be computed as a function of the minimum number of slabs
//
void
-hs_pad(uint32_t const count,
- uint32_t * const count_padded_in,
- uint32_t * const count_padded_out)
+hs_cl_pad(struct hs_cl const * const hs,
+ uint32_t const count,
+ uint32_t * const count_padded_in,
+ uint32_t * const count_padded_out)
{
//
- // round up the count to warps
+ // round up the count to slabs
//
- uint32_t const warps_ru = (count + HS_KEYS_PER_WARP - 1) / HS_KEYS_PER_WARP;
- uint32_t const blocks = warps_ru / HS_BS_WARPS;
- uint32_t const warps_mod = warps_ru % HS_BS_WARPS;
- uint32_t const warps_mod_ru = MIN_MACRO(pow2_ru_u32(warps_mod),HS_BS_WARPS);
+ uint32_t const slabs_ru = (count + hs->slab_keys - 1) / hs->slab_keys;
+ uint32_t const blocks = slabs_ru / hs->config.block.slabs;
+ uint32_t const block_slabs = blocks * hs->config.block.slabs;
+ uint32_t const slabs_ru_rem = slabs_ru - block_slabs;
+ uint32_t const slabs_ru_rem_ru = MIN_MACRO(pow2_ru_u32(slabs_ru_rem),hs->config.block.slabs);
- *count_padded_in = (blocks * HS_BS_WARPS + warps_mod_ru) * HS_KEYS_PER_WARP;
+ *count_padded_in = (block_slabs + slabs_ru_rem_ru) * hs->slab_keys;
*count_padded_out = *count_padded_in;
//
- // more than a single block sort?
+ // will merging be required?
//
- if (warps_ru > HS_BS_WARPS)
+ if (slabs_ru > hs->config.block.slabs)
{
// more than one block
- uint32_t const blocks_lo = pow2_rd_u32(blocks);
- uint32_t const warps_lo = blocks_lo * HS_BS_WARPS;
- uint32_t const warps_rem = warps_ru - warps_lo;
+ uint32_t const blocks_lo = pow2_rd_u32(blocks);
+ uint32_t const block_slabs_lo = blocks_lo * hs->config.block.slabs;
+ uint32_t const block_slabs_rem = slabs_ru - block_slabs_lo;
- if (warps_rem > 0)
+ if (block_slabs_rem > 0)
{
- uint32_t const warps_rem_ru = pow2_ru_u32(warps_rem);
- uint32_t const warps_hi = MAX_MACRO(warps_rem_ru,blocks_lo << HS_FM_BLOCKS_LOG2_1);
- uint32_t const warps_padded_out = MIN_MACRO(warps_lo+warps_hi,warps_lo*2); // clamp non-pow2 blocks
+ uint32_t const block_slabs_rem_ru = pow2_ru_u32(block_slabs_rem);
+
+ uint32_t const block_slabs_hi = MAX_MACRO(block_slabs_rem_ru,
+ blocks_lo << (1 - hs->config.merge.fm.scale_min));
+
+ uint32_t const block_slabs_padded_out = MIN_MACRO(block_slabs_lo+block_slabs_hi,
+ block_slabs_lo*2); // clamp non-pow2 blocks
- *count_padded_out = warps_padded_out * HS_KEYS_PER_WARP;
+ *count_padded_out = block_slabs_padded_out * hs->slab_keys;
}
}
}
@@ -893,229 +846,291 @@ hs_pad(uint32_t const count,
//
//
+static
void
-hs_sort(cl_command_queue cq, // out-of-order cq
- cl_mem vin,
- cl_mem vout,
- uint32_t const count,
- uint32_t const count_padded_in,
- uint32_t const count_padded_out,
- bool const linearize)
+hs_create_kernel(cl_program program,
+ cl_kernel * const kernel,
+ char const * const name)
{
-#ifndef NDEBUG
- events_count = 0;
-#endif
+ cl_int err;
- //
- // FIXME -- get rid of this vestigial structure
- //
- struct hs_state state = { .vin = vin, .vout = vout };
+ *kernel = clCreateKernel(program,name,&err);
+
+ cl_ok(err);
+}
- // how many rounded-up key slabs are there?
- state.bx.ru = (count + HS_KEYS_PER_WARP - 1) / HS_KEYS_PER_WARP;
+static
+void
+hs_create_kernels(cl_program program,
+ cl_kernel * kernels,
+ char name_template[],
+ size_t const name_template_size,
+ uint32_t const count)
+{
+ char const n_max = '0'+(char)count;
+
+ for (char n = '0'; n<n_max; n++)
+ {
+ cl_int err;
+ name_template[name_template_size-2] = n;
+
+ *kernels++ = clCreateKernel(program,name_template,&err);
+
+ cl_ok(err);
+ }
+}
+
+//
+//
+//
+
+struct hs_cl *
+hs_cl_create(struct hs_cl_target const * const target,
+ cl_context context,
+ cl_device_id device_id)
+{
//
- // init padding with max-valued keys
+ // immediately try to build the OpenCL program
//
- bool const split = state.vout != state.vin; // FIXME -- careful this comparison might not always be correct
- bool keyset = false;
+ bool const is_binary = (target->program[0] == 0);
+ uint32_t const program_size = NPBTOHL_MACRO(target->program+1);
- if (!split)
- {
- uint32_t const vin_span = count_padded_out - count;
+ cl_program program;
- if (vin_span > 0)
- {
- hs_keyset_launcher(state.vin,
- count,vin_span,
- cq);
- keyset = true;
- }
+ if (is_binary) // program is a binary
+ {
+ cl_int status, err;
+
+ size_t const bins_sizeof[] = { program_size };
+ unsigned char const * bins[] = { target->program+5 };
+
+ program = clCreateProgramWithBinary(context,
+ 1,
+ &device_id,
+ bins_sizeof,
+ bins,
+ &status,
+ &err);
+ cl_ok(err);
+
+ cl(BuildProgram(program,
+ 1,
+ &device_id,
+ NULL,
+ NULL,
+ NULL));
}
- else
+ else // program is source code
{
- uint32_t const vin_span = count_padded_in - count;
-
- if (vin_span > 0)
- {
- hs_keyset_launcher(state.vin,
- count,vin_span,
- cq);
- keyset = true;
- }
+ cl_int err;
+
+ size_t const strings_sizeof[] = { program_size };
+ char const * strings[] = { (char*)target->program+5 };
+
+ program = clCreateProgramWithSource(context,
+ 1,
+ strings,
+ strings_sizeof,
+ &err);
+ cl_ok(err);
+
+ char const * const options =
+ "-cl-std=CL1.2 -cl-fast-relaxed-math " // FIXME FIXME FIXME FIXME 1.2
+ "-cl-no-signed-zeros -cl-mad-enable "
+ "-cl-denorms-are-zero "
+ "-cl-kernel-arg-info";
+
+ cl(BuildProgram(program,
+ 1,
+ &device_id,
+ options,
+ NULL,
+ NULL));
+ }
- uint32_t const vout_span = count_padded_out - count_padded_in;
+ //
+ // we reference these values a lot
+ //
+ uint32_t const bs_slabs_log2_ru = msb_idx_u32(pow2_ru_u32(target->config.block.slabs));
+ uint32_t const bc_slabs_log2_max = msb_idx_u32(pow2_rd_u32(target->config.block.slabs));
- if (vout_span > 0)
- {
- hs_keyset_launcher(state.vout,
- count_padded_in,vout_span,
- cq);
- keyset = true;
- }
+ //
+ // how many kernels will be created?
+ //
+ uint32_t const count_bs = bs_slabs_log2_ru + 1;
+ uint32_t const count_bc = bc_slabs_log2_max + 1;
+ uint32_t count_fm[3] = { 0 };
+ uint32_t count_hm[3] = { 0 };
+
+ // guaranteed to be in range [0,2]
+ for (uint32_t scale = target->config.merge.fm.scale_min;
+ scale <= target->config.merge.fm.scale_max;
+ scale++)
+ {
+ count_fm[scale] = msb_idx_u32(pow2_ru_u32(target->config.block.slabs>>(scale-1))) + 1;
}
- if (keyset)
+ // guaranteed to be in range [0,2]
+ for (uint32_t scale = target->config.merge.hm.scale_min;
+ scale <= target->config.merge.hm.scale_max;
+ scale++)
{
- hs_barrier(cq);
+ count_hm[scale] = 1;
}
+ uint32_t const count_all =
+ 1
+ + count_bs
+ + count_bc
+ + count_fm[0] + count_fm[1] + count_fm[2]
+ + count_hm[0] + count_hm[1] + count_hm[2];
+
//
- // sort blocks
+ // allocate hs_cl
//
- uint32_t const warps_in = count_padded_in / HS_KEYS_PER_WARP;
+ struct hs_cl * hs = malloc(sizeof(*hs) + sizeof(cl_kernel) * count_all);
- hs_bs_launcher(&state,warps_in,cq);
+ memcpy(&hs->config,&target->config,sizeof(hs->config));
- hs_barrier(cq);
+ // save some frequently used calculated values
+ hs->key_val_size = (target->config.words.key + target->config.words.val) * 4;
+ hs->slab_keys = target->config.slab.height << target->config.slab.width_log2;
+ hs->bs_slabs_log2_ru = bs_slabs_log2_ru;
+ hs->bc_slabs_log2_max = bc_slabs_log2_max;
+
+ // save kernel count
+ hs->kernels.count = count_all;
//
- // we're done if only a single bs kernel block was required
+ // create all the kernels and release the program
//
- if (state.bx.ru > HS_BS_WARPS)
- {
- //
- // otherwise... merge sorted spans of warps until done
- //
- uint32_t up_scale_log2 = 1;
+ cl_kernel * kernel_next = hs->kernels.all;
- while (true)
- {
- uint32_t down_warps;
+ //
+ // TRANSPOSE
+ //
+ {
+ hs->kernels.transpose = kernel_next;
- // flip merge warps -- return span of warps that must be cleaned
- uint32_t down_warps_log2 = hs_fm_launcher(&state,
- up_scale_log2,
- &down_warps,
- cq);
+ hs_create_kernel(program,
+ kernel_next,
+ "hs_kernel_transpose");
- hs_barrier(cq);
+ kernel_next += 1;
+ }
- // if span is gt largest slab block cleaner then half merge
- while (down_warps_log2 > HS_BC_WARPS_LOG2_MAX)
- {
- down_warps_log2 = hs_hm_launcher(&state,
- down_warps,
- down_warps_log2,
- cq);
+ //
+ // BS
+ //
+ {
+ hs->kernels.bs = kernel_next;
- hs_barrier(cq);
- }
+ char bs_name[] = { "hs_kernel_bs_X" };
- // launch clean slab grid -- is it the final launch?
- hs_bc_launcher(&state,
- down_warps,
- down_warps_log2,
- cq);
+ hs_create_kernels(program,
+ kernel_next,
+ bs_name,sizeof(bs_name),
+ count_bs);
- hs_barrier(cq);
+ kernel_next += count_bs;
+ }
- // was this the final block clean?
- if (((uint32_t)HS_BS_WARPS << up_scale_log2) >= state.bx.ru)
- break;
+ //
+ // BC
+ //
+ {
+ hs->kernels.bc = kernel_next;
- // otherwise, merge twice as many slabs
- up_scale_log2 += 1;
- }
- }
+ char bc_name[] = { "hs_kernel_bc_X" };
+
+ hs_create_kernels(program,
+ kernel_next,
+ bc_name,sizeof(bc_name),
+ count_bc);
+
+ kernel_next += count_bc;
+ }
- if (linearize)
+ //
+ // FM 0
+ //
+ if (count_fm[0] > 0)
{
- // launch linearize;
- hs_transpose_launcher(&state,cq);
+ hs->kernels.fm[0] = kernel_next;
+
+ char fm_0_name[] = { "hs_kernel_fm_0_X" };
- hs_barrier(cq);
+ hs_create_kernels(program,
+ kernel_next,
+ fm_0_name,sizeof(fm_0_name),
+ count_fm[0]);
+
+ kernel_next += count_fm[0];
}
- HS_EVENT_PROFILE(cq);
-}
+ if (count_fm[1] > 0)
+ {
+ hs->kernels.fm[1] = kernel_next;
-//
-//
-//
+ char fm_1_name[] = { "hs_kernel_fm_1_X" };
-void
-hs_create(cl_context context,
- cl_device_id device_id,
- struct hs_info * const info)
-{
- //
- // create and build the program from source or a precompiled binary
- //
- if (info != NULL)
+ hs_create_kernels(program,
+ kernel_next,
+ fm_1_name,sizeof(fm_1_name),
+ count_fm[1]);
+
+ kernel_next += count_fm[1];
+ }
+
+ if (count_fm[2] > 0)
{
- info->words = HS_KEY_WORDS;
- info->keys = HS_KEYS_PER_LANE;
- info->lanes = HS_LANES_PER_WARP;
+ hs->kernels.fm[2] = kernel_next;
+
+ char fm_2_name[] = { "hs_kernel_fm_2_X" };
+
+ hs_create_kernels(program,
+ kernel_next,
+ fm_2_name,sizeof(fm_2_name),
+ count_fm[2]);
+
+ kernel_next += count_fm[2];
}
-#if defined( HS_KERNEL_SOURCE )
+ if (count_hm[0] > 0)
+ {
+ hs->kernels.hm[0] = kernel_next;
- cl_int err;
+ hs_create_kernel(program,
+ kernel_next,
+ "hs_kernel_hm_0");
+
+ kernel_next += count_hm[0];
+ }
- size_t const strings_sizeof[] = { sizeof(hs_cl_pre_cl) };
- char const * strings[] = { (char*)hs_cl_pre_cl };
+ if (count_hm[1] > 0)
+ {
+ hs->kernels.hm[1] = kernel_next;
- cl_program program = clCreateProgramWithSource(context,
- 1,
- strings,
- strings_sizeof,
- &err);
- cl_ok(err);
+ hs_create_kernel(program,
+ kernel_next,
+ "hs_kernel_hm_1");
- char const * const options =
- "-cl-std=CL2.0 -cl-fast-relaxed-math "
- "-cl-no-signed-zeros -cl-mad-enable "
- "-cl-denorms-are-zero "
- "-cl-kernel-arg-info";
-
- cl(BuildProgram(program,
- 1,
- &device_id,
- options,
- NULL,
- NULL));
-
-#elif defined( HS_KERNEL_BINARY )
-
- cl_int status, err;
-
- size_t const bins_sizeof[] = { sizeof(hs_cl_pre_ir) };
- unsigned char const * bins[] = { hs_cl_pre_ir };
-
- cl_program program = clCreateProgramWithBinary(context,
- 1,
- &device_id,
- bins_sizeof,
- bins,
- &status,
- &err);
- cl_ok(err);
+ kernel_next += count_hm[1];
+ }
- cl(BuildProgram(program,
- 1,
- &device_id,
- NULL,
- NULL,
- NULL));
-#endif
+ if (count_hm[2] > 0)
+ {
+ hs->kernels.hm[2] = kernel_next;
- //
- // create all the kernels and release the program
- //
-#define HS_CREATE_KERNELS(ks) \
- for (uint32_t ii=0; ii<ARRAY_LENGTH(ks); ii++) { \
- ks[ii].kernel = clCreateKernel(program,ks[ii].name,&err); \
- cl_ok(err); \
- }
+ hs_create_kernel(program,
+ kernel_next,
+ "hs_kernel_hm_2");
- HS_CREATE_KERNELS(bs_kernels);
- HS_CREATE_KERNELS(bc_kernels);
- HS_CREATE_KERNELS(fm_kernels);
- HS_CREATE_KERNELS(hm_kernels);
- HS_CREATE_KERNELS(transpose_kernels);
+ kernel_next += count_hm[2]; // unnecessary
+ }
- cl(ReleaseProgram(program));
+ return hs;
}
//
@@ -1123,17 +1138,12 @@ hs_create(cl_context context,
//
void
-hs_release()
+hs_cl_release(struct hs_cl * const hs)
{
-#define HS_RELEASE_KERNELS(ks) \
- for (uint32_t ii=0; ii<ARRAY_LENGTH(ks); ii++) \
- cl(ReleaseKernel(ks[ii].kernel))
-
- HS_RELEASE_KERNELS(bs_kernels);
- HS_RELEASE_KERNELS(bc_kernels);
- HS_RELEASE_KERNELS(fm_kernels);
- HS_RELEASE_KERNELS(hm_kernels);
- HS_RELEASE_KERNELS(transpose_kernels);
+ for (uint32_t ii=0; ii<hs->kernels.count; ii++)
+ cl(ReleaseKernel(hs->kernels.all[ii]));
+
+ free(hs);
}
//
diff --git a/src/compute/hs/cl/hs_cl_launcher.h b/src/compute/hs/cl/hs_cl_launcher.h
index 049657cc2f..33f62d9943 100644
--- a/src/compute/hs/cl/hs_cl_launcher.h
+++ b/src/compute/hs/cl/hs_cl_launcher.h
@@ -17,61 +17,65 @@
#include <stdbool.h>
//
-// Returns some useful info about algorithm's configuration for the
-// target architecture.
+//
//
-struct hs_info
-{
- uint32_t words; // words-per-key (1 = uint, 2 = ulong)
- uint32_t keys; // keys-per-lane
- uint32_t lanes; // lanes-per-warp
-};
+#include "hs_cl_target.h"
//
//
//
-void
-hs_create(cl_context context,
- cl_device_id device_id,
- struct hs_info * const info);
+struct hs_cl *
+hs_cl_create(struct hs_cl_target const * const target,
+ cl_context context,
+ cl_device_id device_id);
+
//
//
//
void
-hs_release();
+hs_cl_release(struct hs_cl * const hs);
//
-// Size the buffers.
+// Determine what padding will be applied to the input and output
+// buffers.
+//
+// Always check to see if the allocated buffers are large enough.
+//
+// count : number of keys
+// count + count_padded_in : additional keys required for sorting
+// count + count_padded_out : additional keys required for merging
//
void
-hs_pad(uint32_t const count,
- uint32_t * const count_padded_in,
- uint32_t * const count_padded_out);
+hs_cl_pad(struct hs_cl const * const hs,
+ uint32_t const count,
+ uint32_t * const count_padded_in,
+ uint32_t * const count_padded_out);
//
// Sort the keys in the vin buffer and store them in the vout buffer.
//
-// The vin and vout buffers can be the same buffer.
-//
-// If it is necessary, a barrier should be enqueued before running
-// hs_sort().
+// If vout is NULL then the sort will be performed in place.
//
-// A final barrier will enqueued before returning.
+// The implementation assumes the command queue is out-of-order.
//
void
-hs_sort(cl_command_queue cq, // out-of-order cq
- cl_mem vin,
- cl_mem vout,
- uint32_t const count,
- uint32_t const count_padded_in,
- uint32_t const count_padded_out,
- bool const linearize);
+hs_cl_sort(struct hs_cl const * const hs,
+ cl_command_queue cq,
+ uint32_t const wait_list_size,
+ cl_event * wait_list,
+ cl_event * event,
+ cl_mem vin,
+ cl_mem vout,
+ uint32_t const count,
+ uint32_t const count_padded_in,
+ uint32_t const count_padded_out,
+ bool const linearize);
//
//
diff --git a/src/compute/hs/cl/hs_cl_target.h b/src/compute/hs/cl/hs_cl_target.h
new file mode 100644
index 0000000000..b7bb73e0d3
--- /dev/null
+++ b/src/compute/hs/cl/hs_cl_target.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include <stdint.h>
+
+//
+// This structure packages all of the parameters and kernels for a
+// target architecture.
+//
+
+struct hs_cl_target_config
+{
+ struct {
+ uint8_t threads_log2;
+ uint8_t width_log2;
+ uint8_t height;
+ } slab;
+
+ struct {
+ uint8_t key;
+ uint8_t val;
+ } words;
+
+ struct {
+ uint8_t slabs;
+ } block;
+
+ struct {
+ struct {
+ uint8_t scale_min;
+ uint8_t scale_max;
+ } fm;
+ struct {
+ uint8_t scale_min;
+ uint8_t scale_max;
+ } hm;
+ } merge;
+};
+
+//
+//
+//
+
+struct hs_cl_target
+{
+ struct hs_cl_target_config config;
+ uint8_t program[];
+};
+
+//
+//
+//
diff --git a/src/compute/hs/cl/intel/gen8/u32/make_all.bat b/src/compute/hs/cl/intel/gen8/u32/make_all.bat
new file mode 100644
index 0000000000..a68057af0e
--- /dev/null
+++ b/src/compute/hs/cl/intel/gen8/u32/make_all.bat
@@ -0,0 +1,16 @@
+@ECHO OFF
+
+SET HS_GEN=..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "opencl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "opencl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+CMD /C make_inl_cl.bat hs_cl.cl
diff --git a/src/compute/hs/cl/gen9/make_inl_cl.bat b/src/compute/hs/cl/intel/gen8/u32/make_inl_cl.bat
index 76cb6e569e..54b1aac48f 100644
--- a/src/compute/hs/cl/gen9/make_inl_cl.bat
+++ b/src/compute/hs/cl/intel/gen8/u32/make_inl_cl.bat
@@ -1,4 +1,3 @@
-
@ECHO OFF
::
diff --git a/src/compute/hs/cl/intel/gen8/u32b32/make_all.bat b/src/compute/hs/cl/intel/gen8/u32b32/make_all.bat
new file mode 100644
index 0000000000..a68057af0e
--- /dev/null
+++ b/src/compute/hs/cl/intel/gen8/u32b32/make_all.bat
@@ -0,0 +1,16 @@
+@ECHO OFF
+
+SET HS_GEN=..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "opencl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "opencl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+CMD /C make_inl_cl.bat hs_cl.cl
diff --git a/src/compute/hs/cl/intel/gen8/u32b32/make_inl_cl.bat b/src/compute/hs/cl/intel/gen8/u32b32/make_inl_cl.bat
new file mode 100644
index 0000000000..54b1aac48f
--- /dev/null
+++ b/src/compute/hs/cl/intel/gen8/u32b32/make_inl_cl.bat
@@ -0,0 +1,77 @@
+@ECHO OFF
+
+::
+::
+::
+
+SET OPENCL_STD=-cl-std=CL1.2
+SET OPENCL_PRE=__OPENCL_C_VERSION__=120
+
+:: SET OPENCL_STD=-cl-std=CL2.0
+:: SET OPENCL_PRE=__OPENCL_C_VERSION__=200
+
+::
+::
+::
+
+SET IOC=ioc64
+
+::
+::
+::
+
+SET IOC_IR_OPTS_OPT=%OPENCL_STD% -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info
+
+SET IOC_IR_OPTS_DBG=%OPENCL_STD% -cl-kernel-arg-info -g
+
+SET IOC_IR_OPTS=%IOC_IR_OPTS_OPT%
+
+::
+::
+::
+
+SET PRE_DIR=%~p1
+
+CD %PRE_DIR%
+
+SET PRE_CL=%~n1
+SET PRE_CL=%PRE_CL%.pre.cl
+
+SET PRE_SRC_INL=%~n1
+SET PRE_SRC_INL=%PRE_SRC_INL%.pre.src.inl
+
+SET PRE_BIN_IR=%~n1
+SET PRE_BIN_IR=%PRE_BIN_IR%.pre.ir
+
+SET PRE_BIN_INL=%~n1
+SET PRE_BIN_INL=%PRE_BIN_INL%.pre.bin.inl
+
+::
+:: *.pre.cl
+:: *.pre.src.inl
+::
+
+CMD /C clang-format -style=Mozilla -i %1
+CMD /C cl -I . -I "%INTELOCLSDKROOT%\include" -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_CL%"
+CMD /C clang-format -style=Mozilla -i %PRE_CL%
+CMD /C dos2unix -q %PRE_CL%
+CMD /C xxd -i %PRE_CL% %PRE_SRC_INL%
+
+echo %PRE_CL%
+echo %PRE_SRC_INL%
+
+::
+:: *.pre.cl
+:: *.pre.src.inl
+::
+
+CMD /C touch %PRE_BIN_IR%
+ECHO ON
+@CMD /C %IOC% -cmd=build -bo="%IOC_IR_OPTS%" -device=gpu -input=%PRE_CL% -ir=%PRE_BIN_IR%
+@ECHO OFF
+CMD /C xxd -i %PRE_BIN_IR% %PRE_BIN_INL%
+
+echo %PRE_BIN_IR%
+echo %PRE_BIN_INL%
+
+
diff --git a/src/compute/hs/cl/intel/gen8/u64/hs_cl.cl b/src/compute/hs/cl/intel/gen8/u64/hs_cl.cl
new file mode 100644
index 0000000000..b994d8276f
--- /dev/null
+++ b/src/compute/hs/cl/intel/gen8/u64/hs_cl.cl
@@ -0,0 +1,4851 @@
+//
+// Copyright 2016 Google Inc.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+
+#include "hs_cl_macros.h"
+
+//
+//
+//
+
+HS_TRANSPOSE_KERNEL_PROTO(8)
+{
+ HS_SLAB_GLOBAL_PREAMBLE(8, 16);
+ HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 8, 0);
+ HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 8, 1);
+ HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 8, 2);
+ HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 8, 3);
+ HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 8, 4);
+ HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 8, 5);
+ HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 8, 6);
+ HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 8, 7);
+ HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vout, 8, 8);
+ HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vout, 8, 9);
+ HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vout, 8, 10);
+ HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vout, 8, 11);
+ HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vout, 8, 12);
+ HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vout, 8, 13);
+ HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vout, 8, 14);
+ HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vout, 8, 15);
+ HS_TRANSPOSE_SLAB()
+}
+
+HS_BS_KERNEL_PROTO(8, 16, 4)
+{
+ HS_BLOCK_LOCAL_MEM_DECL(128, 16);
+
+ HS_SLAB_GLOBAL_PREAMBLE(8, 16);
+ HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 8, 0);
+ HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 8, 1);
+ HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 8, 2);
+ HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 8, 3);
+ HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 8, 4);
+ HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 8, 5);
+ HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 8, 6);
+ HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 8, 7);
+ HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8, 8);
+ HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 8, 9);
+ HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 8, 10);
+ HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 8, 11);
+ HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 8, 12);
+ HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 8, 13);
+ HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 8, 14);
+ HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 8, 15);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r6, r11);
+ HS_CMP_XCHG(r7, r10);
+ HS_CMP_XCHG(r4, r13);
+ HS_CMP_XCHG(r14, r15);
+ HS_CMP_XCHG(r8, r12);
+ HS_CMP_XCHG(r2, r3);
+ HS_CMP_XCHG(r5, r9);
+ HS_CMP_XCHG(r2, r5);
+ HS_CMP_XCHG(r8, r14);
+ HS_CMP_XCHG(r3, r9);
+ HS_CMP_XCHG(r12, r15);
+ HS_CMP_XCHG(r3, r5);
+ HS_CMP_XCHG(r6, r7);
+ HS_CMP_XCHG(r10, r11);
+ HS_CMP_XCHG(r12, r14);
+ HS_CMP_XCHG(r4, r9);
+ HS_CMP_XCHG(r8, r13);
+ HS_CMP_XCHG(r7, r9);
+ HS_CMP_XCHG(r11, r13);
+ HS_CMP_XCHG(r4, r6);
+ HS_CMP_XCHG(r8, r10);
+ HS_CMP_XCHG(r4, r5);
+ HS_CMP_XCHG(r6, r7);
+ HS_CMP_XCHG(r8, r9);
+ HS_CMP_XCHG(r10, r11);
+ HS_CMP_XCHG(r12, r13);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ {
+ HS_SLAB_FLIP_PREAMBLE(1);
+ HS_CMP_FLIP(0, r1, r16);
+ HS_CMP_FLIP(1, r2, r15);
+ HS_CMP_FLIP(2, r3, r14);
+ HS_CMP_FLIP(3, r4, r13);
+ HS_CMP_FLIP(4, r5, r12);
+ HS_CMP_FLIP(5, r6, r11);
+ HS_CMP_FLIP(6, r7, r10);
+ HS_CMP_FLIP(7, r8, r9);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ {
+ HS_SLAB_FLIP_PREAMBLE(3);
+ HS_CMP_FLIP(0, r1, r16);
+ HS_CMP_FLIP(1, r2, r15);
+ HS_CMP_FLIP(2, r3, r14);
+ HS_CMP_FLIP(3, r4, r13);
+ HS_CMP_FLIP(4, r5, r12);
+ HS_CMP_FLIP(5, r6, r11);
+ HS_CMP_FLIP(6, r7, r10);
+ HS_CMP_FLIP(7, r8, r9);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ {
+ HS_SLAB_FLIP_PREAMBLE(7);
+ HS_CMP_FLIP(0, r1, r16);
+ HS_CMP_FLIP(1, r2, r15);
+ HS_CMP_FLIP(2, r3, r14);
+ HS_CMP_FLIP(3, r4, r13);
+ HS_CMP_FLIP(4, r5, r12);
+ HS_CMP_FLIP(5, r6, r11);
+ HS_CMP_FLIP(6, r7, r10);
+ HS_CMP_FLIP(7, r8, r9);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ HS_BS_MERGE_H_PREAMBLE(8, 16);
+ HS_BX_LOCAL_V(16 * 8 * 0) = r1;
+ HS_BX_LOCAL_V(16 * 8 * 1) = r16;
+ HS_BX_LOCAL_V(16 * 8 * 2) = r2;
+ HS_BX_LOCAL_V(16 * 8 * 3) = r15;
+ HS_BX_LOCAL_V(16 * 8 * 4) = r3;
+ HS_BX_LOCAL_V(16 * 8 * 5) = r14;
+ HS_BX_LOCAL_V(16 * 8 * 6) = r4;
+ HS_BX_LOCAL_V(16 * 8 * 7) = r13;
+ HS_BX_LOCAL_V(16 * 8 * 8) = r5;
+ HS_BX_LOCAL_V(16 * 8 * 9) = r12;
+ HS_BX_LOCAL_V(16 * 8 * 10) = r6;
+ HS_BX_LOCAL_V(16 * 8 * 11) = r11;
+ HS_BX_LOCAL_V(16 * 8 * 12) = r7;
+ HS_BX_LOCAL_V(16 * 8 * 13) = r10;
+ HS_BX_LOCAL_V(16 * 8 * 14) = r8;
+ HS_BX_LOCAL_V(16 * 8 * 15) = r9;
+ HS_BLOCK_BARRIER();
+ {
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(8);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(0) = r0_1;
+ HS_SLAB_LOCAL_R(8) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(16);
+ HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(24);
+ HS_CMP_XCHG(r1_1, r1_2);
+ HS_SLAB_LOCAL_L(16) = r1_1;
+ HS_SLAB_LOCAL_R(24) = r1_2;
+ }
+ {
+ HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(32);
+ HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(40);
+ HS_CMP_XCHG(r2_1, r2_2);
+ HS_SLAB_LOCAL_L(32) = r2_1;
+ HS_SLAB_LOCAL_R(40) = r2_2;
+ }
+ {
+ HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(48);
+ HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(56);
+ HS_CMP_XCHG(r3_1, r3_2);
+ HS_SLAB_LOCAL_L(48) = r3_1;
+ HS_SLAB_LOCAL_R(56) = r3_2;
+ }
+ {
+ HS_KEY_TYPE r4_1 = HS_SLAB_LOCAL_L(64);
+ HS_KEY_TYPE r4_2 = HS_SLAB_LOCAL_R(72);
+ HS_CMP_XCHG(r4_1, r4_2);
+ HS_SLAB_LOCAL_L(64) = r4_1;
+ HS_SLAB_LOCAL_R(72) = r4_2;
+ }
+ {
+ HS_KEY_TYPE r5_1 = HS_SLAB_LOCAL_L(80);
+ HS_KEY_TYPE r5_2 = HS_SLAB_LOCAL_R(88);
+ HS_CMP_XCHG(r5_1, r5_2);
+ HS_SLAB_LOCAL_L(80) = r5_1;
+ HS_SLAB_LOCAL_R(88) = r5_2;
+ }
+ {
+ HS_KEY_TYPE r6_1 = HS_SLAB_LOCAL_L(96);
+ HS_KEY_TYPE r6_2 = HS_SLAB_LOCAL_R(104);
+ HS_CMP_XCHG(r6_1, r6_2);
+ HS_SLAB_LOCAL_L(96) = r6_1;
+ HS_SLAB_LOCAL_R(104) = r6_2;
+ }
+ {
+ HS_KEY_TYPE r7_1 = HS_SLAB_LOCAL_L(112);
+ HS_KEY_TYPE r7_2 = HS_SLAB_LOCAL_R(120);
+ HS_CMP_XCHG(r7_1, r7_2);
+ HS_SLAB_LOCAL_L(112) = r7_1;
+ HS_SLAB_LOCAL_R(120) = r7_2;
+ }
+ }
+ HS_BLOCK_BARRIER();
+ r1 = HS_BX_LOCAL_V(16 * 8 * 0);
+ r16 = HS_BX_LOCAL_V(16 * 8 * 1);
+ r2 = HS_BX_LOCAL_V(16 * 8 * 2);
+ r15 = HS_BX_LOCAL_V(16 * 8 * 3);
+ r3 = HS_BX_LOCAL_V(16 * 8 * 4);
+ r14 = HS_BX_LOCAL_V(16 * 8 * 5);
+ r4 = HS_BX_LOCAL_V(16 * 8 * 6);
+ r13 = HS_BX_LOCAL_V(16 * 8 * 7);
+ r5 = HS_BX_LOCAL_V(16 * 8 * 8);
+ r12 = HS_BX_LOCAL_V(16 * 8 * 9);
+ r6 = HS_BX_LOCAL_V(16 * 8 * 10);
+ r11 = HS_BX_LOCAL_V(16 * 8 * 11);
+ r7 = HS_BX_LOCAL_V(16 * 8 * 12);
+ r10 = HS_BX_LOCAL_V(16 * 8 * 13);
+ r8 = HS_BX_LOCAL_V(16 * 8 * 14);
+ r9 = HS_BX_LOCAL_V(16 * 8 * 15);
+ {
+ {
+ HS_SLAB_HALF_PREAMBLE(4);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ }
+ HS_BX_LOCAL_V(16 * 8 * 0) = r1;
+ HS_BX_LOCAL_V(16 * 8 * 1) = r16;
+ HS_BX_LOCAL_V(16 * 8 * 2) = r2;
+ HS_BX_LOCAL_V(16 * 8 * 3) = r15;
+ HS_BX_LOCAL_V(16 * 8 * 4) = r3;
+ HS_BX_LOCAL_V(16 * 8 * 5) = r14;
+ HS_BX_LOCAL_V(16 * 8 * 6) = r4;
+ HS_BX_LOCAL_V(16 * 8 * 7) = r13;
+ HS_BX_LOCAL_V(16 * 8 * 8) = r5;
+ HS_BX_LOCAL_V(16 * 8 * 9) = r12;
+ HS_BX_LOCAL_V(16 * 8 * 10) = r6;
+ HS_BX_LOCAL_V(16 * 8 * 11) = r11;
+ HS_BX_LOCAL_V(16 * 8 * 12) = r7;
+ HS_BX_LOCAL_V(16 * 8 * 13) = r10;
+ HS_BX_LOCAL_V(16 * 8 * 14) = r8;
+ HS_BX_LOCAL_V(16 * 8 * 15) = r9;
+ HS_BLOCK_BARRIER();
+ {
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8);
+ HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(16);
+ HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(24);
+ HS_CMP_XCHG(r0_2, r0_3);
+ HS_CMP_XCHG(r0_1, r0_4);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(0) = r0_1;
+ HS_SLAB_LOCAL_L(8) = r0_2;
+ HS_SLAB_LOCAL_R(16) = r0_3;
+ HS_SLAB_LOCAL_R(24) = r0_4;
+ }
+ {
+ HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(32);
+ HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(40);
+ HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(48);
+ HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(56);
+ HS_CMP_XCHG(r1_2, r1_3);
+ HS_CMP_XCHG(r1_1, r1_4);
+ HS_CMP_XCHG(r1_3, r1_4);
+ HS_CMP_XCHG(r1_1, r1_2);
+ HS_SLAB_LOCAL_L(32) = r1_1;
+ HS_SLAB_LOCAL_L(40) = r1_2;
+ HS_SLAB_LOCAL_R(48) = r1_3;
+ HS_SLAB_LOCAL_R(56) = r1_4;
+ }
+ {
+ HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(64);
+ HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_L(72);
+ HS_KEY_TYPE r2_3 = HS_SLAB_LOCAL_R(80);
+ HS_KEY_TYPE r2_4 = HS_SLAB_LOCAL_R(88);
+ HS_CMP_XCHG(r2_2, r2_3);
+ HS_CMP_XCHG(r2_1, r2_4);
+ HS_CMP_XCHG(r2_3, r2_4);
+ HS_CMP_XCHG(r2_1, r2_2);
+ HS_SLAB_LOCAL_L(64) = r2_1;
+ HS_SLAB_LOCAL_L(72) = r2_2;
+ HS_SLAB_LOCAL_R(80) = r2_3;
+ HS_SLAB_LOCAL_R(88) = r2_4;
+ }
+ {
+ HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(96);
+ HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_L(104);
+ HS_KEY_TYPE r3_3 = HS_SLAB_LOCAL_R(112);
+ HS_KEY_TYPE r3_4 = HS_SLAB_LOCAL_R(120);
+ HS_CMP_XCHG(r3_2, r3_3);
+ HS_CMP_XCHG(r3_1, r3_4);
+ HS_CMP_XCHG(r3_3, r3_4);
+ HS_CMP_XCHG(r3_1, r3_2);
+ HS_SLAB_LOCAL_L(96) = r3_1;
+ HS_SLAB_LOCAL_L(104) = r3_2;
+ HS_SLAB_LOCAL_R(112) = r3_3;
+ HS_SLAB_LOCAL_R(120) = r3_4;
+ }
+ }
+ HS_BLOCK_BARRIER();
+ r1 = HS_BX_LOCAL_V(16 * 8 * 0);
+ r16 = HS_BX_LOCAL_V(16 * 8 * 1);
+ r2 = HS_BX_LOCAL_V(16 * 8 * 2);
+ r15 = HS_BX_LOCAL_V(16 * 8 * 3);
+ r3 = HS_BX_LOCAL_V(16 * 8 * 4);
+ r14 = HS_BX_LOCAL_V(16 * 8 * 5);
+ r4 = HS_BX_LOCAL_V(16 * 8 * 6);
+ r13 = HS_BX_LOCAL_V(16 * 8 * 7);
+ r5 = HS_BX_LOCAL_V(16 * 8 * 8);
+ r12 = HS_BX_LOCAL_V(16 * 8 * 9);
+ r6 = HS_BX_LOCAL_V(16 * 8 * 10);
+ r11 = HS_BX_LOCAL_V(16 * 8 * 11);
+ r7 = HS_BX_LOCAL_V(16 * 8 * 12);
+ r10 = HS_BX_LOCAL_V(16 * 8 * 13);
+ r8 = HS_BX_LOCAL_V(16 * 8 * 14);
+ r9 = HS_BX_LOCAL_V(16 * 8 * 15);
+ {
+ {
+ HS_SLAB_HALF_PREAMBLE(4);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ }
+ HS_BX_LOCAL_V(16 * 8 * 0) = r1;
+ HS_BX_LOCAL_V(16 * 8 * 1) = r16;
+ HS_BX_LOCAL_V(16 * 8 * 2) = r2;
+ HS_BX_LOCAL_V(16 * 8 * 3) = r15;
+ HS_BX_LOCAL_V(16 * 8 * 4) = r3;
+ HS_BX_LOCAL_V(16 * 8 * 5) = r14;
+ HS_BX_LOCAL_V(16 * 8 * 6) = r4;
+ HS_BX_LOCAL_V(16 * 8 * 7) = r13;
+ HS_BX_LOCAL_V(16 * 8 * 8) = r5;
+ HS_BX_LOCAL_V(16 * 8 * 9) = r12;
+ HS_BX_LOCAL_V(16 * 8 * 10) = r6;
+ HS_BX_LOCAL_V(16 * 8 * 11) = r11;
+ HS_BX_LOCAL_V(16 * 8 * 12) = r7;
+ HS_BX_LOCAL_V(16 * 8 * 13) = r10;
+ HS_BX_LOCAL_V(16 * 8 * 14) = r8;
+ HS_BX_LOCAL_V(16 * 8 * 15) = r9;
+ HS_BLOCK_BARRIER();
+ {
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8);
+ HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(16);
+ HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(24);
+ HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(32);
+ HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(40);
+ HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(48);
+ HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(56);
+ HS_CMP_XCHG(r0_4, r0_5);
+ HS_CMP_XCHG(r0_3, r0_6);
+ HS_CMP_XCHG(r0_2, r0_7);
+ HS_CMP_XCHG(r0_1, r0_8);
+ HS_CMP_XCHG(r0_5, r0_7);
+ HS_CMP_XCHG(r0_6, r0_8);
+ HS_CMP_XCHG(r0_5, r0_6);
+ HS_CMP_XCHG(r0_7, r0_8);
+ HS_CMP_XCHG(r0_1, r0_3);
+ HS_CMP_XCHG(r0_2, r0_4);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_SLAB_LOCAL_L(0) = r0_1;
+ HS_SLAB_LOCAL_L(8) = r0_2;
+ HS_SLAB_LOCAL_L(16) = r0_3;
+ HS_SLAB_LOCAL_L(24) = r0_4;
+ HS_SLAB_LOCAL_R(32) = r0_5;
+ HS_SLAB_LOCAL_R(40) = r0_6;
+ HS_SLAB_LOCAL_R(48) = r0_7;
+ HS_SLAB_LOCAL_R(56) = r0_8;
+ }
+ {
+ HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64);
+ HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(72);
+ HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_L(80);
+ HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_L(88);
+ HS_KEY_TYPE r1_5 = HS_SLAB_LOCAL_R(96);
+ HS_KEY_TYPE r1_6 = HS_SLAB_LOCAL_R(104);
+ HS_KEY_TYPE r1_7 = HS_SLAB_LOCAL_R(112);
+ HS_KEY_TYPE r1_8 = HS_SLAB_LOCAL_R(120);
+ HS_CMP_XCHG(r1_4, r1_5);
+ HS_CMP_XCHG(r1_3, r1_6);
+ HS_CMP_XCHG(r1_2, r1_7);
+ HS_CMP_XCHG(r1_1, r1_8);
+ HS_CMP_XCHG(r1_5, r1_7);
+ HS_CMP_XCHG(r1_6, r1_8);
+ HS_CMP_XCHG(r1_5, r1_6);
+ HS_CMP_XCHG(r1_7, r1_8);
+ HS_CMP_XCHG(r1_1, r1_3);
+ HS_CMP_XCHG(r1_2, r1_4);
+ HS_CMP_XCHG(r1_1, r1_2);
+ HS_CMP_XCHG(r1_3, r1_4);
+ HS_SLAB_LOCAL_L(64) = r1_1;
+ HS_SLAB_LOCAL_L(72) = r1_2;
+ HS_SLAB_LOCAL_L(80) = r1_3;
+ HS_SLAB_LOCAL_L(88) = r1_4;
+ HS_SLAB_LOCAL_R(96) = r1_5;
+ HS_SLAB_LOCAL_R(104) = r1_6;
+ HS_SLAB_LOCAL_R(112) = r1_7;
+ HS_SLAB_LOCAL_R(120) = r1_8;
+ }
+ }
+ HS_BLOCK_BARRIER();
+ r1 = HS_BX_LOCAL_V(16 * 8 * 0);
+ r16 = HS_BX_LOCAL_V(16 * 8 * 1);
+ r2 = HS_BX_LOCAL_V(16 * 8 * 2);
+ r15 = HS_BX_LOCAL_V(16 * 8 * 3);
+ r3 = HS_BX_LOCAL_V(16 * 8 * 4);
+ r14 = HS_BX_LOCAL_V(16 * 8 * 5);
+ r4 = HS_BX_LOCAL_V(16 * 8 * 6);
+ r13 = HS_BX_LOCAL_V(16 * 8 * 7);
+ r5 = HS_BX_LOCAL_V(16 * 8 * 8);
+ r12 = HS_BX_LOCAL_V(16 * 8 * 9);
+ r6 = HS_BX_LOCAL_V(16 * 8 * 10);
+ r11 = HS_BX_LOCAL_V(16 * 8 * 11);
+ r7 = HS_BX_LOCAL_V(16 * 8 * 12);
+ r10 = HS_BX_LOCAL_V(16 * 8 * 13);
+ r8 = HS_BX_LOCAL_V(16 * 8 * 14);
+ r9 = HS_BX_LOCAL_V(16 * 8 * 15);
+ {
+ {
+ HS_SLAB_HALF_PREAMBLE(4);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ }
+ HS_BX_LOCAL_V(16 * 8 * 0) = r1;
+ HS_BX_LOCAL_V(16 * 8 * 1) = r16;
+ HS_BX_LOCAL_V(16 * 8 * 2) = r2;
+ HS_BX_LOCAL_V(16 * 8 * 3) = r15;
+ HS_BX_LOCAL_V(16 * 8 * 4) = r3;
+ HS_BX_LOCAL_V(16 * 8 * 5) = r14;
+ HS_BX_LOCAL_V(16 * 8 * 6) = r4;
+ HS_BX_LOCAL_V(16 * 8 * 7) = r13;
+ HS_BX_LOCAL_V(16 * 8 * 8) = r5;
+ HS_BX_LOCAL_V(16 * 8 * 9) = r12;
+ HS_BX_LOCAL_V(16 * 8 * 10) = r6;
+ HS_BX_LOCAL_V(16 * 8 * 11) = r11;
+ HS_BX_LOCAL_V(16 * 8 * 12) = r7;
+ HS_BX_LOCAL_V(16 * 8 * 13) = r10;
+ HS_BX_LOCAL_V(16 * 8 * 14) = r8;
+ HS_BX_LOCAL_V(16 * 8 * 15) = r9;
+ HS_BLOCK_BARRIER();
+ {
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8);
+ HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(16);
+ HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(24);
+ HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_L(32);
+ HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_L(40);
+ HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_L(48);
+ HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_L(56);
+ HS_KEY_TYPE r0_9 = HS_SLAB_LOCAL_R(64);
+ HS_KEY_TYPE r0_10 = HS_SLAB_LOCAL_R(72);
+ HS_KEY_TYPE r0_11 = HS_SLAB_LOCAL_R(80);
+ HS_KEY_TYPE r0_12 = HS_SLAB_LOCAL_R(88);
+ HS_KEY_TYPE r0_13 = HS_SLAB_LOCAL_R(96);
+ HS_KEY_TYPE r0_14 = HS_SLAB_LOCAL_R(104);
+ HS_KEY_TYPE r0_15 = HS_SLAB_LOCAL_R(112);
+ HS_KEY_TYPE r0_16 = HS_SLAB_LOCAL_R(120);
+ HS_CMP_XCHG(r0_8, r0_9);
+ HS_CMP_XCHG(r0_7, r0_10);
+ HS_CMP_XCHG(r0_6, r0_11);
+ HS_CMP_XCHG(r0_5, r0_12);
+ HS_CMP_XCHG(r0_4, r0_13);
+ HS_CMP_XCHG(r0_3, r0_14);
+ HS_CMP_XCHG(r0_2, r0_15);
+ HS_CMP_XCHG(r0_1, r0_16);
+ HS_CMP_XCHG(r0_9, r0_13);
+ HS_CMP_XCHG(r0_11, r0_15);
+ HS_CMP_XCHG(r0_9, r0_11);
+ HS_CMP_XCHG(r0_13, r0_15);
+ HS_CMP_XCHG(r0_10, r0_14);
+ HS_CMP_XCHG(r0_12, r0_16);
+ HS_CMP_XCHG(r0_10, r0_12);
+ HS_CMP_XCHG(r0_14, r0_16);
+ HS_CMP_XCHG(r0_9, r0_10);
+ HS_CMP_XCHG(r0_11, r0_12);
+ HS_CMP_XCHG(r0_13, r0_14);
+ HS_CMP_XCHG(r0_15, r0_16);
+ HS_CMP_XCHG(r0_1, r0_5);
+ HS_CMP_XCHG(r0_3, r0_7);
+ HS_CMP_XCHG(r0_1, r0_3);
+ HS_CMP_XCHG(r0_5, r0_7);
+ HS_CMP_XCHG(r0_2, r0_6);
+ HS_CMP_XCHG(r0_4, r0_8);
+ HS_CMP_XCHG(r0_2, r0_4);
+ HS_CMP_XCHG(r0_6, r0_8);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_CMP_XCHG(r0_5, r0_6);
+ HS_CMP_XCHG(r0_7, r0_8);
+ HS_SLAB_LOCAL_L(0) = r0_1;
+ HS_SLAB_LOCAL_L(8) = r0_2;
+ HS_SLAB_LOCAL_L(16) = r0_3;
+ HS_SLAB_LOCAL_L(24) = r0_4;
+ HS_SLAB_LOCAL_L(32) = r0_5;
+ HS_SLAB_LOCAL_L(40) = r0_6;
+ HS_SLAB_LOCAL_L(48) = r0_7;
+ HS_SLAB_LOCAL_L(56) = r0_8;
+ HS_SLAB_LOCAL_R(64) = r0_9;
+ HS_SLAB_LOCAL_R(72) = r0_10;
+ HS_SLAB_LOCAL_R(80) = r0_11;
+ HS_SLAB_LOCAL_R(88) = r0_12;
+ HS_SLAB_LOCAL_R(96) = r0_13;
+ HS_SLAB_LOCAL_R(104) = r0_14;
+ HS_SLAB_LOCAL_R(112) = r0_15;
+ HS_SLAB_LOCAL_R(120) = r0_16;
+ }
+ }
+ HS_BLOCK_BARRIER();
+ r1 = HS_BX_LOCAL_V(16 * 8 * 0);
+ r16 = HS_BX_LOCAL_V(16 * 8 * 1);
+ r2 = HS_BX_LOCAL_V(16 * 8 * 2);
+ r15 = HS_BX_LOCAL_V(16 * 8 * 3);
+ r3 = HS_BX_LOCAL_V(16 * 8 * 4);
+ r14 = HS_BX_LOCAL_V(16 * 8 * 5);
+ r4 = HS_BX_LOCAL_V(16 * 8 * 6);
+ r13 = HS_BX_LOCAL_V(16 * 8 * 7);
+ r5 = HS_BX_LOCAL_V(16 * 8 * 8);
+ r12 = HS_BX_LOCAL_V(16 * 8 * 9);
+ r6 = HS_BX_LOCAL_V(16 * 8 * 10);
+ r11 = HS_BX_LOCAL_V(16 * 8 * 11);
+ r7 = HS_BX_LOCAL_V(16 * 8 * 12);
+ r10 = HS_BX_LOCAL_V(16 * 8 * 13);
+ r8 = HS_BX_LOCAL_V(16 * 8 * 14);
+ r9 = HS_BX_LOCAL_V(16 * 8 * 15);
+ {
+ {
+ HS_SLAB_HALF_PREAMBLE(4);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ }
+ HS_SLAB_GLOBAL_STORE(8, 0, r1);
+ HS_SLAB_GLOBAL_STORE(8, 1, r2);
+ HS_SLAB_GLOBAL_STORE(8, 2, r3);
+ HS_SLAB_GLOBAL_STORE(8, 3, r4);
+ HS_SLAB_GLOBAL_STORE(8, 4, r5);
+ HS_SLAB_GLOBAL_STORE(8, 5, r6);
+ HS_SLAB_GLOBAL_STORE(8, 6, r7);
+ HS_SLAB_GLOBAL_STORE(8, 7, r8);
+ HS_SLAB_GLOBAL_STORE(8, 8, r9);
+ HS_SLAB_GLOBAL_STORE(8, 9, r10);
+ HS_SLAB_GLOBAL_STORE(8, 10, r11);
+ HS_SLAB_GLOBAL_STORE(8, 11, r12);
+ HS_SLAB_GLOBAL_STORE(8, 12, r13);
+ HS_SLAB_GLOBAL_STORE(8, 13, r14);
+ HS_SLAB_GLOBAL_STORE(8, 14, r15);
+ HS_SLAB_GLOBAL_STORE(8, 15, r16);
+}
+
+HS_BS_KERNEL_PROTO(8, 8, 3)
+{
+ HS_BLOCK_LOCAL_MEM_DECL(64, 16);
+
+ HS_SLAB_GLOBAL_PREAMBLE(8, 16);
+ HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 8, 0);
+ HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 8, 1);
+ HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 8, 2);
+ HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 8, 3);
+ HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 8, 4);
+ HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 8, 5);
+ HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 8, 6);
+ HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 8, 7);
+ HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8, 8);
+ HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 8, 9);
+ HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 8, 10);
+ HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 8, 11);
+ HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 8, 12);
+ HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 8, 13);
+ HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 8, 14);
+ HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 8, 15);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r6, r11);
+ HS_CMP_XCHG(r7, r10);
+ HS_CMP_XCHG(r4, r13);
+ HS_CMP_XCHG(r14, r15);
+ HS_CMP_XCHG(r8, r12);
+ HS_CMP_XCHG(r2, r3);
+ HS_CMP_XCHG(r5, r9);
+ HS_CMP_XCHG(r2, r5);
+ HS_CMP_XCHG(r8, r14);
+ HS_CMP_XCHG(r3, r9);
+ HS_CMP_XCHG(r12, r15);
+ HS_CMP_XCHG(r3, r5);
+ HS_CMP_XCHG(r6, r7);
+ HS_CMP_XCHG(r10, r11);
+ HS_CMP_XCHG(r12, r14);
+ HS_CMP_XCHG(r4, r9);
+ HS_CMP_XCHG(r8, r13);
+ HS_CMP_XCHG(r7, r9);
+ HS_CMP_XCHG(r11, r13);
+ HS_CMP_XCHG(r4, r6);
+ HS_CMP_XCHG(r8, r10);
+ HS_CMP_XCHG(r4, r5);
+ HS_CMP_XCHG(r6, r7);
+ HS_CMP_XCHG(r8, r9);
+ HS_CMP_XCHG(r10, r11);
+ HS_CMP_XCHG(r12, r13);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ {
+ HS_SLAB_FLIP_PREAMBLE(1);
+ HS_CMP_FLIP(0, r1, r16);
+ HS_CMP_FLIP(1, r2, r15);
+ HS_CMP_FLIP(2, r3, r14);
+ HS_CMP_FLIP(3, r4, r13);
+ HS_CMP_FLIP(4, r5, r12);
+ HS_CMP_FLIP(5, r6, r11);
+ HS_CMP_FLIP(6, r7, r10);
+ HS_CMP_FLIP(7, r8, r9);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ {
+ HS_SLAB_FLIP_PREAMBLE(3);
+ HS_CMP_FLIP(0, r1, r16);
+ HS_CMP_FLIP(1, r2, r15);
+ HS_CMP_FLIP(2, r3, r14);
+ HS_CMP_FLIP(3, r4, r13);
+ HS_CMP_FLIP(4, r5, r12);
+ HS_CMP_FLIP(5, r6, r11);
+ HS_CMP_FLIP(6, r7, r10);
+ HS_CMP_FLIP(7, r8, r9);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ {
+ HS_SLAB_FLIP_PREAMBLE(7);
+ HS_CMP_FLIP(0, r1, r16);
+ HS_CMP_FLIP(1, r2, r15);
+ HS_CMP_FLIP(2, r3, r14);
+ HS_CMP_FLIP(3, r4, r13);
+ HS_CMP_FLIP(4, r5, r12);
+ HS_CMP_FLIP(5, r6, r11);
+ HS_CMP_FLIP(6, r7, r10);
+ HS_CMP_FLIP(7, r8, r9);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ HS_BS_MERGE_H_PREAMBLE(8, 8);
+ HS_BX_LOCAL_V(8 * 8 * 0) = r1;
+ HS_BX_LOCAL_V(8 * 8 * 1) = r16;
+ HS_BX_LOCAL_V(8 * 8 * 2) = r2;
+ HS_BX_LOCAL_V(8 * 8 * 3) = r15;
+ HS_BX_LOCAL_V(8 * 8 * 4) = r3;
+ HS_BX_LOCAL_V(8 * 8 * 5) = r14;
+ HS_BX_LOCAL_V(8 * 8 * 6) = r4;
+ HS_BX_LOCAL_V(8 * 8 * 7) = r13;
+ HS_BX_LOCAL_V(8 * 8 * 8) = r5;
+ HS_BX_LOCAL_V(8 * 8 * 9) = r12;
+ HS_BX_LOCAL_V(8 * 8 * 10) = r6;
+ HS_BX_LOCAL_V(8 * 8 * 11) = r11;
+ HS_BX_LOCAL_V(8 * 8 * 12) = r7;
+ HS_BX_LOCAL_V(8 * 8 * 13) = r10;
+ HS_BX_LOCAL_V(8 * 8 * 14) = r8;
+ HS_BX_LOCAL_V(8 * 8 * 15) = r9;
+ HS_BLOCK_BARRIER();
+ {
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(8);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(0) = r0_1;
+ HS_SLAB_LOCAL_R(8) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(16);
+ HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(24);
+ HS_CMP_XCHG(r1_1, r1_2);
+ HS_SLAB_LOCAL_L(16) = r1_1;
+ HS_SLAB_LOCAL_R(24) = r1_2;
+ }
+ {
+ HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(32);
+ HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(40);
+ HS_CMP_XCHG(r2_1, r2_2);
+ HS_SLAB_LOCAL_L(32) = r2_1;
+ HS_SLAB_LOCAL_R(40) = r2_2;
+ }
+ {
+ HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(48);
+ HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(56);
+ HS_CMP_XCHG(r3_1, r3_2);
+ HS_SLAB_LOCAL_L(48) = r3_1;
+ HS_SLAB_LOCAL_R(56) = r3_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(520);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(512) = r0_1;
+ HS_SLAB_LOCAL_R(520) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(528);
+ HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(536);
+ HS_CMP_XCHG(r1_1, r1_2);
+ HS_SLAB_LOCAL_L(528) = r1_1;
+ HS_SLAB_LOCAL_R(536) = r1_2;
+ }
+ {
+ HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(544);
+ HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(552);
+ HS_CMP_XCHG(r2_1, r2_2);
+ HS_SLAB_LOCAL_L(544) = r2_1;
+ HS_SLAB_LOCAL_R(552) = r2_2;
+ }
+ {
+ HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(560);
+ HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(568);
+ HS_CMP_XCHG(r3_1, r3_2);
+ HS_SLAB_LOCAL_L(560) = r3_1;
+ HS_SLAB_LOCAL_R(568) = r3_2;
+ }
+ }
+ HS_BLOCK_BARRIER();
+ r1 = HS_BX_LOCAL_V(8 * 8 * 0);
+ r16 = HS_BX_LOCAL_V(8 * 8 * 1);
+ r2 = HS_BX_LOCAL_V(8 * 8 * 2);
+ r15 = HS_BX_LOCAL_V(8 * 8 * 3);
+ r3 = HS_BX_LOCAL_V(8 * 8 * 4);
+ r14 = HS_BX_LOCAL_V(8 * 8 * 5);
+ r4 = HS_BX_LOCAL_V(8 * 8 * 6);
+ r13 = HS_BX_LOCAL_V(8 * 8 * 7);
+ r5 = HS_BX_LOCAL_V(8 * 8 * 8);
+ r12 = HS_BX_LOCAL_V(8 * 8 * 9);
+ r6 = HS_BX_LOCAL_V(8 * 8 * 10);
+ r11 = HS_BX_LOCAL_V(8 * 8 * 11);
+ r7 = HS_BX_LOCAL_V(8 * 8 * 12);
+ r10 = HS_BX_LOCAL_V(8 * 8 * 13);
+ r8 = HS_BX_LOCAL_V(8 * 8 * 14);
+ r9 = HS_BX_LOCAL_V(8 * 8 * 15);
+ {
+ {
+ HS_SLAB_HALF_PREAMBLE(4);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ }
+ HS_BX_LOCAL_V(8 * 8 * 0) = r1;
+ HS_BX_LOCAL_V(8 * 8 * 1) = r16;
+ HS_BX_LOCAL_V(8 * 8 * 2) = r2;
+ HS_BX_LOCAL_V(8 * 8 * 3) = r15;
+ HS_BX_LOCAL_V(8 * 8 * 4) = r3;
+ HS_BX_LOCAL_V(8 * 8 * 5) = r14;
+ HS_BX_LOCAL_V(8 * 8 * 6) = r4;
+ HS_BX_LOCAL_V(8 * 8 * 7) = r13;
+ HS_BX_LOCAL_V(8 * 8 * 8) = r5;
+ HS_BX_LOCAL_V(8 * 8 * 9) = r12;
+ HS_BX_LOCAL_V(8 * 8 * 10) = r6;
+ HS_BX_LOCAL_V(8 * 8 * 11) = r11;
+ HS_BX_LOCAL_V(8 * 8 * 12) = r7;
+ HS_BX_LOCAL_V(8 * 8 * 13) = r10;
+ HS_BX_LOCAL_V(8 * 8 * 14) = r8;
+ HS_BX_LOCAL_V(8 * 8 * 15) = r9;
+ HS_BLOCK_BARRIER();
+ {
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8);
+ HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(16);
+ HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(24);
+ HS_CMP_XCHG(r0_2, r0_3);
+ HS_CMP_XCHG(r0_1, r0_4);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(0) = r0_1;
+ HS_SLAB_LOCAL_L(8) = r0_2;
+ HS_SLAB_LOCAL_R(16) = r0_3;
+ HS_SLAB_LOCAL_R(24) = r0_4;
+ }
+ {
+ HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(32);
+ HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(40);
+ HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(48);
+ HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(56);
+ HS_CMP_XCHG(r1_2, r1_3);
+ HS_CMP_XCHG(r1_1, r1_4);
+ HS_CMP_XCHG(r1_3, r1_4);
+ HS_CMP_XCHG(r1_1, r1_2);
+ HS_SLAB_LOCAL_L(32) = r1_1;
+ HS_SLAB_LOCAL_L(40) = r1_2;
+ HS_SLAB_LOCAL_R(48) = r1_3;
+ HS_SLAB_LOCAL_R(56) = r1_4;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(520);
+ HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(528);
+ HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(536);
+ HS_CMP_XCHG(r0_2, r0_3);
+ HS_CMP_XCHG(r0_1, r0_4);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(512) = r0_1;
+ HS_SLAB_LOCAL_L(520) = r0_2;
+ HS_SLAB_LOCAL_R(528) = r0_3;
+ HS_SLAB_LOCAL_R(536) = r0_4;
+ }
+ {
+ HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(544);
+ HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(552);
+ HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(560);
+ HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(568);
+ HS_CMP_XCHG(r1_2, r1_3);
+ HS_CMP_XCHG(r1_1, r1_4);
+ HS_CMP_XCHG(r1_3, r1_4);
+ HS_CMP_XCHG(r1_1, r1_2);
+ HS_SLAB_LOCAL_L(544) = r1_1;
+ HS_SLAB_LOCAL_L(552) = r1_2;
+ HS_SLAB_LOCAL_R(560) = r1_3;
+ HS_SLAB_LOCAL_R(568) = r1_4;
+ }
+ }
+ HS_BLOCK_BARRIER();
+ r1 = HS_BX_LOCAL_V(8 * 8 * 0);
+ r16 = HS_BX_LOCAL_V(8 * 8 * 1);
+ r2 = HS_BX_LOCAL_V(8 * 8 * 2);
+ r15 = HS_BX_LOCAL_V(8 * 8 * 3);
+ r3 = HS_BX_LOCAL_V(8 * 8 * 4);
+ r14 = HS_BX_LOCAL_V(8 * 8 * 5);
+ r4 = HS_BX_LOCAL_V(8 * 8 * 6);
+ r13 = HS_BX_LOCAL_V(8 * 8 * 7);
+ r5 = HS_BX_LOCAL_V(8 * 8 * 8);
+ r12 = HS_BX_LOCAL_V(8 * 8 * 9);
+ r6 = HS_BX_LOCAL_V(8 * 8 * 10);
+ r11 = HS_BX_LOCAL_V(8 * 8 * 11);
+ r7 = HS_BX_LOCAL_V(8 * 8 * 12);
+ r10 = HS_BX_LOCAL_V(8 * 8 * 13);
+ r8 = HS_BX_LOCAL_V(8 * 8 * 14);
+ r9 = HS_BX_LOCAL_V(8 * 8 * 15);
+ {
+ {
+ HS_SLAB_HALF_PREAMBLE(4);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ }
+ HS_BX_LOCAL_V(8 * 8 * 0) = r1;
+ HS_BX_LOCAL_V(8 * 8 * 1) = r16;
+ HS_BX_LOCAL_V(8 * 8 * 2) = r2;
+ HS_BX_LOCAL_V(8 * 8 * 3) = r15;
+ HS_BX_LOCAL_V(8 * 8 * 4) = r3;
+ HS_BX_LOCAL_V(8 * 8 * 5) = r14;
+ HS_BX_LOCAL_V(8 * 8 * 6) = r4;
+ HS_BX_LOCAL_V(8 * 8 * 7) = r13;
+ HS_BX_LOCAL_V(8 * 8 * 8) = r5;
+ HS_BX_LOCAL_V(8 * 8 * 9) = r12;
+ HS_BX_LOCAL_V(8 * 8 * 10) = r6;
+ HS_BX_LOCAL_V(8 * 8 * 11) = r11;
+ HS_BX_LOCAL_V(8 * 8 * 12) = r7;
+ HS_BX_LOCAL_V(8 * 8 * 13) = r10;
+ HS_BX_LOCAL_V(8 * 8 * 14) = r8;
+ HS_BX_LOCAL_V(8 * 8 * 15) = r9;
+ HS_BLOCK_BARRIER();
+ {
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8);
+ HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(16);
+ HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(24);
+ HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(32);
+ HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(40);
+ HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(48);
+ HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(56);
+ HS_CMP_XCHG(r0_4, r0_5);
+ HS_CMP_XCHG(r0_3, r0_6);
+ HS_CMP_XCHG(r0_2, r0_7);
+ HS_CMP_XCHG(r0_1, r0_8);
+ HS_CMP_XCHG(r0_5, r0_7);
+ HS_CMP_XCHG(r0_6, r0_8);
+ HS_CMP_XCHG(r0_5, r0_6);
+ HS_CMP_XCHG(r0_7, r0_8);
+ HS_CMP_XCHG(r0_1, r0_3);
+ HS_CMP_XCHG(r0_2, r0_4);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_SLAB_LOCAL_L(0) = r0_1;
+ HS_SLAB_LOCAL_L(8) = r0_2;
+ HS_SLAB_LOCAL_L(16) = r0_3;
+ HS_SLAB_LOCAL_L(24) = r0_4;
+ HS_SLAB_LOCAL_R(32) = r0_5;
+ HS_SLAB_LOCAL_R(40) = r0_6;
+ HS_SLAB_LOCAL_R(48) = r0_7;
+ HS_SLAB_LOCAL_R(56) = r0_8;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(520);
+ HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(528);
+ HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(536);
+ HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(544);
+ HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(552);
+ HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(560);
+ HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(568);
+ HS_CMP_XCHG(r0_4, r0_5);
+ HS_CMP_XCHG(r0_3, r0_6);
+ HS_CMP_XCHG(r0_2, r0_7);
+ HS_CMP_XCHG(r0_1, r0_8);
+ HS_CMP_XCHG(r0_5, r0_7);
+ HS_CMP_XCHG(r0_6, r0_8);
+ HS_CMP_XCHG(r0_5, r0_6);
+ HS_CMP_XCHG(r0_7, r0_8);
+ HS_CMP_XCHG(r0_1, r0_3);
+ HS_CMP_XCHG(r0_2, r0_4);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_SLAB_LOCAL_L(512) = r0_1;
+ HS_SLAB_LOCAL_L(520) = r0_2;
+ HS_SLAB_LOCAL_L(528) = r0_3;
+ HS_SLAB_LOCAL_L(536) = r0_4;
+ HS_SLAB_LOCAL_R(544) = r0_5;
+ HS_SLAB_LOCAL_R(552) = r0_6;
+ HS_SLAB_LOCAL_R(560) = r0_7;
+ HS_SLAB_LOCAL_R(568) = r0_8;
+ }
+ }
+ HS_BLOCK_BARRIER();
+ r1 = HS_BX_LOCAL_V(8 * 8 * 0);
+ r16 = HS_BX_LOCAL_V(8 * 8 * 1);
+ r2 = HS_BX_LOCAL_V(8 * 8 * 2);
+ r15 = HS_BX_LOCAL_V(8 * 8 * 3);
+ r3 = HS_BX_LOCAL_V(8 * 8 * 4);
+ r14 = HS_BX_LOCAL_V(8 * 8 * 5);
+ r4 = HS_BX_LOCAL_V(8 * 8 * 6);
+ r13 = HS_BX_LOCAL_V(8 * 8 * 7);
+ r5 = HS_BX_LOCAL_V(8 * 8 * 8);
+ r12 = HS_BX_LOCAL_V(8 * 8 * 9);
+ r6 = HS_BX_LOCAL_V(8 * 8 * 10);
+ r11 = HS_BX_LOCAL_V(8 * 8 * 11);
+ r7 = HS_BX_LOCAL_V(8 * 8 * 12);
+ r10 = HS_BX_LOCAL_V(8 * 8 * 13);
+ r8 = HS_BX_LOCAL_V(8 * 8 * 14);
+ r9 = HS_BX_LOCAL_V(8 * 8 * 15);
+ {
+ {
+ HS_SLAB_HALF_PREAMBLE(4);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ }
+ HS_SLAB_GLOBAL_STORE(8, 0, r1);
+ HS_SLAB_GLOBAL_STORE(8, 1, r2);
+ HS_SLAB_GLOBAL_STORE(8, 2, r3);
+ HS_SLAB_GLOBAL_STORE(8, 3, r4);
+ HS_SLAB_GLOBAL_STORE(8, 4, r5);
+ HS_SLAB_GLOBAL_STORE(8, 5, r6);
+ HS_SLAB_GLOBAL_STORE(8, 6, r7);
+ HS_SLAB_GLOBAL_STORE(8, 7, r8);
+ HS_SLAB_GLOBAL_STORE(8, 8, r9);
+ HS_SLAB_GLOBAL_STORE(8, 9, r10);
+ HS_SLAB_GLOBAL_STORE(8, 10, r11);
+ HS_SLAB_GLOBAL_STORE(8, 11, r12);
+ HS_SLAB_GLOBAL_STORE(8, 12, r13);
+ HS_SLAB_GLOBAL_STORE(8, 13, r14);
+ HS_SLAB_GLOBAL_STORE(8, 14, r15);
+ HS_SLAB_GLOBAL_STORE(8, 15, r16);
+}
+
+HS_BS_KERNEL_PROTO(8, 4, 2)
+{
+ HS_BLOCK_LOCAL_MEM_DECL(32, 16);
+
+ HS_SLAB_GLOBAL_PREAMBLE(8, 16);
+ HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 8, 0);
+ HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 8, 1);
+ HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 8, 2);
+ HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 8, 3);
+ HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 8, 4);
+ HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 8, 5);
+ HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 8, 6);
+ HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 8, 7);
+ HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8, 8);
+ HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 8, 9);
+ HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 8, 10);
+ HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 8, 11);
+ HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 8, 12);
+ HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 8, 13);
+ HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 8, 14);
+ HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 8, 15);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r6, r11);
+ HS_CMP_XCHG(r7, r10);
+ HS_CMP_XCHG(r4, r13);
+ HS_CMP_XCHG(r14, r15);
+ HS_CMP_XCHG(r8, r12);
+ HS_CMP_XCHG(r2, r3);
+ HS_CMP_XCHG(r5, r9);
+ HS_CMP_XCHG(r2, r5);
+ HS_CMP_XCHG(r8, r14);
+ HS_CMP_XCHG(r3, r9);
+ HS_CMP_XCHG(r12, r15);
+ HS_CMP_XCHG(r3, r5);
+ HS_CMP_XCHG(r6, r7);
+ HS_CMP_XCHG(r10, r11);
+ HS_CMP_XCHG(r12, r14);
+ HS_CMP_XCHG(r4, r9);
+ HS_CMP_XCHG(r8, r13);
+ HS_CMP_XCHG(r7, r9);
+ HS_CMP_XCHG(r11, r13);
+ HS_CMP_XCHG(r4, r6);
+ HS_CMP_XCHG(r8, r10);
+ HS_CMP_XCHG(r4, r5);
+ HS_CMP_XCHG(r6, r7);
+ HS_CMP_XCHG(r8, r9);
+ HS_CMP_XCHG(r10, r11);
+ HS_CMP_XCHG(r12, r13);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ {
+ HS_SLAB_FLIP_PREAMBLE(1);
+ HS_CMP_FLIP(0, r1, r16);
+ HS_CMP_FLIP(1, r2, r15);
+ HS_CMP_FLIP(2, r3, r14);
+ HS_CMP_FLIP(3, r4, r13);
+ HS_CMP_FLIP(4, r5, r12);
+ HS_CMP_FLIP(5, r6, r11);
+ HS_CMP_FLIP(6, r7, r10);
+ HS_CMP_FLIP(7, r8, r9);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ {
+ HS_SLAB_FLIP_PREAMBLE(3);
+ HS_CMP_FLIP(0, r1, r16);
+ HS_CMP_FLIP(1, r2, r15);
+ HS_CMP_FLIP(2, r3, r14);
+ HS_CMP_FLIP(3, r4, r13);
+ HS_CMP_FLIP(4, r5, r12);
+ HS_CMP_FLIP(5, r6, r11);
+ HS_CMP_FLIP(6, r7, r10);
+ HS_CMP_FLIP(7, r8, r9);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ {
+ HS_SLAB_FLIP_PREAMBLE(7);
+ HS_CMP_FLIP(0, r1, r16);
+ HS_CMP_FLIP(1, r2, r15);
+ HS_CMP_FLIP(2, r3, r14);
+ HS_CMP_FLIP(3, r4, r13);
+ HS_CMP_FLIP(4, r5, r12);
+ HS_CMP_FLIP(5, r6, r11);
+ HS_CMP_FLIP(6, r7, r10);
+ HS_CMP_FLIP(7, r8, r9);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ HS_BS_MERGE_H_PREAMBLE(8, 4);
+ HS_BX_LOCAL_V(4 * 8 * 0) = r1;
+ HS_BX_LOCAL_V(4 * 8 * 1) = r16;
+ HS_BX_LOCAL_V(4 * 8 * 2) = r2;
+ HS_BX_LOCAL_V(4 * 8 * 3) = r15;
+ HS_BX_LOCAL_V(4 * 8 * 4) = r3;
+ HS_BX_LOCAL_V(4 * 8 * 5) = r14;
+ HS_BX_LOCAL_V(4 * 8 * 6) = r4;
+ HS_BX_LOCAL_V(4 * 8 * 7) = r13;
+ HS_BX_LOCAL_V(4 * 8 * 8) = r5;
+ HS_BX_LOCAL_V(4 * 8 * 9) = r12;
+ HS_BX_LOCAL_V(4 * 8 * 10) = r6;
+ HS_BX_LOCAL_V(4 * 8 * 11) = r11;
+ HS_BX_LOCAL_V(4 * 8 * 12) = r7;
+ HS_BX_LOCAL_V(4 * 8 * 13) = r10;
+ HS_BX_LOCAL_V(4 * 8 * 14) = r8;
+ HS_BX_LOCAL_V(4 * 8 * 15) = r9;
+ HS_BLOCK_BARRIER();
+ {
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(8);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(0) = r0_1;
+ HS_SLAB_LOCAL_R(8) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(16);
+ HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(24);
+ HS_CMP_XCHG(r1_1, r1_2);
+ HS_SLAB_LOCAL_L(16) = r1_1;
+ HS_SLAB_LOCAL_R(24) = r1_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(136);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(128) = r0_1;
+ HS_SLAB_LOCAL_R(136) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(144);
+ HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(152);
+ HS_CMP_XCHG(r1_1, r1_2);
+ HS_SLAB_LOCAL_L(144) = r1_1;
+ HS_SLAB_LOCAL_R(152) = r1_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(264);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(256) = r0_1;
+ HS_SLAB_LOCAL_R(264) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(272);
+ HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(280);
+ HS_CMP_XCHG(r1_1, r1_2);
+ HS_SLAB_LOCAL_L(272) = r1_1;
+ HS_SLAB_LOCAL_R(280) = r1_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(384);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(392);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(384) = r0_1;
+ HS_SLAB_LOCAL_R(392) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(400);
+ HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(408);
+ HS_CMP_XCHG(r1_1, r1_2);
+ HS_SLAB_LOCAL_L(400) = r1_1;
+ HS_SLAB_LOCAL_R(408) = r1_2;
+ }
+ }
+ HS_BLOCK_BARRIER();
+ r1 = HS_BX_LOCAL_V(4 * 8 * 0);
+ r16 = HS_BX_LOCAL_V(4 * 8 * 1);
+ r2 = HS_BX_LOCAL_V(4 * 8 * 2);
+ r15 = HS_BX_LOCAL_V(4 * 8 * 3);
+ r3 = HS_BX_LOCAL_V(4 * 8 * 4);
+ r14 = HS_BX_LOCAL_V(4 * 8 * 5);
+ r4 = HS_BX_LOCAL_V(4 * 8 * 6);
+ r13 = HS_BX_LOCAL_V(4 * 8 * 7);
+ r5 = HS_BX_LOCAL_V(4 * 8 * 8);
+ r12 = HS_BX_LOCAL_V(4 * 8 * 9);
+ r6 = HS_BX_LOCAL_V(4 * 8 * 10);
+ r11 = HS_BX_LOCAL_V(4 * 8 * 11);
+ r7 = HS_BX_LOCAL_V(4 * 8 * 12);
+ r10 = HS_BX_LOCAL_V(4 * 8 * 13);
+ r8 = HS_BX_LOCAL_V(4 * 8 * 14);
+ r9 = HS_BX_LOCAL_V(4 * 8 * 15);
+ {
+ {
+ HS_SLAB_HALF_PREAMBLE(4);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ }
+ HS_BX_LOCAL_V(4 * 8 * 0) = r1;
+ HS_BX_LOCAL_V(4 * 8 * 1) = r16;
+ HS_BX_LOCAL_V(4 * 8 * 2) = r2;
+ HS_BX_LOCAL_V(4 * 8 * 3) = r15;
+ HS_BX_LOCAL_V(4 * 8 * 4) = r3;
+ HS_BX_LOCAL_V(4 * 8 * 5) = r14;
+ HS_BX_LOCAL_V(4 * 8 * 6) = r4;
+ HS_BX_LOCAL_V(4 * 8 * 7) = r13;
+ HS_BX_LOCAL_V(4 * 8 * 8) = r5;
+ HS_BX_LOCAL_V(4 * 8 * 9) = r12;
+ HS_BX_LOCAL_V(4 * 8 * 10) = r6;
+ HS_BX_LOCAL_V(4 * 8 * 11) = r11;
+ HS_BX_LOCAL_V(4 * 8 * 12) = r7;
+ HS_BX_LOCAL_V(4 * 8 * 13) = r10;
+ HS_BX_LOCAL_V(4 * 8 * 14) = r8;
+ HS_BX_LOCAL_V(4 * 8 * 15) = r9;
+ HS_BLOCK_BARRIER();
+ {
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8);
+ HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(16);
+ HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(24);
+ HS_CMP_XCHG(r0_2, r0_3);
+ HS_CMP_XCHG(r0_1, r0_4);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(0) = r0_1;
+ HS_SLAB_LOCAL_L(8) = r0_2;
+ HS_SLAB_LOCAL_R(16) = r0_3;
+ HS_SLAB_LOCAL_R(24) = r0_4;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(136);
+ HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(144);
+ HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(152);
+ HS_CMP_XCHG(r0_2, r0_3);
+ HS_CMP_XCHG(r0_1, r0_4);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(128) = r0_1;
+ HS_SLAB_LOCAL_L(136) = r0_2;
+ HS_SLAB_LOCAL_R(144) = r0_3;
+ HS_SLAB_LOCAL_R(152) = r0_4;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(264);
+ HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(272);
+ HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(280);
+ HS_CMP_XCHG(r0_2, r0_3);
+ HS_CMP_XCHG(r0_1, r0_4);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(256) = r0_1;
+ HS_SLAB_LOCAL_L(264) = r0_2;
+ HS_SLAB_LOCAL_R(272) = r0_3;
+ HS_SLAB_LOCAL_R(280) = r0_4;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(384);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(392);
+ HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(400);
+ HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(408);
+ HS_CMP_XCHG(r0_2, r0_3);
+ HS_CMP_XCHG(r0_1, r0_4);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(384) = r0_1;
+ HS_SLAB_LOCAL_L(392) = r0_2;
+ HS_SLAB_LOCAL_R(400) = r0_3;
+ HS_SLAB_LOCAL_R(408) = r0_4;
+ }
+ }
+ HS_BLOCK_BARRIER();
+ r1 = HS_BX_LOCAL_V(4 * 8 * 0);
+ r16 = HS_BX_LOCAL_V(4 * 8 * 1);
+ r2 = HS_BX_LOCAL_V(4 * 8 * 2);
+ r15 = HS_BX_LOCAL_V(4 * 8 * 3);
+ r3 = HS_BX_LOCAL_V(4 * 8 * 4);
+ r14 = HS_BX_LOCAL_V(4 * 8 * 5);
+ r4 = HS_BX_LOCAL_V(4 * 8 * 6);
+ r13 = HS_BX_LOCAL_V(4 * 8 * 7);
+ r5 = HS_BX_LOCAL_V(4 * 8 * 8);
+ r12 = HS_BX_LOCAL_V(4 * 8 * 9);
+ r6 = HS_BX_LOCAL_V(4 * 8 * 10);
+ r11 = HS_BX_LOCAL_V(4 * 8 * 11);
+ r7 = HS_BX_LOCAL_V(4 * 8 * 12);
+ r10 = HS_BX_LOCAL_V(4 * 8 * 13);
+ r8 = HS_BX_LOCAL_V(4 * 8 * 14);
+ r9 = HS_BX_LOCAL_V(4 * 8 * 15);
+ {
+ {
+ HS_SLAB_HALF_PREAMBLE(4);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ }
+ HS_SLAB_GLOBAL_STORE(8, 0, r1);
+ HS_SLAB_GLOBAL_STORE(8, 1, r2);
+ HS_SLAB_GLOBAL_STORE(8, 2, r3);
+ HS_SLAB_GLOBAL_STORE(8, 3, r4);
+ HS_SLAB_GLOBAL_STORE(8, 4, r5);
+ HS_SLAB_GLOBAL_STORE(8, 5, r6);
+ HS_SLAB_GLOBAL_STORE(8, 6, r7);
+ HS_SLAB_GLOBAL_STORE(8, 7, r8);
+ HS_SLAB_GLOBAL_STORE(8, 8, r9);
+ HS_SLAB_GLOBAL_STORE(8, 9, r10);
+ HS_SLAB_GLOBAL_STORE(8, 10, r11);
+ HS_SLAB_GLOBAL_STORE(8, 11, r12);
+ HS_SLAB_GLOBAL_STORE(8, 12, r13);
+ HS_SLAB_GLOBAL_STORE(8, 13, r14);
+ HS_SLAB_GLOBAL_STORE(8, 14, r15);
+ HS_SLAB_GLOBAL_STORE(8, 15, r16);
+}
+
+HS_BS_KERNEL_PROTO(8, 2, 1)
+{
+ HS_BLOCK_LOCAL_MEM_DECL(16, 16);
+
+ HS_SLAB_GLOBAL_PREAMBLE(8, 16);
+ HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 8, 0);
+ HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 8, 1);
+ HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 8, 2);
+ HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 8, 3);
+ HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 8, 4);
+ HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 8, 5);
+ HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 8, 6);
+ HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 8, 7);
+ HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8, 8);
+ HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 8, 9);
+ HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 8, 10);
+ HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 8, 11);
+ HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 8, 12);
+ HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 8, 13);
+ HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 8, 14);
+ HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 8, 15);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r6, r11);
+ HS_CMP_XCHG(r7, r10);
+ HS_CMP_XCHG(r4, r13);
+ HS_CMP_XCHG(r14, r15);
+ HS_CMP_XCHG(r8, r12);
+ HS_CMP_XCHG(r2, r3);
+ HS_CMP_XCHG(r5, r9);
+ HS_CMP_XCHG(r2, r5);
+ HS_CMP_XCHG(r8, r14);
+ HS_CMP_XCHG(r3, r9);
+ HS_CMP_XCHG(r12, r15);
+ HS_CMP_XCHG(r3, r5);
+ HS_CMP_XCHG(r6, r7);
+ HS_CMP_XCHG(r10, r11);
+ HS_CMP_XCHG(r12, r14);
+ HS_CMP_XCHG(r4, r9);
+ HS_CMP_XCHG(r8, r13);
+ HS_CMP_XCHG(r7, r9);
+ HS_CMP_XCHG(r11, r13);
+ HS_CMP_XCHG(r4, r6);
+ HS_CMP_XCHG(r8, r10);
+ HS_CMP_XCHG(r4, r5);
+ HS_CMP_XCHG(r6, r7);
+ HS_CMP_XCHG(r8, r9);
+ HS_CMP_XCHG(r10, r11);
+ HS_CMP_XCHG(r12, r13);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ {
+ HS_SLAB_FLIP_PREAMBLE(1);
+ HS_CMP_FLIP(0, r1, r16);
+ HS_CMP_FLIP(1, r2, r15);
+ HS_CMP_FLIP(2, r3, r14);
+ HS_CMP_FLIP(3, r4, r13);
+ HS_CMP_FLIP(4, r5, r12);
+ HS_CMP_FLIP(5, r6, r11);
+ HS_CMP_FLIP(6, r7, r10);
+ HS_CMP_FLIP(7, r8, r9);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ {
+ HS_SLAB_FLIP_PREAMBLE(3);
+ HS_CMP_FLIP(0, r1, r16);
+ HS_CMP_FLIP(1, r2, r15);
+ HS_CMP_FLIP(2, r3, r14);
+ HS_CMP_FLIP(3, r4, r13);
+ HS_CMP_FLIP(4, r5, r12);
+ HS_CMP_FLIP(5, r6, r11);
+ HS_CMP_FLIP(6, r7, r10);
+ HS_CMP_FLIP(7, r8, r9);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ {
+ HS_SLAB_FLIP_PREAMBLE(7);
+ HS_CMP_FLIP(0, r1, r16);
+ HS_CMP_FLIP(1, r2, r15);
+ HS_CMP_FLIP(2, r3, r14);
+ HS_CMP_FLIP(3, r4, r13);
+ HS_CMP_FLIP(4, r5, r12);
+ HS_CMP_FLIP(5, r6, r11);
+ HS_CMP_FLIP(6, r7, r10);
+ HS_CMP_FLIP(7, r8, r9);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ HS_BS_MERGE_H_PREAMBLE(8, 2);
+ HS_BX_LOCAL_V(2 * 8 * 0) = r1;
+ HS_BX_LOCAL_V(2 * 8 * 1) = r16;
+ HS_BX_LOCAL_V(2 * 8 * 2) = r2;
+ HS_BX_LOCAL_V(2 * 8 * 3) = r15;
+ HS_BX_LOCAL_V(2 * 8 * 4) = r3;
+ HS_BX_LOCAL_V(2 * 8 * 5) = r14;
+ HS_BX_LOCAL_V(2 * 8 * 6) = r4;
+ HS_BX_LOCAL_V(2 * 8 * 7) = r13;
+ HS_BX_LOCAL_V(2 * 8 * 8) = r5;
+ HS_BX_LOCAL_V(2 * 8 * 9) = r12;
+ HS_BX_LOCAL_V(2 * 8 * 10) = r6;
+ HS_BX_LOCAL_V(2 * 8 * 11) = r11;
+ HS_BX_LOCAL_V(2 * 8 * 12) = r7;
+ HS_BX_LOCAL_V(2 * 8 * 13) = r10;
+ HS_BX_LOCAL_V(2 * 8 * 14) = r8;
+ HS_BX_LOCAL_V(2 * 8 * 15) = r9;
+ HS_BLOCK_BARRIER();
+ {
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(8);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(0) = r0_1;
+ HS_SLAB_LOCAL_R(8) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(32);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(40);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(32) = r0_1;
+ HS_SLAB_LOCAL_R(40) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(64);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(72);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(64) = r0_1;
+ HS_SLAB_LOCAL_R(72) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(96);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(104);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(96) = r0_1;
+ HS_SLAB_LOCAL_R(104) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(136);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(128) = r0_1;
+ HS_SLAB_LOCAL_R(136) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(160);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(168);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(160) = r0_1;
+ HS_SLAB_LOCAL_R(168) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(192);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(200);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(192) = r0_1;
+ HS_SLAB_LOCAL_R(200) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(224);
+ HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(232);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(224) = r0_1;
+ HS_SLAB_LOCAL_R(232) = r0_2;
+ }
+ }
+ HS_BLOCK_BARRIER();
+ r1 = HS_BX_LOCAL_V(2 * 8 * 0);
+ r16 = HS_BX_LOCAL_V(2 * 8 * 1);
+ r2 = HS_BX_LOCAL_V(2 * 8 * 2);
+ r15 = HS_BX_LOCAL_V(2 * 8 * 3);
+ r3 = HS_BX_LOCAL_V(2 * 8 * 4);
+ r14 = HS_BX_LOCAL_V(2 * 8 * 5);
+ r4 = HS_BX_LOCAL_V(2 * 8 * 6);
+ r13 = HS_BX_LOCAL_V(2 * 8 * 7);
+ r5 = HS_BX_LOCAL_V(2 * 8 * 8);
+ r12 = HS_BX_LOCAL_V(2 * 8 * 9);
+ r6 = HS_BX_LOCAL_V(2 * 8 * 10);
+ r11 = HS_BX_LOCAL_V(2 * 8 * 11);
+ r7 = HS_BX_LOCAL_V(2 * 8 * 12);
+ r10 = HS_BX_LOCAL_V(2 * 8 * 13);
+ r8 = HS_BX_LOCAL_V(2 * 8 * 14);
+ r9 = HS_BX_LOCAL_V(2 * 8 * 15);
+ {
+ {
+ HS_SLAB_HALF_PREAMBLE(4);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ }
+ HS_SLAB_GLOBAL_STORE(8, 0, r1);
+ HS_SLAB_GLOBAL_STORE(8, 1, r2);
+ HS_SLAB_GLOBAL_STORE(8, 2, r3);
+ HS_SLAB_GLOBAL_STORE(8, 3, r4);
+ HS_SLAB_GLOBAL_STORE(8, 4, r5);
+ HS_SLAB_GLOBAL_STORE(8, 5, r6);
+ HS_SLAB_GLOBAL_STORE(8, 6, r7);
+ HS_SLAB_GLOBAL_STORE(8, 7, r8);
+ HS_SLAB_GLOBAL_STORE(8, 8, r9);
+ HS_SLAB_GLOBAL_STORE(8, 9, r10);
+ HS_SLAB_GLOBAL_STORE(8, 10, r11);
+ HS_SLAB_GLOBAL_STORE(8, 11, r12);
+ HS_SLAB_GLOBAL_STORE(8, 12, r13);
+ HS_SLAB_GLOBAL_STORE(8, 13, r14);
+ HS_SLAB_GLOBAL_STORE(8, 14, r15);
+ HS_SLAB_GLOBAL_STORE(8, 15, r16);
+}
+
+HS_BS_KERNEL_PROTO(8, 1, 0)
+{
+ HS_SLAB_GLOBAL_PREAMBLE(8, 16);
+ HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 8, 0);
+ HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 8, 1);
+ HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 8, 2);
+ HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 8, 3);
+ HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 8, 4);
+ HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 8, 5);
+ HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 8, 6);
+ HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 8, 7);
+ HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8, 8);
+ HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 8, 9);
+ HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 8, 10);
+ HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 8, 11);
+ HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 8, 12);
+ HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 8, 13);
+ HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 8, 14);
+ HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 8, 15);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r6, r11);
+ HS_CMP_XCHG(r7, r10);
+ HS_CMP_XCHG(r4, r13);
+ HS_CMP_XCHG(r14, r15);
+ HS_CMP_XCHG(r8, r12);
+ HS_CMP_XCHG(r2, r3);
+ HS_CMP_XCHG(r5, r9);
+ HS_CMP_XCHG(r2, r5);
+ HS_CMP_XCHG(r8, r14);
+ HS_CMP_XCHG(r3, r9);
+ HS_CMP_XCHG(r12, r15);
+ HS_CMP_XCHG(r3, r5);
+ HS_CMP_XCHG(r6, r7);
+ HS_CMP_XCHG(r10, r11);
+ HS_CMP_XCHG(r12, r14);
+ HS_CMP_XCHG(r4, r9);
+ HS_CMP_XCHG(r8, r13);
+ HS_CMP_XCHG(r7, r9);
+ HS_CMP_XCHG(r11, r13);
+ HS_CMP_XCHG(r4, r6);
+ HS_CMP_XCHG(r8, r10);
+ HS_CMP_XCHG(r4, r5);
+ HS_CMP_XCHG(r6, r7);
+ HS_CMP_XCHG(r8, r9);
+ HS_CMP_XCHG(r10, r11);
+ HS_CMP_XCHG(r12, r13);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ {
+ HS_SLAB_FLIP_PREAMBLE(1);
+ HS_CMP_FLIP(0, r1, r16);
+ HS_CMP_FLIP(1, r2, r15);
+ HS_CMP_FLIP(2, r3, r14);
+ HS_CMP_FLIP(3, r4, r13);
+ HS_CMP_FLIP(4, r5, r12);
+ HS_CMP_FLIP(5, r6, r11);
+ HS_CMP_FLIP(6, r7, r10);
+ HS_CMP_FLIP(7, r8, r9);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ {
+ HS_SLAB_FLIP_PREAMBLE(3);
+ HS_CMP_FLIP(0, r1, r16);
+ HS_CMP_FLIP(1, r2, r15);
+ HS_CMP_FLIP(2, r3, r14);
+ HS_CMP_FLIP(3, r4, r13);
+ HS_CMP_FLIP(4, r5, r12);
+ HS_CMP_FLIP(5, r6, r11);
+ HS_CMP_FLIP(6, r7, r10);
+ HS_CMP_FLIP(7, r8, r9);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ {
+ HS_SLAB_FLIP_PREAMBLE(7);
+ HS_CMP_FLIP(0, r1, r16);
+ HS_CMP_FLIP(1, r2, r15);
+ HS_CMP_FLIP(2, r3, r14);
+ HS_CMP_FLIP(3, r4, r13);
+ HS_CMP_FLIP(4, r5, r12);
+ HS_CMP_FLIP(5, r6, r11);
+ HS_CMP_FLIP(6, r7, r10);
+ HS_CMP_FLIP(7, r8, r9);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ HS_SLAB_GLOBAL_STORE(8, 0, r1);
+ HS_SLAB_GLOBAL_STORE(8, 1, r2);
+ HS_SLAB_GLOBAL_STORE(8, 2, r3);
+ HS_SLAB_GLOBAL_STORE(8, 3, r4);
+ HS_SLAB_GLOBAL_STORE(8, 4, r5);
+ HS_SLAB_GLOBAL_STORE(8, 5, r6);
+ HS_SLAB_GLOBAL_STORE(8, 6, r7);
+ HS_SLAB_GLOBAL_STORE(8, 7, r8);
+ HS_SLAB_GLOBAL_STORE(8, 8, r9);
+ HS_SLAB_GLOBAL_STORE(8, 9, r10);
+ HS_SLAB_GLOBAL_STORE(8, 10, r11);
+ HS_SLAB_GLOBAL_STORE(8, 11, r12);
+ HS_SLAB_GLOBAL_STORE(8, 12, r13);
+ HS_SLAB_GLOBAL_STORE(8, 13, r14);
+ HS_SLAB_GLOBAL_STORE(8, 14, r15);
+ HS_SLAB_GLOBAL_STORE(8, 15, r16);
+}
+
+HS_BC_KERNEL_PROTO(8, 16, 4)
+{
+ HS_BLOCK_LOCAL_MEM_DECL(128, 16);
+
+ HS_SLAB_GLOBAL_PREAMBLE(8, 16);
+ HS_BC_MERGE_H_PREAMBLE(8, 16, 16);
+ {
+ {
+ HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 0);
+ HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 16);
+ HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(8, 32);
+ HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(8, 48);
+ HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(8, 64);
+ HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(8, 80);
+ HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(8, 96);
+ HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(8, 112);
+ HS_KEY_TYPE r0_9 = HS_BC_GLOBAL_LOAD_L(8, 128);
+ HS_KEY_TYPE r0_10 = HS_BC_GLOBAL_LOAD_L(8, 144);
+ HS_KEY_TYPE r0_11 = HS_BC_GLOBAL_LOAD_L(8, 160);
+ HS_KEY_TYPE r0_12 = HS_BC_GLOBAL_LOAD_L(8, 176);
+ HS_KEY_TYPE r0_13 = HS_BC_GLOBAL_LOAD_L(8, 192);
+ HS_KEY_TYPE r0_14 = HS_BC_GLOBAL_LOAD_L(8, 208);
+ HS_KEY_TYPE r0_15 = HS_BC_GLOBAL_LOAD_L(8, 224);
+ HS_KEY_TYPE r0_16 = HS_BC_GLOBAL_LOAD_L(8, 240);
+ HS_CMP_XCHG(r0_1, r0_9);
+ HS_CMP_XCHG(r0_5, r0_13);
+ HS_CMP_XCHG(r0_1, r0_5);
+ HS_CMP_XCHG(r0_9, r0_13);
+ HS_CMP_XCHG(r0_3, r0_11);
+ HS_CMP_XCHG(r0_7, r0_15);
+ HS_CMP_XCHG(r0_3, r0_7);
+ HS_CMP_XCHG(r0_11, r0_15);
+ HS_CMP_XCHG(r0_1, r0_3);
+ HS_CMP_XCHG(r0_5, r0_7);
+ HS_CMP_XCHG(r0_9, r0_11);
+ HS_CMP_XCHG(r0_13, r0_15);
+ HS_CMP_XCHG(r0_2, r0_10);
+ HS_CMP_XCHG(r0_6, r0_14);
+ HS_CMP_XCHG(r0_2, r0_6);
+ HS_CMP_XCHG(r0_10, r0_14);
+ HS_CMP_XCHG(r0_4, r0_12);
+ HS_CMP_XCHG(r0_8, r0_16);
+ HS_CMP_XCHG(r0_4, r0_8);
+ HS_CMP_XCHG(r0_12, r0_16);
+ HS_CMP_XCHG(r0_2, r0_4);
+ HS_CMP_XCHG(r0_6, r0_8);
+ HS_CMP_XCHG(r0_10, r0_12);
+ HS_CMP_XCHG(r0_14, r0_16);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_CMP_XCHG(r0_5, r0_6);
+ HS_CMP_XCHG(r0_7, r0_8);
+ HS_CMP_XCHG(r0_9, r0_10);
+ HS_CMP_XCHG(r0_11, r0_12);
+ HS_CMP_XCHG(r0_13, r0_14);
+ HS_CMP_XCHG(r0_15, r0_16);
+ HS_SLAB_LOCAL_L(0) = r0_1;
+ HS_SLAB_LOCAL_L(8) = r0_2;
+ HS_SLAB_LOCAL_L(16) = r0_3;
+ HS_SLAB_LOCAL_L(24) = r0_4;
+ HS_SLAB_LOCAL_L(32) = r0_5;
+ HS_SLAB_LOCAL_L(40) = r0_6;
+ HS_SLAB_LOCAL_L(48) = r0_7;
+ HS_SLAB_LOCAL_L(56) = r0_8;
+ HS_SLAB_LOCAL_L(64) = r0_9;
+ HS_SLAB_LOCAL_L(72) = r0_10;
+ HS_SLAB_LOCAL_L(80) = r0_11;
+ HS_SLAB_LOCAL_L(88) = r0_12;
+ HS_SLAB_LOCAL_L(96) = r0_13;
+ HS_SLAB_LOCAL_L(104) = r0_14;
+ HS_SLAB_LOCAL_L(112) = r0_15;
+ HS_SLAB_LOCAL_L(120) = r0_16;
+ }
+ }
+ HS_BLOCK_BARRIER();
+ HS_KEY_TYPE r1 = HS_BX_LOCAL_V(16 * 8 * 0);
+ HS_KEY_TYPE r2 = HS_BX_LOCAL_V(16 * 8 * 1);
+ HS_KEY_TYPE r3 = HS_BX_LOCAL_V(16 * 8 * 2);
+ HS_KEY_TYPE r4 = HS_BX_LOCAL_V(16 * 8 * 3);
+ HS_KEY_TYPE r5 = HS_BX_LOCAL_V(16 * 8 * 4);
+ HS_KEY_TYPE r6 = HS_BX_LOCAL_V(16 * 8 * 5);
+ HS_KEY_TYPE r7 = HS_BX_LOCAL_V(16 * 8 * 6);
+ HS_KEY_TYPE r8 = HS_BX_LOCAL_V(16 * 8 * 7);
+ HS_KEY_TYPE r9 = HS_BX_LOCAL_V(16 * 8 * 8);
+ HS_KEY_TYPE r10 = HS_BX_LOCAL_V(16 * 8 * 9);
+ HS_KEY_TYPE r11 = HS_BX_LOCAL_V(16 * 8 * 10);
+ HS_KEY_TYPE r12 = HS_BX_LOCAL_V(16 * 8 * 11);
+ HS_KEY_TYPE r13 = HS_BX_LOCAL_V(16 * 8 * 12);
+ HS_KEY_TYPE r14 = HS_BX_LOCAL_V(16 * 8 * 13);
+ HS_KEY_TYPE r15 = HS_BX_LOCAL_V(16 * 8 * 14);
+ HS_KEY_TYPE r16 = HS_BX_LOCAL_V(16 * 8 * 15);
+ {
+ {
+ HS_SLAB_HALF_PREAMBLE(4);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ }
+ HS_SLAB_GLOBAL_STORE(8, 0, r1);
+ HS_SLAB_GLOBAL_STORE(8, 1, r2);
+ HS_SLAB_GLOBAL_STORE(8, 2, r3);
+ HS_SLAB_GLOBAL_STORE(8, 3, r4);
+ HS_SLAB_GLOBAL_STORE(8, 4, r5);
+ HS_SLAB_GLOBAL_STORE(8, 5, r6);
+ HS_SLAB_GLOBAL_STORE(8, 6, r7);
+ HS_SLAB_GLOBAL_STORE(8, 7, r8);
+ HS_SLAB_GLOBAL_STORE(8, 8, r9);
+ HS_SLAB_GLOBAL_STORE(8, 9, r10);
+ HS_SLAB_GLOBAL_STORE(8, 10, r11);
+ HS_SLAB_GLOBAL_STORE(8, 11, r12);
+ HS_SLAB_GLOBAL_STORE(8, 12, r13);
+ HS_SLAB_GLOBAL_STORE(8, 13, r14);
+ HS_SLAB_GLOBAL_STORE(8, 14, r15);
+ HS_SLAB_GLOBAL_STORE(8, 15, r16);
+}
+
+HS_BC_KERNEL_PROTO(8, 8, 3)
+{
+ HS_BLOCK_LOCAL_MEM_DECL(64, 16);
+
+ HS_SLAB_GLOBAL_PREAMBLE(8, 16);
+ HS_BC_MERGE_H_PREAMBLE(8, 16, 8);
+ {
+ {
+ HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 0);
+ HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 16);
+ HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(8, 32);
+ HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(8, 48);
+ HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(8, 64);
+ HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(8, 80);
+ HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(8, 96);
+ HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(8, 112);
+ HS_CMP_XCHG(r0_1, r0_5);
+ HS_CMP_XCHG(r0_3, r0_7);
+ HS_CMP_XCHG(r0_1, r0_3);
+ HS_CMP_XCHG(r0_5, r0_7);
+ HS_CMP_XCHG(r0_2, r0_6);
+ HS_CMP_XCHG(r0_4, r0_8);
+ HS_CMP_XCHG(r0_2, r0_4);
+ HS_CMP_XCHG(r0_6, r0_8);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_CMP_XCHG(r0_5, r0_6);
+ HS_CMP_XCHG(r0_7, r0_8);
+ HS_SLAB_LOCAL_L(0) = r0_1;
+ HS_SLAB_LOCAL_L(8) = r0_2;
+ HS_SLAB_LOCAL_L(16) = r0_3;
+ HS_SLAB_LOCAL_L(24) = r0_4;
+ HS_SLAB_LOCAL_L(32) = r0_5;
+ HS_SLAB_LOCAL_L(40) = r0_6;
+ HS_SLAB_LOCAL_L(48) = r0_7;
+ HS_SLAB_LOCAL_L(56) = r0_8;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 8);
+ HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 24);
+ HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(8, 40);
+ HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(8, 56);
+ HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(8, 72);
+ HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(8, 88);
+ HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(8, 104);
+ HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(8, 120);
+ HS_CMP_XCHG(r0_1, r0_5);
+ HS_CMP_XCHG(r0_3, r0_7);
+ HS_CMP_XCHG(r0_1, r0_3);
+ HS_CMP_XCHG(r0_5, r0_7);
+ HS_CMP_XCHG(r0_2, r0_6);
+ HS_CMP_XCHG(r0_4, r0_8);
+ HS_CMP_XCHG(r0_2, r0_4);
+ HS_CMP_XCHG(r0_6, r0_8);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_CMP_XCHG(r0_5, r0_6);
+ HS_CMP_XCHG(r0_7, r0_8);
+ HS_SLAB_LOCAL_L(512) = r0_1;
+ HS_SLAB_LOCAL_L(520) = r0_2;
+ HS_SLAB_LOCAL_L(528) = r0_3;
+ HS_SLAB_LOCAL_L(536) = r0_4;
+ HS_SLAB_LOCAL_L(544) = r0_5;
+ HS_SLAB_LOCAL_L(552) = r0_6;
+ HS_SLAB_LOCAL_L(560) = r0_7;
+ HS_SLAB_LOCAL_L(568) = r0_8;
+ }
+ }
+ HS_BLOCK_BARRIER();
+ HS_KEY_TYPE r1 = HS_BX_LOCAL_V(8 * 8 * 0);
+ HS_KEY_TYPE r2 = HS_BX_LOCAL_V(8 * 8 * 1);
+ HS_KEY_TYPE r3 = HS_BX_LOCAL_V(8 * 8 * 2);
+ HS_KEY_TYPE r4 = HS_BX_LOCAL_V(8 * 8 * 3);
+ HS_KEY_TYPE r5 = HS_BX_LOCAL_V(8 * 8 * 4);
+ HS_KEY_TYPE r6 = HS_BX_LOCAL_V(8 * 8 * 5);
+ HS_KEY_TYPE r7 = HS_BX_LOCAL_V(8 * 8 * 6);
+ HS_KEY_TYPE r8 = HS_BX_LOCAL_V(8 * 8 * 7);
+ HS_KEY_TYPE r9 = HS_BX_LOCAL_V(8 * 8 * 8);
+ HS_KEY_TYPE r10 = HS_BX_LOCAL_V(8 * 8 * 9);
+ HS_KEY_TYPE r11 = HS_BX_LOCAL_V(8 * 8 * 10);
+ HS_KEY_TYPE r12 = HS_BX_LOCAL_V(8 * 8 * 11);
+ HS_KEY_TYPE r13 = HS_BX_LOCAL_V(8 * 8 * 12);
+ HS_KEY_TYPE r14 = HS_BX_LOCAL_V(8 * 8 * 13);
+ HS_KEY_TYPE r15 = HS_BX_LOCAL_V(8 * 8 * 14);
+ HS_KEY_TYPE r16 = HS_BX_LOCAL_V(8 * 8 * 15);
+ {
+ {
+ HS_SLAB_HALF_PREAMBLE(4);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ }
+ HS_SLAB_GLOBAL_STORE(8, 0, r1);
+ HS_SLAB_GLOBAL_STORE(8, 1, r2);
+ HS_SLAB_GLOBAL_STORE(8, 2, r3);
+ HS_SLAB_GLOBAL_STORE(8, 3, r4);
+ HS_SLAB_GLOBAL_STORE(8, 4, r5);
+ HS_SLAB_GLOBAL_STORE(8, 5, r6);
+ HS_SLAB_GLOBAL_STORE(8, 6, r7);
+ HS_SLAB_GLOBAL_STORE(8, 7, r8);
+ HS_SLAB_GLOBAL_STORE(8, 8, r9);
+ HS_SLAB_GLOBAL_STORE(8, 9, r10);
+ HS_SLAB_GLOBAL_STORE(8, 10, r11);
+ HS_SLAB_GLOBAL_STORE(8, 11, r12);
+ HS_SLAB_GLOBAL_STORE(8, 12, r13);
+ HS_SLAB_GLOBAL_STORE(8, 13, r14);
+ HS_SLAB_GLOBAL_STORE(8, 14, r15);
+ HS_SLAB_GLOBAL_STORE(8, 15, r16);
+}
+
+HS_BC_KERNEL_PROTO(8, 4, 2)
+{
+ HS_BLOCK_LOCAL_MEM_DECL(32, 16);
+
+ HS_SLAB_GLOBAL_PREAMBLE(8, 16);
+ HS_BC_MERGE_H_PREAMBLE(8, 16, 4);
+ {
+ {
+ HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 0);
+ HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 16);
+ HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(8, 32);
+ HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(8, 48);
+ HS_CMP_XCHG(r0_1, r0_3);
+ HS_CMP_XCHG(r0_2, r0_4);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_SLAB_LOCAL_L(0) = r0_1;
+ HS_SLAB_LOCAL_L(8) = r0_2;
+ HS_SLAB_LOCAL_L(16) = r0_3;
+ HS_SLAB_LOCAL_L(24) = r0_4;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 4);
+ HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 20);
+ HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(8, 36);
+ HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(8, 52);
+ HS_CMP_XCHG(r0_1, r0_3);
+ HS_CMP_XCHG(r0_2, r0_4);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_SLAB_LOCAL_L(128) = r0_1;
+ HS_SLAB_LOCAL_L(136) = r0_2;
+ HS_SLAB_LOCAL_L(144) = r0_3;
+ HS_SLAB_LOCAL_L(152) = r0_4;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 8);
+ HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 24);
+ HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(8, 40);
+ HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(8, 56);
+ HS_CMP_XCHG(r0_1, r0_3);
+ HS_CMP_XCHG(r0_2, r0_4);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_SLAB_LOCAL_L(256) = r0_1;
+ HS_SLAB_LOCAL_L(264) = r0_2;
+ HS_SLAB_LOCAL_L(272) = r0_3;
+ HS_SLAB_LOCAL_L(280) = r0_4;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 12);
+ HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 28);
+ HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(8, 44);
+ HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(8, 60);
+ HS_CMP_XCHG(r0_1, r0_3);
+ HS_CMP_XCHG(r0_2, r0_4);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_CMP_XCHG(r0_3, r0_4);
+ HS_SLAB_LOCAL_L(384) = r0_1;
+ HS_SLAB_LOCAL_L(392) = r0_2;
+ HS_SLAB_LOCAL_L(400) = r0_3;
+ HS_SLAB_LOCAL_L(408) = r0_4;
+ }
+ }
+ HS_BLOCK_BARRIER();
+ HS_KEY_TYPE r1 = HS_BX_LOCAL_V(4 * 8 * 0);
+ HS_KEY_TYPE r2 = HS_BX_LOCAL_V(4 * 8 * 1);
+ HS_KEY_TYPE r3 = HS_BX_LOCAL_V(4 * 8 * 2);
+ HS_KEY_TYPE r4 = HS_BX_LOCAL_V(4 * 8 * 3);
+ HS_KEY_TYPE r5 = HS_BX_LOCAL_V(4 * 8 * 4);
+ HS_KEY_TYPE r6 = HS_BX_LOCAL_V(4 * 8 * 5);
+ HS_KEY_TYPE r7 = HS_BX_LOCAL_V(4 * 8 * 6);
+ HS_KEY_TYPE r8 = HS_BX_LOCAL_V(4 * 8 * 7);
+ HS_KEY_TYPE r9 = HS_BX_LOCAL_V(4 * 8 * 8);
+ HS_KEY_TYPE r10 = HS_BX_LOCAL_V(4 * 8 * 9);
+ HS_KEY_TYPE r11 = HS_BX_LOCAL_V(4 * 8 * 10);
+ HS_KEY_TYPE r12 = HS_BX_LOCAL_V(4 * 8 * 11);
+ HS_KEY_TYPE r13 = HS_BX_LOCAL_V(4 * 8 * 12);
+ HS_KEY_TYPE r14 = HS_BX_LOCAL_V(4 * 8 * 13);
+ HS_KEY_TYPE r15 = HS_BX_LOCAL_V(4 * 8 * 14);
+ HS_KEY_TYPE r16 = HS_BX_LOCAL_V(4 * 8 * 15);
+ {
+ {
+ HS_SLAB_HALF_PREAMBLE(4);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ }
+ HS_SLAB_GLOBAL_STORE(8, 0, r1);
+ HS_SLAB_GLOBAL_STORE(8, 1, r2);
+ HS_SLAB_GLOBAL_STORE(8, 2, r3);
+ HS_SLAB_GLOBAL_STORE(8, 3, r4);
+ HS_SLAB_GLOBAL_STORE(8, 4, r5);
+ HS_SLAB_GLOBAL_STORE(8, 5, r6);
+ HS_SLAB_GLOBAL_STORE(8, 6, r7);
+ HS_SLAB_GLOBAL_STORE(8, 7, r8);
+ HS_SLAB_GLOBAL_STORE(8, 8, r9);
+ HS_SLAB_GLOBAL_STORE(8, 9, r10);
+ HS_SLAB_GLOBAL_STORE(8, 10, r11);
+ HS_SLAB_GLOBAL_STORE(8, 11, r12);
+ HS_SLAB_GLOBAL_STORE(8, 12, r13);
+ HS_SLAB_GLOBAL_STORE(8, 13, r14);
+ HS_SLAB_GLOBAL_STORE(8, 14, r15);
+ HS_SLAB_GLOBAL_STORE(8, 15, r16);
+}
+
+HS_BC_KERNEL_PROTO(8, 2, 1)
+{
+ HS_BLOCK_LOCAL_MEM_DECL(16, 16);
+
+ HS_SLAB_GLOBAL_PREAMBLE(8, 16);
+ HS_BC_MERGE_H_PREAMBLE(8, 16, 2);
+ {
+ {
+ HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 0);
+ HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 16);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(0) = r0_1;
+ HS_SLAB_LOCAL_L(8) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 2);
+ HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 18);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(32) = r0_1;
+ HS_SLAB_LOCAL_L(40) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 4);
+ HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 20);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(64) = r0_1;
+ HS_SLAB_LOCAL_L(72) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 6);
+ HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 22);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(96) = r0_1;
+ HS_SLAB_LOCAL_L(104) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 8);
+ HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 24);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(128) = r0_1;
+ HS_SLAB_LOCAL_L(136) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 10);
+ HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 26);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(160) = r0_1;
+ HS_SLAB_LOCAL_L(168) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 12);
+ HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 28);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(192) = r0_1;
+ HS_SLAB_LOCAL_L(200) = r0_2;
+ }
+ {
+ HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 14);
+ HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 30);
+ HS_CMP_XCHG(r0_1, r0_2);
+ HS_SLAB_LOCAL_L(224) = r0_1;
+ HS_SLAB_LOCAL_L(232) = r0_2;
+ }
+ }
+ HS_BLOCK_BARRIER();
+ HS_KEY_TYPE r1 = HS_BX_LOCAL_V(2 * 8 * 0);
+ HS_KEY_TYPE r2 = HS_BX_LOCAL_V(2 * 8 * 1);
+ HS_KEY_TYPE r3 = HS_BX_LOCAL_V(2 * 8 * 2);
+ HS_KEY_TYPE r4 = HS_BX_LOCAL_V(2 * 8 * 3);
+ HS_KEY_TYPE r5 = HS_BX_LOCAL_V(2 * 8 * 4);
+ HS_KEY_TYPE r6 = HS_BX_LOCAL_V(2 * 8 * 5);
+ HS_KEY_TYPE r7 = HS_BX_LOCAL_V(2 * 8 * 6);
+ HS_KEY_TYPE r8 = HS_BX_LOCAL_V(2 * 8 * 7);
+ HS_KEY_TYPE r9 = HS_BX_LOCAL_V(2 * 8 * 8);
+ HS_KEY_TYPE r10 = HS_BX_LOCAL_V(2 * 8 * 9);
+ HS_KEY_TYPE r11 = HS_BX_LOCAL_V(2 * 8 * 10);
+ HS_KEY_TYPE r12 = HS_BX_LOCAL_V(2 * 8 * 11);
+ HS_KEY_TYPE r13 = HS_BX_LOCAL_V(2 * 8 * 12);
+ HS_KEY_TYPE r14 = HS_BX_LOCAL_V(2 * 8 * 13);
+ HS_KEY_TYPE r15 = HS_BX_LOCAL_V(2 * 8 * 14);
+ HS_KEY_TYPE r16 = HS_BX_LOCAL_V(2 * 8 * 15);
+ {
+ {
+ HS_SLAB_HALF_PREAMBLE(4);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ }
+ HS_SLAB_GLOBAL_STORE(8, 0, r1);
+ HS_SLAB_GLOBAL_STORE(8, 1, r2);
+ HS_SLAB_GLOBAL_STORE(8, 2, r3);
+ HS_SLAB_GLOBAL_STORE(8, 3, r4);
+ HS_SLAB_GLOBAL_STORE(8, 4, r5);
+ HS_SLAB_GLOBAL_STORE(8, 5, r6);
+ HS_SLAB_GLOBAL_STORE(8, 6, r7);
+ HS_SLAB_GLOBAL_STORE(8, 7, r8);
+ HS_SLAB_GLOBAL_STORE(8, 8, r9);
+ HS_SLAB_GLOBAL_STORE(8, 9, r10);
+ HS_SLAB_GLOBAL_STORE(8, 10, r11);
+ HS_SLAB_GLOBAL_STORE(8, 11, r12);
+ HS_SLAB_GLOBAL_STORE(8, 12, r13);
+ HS_SLAB_GLOBAL_STORE(8, 13, r14);
+ HS_SLAB_GLOBAL_STORE(8, 14, r15);
+ HS_SLAB_GLOBAL_STORE(8, 15, r16);
+}
+
+HS_BC_KERNEL_PROTO(8, 1, 0)
+{
+ HS_SLAB_GLOBAL_PREAMBLE(8, 16);
+ HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 8, 0);
+ HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 8, 1);
+ HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 8, 2);
+ HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 8, 3);
+ HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 8, 4);
+ HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 8, 5);
+ HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 8, 6);
+ HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 8, 7);
+ HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vout, 8, 8);
+ HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vout, 8, 9);
+ HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vout, 8, 10);
+ HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vout, 8, 11);
+ HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vout, 8, 12);
+ HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vout, 8, 13);
+ HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vout, 8, 14);
+ HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vout, 8, 15);
+ {
+ {
+ HS_SLAB_HALF_PREAMBLE(4);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(2);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ {
+ HS_SLAB_HALF_PREAMBLE(1);
+ HS_CMP_HALF(0, r1);
+ HS_CMP_HALF(1, r2);
+ HS_CMP_HALF(2, r3);
+ HS_CMP_HALF(3, r4);
+ HS_CMP_HALF(4, r5);
+ HS_CMP_HALF(5, r6);
+ HS_CMP_HALF(6, r7);
+ HS_CMP_HALF(7, r8);
+ HS_CMP_HALF(8, r9);
+ HS_CMP_HALF(9, r10);
+ HS_CMP_HALF(10, r11);
+ HS_CMP_HALF(11, r12);
+ HS_CMP_HALF(12, r13);
+ HS_CMP_HALF(13, r14);
+ HS_CMP_HALF(14, r15);
+ HS_CMP_HALF(15, r16);
+ }
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ }
+ HS_SLAB_GLOBAL_STORE(8, 0, r1);
+ HS_SLAB_GLOBAL_STORE(8, 1, r2);
+ HS_SLAB_GLOBAL_STORE(8, 2, r3);
+ HS_SLAB_GLOBAL_STORE(8, 3, r4);
+ HS_SLAB_GLOBAL_STORE(8, 4, r5);
+ HS_SLAB_GLOBAL_STORE(8, 5, r6);
+ HS_SLAB_GLOBAL_STORE(8, 6, r7);
+ HS_SLAB_GLOBAL_STORE(8, 7, r8);
+ HS_SLAB_GLOBAL_STORE(8, 8, r9);
+ HS_SLAB_GLOBAL_STORE(8, 9, r10);
+ HS_SLAB_GLOBAL_STORE(8, 10, r11);
+ HS_SLAB_GLOBAL_STORE(8, 11, r12);
+ HS_SLAB_GLOBAL_STORE(8, 12, r13);
+ HS_SLAB_GLOBAL_STORE(8, 13, r14);
+ HS_SLAB_GLOBAL_STORE(8, 14, r15);
+ HS_SLAB_GLOBAL_STORE(8, 15, r16);
+}
+
+HS_FM_KERNEL_PROTO(1, 4)
+{
+ HS_FM_PREAMBLE(16);
+ HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
+ HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
+ HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
+ HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
+ HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
+ HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
+ HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
+ HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
+ HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8);
+ HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9);
+ HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10);
+ HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11);
+ HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12);
+ HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13);
+ HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14);
+ HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15);
+ HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0);
+ HS_KEY_TYPE r18 = HS_FM_GLOBAL_LOAD_R(1);
+ HS_KEY_TYPE r19 = HS_FM_GLOBAL_LOAD_R(2);
+ HS_KEY_TYPE r20 = HS_FM_GLOBAL_LOAD_R(3);
+ HS_KEY_TYPE r21 = HS_FM_GLOBAL_LOAD_R(4);
+ HS_KEY_TYPE r22 = HS_FM_GLOBAL_LOAD_R(5);
+ HS_KEY_TYPE r23 = HS_FM_GLOBAL_LOAD_R(6);
+ HS_KEY_TYPE r24 = HS_FM_GLOBAL_LOAD_R(7);
+ HS_KEY_TYPE r25 = HS_FM_GLOBAL_LOAD_R(8);
+ HS_KEY_TYPE r26 = HS_FM_GLOBAL_LOAD_R(9);
+ HS_KEY_TYPE r27 = HS_FM_GLOBAL_LOAD_R(10);
+ HS_KEY_TYPE r28 = HS_FM_GLOBAL_LOAD_R(11);
+ HS_KEY_TYPE r29 = HS_FM_GLOBAL_LOAD_R(12);
+ HS_KEY_TYPE r30 = HS_FM_GLOBAL_LOAD_R(13);
+ HS_KEY_TYPE r31 = HS_FM_GLOBAL_LOAD_R(14);
+ HS_KEY_TYPE r32 = HS_FM_GLOBAL_LOAD_R(15);
+ HS_CMP_XCHG(r16, r17);
+ HS_CMP_XCHG(r15, r18);
+ HS_CMP_XCHG(r14, r19);
+ HS_CMP_XCHG(r13, r20);
+ HS_CMP_XCHG(r12, r21);
+ HS_CMP_XCHG(r11, r22);
+ HS_CMP_XCHG(r10, r23);
+ HS_CMP_XCHG(r9, r24);
+ HS_CMP_XCHG(r8, r25);
+ HS_CMP_XCHG(r7, r26);
+ HS_CMP_XCHG(r6, r27);
+ HS_CMP_XCHG(r5, r28);
+ HS_CMP_XCHG(r4, r29);
+ HS_CMP_XCHG(r3, r30);
+ HS_CMP_XCHG(r2, r31);
+ HS_CMP_XCHG(r1, r32);
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ HS_CMP_XCHG(r17, r25);
+ HS_CMP_XCHG(r21, r29);
+ HS_CMP_XCHG(r17, r21);
+ HS_CMP_XCHG(r25, r29);
+ HS_CMP_XCHG(r19, r27);
+ HS_CMP_XCHG(r23, r31);
+ HS_CMP_XCHG(r19, r23);
+ HS_CMP_XCHG(r27, r31);
+ HS_CMP_XCHG(r17, r19);
+ HS_CMP_XCHG(r21, r23);
+ HS_CMP_XCHG(r25, r27);
+ HS_CMP_XCHG(r29, r31);
+ HS_CMP_XCHG(r18, r26);
+ HS_CMP_XCHG(r22, r30);
+ HS_CMP_XCHG(r18, r22);
+ HS_CMP_XCHG(r26, r30);
+ HS_CMP_XCHG(r20, r28);
+ HS_CMP_XCHG(r24, r32);
+ HS_CMP_XCHG(r20, r24);
+ HS_CMP_XCHG(r28, r32);
+ HS_CMP_XCHG(r18, r20);
+ HS_CMP_XCHG(r22, r24);
+ HS_CMP_XCHG(r26, r28);
+ HS_CMP_XCHG(r30, r32);
+ HS_CMP_XCHG(r17, r18);
+ HS_CMP_XCHG(r19, r20);
+ HS_CMP_XCHG(r21, r22);
+ HS_CMP_XCHG(r23, r24);
+ HS_CMP_XCHG(r25, r26);
+ HS_CMP_XCHG(r27, r28);
+ HS_CMP_XCHG(r29, r30);
+ HS_CMP_XCHG(r31, r32);
+ HS_XM_GLOBAL_STORE_L(0, r1);
+ HS_XM_GLOBAL_STORE_L(1, r2);
+ HS_XM_GLOBAL_STORE_L(2, r3);
+ HS_XM_GLOBAL_STORE_L(3, r4);
+ HS_XM_GLOBAL_STORE_L(4, r5);
+ HS_XM_GLOBAL_STORE_L(5, r6);
+ HS_XM_GLOBAL_STORE_L(6, r7);
+ HS_XM_GLOBAL_STORE_L(7, r8);
+ HS_XM_GLOBAL_STORE_L(8, r9);
+ HS_XM_GLOBAL_STORE_L(9, r10);
+ HS_XM_GLOBAL_STORE_L(10, r11);
+ HS_XM_GLOBAL_STORE_L(11, r12);
+ HS_XM_GLOBAL_STORE_L(12, r13);
+ HS_XM_GLOBAL_STORE_L(13, r14);
+ HS_XM_GLOBAL_STORE_L(14, r15);
+ HS_XM_GLOBAL_STORE_L(15, r16);
+ HS_FM_GLOBAL_STORE_R(0, r17);
+ HS_FM_GLOBAL_STORE_R(1, r18);
+ HS_FM_GLOBAL_STORE_R(2, r19);
+ HS_FM_GLOBAL_STORE_R(3, r20);
+ HS_FM_GLOBAL_STORE_R(4, r21);
+ HS_FM_GLOBAL_STORE_R(5, r22);
+ HS_FM_GLOBAL_STORE_R(6, r23);
+ HS_FM_GLOBAL_STORE_R(7, r24);
+ HS_FM_GLOBAL_STORE_R(8, r25);
+ HS_FM_GLOBAL_STORE_R(9, r26);
+ HS_FM_GLOBAL_STORE_R(10, r27);
+ HS_FM_GLOBAL_STORE_R(11, r28);
+ HS_FM_GLOBAL_STORE_R(12, r29);
+ HS_FM_GLOBAL_STORE_R(13, r30);
+ HS_FM_GLOBAL_STORE_R(14, r31);
+ HS_FM_GLOBAL_STORE_R(15, r32);
+}
+
+HS_FM_KERNEL_PROTO(1, 3)
+{
+ HS_FM_PREAMBLE(16);
+ HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
+ HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
+ HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
+ HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
+ HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
+ HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
+ HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
+ HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
+ HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8);
+ HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9);
+ HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10);
+ HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11);
+ HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12);
+ HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13);
+ HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14);
+ HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15);
+ HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0);
+ HS_KEY_TYPE r18 = HS_FM_GLOBAL_LOAD_R(1);
+ HS_KEY_TYPE r19 = HS_FM_GLOBAL_LOAD_R(2);
+ HS_KEY_TYPE r20 = HS_FM_GLOBAL_LOAD_R(3);
+ HS_KEY_TYPE r21 = HS_FM_GLOBAL_LOAD_R(4);
+ HS_KEY_TYPE r22 = HS_FM_GLOBAL_LOAD_R(5);
+ HS_KEY_TYPE r23 = HS_FM_GLOBAL_LOAD_R(6);
+ HS_KEY_TYPE r24 = HS_FM_GLOBAL_LOAD_R(7);
+ HS_CMP_XCHG(r16, r17);
+ HS_CMP_XCHG(r15, r18);
+ HS_CMP_XCHG(r14, r19);
+ HS_CMP_XCHG(r13, r20);
+ HS_CMP_XCHG(r12, r21);
+ HS_CMP_XCHG(r11, r22);
+ HS_CMP_XCHG(r10, r23);
+ HS_CMP_XCHG(r9, r24);
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ HS_CMP_XCHG(r17, r21);
+ HS_CMP_XCHG(r19, r23);
+ HS_CMP_XCHG(r17, r19);
+ HS_CMP_XCHG(r21, r23);
+ HS_CMP_XCHG(r18, r22);
+ HS_CMP_XCHG(r20, r24);
+ HS_CMP_XCHG(r18, r20);
+ HS_CMP_XCHG(r22, r24);
+ HS_CMP_XCHG(r17, r18);
+ HS_CMP_XCHG(r19, r20);
+ HS_CMP_XCHG(r21, r22);
+ HS_CMP_XCHG(r23, r24);
+ HS_XM_GLOBAL_STORE_L(0, r1);
+ HS_XM_GLOBAL_STORE_L(1, r2);
+ HS_XM_GLOBAL_STORE_L(2, r3);
+ HS_XM_GLOBAL_STORE_L(3, r4);
+ HS_XM_GLOBAL_STORE_L(4, r5);
+ HS_XM_GLOBAL_STORE_L(5, r6);
+ HS_XM_GLOBAL_STORE_L(6, r7);
+ HS_XM_GLOBAL_STORE_L(7, r8);
+ HS_XM_GLOBAL_STORE_L(8, r9);
+ HS_XM_GLOBAL_STORE_L(9, r10);
+ HS_XM_GLOBAL_STORE_L(10, r11);
+ HS_XM_GLOBAL_STORE_L(11, r12);
+ HS_XM_GLOBAL_STORE_L(12, r13);
+ HS_XM_GLOBAL_STORE_L(13, r14);
+ HS_XM_GLOBAL_STORE_L(14, r15);
+ HS_XM_GLOBAL_STORE_L(15, r16);
+ HS_FM_GLOBAL_STORE_R(0, r17);
+ HS_FM_GLOBAL_STORE_R(1, r18);
+ HS_FM_GLOBAL_STORE_R(2, r19);
+ HS_FM_GLOBAL_STORE_R(3, r20);
+ HS_FM_GLOBAL_STORE_R(4, r21);
+ HS_FM_GLOBAL_STORE_R(5, r22);
+ HS_FM_GLOBAL_STORE_R(6, r23);
+ HS_FM_GLOBAL_STORE_R(7, r24);
+}
+
+HS_FM_KERNEL_PROTO(1, 2)
+{
+ HS_FM_PREAMBLE(16);
+ HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
+ HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
+ HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
+ HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
+ HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
+ HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
+ HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
+ HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
+ HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8);
+ HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9);
+ HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10);
+ HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11);
+ HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12);
+ HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13);
+ HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14);
+ HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15);
+ HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0);
+ HS_KEY_TYPE r18 = HS_FM_GLOBAL_LOAD_R(1);
+ HS_KEY_TYPE r19 = HS_FM_GLOBAL_LOAD_R(2);
+ HS_KEY_TYPE r20 = HS_FM_GLOBAL_LOAD_R(3);
+ HS_CMP_XCHG(r16, r17);
+ HS_CMP_XCHG(r15, r18);
+ HS_CMP_XCHG(r14, r19);
+ HS_CMP_XCHG(r13, r20);
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ HS_CMP_XCHG(r17, r19);
+ HS_CMP_XCHG(r18, r20);
+ HS_CMP_XCHG(r17, r18);
+ HS_CMP_XCHG(r19, r20);
+ HS_XM_GLOBAL_STORE_L(0, r1);
+ HS_XM_GLOBAL_STORE_L(1, r2);
+ HS_XM_GLOBAL_STORE_L(2, r3);
+ HS_XM_GLOBAL_STORE_L(3, r4);
+ HS_XM_GLOBAL_STORE_L(4, r5);
+ HS_XM_GLOBAL_STORE_L(5, r6);
+ HS_XM_GLOBAL_STORE_L(6, r7);
+ HS_XM_GLOBAL_STORE_L(7, r8);
+ HS_XM_GLOBAL_STORE_L(8, r9);
+ HS_XM_GLOBAL_STORE_L(9, r10);
+ HS_XM_GLOBAL_STORE_L(10, r11);
+ HS_XM_GLOBAL_STORE_L(11, r12);
+ HS_XM_GLOBAL_STORE_L(12, r13);
+ HS_XM_GLOBAL_STORE_L(13, r14);
+ HS_XM_GLOBAL_STORE_L(14, r15);
+ HS_XM_GLOBAL_STORE_L(15, r16);
+ HS_FM_GLOBAL_STORE_R(0, r17);
+ HS_FM_GLOBAL_STORE_R(1, r18);
+ HS_FM_GLOBAL_STORE_R(2, r19);
+ HS_FM_GLOBAL_STORE_R(3, r20);
+}
+
+HS_FM_KERNEL_PROTO(1, 1)
+{
+ HS_FM_PREAMBLE(16);
+ HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
+ HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
+ HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
+ HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
+ HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
+ HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
+ HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
+ HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
+ HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8);
+ HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9);
+ HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10);
+ HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11);
+ HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12);
+ HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13);
+ HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14);
+ HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15);
+ HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0);
+ HS_KEY_TYPE r18 = HS_FM_GLOBAL_LOAD_R(1);
+ HS_CMP_XCHG(r16, r17);
+ HS_CMP_XCHG(r15, r18);
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ HS_CMP_XCHG(r17, r18);
+ HS_XM_GLOBAL_STORE_L(0, r1);
+ HS_XM_GLOBAL_STORE_L(1, r2);
+ HS_XM_GLOBAL_STORE_L(2, r3);
+ HS_XM_GLOBAL_STORE_L(3, r4);
+ HS_XM_GLOBAL_STORE_L(4, r5);
+ HS_XM_GLOBAL_STORE_L(5, r6);
+ HS_XM_GLOBAL_STORE_L(6, r7);
+ HS_XM_GLOBAL_STORE_L(7, r8);
+ HS_XM_GLOBAL_STORE_L(8, r9);
+ HS_XM_GLOBAL_STORE_L(9, r10);
+ HS_XM_GLOBAL_STORE_L(10, r11);
+ HS_XM_GLOBAL_STORE_L(11, r12);
+ HS_XM_GLOBAL_STORE_L(12, r13);
+ HS_XM_GLOBAL_STORE_L(13, r14);
+ HS_XM_GLOBAL_STORE_L(14, r15);
+ HS_XM_GLOBAL_STORE_L(15, r16);
+ HS_FM_GLOBAL_STORE_R(0, r17);
+ HS_FM_GLOBAL_STORE_R(1, r18);
+}
+
+HS_FM_KERNEL_PROTO(1, 0)
+{
+ HS_FM_PREAMBLE(16);
+ HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
+ HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
+ HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
+ HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
+ HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
+ HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
+ HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
+ HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
+ HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8);
+ HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9);
+ HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10);
+ HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11);
+ HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12);
+ HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13);
+ HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14);
+ HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15);
+ HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0);
+ HS_CMP_XCHG(r16, r17);
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ HS_XM_GLOBAL_STORE_L(0, r1);
+ HS_XM_GLOBAL_STORE_L(1, r2);
+ HS_XM_GLOBAL_STORE_L(2, r3);
+ HS_XM_GLOBAL_STORE_L(3, r4);
+ HS_XM_GLOBAL_STORE_L(4, r5);
+ HS_XM_GLOBAL_STORE_L(5, r6);
+ HS_XM_GLOBAL_STORE_L(6, r7);
+ HS_XM_GLOBAL_STORE_L(7, r8);
+ HS_XM_GLOBAL_STORE_L(8, r9);
+ HS_XM_GLOBAL_STORE_L(9, r10);
+ HS_XM_GLOBAL_STORE_L(10, r11);
+ HS_XM_GLOBAL_STORE_L(11, r12);
+ HS_XM_GLOBAL_STORE_L(12, r13);
+ HS_XM_GLOBAL_STORE_L(13, r14);
+ HS_XM_GLOBAL_STORE_L(14, r15);
+ HS_XM_GLOBAL_STORE_L(15, r16);
+ HS_FM_GLOBAL_STORE_R(0, r17);
+}
+
+HS_HM_KERNEL_PROTO(1)
+{
+ HS_HM_PREAMBLE(16);
+ HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
+ HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
+ HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
+ HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
+ HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
+ HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
+ HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
+ HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
+ HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8);
+ HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9);
+ HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10);
+ HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11);
+ HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12);
+ HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13);
+ HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14);
+ HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15);
+ HS_KEY_TYPE r17 = HS_XM_GLOBAL_LOAD_L(16);
+ HS_KEY_TYPE r18 = HS_XM_GLOBAL_LOAD_L(17);
+ HS_KEY_TYPE r19 = HS_XM_GLOBAL_LOAD_L(18);
+ HS_KEY_TYPE r20 = HS_XM_GLOBAL_LOAD_L(19);
+ HS_KEY_TYPE r21 = HS_XM_GLOBAL_LOAD_L(20);
+ HS_KEY_TYPE r22 = HS_XM_GLOBAL_LOAD_L(21);
+ HS_KEY_TYPE r23 = HS_XM_GLOBAL_LOAD_L(22);
+ HS_KEY_TYPE r24 = HS_XM_GLOBAL_LOAD_L(23);
+ HS_KEY_TYPE r25 = HS_XM_GLOBAL_LOAD_L(24);
+ HS_KEY_TYPE r26 = HS_XM_GLOBAL_LOAD_L(25);
+ HS_KEY_TYPE r27 = HS_XM_GLOBAL_LOAD_L(26);
+ HS_KEY_TYPE r28 = HS_XM_GLOBAL_LOAD_L(27);
+ HS_KEY_TYPE r29 = HS_XM_GLOBAL_LOAD_L(28);
+ HS_KEY_TYPE r30 = HS_XM_GLOBAL_LOAD_L(29);
+ HS_KEY_TYPE r31 = HS_XM_GLOBAL_LOAD_L(30);
+ HS_KEY_TYPE r32 = HS_XM_GLOBAL_LOAD_L(31);
+ HS_CMP_XCHG(r1, r17);
+ HS_CMP_XCHG(r9, r25);
+ HS_CMP_XCHG(r1, r9);
+ HS_CMP_XCHG(r17, r25);
+ HS_CMP_XCHG(r5, r21);
+ HS_CMP_XCHG(r13, r29);
+ HS_CMP_XCHG(r5, r13);
+ HS_CMP_XCHG(r21, r29);
+ HS_CMP_XCHG(r1, r5);
+ HS_CMP_XCHG(r9, r13);
+ HS_CMP_XCHG(r17, r21);
+ HS_CMP_XCHG(r25, r29);
+ HS_CMP_XCHG(r3, r19);
+ HS_CMP_XCHG(r11, r27);
+ HS_CMP_XCHG(r3, r11);
+ HS_CMP_XCHG(r19, r27);
+ HS_CMP_XCHG(r7, r23);
+ HS_CMP_XCHG(r15, r31);
+ HS_CMP_XCHG(r7, r15);
+ HS_CMP_XCHG(r23, r31);
+ HS_CMP_XCHG(r3, r7);
+ HS_CMP_XCHG(r11, r15);
+ HS_CMP_XCHG(r19, r23);
+ HS_CMP_XCHG(r27, r31);
+ HS_CMP_XCHG(r1, r3);
+ HS_CMP_XCHG(r5, r7);
+ HS_CMP_XCHG(r9, r11);
+ HS_CMP_XCHG(r13, r15);
+ HS_CMP_XCHG(r17, r19);
+ HS_CMP_XCHG(r21, r23);
+ HS_CMP_XCHG(r25, r27);
+ HS_CMP_XCHG(r29, r31);
+ HS_CMP_XCHG(r2, r18);
+ HS_CMP_XCHG(r10, r26);
+ HS_CMP_XCHG(r2, r10);
+ HS_CMP_XCHG(r18, r26);
+ HS_CMP_XCHG(r6, r22);
+ HS_CMP_XCHG(r14, r30);
+ HS_CMP_XCHG(r6, r14);
+ HS_CMP_XCHG(r22, r30);
+ HS_CMP_XCHG(r2, r6);
+ HS_CMP_XCHG(r10, r14);
+ HS_CMP_XCHG(r18, r22);
+ HS_CMP_XCHG(r26, r30);
+ HS_CMP_XCHG(r4, r20);
+ HS_CMP_XCHG(r12, r28);
+ HS_CMP_XCHG(r4, r12);
+ HS_CMP_XCHG(r20, r28);
+ HS_CMP_XCHG(r8, r24);
+ HS_CMP_XCHG(r16, r32);
+ HS_CMP_XCHG(r8, r16);
+ HS_CMP_XCHG(r24, r32);
+ HS_CMP_XCHG(r4, r8);
+ HS_CMP_XCHG(r12, r16);
+ HS_CMP_XCHG(r20, r24);
+ HS_CMP_XCHG(r28, r32);
+ HS_CMP_XCHG(r2, r4);
+ HS_CMP_XCHG(r6, r8);
+ HS_CMP_XCHG(r10, r12);
+ HS_CMP_XCHG(r14, r16);
+ HS_CMP_XCHG(r18, r20);
+ HS_CMP_XCHG(r22, r24);
+ HS_CMP_XCHG(r26, r28);
+ HS_CMP_XCHG(r30, r32);
+ HS_CMP_XCHG(r1, r2);
+ HS_CMP_XCHG(r3, r4);
+ HS_CMP_XCHG(r5, r6);
+ HS_CMP_XCHG(r7, r8);
+ HS_CMP_XCHG(r9, r10);
+ HS_CMP_XCHG(r11, r12);
+ HS_CMP_XCHG(r13, r14);
+ HS_CMP_XCHG(r15, r16);
+ HS_CMP_XCHG(r17, r18);
+ HS_CMP_XCHG(r19, r20);
+ HS_CMP_XCHG(r21, r22);
+ HS_CMP_XCHG(r23, r24);
+ HS_CMP_XCHG(r25, r26);
+ HS_CMP_XCHG(r27, r28);
+ HS_CMP_XCHG(r29, r30);
+ HS_CMP_XCHG(r31, r32);
+ HS_XM_GLOBAL_STORE_L(0, r1);
+ HS_XM_GLOBAL_STORE_L(1, r2);
+ HS_XM_GLOBAL_STORE_L(2, r3);
+ HS_XM_GLOBAL_STORE_L(3, r4);
+ HS_XM_GLOBAL_STORE_L(4, r5);
+ HS_XM_GLOBAL_STORE_L(5, r6);
+ HS_XM_GLOBAL_STORE_L(6, r7);
+ HS_XM_GLOBAL_STORE_L(7, r8);
+ HS_XM_GLOBAL_STORE_L(8, r9);
+ HS_XM_GLOBAL_STORE_L(9, r10);
+ HS_XM_GLOBAL_STORE_L(10, r11);
+ HS_XM_GLOBAL_STORE_L(11, r12);
+ HS_XM_GLOBAL_STORE_L(12, r13);
+ HS_XM_GLOBAL_STORE_L(13, r14);
+ HS_XM_GLOBAL_STORE_L(14, r15);
+ HS_XM_GLOBAL_STORE_L(15, r16);
+ HS_XM_GLOBAL_STORE_L(16, r17);
+ HS_XM_GLOBAL_STORE_L(17, r18);
+ HS_XM_GLOBAL_STORE_L(18, r19);
+ HS_XM_GLOBAL_STORE_L(19, r20);
+ HS_XM_GLOBAL_STORE_L(20, r21);
+ HS_XM_GLOBAL_STORE_L(21, r22);
+ HS_XM_GLOBAL_STORE_L(22, r23);
+ HS_XM_GLOBAL_STORE_L(23, r24);
+ HS_XM_GLOBAL_STORE_L(24, r25);
+ HS_XM_GLOBAL_STORE_L(25, r26);
+ HS_XM_GLOBAL_STORE_L(26, r27);
+ HS_XM_GLOBAL_STORE_L(27, r28);
+ HS_XM_GLOBAL_STORE_L(28, r29);
+ HS_XM_GLOBAL_STORE_L(29, r30);
+ HS_XM_GLOBAL_STORE_L(30, r31);
+ HS_XM_GLOBAL_STORE_L(31, r32);
+}
+
+//
+//
+//
diff --git a/src/compute/hs/cl/gen9/hs_cl.h b/src/compute/hs/cl/intel/gen8/u64/hs_cl.h
index 4926a14fb3..d1c996fce9 100644
--- a/src/compute/hs/cl/gen9/hs_cl.h
+++ b/src/compute/hs/cl/intel/gen8/u64/hs_cl.h
@@ -8,17 +8,23 @@
#ifndef HS_CL_ONCE
#define HS_CL_ONCE
-#define HS_LANES_PER_WARP_LOG2 3
-#define HS_LANES_PER_WARP (1 << HS_LANES_PER_WARP_LOG2)
-#define HS_BS_WARPS 16
-#define HS_BS_WARPS_LOG2_RU 4
-#define HS_BC_WARPS_LOG2_MAX 4
-#define HS_FM_BLOCKS_LOG2_MIN 1
-#define HS_HM_BLOCKS_LOG2_MIN 1
-#define HS_KEYS_PER_LANE 16
+#define HS_SLAB_THREADS_LOG2 3
+#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2)
+#define HS_SLAB_WIDTH_LOG2 3
+#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2)
+#define HS_SLAB_HEIGHT 16
+#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)
#define HS_REG_LAST(c) c##16
-#define HS_KEY_WORDS 2
#define HS_KEY_TYPE ulong
+#define HS_KEY_WORDS 2
+#define HS_VAL_WORDS 0
+#define HS_BS_SLABS 16
+#define HS_BS_SLABS_LOG2_RU 4
+#define HS_BC_SLABS_LOG2_MAX 4
+#define HS_FM_SCALE_MIN 1
+#define HS_FM_SCALE_MAX 1
+#define HS_HM_SCALE_MIN 1
+#define HS_HM_SCALE_MAX 1
#define HS_EMPTY
#define HS_SLAB_ROWS() \
@@ -86,34 +92,6 @@
HS_TRANSPOSE_REMAP( u, 16, 16 ) \
HS_EMPTY
-#define HS_FM_BLOCKS_LOG2_1 0
-#define HS_FM_BLOCKS_LOG2_2 1
-#define HS_FM_BLOCKS_LOG2_3 2
-#define HS_FM_BLOCKS_LOG2_4 3
-#define HS_FM_BLOCKS_LOG2_5 4
-#define HS_FM_BLOCKS_LOG2_6 5
-#define HS_HM_BLOCKS_LOG2_5 0
-#define HS_FM_BLOCKS_LOG2_7 6
-#define HS_HM_BLOCKS_LOG2_6 1
-#define HS_FM_BLOCKS_LOG2_8 7
-#define HS_HM_BLOCKS_LOG2_7 2
-#define HS_FM_BLOCKS_LOG2_9 8
-#define HS_HM_BLOCKS_LOG2_8 3
-#define HS_FM_BLOCKS_LOG2_10 9
-#define HS_HM_BLOCKS_LOG2_9 4
-#define HS_FM_BLOCKS_LOG2_11 10
-#define HS_HM_BLOCKS_LOG2_10 5
-#define HS_FM_BLOCKS_LOG2_12 11
-#define HS_HM_BLOCKS_LOG2_11 6
-#define HS_FM_BLOCKS_LOG2_13 12
-#define HS_HM_BLOCKS_LOG2_12 7
-#define HS_FM_BLOCKS_LOG2_14 13
-#define HS_HM_BLOCKS_LOG2_13 8
-#define HS_FM_BLOCKS_LOG2_15 14
-#define HS_HM_BLOCKS_LOG2_14 9
-#define HS_FM_BLOCKS_LOG2_16 15
-#define HS_HM_BLOCKS_LOG2_15 10
-
#endif
//
diff --git a/src/compute/hs/cl/intel/gen8/u64/hs_cl_macros.h b/src/compute/hs/cl/intel/gen8/u64/hs_cl_macros.h
new file mode 100644
index 0000000000..9406339b36
--- /dev/null
+++ b/src/compute/hs/cl/intel/gen8/u64/hs_cl_macros.h
@@ -0,0 +1,361 @@
+//
+// Copyright 2016 Google Inc.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+
+#ifndef HS_CL_MACROS_ONCE
+#define HS_CL_MACROS_ONCE
+
+//
+//
+//
+
+#include "hs_cl.h"
+
+//
+// FYI, restrict shouldn't have any impact on these kernels and
+// benchmarks appear to prove that true
+//
+
+#define HS_RESTRICT restrict
+
+//
+// KERNEL PROTOS
+//
+
+#define HS_TRANSPOSE_KERNEL_PROTO(slab_width) \
+ __kernel \
+ __attribute__((intel_reqd_sub_group_size(slab_width))) \
+ void \
+ hs_kernel_transpose(__global HS_KEY_TYPE * const HS_RESTRICT vout)
+
+#define HS_BS_KERNEL_PROTO(slab_width,slab_count,slab_count_ru_log2) \
+ __kernel \
+ __attribute__((reqd_work_group_size(slab_count*slab_width,1,1))) \
+ __attribute__((intel_reqd_sub_group_size(slab_width))) \
+ void \
+ hs_kernel_bs_##slab_count_ru_log2##(__global HS_KEY_TYPE const * const HS_RESTRICT vin, \
+ __global HS_KEY_TYPE * const HS_RESTRICT vout)
+
+#define HS_BC_KERNEL_PROTO(slab_width,slab_count,slab_count_log2) \
+ __kernel \
+ __attribute__((reqd_work_group_size(slab_count*slab_width,1,1))) \
+ __attribute__((intel_reqd_sub_group_size(slab_width))) \
+ void \
+ hs_kernel_bc_##slab_count_log2##(__global HS_KEY_TYPE * const HS_RESTRICT vout)
+
+#define HS_HM_KERNEL_PROTO(s) \
+ __kernel void \
+ hs_kernel_hm_##s##(__global HS_KEY_TYPE * const HS_RESTRICT vout)
+
+#define HS_FM_KERNEL_PROTO(s,r) \
+ __kernel void \
+ hs_kernel_fm_##s##_##r##(__global HS_KEY_TYPE * const HS_RESTRICT vout)
+
+//
+// BLOCK LOCAL MEMORY DECLARATION
+//
+
+#define HS_BLOCK_LOCAL_MEM_DECL(width,height) \
+ __local struct { \
+ HS_KEY_TYPE m[width * height]; \
+ } shared
+
+//
+// BLOCK BARRIER
+//
+
+#define HS_BLOCK_BARRIER() \
+ barrier(CLK_LOCAL_MEM_FENCE)
+
+//
+// SLAB GLOBAL
+//
+
+#define HS_SLAB_GLOBAL_PREAMBLE(slab_width,slab_height) \
+ uint const gmem_idx = \
+ (get_global_id(0) & ~(slab_width-1)) * slab_height + \
+ get_sub_group_local_id()
+
+#define HS_SLAB_GLOBAL_LOAD(extent,slab_width,row_idx) \
+ extent[gmem_idx + slab_width * row_idx]
+
+#define HS_SLAB_GLOBAL_STORE(slab_width,row_idx,reg) \
+ vout[gmem_idx + slab_width * row_idx] = reg
+
+//
+// SLAB LOCAL
+//
+
+#define HS_SLAB_LOCAL_L(offset) \
+ shared.m[smem_l_idx + (offset)]
+
+#define HS_SLAB_LOCAL_R(offset) \
+ shared.m[smem_r_idx + (offset)]
+
+//
+// SLAB LOCAL VERTICAL LOADS
+//
+
+#define HS_BX_LOCAL_V(offset) \
+ shared.m[get_local_id(0) + (offset)]
+
+//
+// BLOCK SORT MERGE HORIZONTAL
+//
+
+#define HS_BS_MERGE_H_PREAMBLE(slab_width,slab_count) \
+ uint const smem_l_idx = \
+ get_sub_group_id() * (slab_width * slab_count) + \
+ get_sub_group_local_id(); \
+ uint const smem_r_idx = \
+ (get_sub_group_id() ^ 1) * (slab_width * slab_count) + \
+ (get_sub_group_local_id() ^ (slab_width - 1))
+
+//
+// BLOCK CLEAN MERGE HORIZONTAL
+//
+
+#define HS_BC_MERGE_H_PREAMBLE(slab_width,slab_height,slab_count) \
+ uint const gmem_l_idx = \
+ (get_global_id(0) & ~(slab_width*slab_count-1)) * slab_height + \
+ get_local_id(0); \
+ uint const smem_l_idx = \
+ get_sub_group_id() * (slab_width * slab_count) + \
+ get_sub_group_local_id()
+
+#define HS_BC_GLOBAL_LOAD_L(slab_width,slab_idx) \
+ vout[gmem_l_idx + (slab_width * slab_idx)]
+
+//
+// SLAB FLIP AND HALF PREAMBLES
+//
+
+#define HS_SLAB_FLIP_PREAMBLE(mask) \
+ uint const flip_lane_idx = get_sub_group_local_id() ^ mask; \
+ int const t_lt = get_sub_group_local_id() < flip_lane_idx;
+
+#define HS_SLAB_HALF_PREAMBLE(mask) \
+ uint const half_lane_idx = get_sub_group_local_id() ^ mask; \
+ int const t_lt = get_sub_group_local_id() < half_lane_idx;
+
+//
+// Inter-lane compare exchange
+//
+
+// default
+#define HS_CMP_XCHG_V0(a,b) \
+ { \
+ HS_KEY_TYPE const t = min(a,b); \
+ b = max(a,b); \
+ a = t; \
+ }
+
+// super slow
+#define HS_CMP_XCHG_V1(a,b) \
+ { \
+ HS_KEY_TYPE const tmp = a; \
+ a = (a < b) ? a : b; \
+ b ^= a ^ tmp; \
+ }
+
+// best
+#define HS_CMP_XCHG_V2(a,b) \
+ if (a >= b) { \
+ HS_KEY_TYPE const t = a; \
+ a = b; \
+ b = t; \
+ }
+
+// good
+#define HS_CMP_XCHG_V3(a,b) \
+ { \
+ int const ge = a >= b; \
+ HS_KEY_TYPE const t = a; \
+ a = ge ? b : a; \
+ b = ge ? t : b; \
+ }
+
+//
+//
+//
+
+#if (HS_KEY_WORDS == 1)
+#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V0(a,b)
+#elif (HS_KEY_WORDS == 2)
+#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V2(a,b)
+#endif
+
+//
+// The flip/half comparisons rely on a "conditional min/max":
+//
+// - if the flag is false, return min(a,b)
+// - otherwise, return max(a,b)
+//
+// What's a little surprising is that sequence (1) is faster than (2)
+// for 32-bit keys.
+//
+// I suspect either a code generation problem or that the sequence
+// maps well to the GEN instruction set.
+//
+// We mostly care about 64-bit keys and unsurprisingly sequence (2) is
+// fastest for this wider type.
+//
+
+// this is what you would normally use
+#define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) ^ lt) ? b : a
+
+// this seems to be faster for 32-bit keys
+#define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b))
+
+//
+//
+//
+
+#if (HS_KEY_WORDS == 1)
+#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V1(lt,a,b)
+#elif (HS_KEY_WORDS == 2)
+#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b)
+#endif
+
+//
+// Conditional inter-subgroup flip/half compare exchange
+//
+
+#define HS_CMP_FLIP(i,a,b) \
+ { \
+ HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,flip_lane_idx); \
+ HS_KEY_TYPE const tb = intel_sub_group_shuffle(b,flip_lane_idx); \
+ a = HS_COND_MIN_MAX(t_lt,a,tb); \
+ b = HS_COND_MIN_MAX(t_lt,b,ta); \
+ }
+
+#define HS_CMP_HALF(i,a) \
+ { \
+ HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,half_lane_idx); \
+ a = HS_COND_MIN_MAX(t_lt,a,ta); \
+ }
+
+//
+// The device's comparison operator might return what we actually
+// want. For example, it appears GEN 'cmp' returns {true:-1,false:0}.
+//
+
+#define HS_CMP_IS_ZERO_ONE
+
+#ifdef HS_CMP_IS_ZERO_ONE
+// OpenCL requires a {true: +1, false: 0} scalar result
+// (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF }
+#define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b))
+#define HS_CMP_TO_MASK(a) (HS_KEY_TYPE)(-a)
+#else
+// However, OpenCL requires { -1, 0 } for vectors
+// (a < b) -> { 0xFFFFFFFF, 0 }
+#define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64
+#define HS_CMP_TO_MASK(a) (a)
+#endif
+
+//
+// The "flip-merge" and "half-merge" preambles are very similar
+//
+
+#define HS_HM_PREAMBLE(half_span) \
+ uint const span_idx = get_global_id(2) * get_global_size(1) + get_global_id(1); \
+ uint const span_stride = get_global_size(0); \
+ uint const span_size = span_stride * half_span * 2; \
+ uint const span_base = span_idx * span_size; \
+ uint const span_off = get_global_id(0); \
+ uint const span_l = span_base + span_off
+
+#define HS_FM_PREAMBLE(half_span) \
+ HS_HM_PREAMBLE(half_span); \
+ uint const span_r = span_base + span_stride * (half_span + 1) - span_off - 1
+
+//
+//
+//
+
+#define HS_XM_GLOBAL_L(stride_idx) \
+ vout[span_l + span_stride * stride_idx]
+
+#define HS_XM_GLOBAL_LOAD_L(stride_idx) \
+ HS_XM_GLOBAL_L(stride_idx)
+
+#define HS_XM_GLOBAL_STORE_L(stride_idx,reg) \
+ HS_XM_GLOBAL_L(stride_idx) = reg
+
+#define HS_FM_GLOBAL_R(stride_idx) \
+ vout[span_r + span_stride * stride_idx]
+
+#define HS_FM_GLOBAL_LOAD_R(stride_idx) \
+ HS_FM_GLOBAL_R(stride_idx)
+
+#define HS_FM_GLOBAL_STORE_R(stride_idx,reg) \
+ HS_FM_GLOBAL_R(stride_idx) = reg
+
+//
+// This snarl of macros is for transposing a "slab" of sorted elements
+// into linear order.
+//
+// This can occur as the last step in hs_sort() or via a custom kernel
+// that inspects the slab and then transposes and stores it to memory.
+//
+// The slab format can be inspected more efficiently than a linear
+// arrangement.
+//
+// The prime example is detecting when adjacent keys (in sort order)
+// have differing high order bits ("key changes"). The index of each
+// change is recorded to an auxilary array.
+//
+// A post-processing step like this needs to be able to navigate the
+// slab and eventually transpose and store the slab in linear order.
+//
+
+#define HS_SUBGROUP_SHUFFLE_XOR(v,m) intel_sub_group_shuffle_xor(v,m)
+
+#define HS_TRANSPOSE_REG(prefix,row) prefix##row
+#define HS_TRANSPOSE_DECL(prefix,row) HS_KEY_TYPE const HS_TRANSPOSE_REG(prefix,row)
+#define HS_TRANSPOSE_PRED(level) is_lo_##level
+
+#define HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) \
+ prefix_curr##row_ll##_##row_ur
+
+#define HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) \
+ HS_KEY_TYPE const HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur)
+
+#define HS_TRANSPOSE_STAGE(level) \
+ bool const HS_TRANSPOSE_PRED(level) = \
+ (get_sub_group_local_id() & (1 << (level-1))) == 0;
+
+#define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \
+ HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) = \
+ HS_SUBGROUP_SHUFFLE_XOR(HS_TRANSPOSE_PRED(level) ? \
+ HS_TRANSPOSE_REG(prefix_prev,row_ll) : \
+ HS_TRANSPOSE_REG(prefix_prev,row_ur), \
+ 1<<(level-1)); \
+ \
+ HS_TRANSPOSE_DECL(prefix_curr,row_ll) = \
+ HS_TRANSPOSE_PRED(level) ? \
+ HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) : \
+ HS_TRANSPOSE_REG(prefix_prev,row_ll); \
+ \
+ HS_TRANSPOSE_DECL(prefix_curr,row_ur) = \
+ HS_TRANSPOSE_PRED(level) ? \
+ HS_TRANSPOSE_REG(prefix_prev,row_ur) : \
+ HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur);
+
+#define HS_TRANSPOSE_REMAP(prefix,row_from,row_to) \
+ vout[gmem_idx + ((row_to-1) << HS_SLAB_WIDTH_LOG2)] = \
+ HS_TRANSPOSE_REG(prefix,row_from);
+
+//
+//
+//
+
+#endif
+
+//
+//
+//
diff --git a/src/compute/hs/cl/intel/gen8/u64/hs_target.h b/src/compute/hs/cl/intel/gen8/u64/hs_target.h
new file mode 100644
index 0000000000..c543c7b523
--- /dev/null
+++ b/src/compute/hs/cl/intel/gen8/u64/hs_target.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "../../../hs_cl_target.h"
+
+//
+//
+//
+
+#include "hs_cl.h"
+
+//
+//
+//
+
+#ifndef HS_TARGET_NAME
+#define HS_TARGET_NAME hs_target
+#endif
+
+#define HS_TARGET_HELPER(a) a
+
+//
+//
+//
+
+static struct hs_cl_target const HS_TARGET_NAME =
+{
+ .config = {
+ .slab = {
+ .threads_log2 = HS_SLAB_THREADS_LOG2,
+ .width_log2 = HS_SLAB_WIDTH_LOG2,
+ .height = HS_SLAB_HEIGHT
+ },
+
+ .words = {
+ .key = HS_KEY_WORDS,
+ .val = HS_VAL_WORDS
+ },
+
+ .block = {
+ .slabs = HS_BS_SLABS
+ },
+
+ .merge = {
+ .fm = {
+ .scale_min = HS_FM_SCALE_MIN,
+ .scale_max = HS_FM_SCALE_MAX
+ },
+ .hm = {
+ .scale_min = HS_HM_SCALE_MIN,
+ .scale_max = HS_HM_SCALE_MAX,
+ }
+ }
+ },
+
+ .program = {
+#ifndef HS_DUMP_SOURCE
+ 0, // KERNELS ARE BINARIES
+#include "hs_cl.bin.len.xxd"
+ ,
+#include "hs_cl.bin.xxd"
+#else
+ 1, // KERNELS ARE SOURCE
+#include "hs_cl.src.len.xxd"
+ ,
+#include "hs_cl.src.xxd"
+#endif
+ }
+};
+
+//
+//
+//
+
+#ifdef HS_DUMP
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int
+main(int argc, char const * argv[])
+{
+ FILE * fp = fopen("hs_target.bin","wb");
+
+ fwrite(&HS_TARGET_NAME.config,1,sizeof(HS_TARGET_NAME.config),fp);
+
+ size_t progsize =
+ (HS_TARGET_NAME.program[1]<<24) | (HS_TARGET_NAME.program[2]<<16) |
+ (HS_TARGET_NAME.program[3]<< 8) | HS_TARGET_NAME.program[4];
+
+ // fprintf(stderr,"%zu\n",progsize);
+
+ progsize += 1 + sizeof(uint32_t);
+
+ fwrite(HS_TARGET_NAME.program,1,progsize,fp);
+
+ fclose(fp);
+
+ return EXIT_SUCCESS;
+}
+
+#endif
+
+//
+//
+//
diff --git a/src/compute/hs/cl/intel/gen8/u64/make_all.bat b/src/compute/hs/cl/intel/gen8/u64/make_all.bat
new file mode 100644
index 0000000000..ee075b3f92
--- /dev/null
+++ b/src/compute/hs/cl/intel/gen8/u64/make_all.bat
@@ -0,0 +1,26 @@
+@ECHO OFF
+
+SET HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+:: --- 32-bit keys ---
+
+:: %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+:: %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+:: %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+:: --- 64-bit keys
+
+%HS_GEN% -v -a "opencl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+:: %HS_GEN% -v -a "opencl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+::
+:: remove trailing whitespace from generated files
+::
+
+sed -i 's/[[:space:]]*$//' hs_cl.h
+
+::
+:: preprocess and build kernels
+::
+
+make_inl_cl.bat hs_cl.cl
diff --git a/src/compute/hs/cl/intel/gen8/u64/make_inl_cl.bat b/src/compute/hs/cl/intel/gen8/u64/make_inl_cl.bat
new file mode 100644
index 0000000000..d7a3c0a951
--- /dev/null
+++ b/src/compute/hs/cl/intel/gen8/u64/make_inl_cl.bat
@@ -0,0 +1,113 @@
+@ECHO OFF
+
+::
+::
+::
+
+SET OPENCL_STD=-cl-std=CL1.2
+SET OPENCL_PRE=__OPENCL_C_VERSION__=120
+
+:: SET OPENCL_STD=-cl-std=CL2.0
+:: SET OPENCL_PRE=__OPENCL_C_VERSION__=200
+
+::
+::
+::
+
+SET IOC=ioc64
+
+::
+::
+::
+
+SET IOC_IR_OPTS_OPT=%OPENCL_STD% -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info
+
+SET IOC_IR_OPTS_DBG=%OPENCL_STD% -cl-kernel-arg-info -g
+
+SET IOC_IR_OPTS=%IOC_IR_OPTS_OPT%
+
+::
+::
+::
+
+SET PRE_DIR=%~p1
+
+CD %PRE_DIR%
+
+SET PRE_SRC=%~n1.pre.cl
+SET PRE_SRC_XXD=%~n1.src.xxd
+SET PRE_SRC_LEN_XXD=%~n1.src.len.xxd
+
+SET PRE_BIN=%~n1.bin
+SET PRE_BIN_XXD=%~n1.bin.xxd
+SET PRE_BIN_LEN_XXD=%~n1.bin.len.xxd
+
+::
+:: *.pre.cl
+::
+
+clang-format -style=Mozilla -i %1 || goto :error
+cl -I . -I "%INTELOCLSDKROOT%\include" -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_SRC%" || goto :error
+clang-format -style=Mozilla -i %PRE_SRC% || goto :error
+dos2unix -q %PRE_SRC% || goto :error
+
+echo %PRE_SRC%
+
+::
+:: *.src.xxd
+:: *.src.len.xxd
+::
+
+xxd -i < %PRE_SRC% > %PRE_SRC_XXD% || goto :error
+
+for /f %%A in ('wc -c %PRE_SRC%') do (
+ echo %PRE_SRC% %%A
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %PRE_SRC_LEN_XXD% || goto :error
+)
+
+echo %PRE_SRC_XXD%
+echo %PRE_SRC_LEN_XXD%
+
+::
+:: *.pre.bin
+::
+
+%IOC% -cmd=build -bo="%IOC_IR_OPTS%" -device=gpu -input=%PRE_SRC% -ir=%PRE_BIN% || goto :error
+
+echo %PRE_BIN%
+
+::
+:: *.bin.xxd
+:: *.bin.len.xxd
+::
+
+xxd -i < %PRE_BIN% > %PRE_BIN_XXD% || goto :error
+
+for /f %%A in ('wc -c %PRE_BIN%') do (
+ echo %PRE_BIN% %%A
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %PRE_BIN_LEN_XXD% || goto :error
+)
+
+echo %PRE_BIN_XXD%
+echo %PRE_BIN_LEN_XXD%
+
+::
+:: dump a binary
+::
+
+cl /DHS_DUMP /Fe:hs_dump.exe /Tchs_target.h
+hs_dump
+
+::
+:: delete temporary files
+::
+
+:: del *.pre.cl
+del *.obj
+del *.exe
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/cl/intel/gen9lp/u32/make_inl_cl.bat b/src/compute/hs/cl/intel/gen9lp/u32/make_inl_cl.bat
new file mode 100644
index 0000000000..54b1aac48f
--- /dev/null
+++ b/src/compute/hs/cl/intel/gen9lp/u32/make_inl_cl.bat
@@ -0,0 +1,77 @@
+@ECHO OFF
+
+::
+::
+::
+
+SET OPENCL_STD=-cl-std=CL1.2
+SET OPENCL_PRE=__OPENCL_C_VERSION__=120
+
+:: SET OPENCL_STD=-cl-std=CL2.0
+:: SET OPENCL_PRE=__OPENCL_C_VERSION__=200
+
+::
+::
+::
+
+SET IOC=ioc64
+
+::
+::
+::
+
+SET IOC_IR_OPTS_OPT=%OPENCL_STD% -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info
+
+SET IOC_IR_OPTS_DBG=%OPENCL_STD% -cl-kernel-arg-info -g
+
+SET IOC_IR_OPTS=%IOC_IR_OPTS_OPT%
+
+::
+::
+::
+
+SET PRE_DIR=%~p1
+
+CD %PRE_DIR%
+
+SET PRE_CL=%~n1
+SET PRE_CL=%PRE_CL%.pre.cl
+
+SET PRE_SRC_INL=%~n1
+SET PRE_SRC_INL=%PRE_SRC_INL%.pre.src.inl
+
+SET PRE_BIN_IR=%~n1
+SET PRE_BIN_IR=%PRE_BIN_IR%.pre.ir
+
+SET PRE_BIN_INL=%~n1
+SET PRE_BIN_INL=%PRE_BIN_INL%.pre.bin.inl
+
+::
+:: *.pre.cl
+:: *.pre.src.inl
+::
+
+CMD /C clang-format -style=Mozilla -i %1
+CMD /C cl -I . -I "%INTELOCLSDKROOT%\include" -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_CL%"
+CMD /C clang-format -style=Mozilla -i %PRE_CL%
+CMD /C dos2unix -q %PRE_CL%
+CMD /C xxd -i %PRE_CL% %PRE_SRC_INL%
+
+echo %PRE_CL%
+echo %PRE_SRC_INL%
+
+::
+:: *.pre.cl
+:: *.pre.src.inl
+::
+
+CMD /C touch %PRE_BIN_IR%
+ECHO ON
+@CMD /C %IOC% -cmd=build -bo="%IOC_IR_OPTS%" -device=gpu -input=%PRE_CL% -ir=%PRE_BIN_IR%
+@ECHO OFF
+CMD /C xxd -i %PRE_BIN_IR% %PRE_BIN_INL%
+
+echo %PRE_BIN_IR%
+echo %PRE_BIN_INL%
+
+
diff --git a/src/compute/hs/cl/intel/gen9lp/u32b32/make_inl_cl.bat b/src/compute/hs/cl/intel/gen9lp/u32b32/make_inl_cl.bat
new file mode 100644
index 0000000000..54b1aac48f
--- /dev/null
+++ b/src/compute/hs/cl/intel/gen9lp/u32b32/make_inl_cl.bat
@@ -0,0 +1,77 @@
+@ECHO OFF
+
+::
+::
+::
+
+SET OPENCL_STD=-cl-std=CL1.2
+SET OPENCL_PRE=__OPENCL_C_VERSION__=120
+
+:: SET OPENCL_STD=-cl-std=CL2.0
+:: SET OPENCL_PRE=__OPENCL_C_VERSION__=200
+
+::
+::
+::
+
+SET IOC=ioc64
+
+::
+::
+::
+
+SET IOC_IR_OPTS_OPT=%OPENCL_STD% -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info
+
+SET IOC_IR_OPTS_DBG=%OPENCL_STD% -cl-kernel-arg-info -g
+
+SET IOC_IR_OPTS=%IOC_IR_OPTS_OPT%
+
+::
+::
+::
+
+SET PRE_DIR=%~p1
+
+CD %PRE_DIR%
+
+SET PRE_CL=%~n1
+SET PRE_CL=%PRE_CL%.pre.cl
+
+SET PRE_SRC_INL=%~n1
+SET PRE_SRC_INL=%PRE_SRC_INL%.pre.src.inl
+
+SET PRE_BIN_IR=%~n1
+SET PRE_BIN_IR=%PRE_BIN_IR%.pre.ir
+
+SET PRE_BIN_INL=%~n1
+SET PRE_BIN_INL=%PRE_BIN_INL%.pre.bin.inl
+
+::
+:: *.pre.cl
+:: *.pre.src.inl
+::
+
+CMD /C clang-format -style=Mozilla -i %1
+CMD /C cl -I . -I "%INTELOCLSDKROOT%\include" -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_CL%"
+CMD /C clang-format -style=Mozilla -i %PRE_CL%
+CMD /C dos2unix -q %PRE_CL%
+CMD /C xxd -i %PRE_CL% %PRE_SRC_INL%
+
+echo %PRE_CL%
+echo %PRE_SRC_INL%
+
+::
+:: *.pre.cl
+:: *.pre.src.inl
+::
+
+CMD /C touch %PRE_BIN_IR%
+ECHO ON
+@CMD /C %IOC% -cmd=build -bo="%IOC_IR_OPTS%" -device=gpu -input=%PRE_CL% -ir=%PRE_BIN_IR%
+@ECHO OFF
+CMD /C xxd -i %PRE_BIN_IR% %PRE_BIN_INL%
+
+echo %PRE_BIN_IR%
+echo %PRE_BIN_INL%
+
+
diff --git a/src/compute/hs/cl/intel/gen9lp/u64/make_inl_cl.bat b/src/compute/hs/cl/intel/gen9lp/u64/make_inl_cl.bat
new file mode 100644
index 0000000000..54b1aac48f
--- /dev/null
+++ b/src/compute/hs/cl/intel/gen9lp/u64/make_inl_cl.bat
@@ -0,0 +1,77 @@
+@ECHO OFF
+
+::
+::
+::
+
+SET OPENCL_STD=-cl-std=CL1.2
+SET OPENCL_PRE=__OPENCL_C_VERSION__=120
+
+:: SET OPENCL_STD=-cl-std=CL2.0
+:: SET OPENCL_PRE=__OPENCL_C_VERSION__=200
+
+::
+::
+::
+
+SET IOC=ioc64
+
+::
+::
+::
+
+SET IOC_IR_OPTS_OPT=%OPENCL_STD% -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info
+
+SET IOC_IR_OPTS_DBG=%OPENCL_STD% -cl-kernel-arg-info -g
+
+SET IOC_IR_OPTS=%IOC_IR_OPTS_OPT%
+
+::
+::
+::
+
+SET PRE_DIR=%~p1
+
+CD %PRE_DIR%
+
+SET PRE_CL=%~n1
+SET PRE_CL=%PRE_CL%.pre.cl
+
+SET PRE_SRC_INL=%~n1
+SET PRE_SRC_INL=%PRE_SRC_INL%.pre.src.inl
+
+SET PRE_BIN_IR=%~n1
+SET PRE_BIN_IR=%PRE_BIN_IR%.pre.ir
+
+SET PRE_BIN_INL=%~n1
+SET PRE_BIN_INL=%PRE_BIN_INL%.pre.bin.inl
+
+::
+:: *.pre.cl
+:: *.pre.src.inl
+::
+
+CMD /C clang-format -style=Mozilla -i %1
+CMD /C cl -I . -I "%INTELOCLSDKROOT%\include" -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_CL%"
+CMD /C clang-format -style=Mozilla -i %PRE_CL%
+CMD /C dos2unix -q %PRE_CL%
+CMD /C xxd -i %PRE_CL% %PRE_SRC_INL%
+
+echo %PRE_CL%
+echo %PRE_SRC_INL%
+
+::
+:: *.pre.cl
+:: *.pre.src.inl
+::
+
+CMD /C touch %PRE_BIN_IR%
+ECHO ON
+@CMD /C %IOC% -cmd=build -bo="%IOC_IR_OPTS%" -device=gpu -input=%PRE_CL% -ir=%PRE_BIN_IR%
+@ECHO OFF
+CMD /C xxd -i %PRE_BIN_IR% %PRE_BIN_INL%
+
+echo %PRE_BIN_IR%
+echo %PRE_BIN_INL%
+
+
diff --git a/src/compute/hs/gen/gen.h b/src/compute/hs/gen/gen.h
index 4043a8df5c..3635d553cf 100644
--- a/src/compute/hs/gen/gen.h
+++ b/src/compute/hs/gen/gen.h
@@ -9,21 +9,20 @@
#pragma once
//
+// TODO:
//
+// Add Key-Val sorting support -- easy.
//
#include <stdio.h>
#include <stdint.h>
//
+// All code generation is driven by the specified architectural
+// details and host platform API.
//
-//
-
-#define MERGE_LEVELS_MAX_LOG2 7 // merge up to 128 warps
-#define MERGE_LEVELS_MAX_SIZE (1 << MERGE_LEVELS_MAX_LOG2) // ((1 << MERGE_MAX_LOG2) - 1) // incorrect debug error
-
-//
-//
+// In general, the warps-per-block and keys-per-thread are the
+// critical knobs for tuning performance.
//
struct hsg_config
@@ -58,6 +57,7 @@ struct hsg_config
struct {
uint32_t lanes;
+ uint32_t lanes_log2;
uint32_t skpw_bs;
} warp;
@@ -72,7 +72,7 @@ struct hsg_config
};
//
-//
+// HotSort can merge non-power-of-two blocks of warps
//
struct hsg_level
@@ -91,6 +91,16 @@ struct hsg_level
} active;
};
+//
+//
+//
+
+#define MERGE_LEVELS_MAX_LOG2 7 // merge up to 128 warps
+#define MERGE_LEVELS_MAX_SIZE (1 << MERGE_LEVELS_MAX_LOG2)
+
+//
+// This is computed
+//
struct hsg_merge
{
@@ -113,6 +123,8 @@ struct hsg_merge
//
//
+#if 0
+
#define HSG_FILE_NAME_SIZE 80
struct hsg_file
@@ -126,18 +138,6 @@ struct hsg_file
//
//
-typedef enum hsg_kernel_type {
-
- HSG_KERNEL_TYPE_SORT_BLOCK,
-
- HSG_KERNEL_TYPE_COUNT
-
-} hsg_kernel_type;
-
-//
-//
-//
-
typedef enum hsg_file_type {
HSG_FILE_TYPE_HEADER,
@@ -147,6 +147,8 @@ typedef enum hsg_file_type {
} hsg_file_type;
+#endif
+
//
//
//
@@ -158,10 +160,8 @@ typedef enum hsg_file_type {
HSG_OP_EXPAND_X(HSG_OP_TYPE_BEGIN) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_ELSE) \
\
- HSG_OP_EXPAND_X(HSG_OP_TYPE_FILE_HEADER) \
- HSG_OP_EXPAND_X(HSG_OP_TYPE_FILE_FOOTER) \
- \
- HSG_OP_EXPAND_X(HSG_OP_TYPE_DUMMY_KERNEL) \
+ HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_BEGIN) \
+ HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_END) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE) \
@@ -186,12 +186,13 @@ typedef enum hsg_file_type {
HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT) \
+ HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_MERGE_RIGHT_PRED) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_LOAD) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_STORE) \
\
- HSG_OP_EXPAND_X(HSG_OP_TYPE_WARP_FLIP) \
- HSG_OP_EXPAND_X(HSG_OP_TYPE_WARP_HALF) \
+ HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_FLIP) \
+ HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_HALF) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_FLIP) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_HALF) \
@@ -221,8 +222,6 @@ typedef enum hsg_file_type {
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_ACTIVE_PRED) \
\
- HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_MERGE_RIGHT_PRED) \
- \
HSG_OP_EXPAND_X(HSG_OP_TYPE_COUNT)
//
@@ -271,42 +270,63 @@ struct hsg_op
//
//
-typedef void (*hsg_target_pfn)(struct hsg_file * const files,
- struct hsg_merge const * const merge,
- struct hsg_op const * const ops,
- uint32_t const depth);
+extern char const * const hsg_op_type_string[];
//
//
//
-extern struct hsg_config hsg_config;
-extern struct hsg_merge hsg_merge[MERGE_LEVELS_MAX_LOG2];
+struct hsg_target
+{
+ struct hsg_target_state * state;
+};
//
+// All targets share this prototype
+//
+
+typedef
+void
+(*hsg_target_pfn)(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth);
//
//
+//
+
+extern
+void
+hsg_target_debug(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth);
extern
void
-hsg_target_debug (struct hsg_file * const files,
- struct hsg_merge const * const merge,
- struct hsg_op const * const ops,
- uint32_t const depth);
+hsg_target_cuda(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth);
extern
void
-hsg_target_cuda_sm3x(struct hsg_file * const files,
- struct hsg_merge const * const merge,
- struct hsg_op const * const ops,
- uint32_t const depth);
+hsg_target_opencl(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth);
extern
void
-hsg_target_igp_genx (struct hsg_file * const files,
- struct hsg_merge const * const merge,
- struct hsg_op const * const ops,
- uint32_t const depth);
+hsg_target_glsl(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth);
//
//
//
diff --git a/src/compute/hs/gen/main.c b/src/compute/hs/gen/main.c
index 42f4518bfd..e06e23029b 100644
--- a/src/compute/hs/gen/main.c
+++ b/src/compute/hs/gen/main.c
@@ -20,14 +20,8 @@
//
#include "networks.h"
-#include "macros.h"
-#include "util.h"
-
-//
-//
-//
-
-#define HSG_INDENT 2
+#include "common/util.h"
+#include "common/macros.h"
//
//
@@ -36,7 +30,6 @@
#undef HSG_OP_EXPAND_X
#define HSG_OP_EXPAND_X(t) #t ,
-static
char const * const
hsg_op_type_string[] =
{
@@ -53,10 +46,8 @@ hsg_op_type_string[] =
#define BEGIN() (struct hsg_op){ HSG_OP_TYPE_BEGIN }
#define ELSE() (struct hsg_op){ HSG_OP_TYPE_ELSE }
-#define STORE_SLAB_EARLY_EXIT() (struct hsg_op){ HSG_OP_TYPE_STORE_SLAB_EARLY_EXIT }
-
-#define FILE_HEADER() (struct hsg_op){ HSG_OP_TYPE_FILE_HEADER }
-#define FILE_FOOTER() (struct hsg_op){ HSG_OP_TYPE_FILE_FOOTER }
+#define TARGET_BEGIN() (struct hsg_op){ HSG_OP_TYPE_TARGET_BEGIN }
+#define TARGET_END() (struct hsg_op){ HSG_OP_TYPE_TARGET_END }
#define TRANSPOSE_KERNEL_PROTO() (struct hsg_op){ HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO }
#define TRANSPOSE_KERNEL_PREAMBLE() (struct hsg_op){ HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE }
@@ -68,11 +59,11 @@ hsg_op_type_string[] =
#define BC_KERNEL_PROTO(i) (struct hsg_op){ HSG_OP_TYPE_BC_KERNEL_PROTO, { i } }
#define BC_KERNEL_PREAMBLE(i) (struct hsg_op){ HSG_OP_TYPE_BC_KERNEL_PREAMBLE, { i } }
-#define FM_KERNEL_PROTO(l,s) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PROTO, { l, s } }
-#define FM_KERNEL_PREAMBLE(w,s) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PREAMBLE, { w, s } }
+#define FM_KERNEL_PROTO(s,r) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PROTO, { s, r } }
+#define FM_KERNEL_PREAMBLE(h) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PREAMBLE, { h } }
-#define HM_KERNEL_PROTO(d,w) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PROTO, { d, w } }
-#define HM_KERNEL_PREAMBLE(w,s) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PREAMBLE, { w, s } }
+#define HM_KERNEL_PROTO(s) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PROTO, { s } }
+#define HM_KERNEL_PREAMBLE(h) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PREAMBLE, { h } }
#define BX_REG_GLOBAL_LOAD(n,v) (struct hsg_op){ HSG_OP_TYPE_BX_REG_GLOBAL_LOAD, { n, v } }
#define BX_REG_GLOBAL_STORE(n) (struct hsg_op){ HSG_OP_TYPE_BX_REG_GLOBAL_STORE, { n } }
@@ -81,12 +72,13 @@ hsg_op_type_string[] =
#define FM_REG_GLOBAL_STORE_LEFT(n,i) (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT, { n, i } }
#define FM_REG_GLOBAL_LOAD_RIGHT(n,i) (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT, { n, i } }
#define FM_REG_GLOBAL_STORE_RIGHT(n,i) (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT, { n, i } }
+#define FM_MERGE_RIGHT_PRED(n,s) (struct hsg_op){ HSG_OP_TYPE_FM_MERGE_RIGHT_PRED, { n, s } }
#define HM_REG_GLOBAL_LOAD(n,i) (struct hsg_op){ HSG_OP_TYPE_HM_REG_GLOBAL_LOAD, { n, i } }
#define HM_REG_GLOBAL_STORE(n,i) (struct hsg_op){ HSG_OP_TYPE_HM_REG_GLOBAL_STORE, { n, i } }
-#define WARP_FLIP(f) (struct hsg_op){ HSG_OP_TYPE_WARP_FLIP, { f } }
-#define WARP_HALF(h) (struct hsg_op){ HSG_OP_TYPE_WARP_HALF, { h } }
+#define SLAB_FLIP(f) (struct hsg_op){ HSG_OP_TYPE_SLAB_FLIP, { f } }
+#define SLAB_HALF(h) (struct hsg_op){ HSG_OP_TYPE_SLAB_HALF, { h } }
#define CMP_FLIP(a,b,c) (struct hsg_op){ HSG_OP_TYPE_CMP_FLIP, { a, b, c } }
#define CMP_HALF(a,b) (struct hsg_op){ HSG_OP_TYPE_CMP_HALF, { a, b } }
@@ -121,13 +113,12 @@ hsg_op_type_string[] =
#define BS_ACTIVE_PRED(m,l) (struct hsg_op){ HSG_OP_TYPE_BS_ACTIVE_PRED, { m, l } }
-#define FM_MERGE_RIGHT_PRED(n,s) (struct hsg_op){ HSG_OP_TYPE_FM_MERGE_RIGHT_PRED, { n, s } }
-
//
// DEFAULTS
//
-struct hsg_config hsg_config = // FIXME -- how useful is this?
+static
+struct hsg_config hsg_config =
{
.merge = {
.flip = {
@@ -138,8 +129,6 @@ struct hsg_config hsg_config = // FIXME -- how useful is this?
.lo = 1,
.hi = 1
},
-
- .max_log2 = 27 // 2^27th = 128m
},
.block = {
@@ -156,6 +145,7 @@ struct hsg_config hsg_config = // FIXME -- how useful is this?
.warp = {
.lanes = 32,
+ .lanes_log2 = 5,
},
.thread = {
@@ -172,45 +162,11 @@ struct hsg_config hsg_config = // FIXME -- how useful is this?
// ZERO HSG_MERGE STRUCT
//
+static
struct hsg_merge hsg_merge[MERGE_LEVELS_MAX_LOG2] = { 0 };
//
-//
-//
-
-static const hsg_target_pfn hsg_target_pfns[] =
- {
- hsg_target_debug,
- hsg_target_cuda_sm3x,
- hsg_target_igp_genx,
- // hsg_target_adreno_5xx,
- // hsg_target_amd_gcn,
- // hsg_target_x86_sse,
- // hsg_target_x86_avx2,
- };
-
-static const char * hsg_target_pfn_string[] =
- {
- "hs_debug",
- "hs_cuda",
- "hs_cl"
- };
-
-static const char * hsg_file_type_string[][2] =
- {
- { ".h", ".txt" },
- { ".h", ".cu" },
- { ".h", ".cl" }
- };
-
-//
-//
-//
-
-#define HSG_TARGET_PFN_COUNT ARRAY_LENGTH(hsg_target_pfns)
-
-//
-//
+// STATS ON INSTRUCTIONS
//
static hsg_op_type hsg_op_type_counts[HSG_OP_TYPE_COUNT] = { 0 };
@@ -223,8 +179,18 @@ static
void
hsg_op_debug()
{
+ uint32_t total = 0;
+
for (hsg_op_type t=HSG_OP_TYPE_EXIT; t<HSG_OP_TYPE_COUNT; t++)
- fprintf(stderr,"%-37s : %u\n",hsg_op_type_string[t],hsg_op_type_counts[t]);
+ {
+ uint32_t const count = hsg_op_type_counts[t];
+
+ total += count;
+
+ fprintf(stderr,"%-37s : %u\n",hsg_op_type_string[t],count);
+ }
+
+ fprintf(stderr,"%-37s : %u\n\n\n","TOTAL",total);
}
//
@@ -268,7 +234,7 @@ hsg_merge_levels_init_shared(struct hsg_merge * const merge)
//
// The provided smem_bs size will be allocated for each sorting block.
//
- uint32_t const bs_threads = merge->warps * hsg_config.warp.lanes;
+ uint32_t const bs_threads = merge->warps << hsg_config.warp.lanes_log2;
uint32_t const bs_keys = hsg_config.block.smem_bs / (hsg_config.type.words * sizeof(uint32_t));
uint32_t const bs_kpt = bs_keys / bs_threads;
uint32_t const bs_kpt_mod = (bs_kpt / hsg_config.block.warps_mod) * hsg_config.block.warps_mod;
@@ -282,7 +248,7 @@ hsg_merge_levels_init_shared(struct hsg_merge * const merge)
}
// clamp to number of registers
- merge->rows_bs = min(bs_rows_even, hsg_config.thread.regs);
+ merge->rows_bs = MIN_MACRO(bs_rows_even, hsg_config.thread.regs);
}
//
@@ -297,19 +263,19 @@ hsg_merge_levels_init_shared(struct hsg_merge * const merge)
//
// if merge->warps is not pow2 then we're going to skip creating a bc elsewhere
//
- uint32_t const bc_warps_min = max(merge->warps,hsg_config.block.warps_min);
- uint32_t const bc_threads = bc_warps_min * hsg_config.warp.lanes;
+ uint32_t const bc_warps_min = MAX_MACRO(merge->warps,hsg_config.block.warps_min);
+ uint32_t const bc_threads = bc_warps_min << hsg_config.warp.lanes_log2;
uint32_t const bc_block_rd = (((hsg_config.block.smem_bc * bc_warps_min) / hsg_config.block.warps_max) /
hsg_config.block.smem_quantum) * hsg_config.block.smem_quantum;
- uint32_t const bc_block_max = max(bc_block_rd,hsg_config.block.smem_min);
- uint32_t const bc_block_smem = min(bc_block_max,hsg_config.block.smem_bs);
+ uint32_t const bc_block_max = MAX_MACRO(bc_block_rd,hsg_config.block.smem_min);
+ uint32_t const bc_block_smem = MIN_MACRO(bc_block_max,hsg_config.block.smem_bs);
// what is the max amount of shared in each possible bc block config?
uint32_t const bc_keys = bc_block_smem / (hsg_config.type.words * sizeof(uint32_t));
uint32_t const bc_kpt = bc_keys / bc_threads;
uint32_t const bc_kpt_mod = (bc_kpt / hsg_config.block.warps_mod) * hsg_config.block.warps_mod;
- merge->rows_bc = min(bc_kpt_mod, hsg_config.thread.regs);
+ merge->rows_bc = MIN_MACRO(bc_kpt_mod, hsg_config.thread.regs);
merge->skpw_bc = bc_keys / bc_warps_min;
}
}
@@ -441,7 +407,7 @@ hsg_merge_levels_hint(struct hsg_merge * const merge, bool const autotune)
for (uint32_t level=0; level<MERGE_LEVELS_MAX_LOG2; level++)
{
// max network
- uint32_t const n_max = max(merge->levels[level].networks[0],
+ uint32_t const n_max = MAX_MACRO(merge->levels[level].networks[0],
merge->levels[level].networks[1]);
if (n_max <= (merge->rows_bs + hsg_config.thread.xtra))
@@ -533,7 +499,7 @@ hsg_network_copy(struct hsg_op * ops,
for (uint32_t ii=0; ii<len; ii++)
{
- const struct hsg_op * const cx = cxa + ii;
+ struct hsg_op const * const cx = cxa + ii;
ops = hsg_op(ops,CMP_XCHG(cx->a,cx->b,prefix));
}
@@ -638,7 +604,7 @@ hsg_warp_half_downto(struct hsg_op * ops, uint32_t h)
{
ops = hsg_begin(ops);
- ops = hsg_op(ops,WARP_HALF(h));
+ ops = hsg_op(ops,SLAB_HALF(h));
ops = hsg_warp_half_network(ops);
ops = hsg_end(ops);
@@ -665,7 +631,7 @@ hsg_warp_flip(struct hsg_op * ops, uint32_t f)
{
ops = hsg_begin(ops);
- ops = hsg_op(ops,WARP_FLIP(f));
+ ops = hsg_op(ops,SLAB_FLIP(f));
ops = hsg_warp_flip_network(ops);
ops = hsg_end(ops);
@@ -782,7 +748,7 @@ hsg_bc_half_merge_level(struct hsg_op * ops,
uint32_t const net_even = merge->levels[0].networks[0];
// min of warps in block and remaining horizontal rows
- uint32_t const active = min(s_count, net_even);
+ uint32_t const active = MIN_MACRO(s_count, net_even);
// conditional on blockIdx.x
if (active < merge->warps)
@@ -834,7 +800,7 @@ hsg_bc_half_merge(struct hsg_op * ops, struct hsg_merge const * const merge)
//
// will only be called with merge->warps >= 2
//
- uint32_t const warps = max(merge->warps,hsg_config.block.warps_min);
+ uint32_t const warps = MAX_MACRO(merge->warps,hsg_config.block.warps_min);
// guaranteed to be an even network
uint32_t const net_even = merge->levels[0].networks[0];
@@ -851,7 +817,7 @@ hsg_bc_half_merge(struct hsg_op * ops, struct hsg_merge const * const merge)
{
// compute store count
uint32_t const r_rem = hsg_config.thread.regs + 1 - r_lo;
- uint32_t const s_count = min(s_max,r_rem);
+ uint32_t const s_count = MIN_MACRO(s_max,r_rem);
// block sync -- can skip if first
if (r_lo > 1)
@@ -1010,7 +976,7 @@ hsg_bs_flip_merge(struct hsg_op * ops, struct hsg_merge const * const merge)
uint32_t r_hi = hsg_config.thread.regs + 1 - r_lo;
// compute store count
- uint32_t const s_pairs = min(s_pairs_max,r_mid - r_lo);
+ uint32_t const s_pairs = MIN_MACRO(s_pairs_max,r_mid - r_lo);
// store rows to shared
for (uint32_t c=0; c<s_pairs; c++)
@@ -1082,7 +1048,7 @@ hsg_bs_flip_merge_all(struct hsg_op * ops, const struct hsg_merge * const merge)
static
struct hsg_op *
-hsg_bs_sort(struct hsg_op * ops, const struct hsg_merge * const merge)
+hsg_bs_sort(struct hsg_op * ops, struct hsg_merge const * const merge)
{
// func proto
ops = hsg_op(ops,BS_KERNEL_PROTO(merge->index));
@@ -1125,7 +1091,7 @@ hsg_bs_sort_all(struct hsg_op * ops)
{
for (uint32_t merge_idx=0; merge_idx<MERGE_LEVELS_MAX_LOG2; merge_idx++)
{
- const struct hsg_merge* const m = hsg_merge + merge_idx;
+ struct hsg_merge const * const m = hsg_merge + merge_idx;
if (m->warps == 0)
break;
@@ -1142,7 +1108,7 @@ hsg_bs_sort_all(struct hsg_op * ops)
static
struct hsg_op *
-hsg_bc_clean(struct hsg_op * ops, const struct hsg_merge * const merge)
+hsg_bc_clean(struct hsg_op * ops, struct hsg_merge const * const merge)
{
// func proto
ops = hsg_op(ops,BC_KERNEL_PROTO(merge->index));
@@ -1189,7 +1155,7 @@ hsg_bc_clean_all(struct hsg_op * ops)
{
for (uint32_t merge_idx=0; merge_idx<MERGE_LEVELS_MAX_LOG2; merge_idx++)
{
- const struct hsg_merge* const m = hsg_merge + merge_idx;
+ struct hsg_merge const * const m = hsg_merge + merge_idx;
if (m->warps == 0)
break;
@@ -1215,9 +1181,7 @@ static
struct hsg_op *
hsg_fm_thread_load_left(struct hsg_op * ops, uint32_t const n)
{
- uint32_t const mid = n/2;
-
- for (uint32_t r=1; r<=mid; r++)
+ for (uint32_t r=1; r<=n; r++)
ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_LEFT(r,r-1));
return ops;
@@ -1227,9 +1191,7 @@ static
struct hsg_op *
hsg_fm_thread_store_left(struct hsg_op * ops, uint32_t const n)
{
- uint32_t const mid = n/2;
-
- for (uint32_t r=mid; r>=1; r--)
+ for (uint32_t r=1; r<=n; r++)
ops = hsg_op(ops,FM_REG_GLOBAL_STORE_LEFT(r,r-1));
return ops;
@@ -1237,53 +1199,60 @@ hsg_fm_thread_store_left(struct hsg_op * ops, uint32_t const n)
static
struct hsg_op *
-hsg_fm_thread_load_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right)
+hsg_fm_thread_load_right(struct hsg_op * ops, uint32_t const half_span, uint32_t const half_case)
{
- uint32_t const mid = n / 2;
- uint32_t const first = mid + 1;
- uint32_t const last = mid + span_right;
-
- for (uint32_t r=first; r<=last; r++)
- ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_RIGHT(r,r-first));
+ for (uint32_t r=0; r<half_case; r++)
+ ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_RIGHT(r,half_span+1+r));
return ops;
}
static
struct hsg_op *
-hsg_fm_thread_store_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right)
+hsg_fm_thread_store_right(struct hsg_op * ops, uint32_t const half_span, uint32_t const half_case)
{
- uint32_t const mid = n / 2;
- uint32_t const first = mid + 1;
- uint32_t const last = mid + span_right;
-
- for (uint32_t r=last; r>=first; r--)
- ops = hsg_op(ops,FM_REG_GLOBAL_STORE_RIGHT(r,r-first));
+ for (uint32_t r=0; r<half_case; r++)
+ ops = hsg_op(ops,FM_REG_GLOBAL_STORE_RIGHT(r,half_span+1+r));
return ops;
}
static
struct hsg_op *
-hsg_fm_thread_merge_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right)
+hsg_fm_merge(struct hsg_op * ops,
+ uint32_t const scale_log2,
+ uint32_t const span_left,
+ uint32_t const span_right)
{
- // conditional
- ops = hsg_op(ops,FM_MERGE_RIGHT_PRED(n/2,span_right));
+ // func proto
+ ops = hsg_op(ops,FM_KERNEL_PROTO(scale_log2,msb_idx_u32(pow2_ru_u32(span_right))));
// begin
ops = hsg_begin(ops);
- // load
- ops = hsg_fm_thread_load_right(ops,n,span_right);
+ // preamble for loading/storing
+ ops = hsg_op(ops,FM_KERNEL_PREAMBLE(span_left));
+
+ // load left span
+ ops = hsg_fm_thread_load_left(ops,span_left);
+
+ // load right span
+ ops = hsg_fm_thread_load_right(ops,span_left,span_right);
// compare left and right
- ops = hsg_thread_merge_left_right(ops,n/2,span_right);
+ ops = hsg_thread_merge_left_right(ops,span_left,span_right);
+
+ // left merging network
+ ops = hsg_thread_merge(ops,span_left);
// right merging network
- ops = hsg_thread_merge_offset(ops,n/2,span_right);
+ ops = hsg_thread_merge_offset(ops,span_left,span_right);
+
+ // store
+ ops = hsg_fm_thread_store_left(ops,span_left);
// store
- ops = hsg_fm_thread_store_right(ops,n,span_right);
+ ops = hsg_fm_thread_store_right(ops,span_left,span_right);
// end
ops = hsg_end(ops);
@@ -1293,45 +1262,12 @@ hsg_fm_thread_merge_right(struct hsg_op * ops, uint32_t const n, uint32_t const
static
struct hsg_op *
-hsg_fm_thread_merge_right_all(struct hsg_op * ops, uint32_t const span)
+hsg_fm_merge_all(struct hsg_op * ops, uint32_t const scale_log2, uint32_t const warps)
{
- ops = hsg_fm_thread_merge_right(ops,span,span/2);
-
- for (uint32_t span_pow2 = pow2_ru_u32(span) / 4; span_pow2 >= 1; span_pow2 /= 2)
- {
- ops = hsg_fm_thread_merge_right(ops,span,span_pow2);
- }
-
- return ops;
-}
-
-static
-struct hsg_op *
-hsg_fm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uint32_t const fm_scale)
-{
- // func proto
- ops = hsg_op(ops,FM_KERNEL_PROTO(level,fm_scale));
-
- // begin
- ops = hsg_begin(ops);
-
- // shared declare
- ops = hsg_op(ops,FM_KERNEL_PREAMBLE(span,fm_scale));
-
- // load
- ops = hsg_fm_thread_load_left(ops,span);
-
- // right merging network
- ops = hsg_fm_thread_merge_right_all(ops,span);
-
- // left merging network
- ops = hsg_thread_merge(ops,span/2);
-
- // store
- ops = hsg_fm_thread_store_left(ops,span);
+ uint32_t const span_left = (warps << scale_log2) / 2;
- // end
- ops = hsg_end(ops);
+ for (uint32_t span_right=span_left; span_right >= 1; span_right=pow2_ru_u32(span_right)/2)
+ ops = hsg_fm_merge(ops,scale_log2,span_left,span_right);
return ops;
}
@@ -1354,7 +1290,7 @@ static
struct hsg_op *
hsg_hm_thread_store(struct hsg_op * ops, uint32_t const n)
{
- for (uint32_t r=n; r>=1; r--)
+ for (uint32_t r=1; r<=n; r++)
ops = hsg_op(ops,HM_REG_GLOBAL_STORE(r,r-1));
return ops;
@@ -1362,16 +1298,18 @@ hsg_hm_thread_store(struct hsg_op * ops, uint32_t const n)
static
struct hsg_op *
-hsg_hm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uint32_t const hm_scale)
+hsg_hm_merge(struct hsg_op * ops, uint32_t const scale_log2, uint32_t const warps_pow2)
{
+ uint32_t const span = warps_pow2 << scale_log2;
+
// func proto
- ops = hsg_op(ops,HM_KERNEL_PROTO(level,level-msb_idx_u32(span)));
+ ops = hsg_op(ops,HM_KERNEL_PROTO(scale_log2));
// begin
ops = hsg_begin(ops);
- // declarations
- ops = hsg_op(ops,HM_KERNEL_PREAMBLE(span,hm_scale));
+ // preamble for loading/storing
+ ops = hsg_op(ops,HM_KERNEL_PREAMBLE(span/2));
// load
ops = hsg_hm_thread_load(ops,span);
@@ -1389,55 +1327,6 @@ hsg_hm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uin
}
//
-//
-//
-
-static
-struct hsg_op *
-hsg_fm_merge_level(struct hsg_op * ops, uint32_t const level)
-{
- uint32_t const bc_max = pow2_rd_u32(hsg_merge[0].warps);
- uint32_t const bc_max_log2 = msb_idx_u32(bc_max);
-
- uint32_t const fm_level = (level <= bc_max_log2) ? hsg_config.merge.flip.lo : min(level - bc_max_log2,hsg_config.merge.flip.hi);
- uint32_t const fm_scale = level - fm_level;
-
- ops = hsg_fm_merge(ops,
- level,
- hsg_merge[0].warps * (1u << fm_level),
- fm_scale);
-
- return ops;
-}
-
-//
-//
-//
-
-static
-struct hsg_op *
-hsg_hm_merge_level(struct hsg_op * ops, uint32_t const level)
-{
- uint32_t const bc_max = pow2_rd_u32(hsg_merge[0].warps);
- uint32_t const bc_max_log2 = msb_idx_u32(bc_max);
-
- uint32_t const fm_log2_max = bc_max_log2 + hsg_config.merge.flip.hi;
-
- if (level > fm_log2_max)
- {
- uint32_t const down_warps_log2 = level - fm_log2_max;
- uint32_t const hm_level = max(hsg_config.merge.half.lo,min(hsg_config.merge.half.hi,down_warps_log2));
-
- ops = hsg_hm_merge(ops,
- level - hsg_config.merge.flip.hi,
- bc_max * (1u << hm_level),
- down_warps_log2 - hm_level);
- }
-
- return ops;
-}
-
-//
// GENERATE MERGE KERNELS
//
@@ -1445,23 +1334,20 @@ static
struct hsg_op *
hsg_xm_merge_all(struct hsg_op * ops)
{
- uint32_t const keys_per_block = hsg_merge[0].warps * hsg_config.warp.lanes * hsg_config.thread.regs;
- uint32_t const blocks = ((1U << hsg_config.merge.max_log2) + keys_per_block - 1) / keys_per_block;
- uint32_t const blocks_ru = pow2_ru_u32(blocks);
- uint32_t const blocks_log2 = msb_idx_u32(blocks_ru);
+ uint32_t const warps = hsg_merge[0].warps;
+ uint32_t const warps_pow2 = pow2_rd_u32(warps);
- for (uint32_t level=1; level<=blocks_log2; level+=1)
- {
- //
- // GENERATE FLIP MERGE KERNELS
- //
- ops = hsg_fm_merge_level(ops,level);
+ //
+ // GENERATE FLIP MERGE KERNELS
+ //
+ for (uint32_t scale_log2=hsg_config.merge.flip.lo; scale_log2<=hsg_config.merge.flip.hi; scale_log2++)
+ ops = hsg_fm_merge_all(ops,scale_log2,warps);
- //
- // GENERATE HALF MERGE KERNELS
- //
- ops = hsg_hm_merge_level(ops,level);
- }
+ //
+ // GENERATE HALF MERGE KERNELS
+ //
+ for (uint32_t scale_log2=hsg_config.merge.half.lo; scale_log2<=hsg_config.merge.half.hi; scale_log2++)
+ ops = hsg_hm_merge(ops,scale_log2,warps_pow2);
return ops;
}
@@ -1470,93 +1356,30 @@ hsg_xm_merge_all(struct hsg_op * ops)
//
//
-void
-hsg_target_indent(struct hsg_file * const files, uint32_t const depth)
-{
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%*s",
- depth*HSG_INDENT,"");
-}
-
-void
-hsg_target_debug(struct hsg_file * const files,
- const struct hsg_merge * const merge,
- const struct hsg_op * const ops,
- uint32_t const depth)
-{
-
- hsg_target_indent(files,depth);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s\n",
- hsg_op_type_string[ops->type]);
-}
-
-//
-//
-//
-
-static
-struct hsg_file*
-hsg_files_open(const char * prefix, const char ** suffix)
-{
-#define STR_BUF_SIZE 80
-
- struct hsg_file * files = malloc(sizeof(struct hsg_file) * HSG_FILE_TYPE_COUNT);
-
- for (int32_t ii=0; ii<HSG_FILE_TYPE_COUNT; ii++)
- {
- char * name = files[ii].name;
-
- // save prefix
- files[ii].prefix = prefix;
-
- // build filename
- strcpy_s(name,STR_BUF_SIZE,prefix);
- strcat_s(name,STR_BUF_SIZE,suffix[ii]);
-
- // open file
- fopen_s(&files[ii].file,name,"w+");
- }
-
- return files;
-}
-
-static
-void
-hsg_files_close(struct hsg_file * files)
-{
- for (int32_t ii=0; ii<HSG_FILE_TYPE_COUNT; ii++)
- fclose(files[ii].file);
-}
-
-//
-//
-//
-
static
-const struct hsg_op *
-hsg_op_translate_depth(hsg_target_pfn target_pfn,
- struct hsg_file * const files,
- const struct hsg_merge * const merge,
- const struct hsg_op * ops,
- uint32_t const depth)
+struct hsg_op const *
+hsg_op_translate_depth(hsg_target_pfn target_pfn,
+ struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * ops,
+ uint32_t const depth)
{
while (ops->type != HSG_OP_TYPE_EXIT)
{
switch (ops->type)
{
case HSG_OP_TYPE_END:
- target_pfn(files,merge,ops,depth-1);
+ target_pfn(target,config,merge,ops,depth-1);
return ops + 1;
case HSG_OP_TYPE_BEGIN:
- target_pfn(files,merge,ops,depth);
- ops = hsg_op_translate_depth(target_pfn,files,merge,ops+1,depth+1);
+ target_pfn(target,config,merge,ops,depth);
+ ops = hsg_op_translate_depth(target_pfn,target,config,merge,ops+1,depth+1);
break;
default:
- target_pfn(files,merge,ops++,depth);
+ target_pfn(target,config,merge,ops++,depth);
}
}
@@ -1565,12 +1388,13 @@ hsg_op_translate_depth(hsg_target_pfn target_pfn,
static
void
-hsg_op_translate(hsg_target_pfn target_pfn,
- struct hsg_file * const files,
- const struct hsg_merge * const merge,
- const struct hsg_op * ops)
+hsg_op_translate(hsg_target_pfn target_pfn,
+ struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * ops)
{
- hsg_op_translate_depth(target_pfn,files,merge,ops,0);
+ hsg_op_translate_depth(target_pfn,target,config,merge,ops,0);
}
//
@@ -1581,37 +1405,27 @@ int
main(int argc, char * argv[])
{
//
- // INIT
- //
- for (uint32_t ii=0; ii<=MERGE_LEVELS_MAX_LOG2; ii++)
- {
- hsg_merge[ii].index = ii;
- hsg_merge[ii].warps = 32 / (1u << ii);
- }
-
- //
// PROCESS OPTIONS
//
- int32_t arch = 0;
- int32_t opt = 0;
-
- bool quiet = false;
- bool autotune = false;
+ int32_t opt = 0;
+ bool verbose = false;
+ bool autotune = false;
+ char const * arch = "undefined";
- while ((opt = getopt(argc,argv,"hqa:g:G:s:S:w:b:B:m:M:k:r:x:t:f:F:c:C:z")) != EOF)
+ while ((opt = getopt(argc,argv,"hva:g:G:s:S:w:b:B:m:M:k:r:x:t:f:F:c:C:z")) != EOF)
{
switch (opt)
{
case 'h':
fprintf(stderr,"Help goes here...\n");
- return -1;
+ return EXIT_FAILURE;
- case 'q':
- quiet = true;
+ case 'v':
+ verbose = true;
break;
case 'a':
- arch = atoi(optarg);
+ arch = optarg;
break;
case 'g':
@@ -1635,30 +1449,28 @@ main(int argc, char * argv[])
break;
case 'w':
- hsg_config.warp.lanes = atoi(optarg);
+ hsg_config.warp.lanes = atoi(optarg);
+ hsg_config.warp.lanes_log2 = msb_idx_u32(hsg_config.warp.lanes);
break;
case 'b':
// maximum warps in a workgroup / cta / thread block
{
- uint32_t const warps = atoi(optarg);
- uint32_t const warps_ru_pow2 = pow2_ru_u32(warps);
-
- // set warps_max if not already set
- if (hsg_config.block.warps_max == UINT32_MAX)
- hsg_config.block.warps_max = warps_ru_pow2;
+ uint32_t const warps = atoi(optarg);
// must always be even
- if ((warps&1) != 0)
+ if ((warps & 1) != 0)
{
fprintf(stderr,"Error: -b must be even.\n");
- exit(-1);
+ return EXIT_FAILURE;
}
+ hsg_merge[0].index = 0;
hsg_merge[0].warps = warps;
- for (uint32_t ii=1; ii<=MERGE_LEVELS_MAX_LOG2; ii++)
- hsg_merge[ii].warps = warps_ru_pow2 / (1u << ii);
+ // set warps_max if not already set
+ if (hsg_config.block.warps_max == UINT32_MAX)
+ hsg_config.block.warps_max = pow2_ru_u32(warps);
}
break;
@@ -1677,18 +1489,14 @@ main(int argc, char * argv[])
hsg_config.block.warps_mod = atoi(optarg);
break;
- case 'k':
- hsg_config.merge.max_log2 = atoi(optarg);
- break;
-
case 'r':
{
uint32_t const regs = atoi(optarg);
- if ((regs&1) != 0)
+ if ((regs & 1) != 0)
{
fprintf(stderr,"Error: -r must be even.\n");
- exit(-1);
+ return EXIT_FAILURE;
}
hsg_config.thread.regs = regs;
@@ -1726,17 +1534,39 @@ main(int argc, char * argv[])
}
//
- // WHICH ARCH TARGET?
+ // INIT MERGE
//
- hsg_target_pfn hsg_target_pfn = (arch < HSG_TARGET_PFN_COUNT) ? hsg_target_pfns[arch] : hsg_target_debug;
+ uint32_t const warps_ru_pow2 = pow2_ru_u32(hsg_merge[0].warps);
+
+ for (uint32_t ii=1; ii<=MERGE_LEVELS_MAX_LOG2; ii++)
+ {
+ hsg_merge[ii].index = ii;
+ hsg_merge[ii].warps = warps_ru_pow2 >> ii;
+ }
//
- // OPEN FILES
+ // WHICH ARCH TARGET?
//
- struct hsg_file * files = hsg_files_open(hsg_target_pfn_string[arch],hsg_file_type_string[arch]);
+ hsg_target_pfn hsg_target_pfn;
+
+ if (strcmp(arch,"debug") == 0)
+ hsg_target_pfn = hsg_target_debug;
+ else if (strcmp(arch,"cuda") == 0)
+ hsg_target_pfn = hsg_target_cuda;
+ else if (strcmp(arch,"opencl") == 0)
+ hsg_target_pfn = hsg_target_opencl;
+ else if (strcmp(arch,"glsl") == 0)
+ hsg_target_pfn = hsg_target_glsl;
+ else {
+ fprintf(stderr,"Invalid arch: %s\n",arch);
+ exit(EXIT_FAILURE);
+ }
+
+ if (verbose)
+ fprintf(stderr,"Target: %s\n",arch);
//
- // INIT F_KEYS
+ // INIT SMEM KEY ALLOCATION
//
hsg_config_init_shared();
@@ -1766,27 +1596,26 @@ main(int argc, char * argv[])
//
// THESE ARE FOR DEBUG/INSPECTION
//
-
- if (!quiet)
+ if (verbose)
{
hsg_merge_levels_debug(merge);
}
}
- if (!quiet)
+ if (verbose)
fprintf(stderr,"\n\n");
//
+ // GENERATE THE OPCODES
//
- //
- uint32_t const op_count = 1024*1024; // 2^20 ops for now!
- struct hsg_op * const ops_begin = malloc(op_count * sizeof(*ops_begin));
+ uint32_t const op_count = 1<<17;
+ struct hsg_op * const ops_begin = malloc(sizeof(*ops_begin) * op_count);
struct hsg_op * ops = ops_begin;
//
- // APPEND HEADER
+ // OPEN INITIAL FILES AND APPEND HEADER
//
- ops = hsg_op(ops,FILE_HEADER());
+ ops = hsg_op(ops,TARGET_BEGIN());
//
// GENERATE TRANSPOSE KERNEL
@@ -1809,9 +1638,9 @@ main(int argc, char * argv[])
ops = hsg_xm_merge_all(ops);
//
- // APPEND FOOTER
+ // APPEND FOOTER AND CLOSE INITIAL FILES
//
- ops = hsg_op(ops,FILE_FOOTER());
+ ops = hsg_op(ops,TARGET_END());
//
// ... WE'RE DONE!
@@ -1821,20 +1650,17 @@ main(int argc, char * argv[])
//
// APPLY TARGET TRANSLATOR TO ACCUMULATED OPS
//
- hsg_op_translate(hsg_target_pfn,files,hsg_merge,ops_begin);
+ struct hsg_target target;
- //
- //
- //
- if (!quiet)
- hsg_op_debug();
+ hsg_op_translate(hsg_target_pfn,&target,&hsg_config,hsg_merge,ops_begin);
//
+ // DUMP INSTRUCTION COUNTS
//
- //
- hsg_files_close(files);
+ if (verbose)
+ hsg_op_debug();
- return 0;
+ return EXIT_SUCCESS;
}
//
diff --git a/src/compute/hs/gen/networks_merging.c b/src/compute/hs/gen/networks_merging.c
index 90dca03c21..f93958c842 100644
--- a/src/compute/hs/gen/networks_merging.c
+++ b/src/compute/hs/gen/networks_merging.c
@@ -11,7 +11,7 @@
//
#include "networks.h"
-#include "macros.h"
+#include "common/macros.h"
//
//
@@ -24,7 +24,7 @@
//
//
-#define LM(n) { ARRAY_LENGTH(mn##n), mn##n }
+#define LM(n) { ARRAY_LENGTH_MACRO(mn##n), mn##n }
//
//
diff --git a/src/compute/hs/gen/networks_sorting.c b/src/compute/hs/gen/networks_sorting.c
index c7beb6b45e..3d8d364399 100644
--- a/src/compute/hs/gen/networks_sorting.c
+++ b/src/compute/hs/gen/networks_sorting.c
@@ -14,7 +14,7 @@
//
#include "networks.h"
-#include "macros.h"
+#include "common/macros.h"
//
//
@@ -27,7 +27,7 @@
//
//
-#define LS(n) { ARRAY_LENGTH(sn##n), sn##n }
+#define LS(n) { ARRAY_LENGTH_MACRO(sn##n), sn##n }
//
//
diff --git a/src/compute/hs/gen/target_cuda.c b/src/compute/hs/gen/target_cuda.c
new file mode 100644
index 0000000000..e140c4be4c
--- /dev/null
+++ b/src/compute/hs/gen/target_cuda.c
@@ -0,0 +1,600 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+//
+//
+
+#include "gen.h"
+#include "transpose.h"
+
+#include "common/util.h"
+#include "common/macros.h"
+
+//
+//
+//
+
+struct hsg_transpose_state
+{
+ FILE * header;
+ struct hsg_config const * config;
+};
+
+static
+char
+hsg_transpose_reg_prefix(uint32_t const cols_log2)
+{
+ return 'a' + (('r' + cols_log2 - 'a') % 26);
+}
+
+static
+void
+hsg_transpose_blend(uint32_t const cols_log2,
+ uint32_t const row_ll, // lower-left
+ uint32_t const row_ur, // upper-right
+ struct hsg_transpose_state * const state)
+{
+ // we're starting register names at '1' for now
+ fprintf(state->header,
+ " HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n",
+ hsg_transpose_reg_prefix(cols_log2-1),
+ hsg_transpose_reg_prefix(cols_log2),
+ cols_log2,row_ll+1,row_ur+1);
+}
+
+static
+void
+hsg_transpose_remap(uint32_t const row_from,
+ uint32_t const row_to,
+ struct hsg_transpose_state * const state)
+{
+ // we're starting register names at '1' for now
+ fprintf(state->header,
+ " HS_TRANSPOSE_REMAP( %c, %3u, %3u ) \\\n",
+ hsg_transpose_reg_prefix(state->config->warp.lanes_log2),
+ row_from+1,row_to+1);
+}
+
+//
+//
+//
+
+static
+void
+hsg_copyright(FILE * file)
+{
+ fprintf(file,
+ "// \n"
+ "// Copyright 2016 Google Inc. \n"
+ "// \n"
+ "// Use of this source code is governed by a BSD-style \n"
+ "// license that can be found in the LICENSE file. \n"
+ "// \n"
+ "\n");
+}
+
+//
+//
+//
+
+struct hsg_target_state
+{
+ FILE * header;
+ FILE * source;
+};
+
+//
+//
+//
+
+void
+hsg_target_cuda(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth)
+{
+ switch (ops->type)
+ {
+ case HSG_OP_TYPE_END:
+ fprintf(target->state->source,
+ "}\n");
+ break;
+
+ case HSG_OP_TYPE_BEGIN:
+ fprintf(target->state->source,
+ "{\n");
+ break;
+
+ case HSG_OP_TYPE_ELSE:
+ fprintf(target->state->source,
+ "else\n");
+ break;
+
+ case HSG_OP_TYPE_TARGET_BEGIN:
+ {
+ // allocate state
+ target->state = malloc(sizeof(*target->state));
+
+ // allocate files
+ fopen_s(&target->state->header,"hs_cuda.h", "wb");
+ fopen_s(&target->state->source,"hs_cuda.cu","wb");
+
+ // initialize header
+ uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps));
+
+ hsg_copyright(target->state->header);
+
+ fprintf(target->state->header,
+ "#ifndef HS_CUDA_ONCE \n"
+ "#define HS_CUDA_ONCE \n"
+ " \n"
+ "#define HS_SLAB_THREADS_LOG2 %u \n"
+ "#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2) \n"
+ "#define HS_SLAB_WIDTH_LOG2 %u \n"
+ "#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2) \n"
+ "#define HS_SLAB_HEIGHT %u \n"
+ "#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)\n"
+ "#define HS_REG_LAST(c) c##%u \n"
+ "#define HS_KEY_TYPE %s \n"
+ "#define HS_KEY_WORDS %u \n"
+ "#define HS_VAL_WORDS 0 \n"
+ "#define HS_BS_SLABS %u \n"
+ "#define HS_BS_SLABS_LOG2_RU %u \n"
+ "#define HS_BC_SLABS_LOG2_MAX %u \n"
+ "#define HS_FM_SCALE_MIN %u \n"
+ "#define HS_FM_SCALE_MAX %u \n"
+ "#define HS_HM_SCALE_MIN %u \n"
+ "#define HS_HM_SCALE_MAX %u \n"
+ "#define HS_EMPTY \n"
+ " \n",
+ config->warp.lanes_log2,
+ config->warp.lanes_log2,
+ config->thread.regs,
+ config->thread.regs,
+ (config->type.words == 2) ? "ulong" : "uint",
+ config->type.words,
+ merge->warps,
+ msb_idx_u32(pow2_ru_u32(merge->warps)),
+ bc_max,
+ config->merge.flip.lo,
+ config->merge.flip.hi,
+ config->merge.half.lo,
+ config->merge.half.hi);
+
+ fprintf(target->state->header,
+ "#define HS_SLAB_ROWS() \\\n");
+
+ for (uint32_t ii=1; ii<=config->thread.regs; ii++)
+ fprintf(target->state->header,
+ " HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1);
+
+ fprintf(target->state->header,
+ " HS_EMPTY\n"
+ " \n");
+
+ fprintf(target->state->header,
+ "#define HS_TRANSPOSE_SLAB() \\\n");
+
+ for (uint32_t ii=1; ii<=config->warp.lanes_log2; ii++)
+ fprintf(target->state->header,
+ " HS_TRANSPOSE_STAGE( %u ) \\\n",ii);
+
+ struct hsg_transpose_state state[1] =
+ {
+ { .header = target->state->header,
+ .config = config
+ }
+ };
+
+ hsg_transpose(config->warp.lanes_log2,
+ config->thread.regs,
+ hsg_transpose_blend,state,
+ hsg_transpose_remap,state);
+
+ fprintf(target->state->header,
+ " HS_EMPTY\n"
+ " \n");
+
+ hsg_copyright(target->state->source);
+
+ fprintf(target->state->source,
+ "#include \"hs_cuda_macros.h\" \n"
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n");
+ }
+ break;
+
+ case HSG_OP_TYPE_TARGET_END:
+ // decorate the files
+ fprintf(target->state->header,
+ "#endif \n"
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n"
+ " \n");
+ fprintf(target->state->source,
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n"
+ " \n");
+
+ // close files
+ fclose(target->state->header);
+ fclose(target->state->source);
+
+ // free state
+ free(target->state);
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO:
+ {
+ fprintf(target->state->source,
+ "\nHS_TRANSPOSE_KERNEL_PROTO(%u)\n",
+ config->warp.lanes);
+ }
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE:
+ {
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ }
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY:
+ {
+ fprintf(target->state->source,
+ "HS_TRANSPOSE_SLAB()\n");
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_KERNEL_PROTO:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ uint32_t const bs = pow2_ru_u32(m->warps);
+ uint32_t const msb = msb_idx_u32(bs);
+
+ fprintf(target->state->source,
+ "\nHS_BS_KERNEL_PROTO(%u,%u,%u)\n",
+ config->warp.lanes,m->warps,msb);
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_KERNEL_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ if (m->warps > 1)
+ {
+ fprintf(target->state->source,
+ "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+ m->warps * config->warp.lanes,
+ m->rows_bs);
+ }
+
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ }
+ break;
+
+ case HSG_OP_TYPE_BC_KERNEL_PROTO:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ uint32_t const msb = msb_idx_u32(m->warps);
+
+ fprintf(target->state->source,
+ "\nHS_BC_KERNEL_PROTO(%u,%u,%u)\n",
+ config->warp.lanes,m->warps,msb);
+ }
+ break;
+
+ case HSG_OP_TYPE_BC_KERNEL_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ if (m->warps > 1)
+ {
+ fprintf(target->state->source,
+ "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+ m->warps * config->warp.lanes,
+ m->rows_bc);
+ }
+
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ }
+ break;
+
+ case HSG_OP_TYPE_FM_KERNEL_PROTO:
+ fprintf(target->state->source,
+ "\nHS_FM_KERNEL_PROTO(%u,%u)\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_FM_KERNEL_PREAMBLE:
+ fprintf(target->state->source,
+ "HS_FM_PREAMBLE(%u);\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_HM_KERNEL_PROTO:
+ {
+ fprintf(target->state->source,
+ "\nHS_HM_KERNEL_PROTO(%u)\n",
+ ops->a);
+ }
+ break;
+
+ case HSG_OP_TYPE_HM_KERNEL_PREAMBLE:
+ fprintf(target->state->source,
+ "HS_HM_PREAMBLE(%u);\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD:
+ {
+ static char const * const vstr[] = { "vin", "vout" };
+
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_SLAB_GLOBAL_LOAD(%s,%u,%u);\n",
+ ops->n,vstr[ops->v],config->warp.lanes,ops->n-1);
+ }
+ break;
+
+ case HSG_OP_TYPE_BX_REG_GLOBAL_STORE:
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_STORE(%u,%u,r%u);\n",
+ config->warp.lanes,ops->n-1,ops->n);
+ break;
+
+ case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_HM_REG_GLOBAL_STORE:
+ fprintf(target->state->source,
+ "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT:
+ fprintf(target->state->source,
+ "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_FM_GLOBAL_LOAD_R(%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT:
+ fprintf(target->state->source,
+ "HS_FM_GLOBAL_STORE_R(%-3u,r%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED:
+ {
+ if (ops->a <= ops->b)
+ {
+ fprintf(target->state->source,
+ "if (HS_FM_IS_NOT_LAST_SPAN() || (fm_frac == 0))\n");
+ }
+ else if (ops->b > 1)
+ {
+ fprintf(target->state->source,
+ "else if (fm_frac == %u)\n",
+ ops->b);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "else\n");
+ }
+ }
+ break;
+
+ case HSG_OP_TYPE_SLAB_FLIP:
+ fprintf(target->state->source,
+ "HS_SLAB_FLIP_PREAMBLE(%u);\n",
+ ops->n-1);
+ break;
+
+ case HSG_OP_TYPE_SLAB_HALF:
+ fprintf(target->state->source,
+ "HS_SLAB_HALF_PREAMBLE(%u);\n",
+ ops->n / 2);
+ break;
+
+ case HSG_OP_TYPE_CMP_FLIP:
+ fprintf(target->state->source,
+ "HS_CMP_FLIP(%-3u,r%-3u,r%-3u);\n",ops->a,ops->b,ops->c);
+ break;
+
+ case HSG_OP_TYPE_CMP_HALF:
+ fprintf(target->state->source,
+ "HS_CMP_HALF(%-3u,r%-3u);\n",ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_CMP_XCHG:
+ if (ops->c == UINT32_MAX)
+ {
+ fprintf(target->state->source,
+ "HS_CMP_XCHG(r%-3u,r%-3u);\n",
+ ops->a,ops->b);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "HS_CMP_XCHG(r%u_%u,r%u_%u);\n",
+ ops->c,ops->a,ops->c,ops->b);
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_STORE_V:
+ fprintf(target->state->source,
+ "HS_BX_LOCAL_V(%-3u * %-2u * %-3u) = r%u;\n",
+ merge[ops->a].warps,config->warp.lanes,ops->c,ops->b);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V:
+ fprintf(target->state->source,
+ "r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+ ops->b,merge[ops->a].warps,config->warp.lanes,ops->c);
+ break;
+
+ case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+ ops->b,ops->a,config->warp.lanes,ops->c);
+ break;
+
+ case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT:
+ fprintf(target->state->source,
+ "HS_SLAB_LOCAL_L(%5u) = r%u_%u;\n",
+ ops->b * config->warp.lanes,
+ ops->c,
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT:
+ fprintf(target->state->source,
+ "HS_SLAB_LOCAL_R(%5u) = r%u_%u;\n",
+ ops->b * config->warp.lanes,
+ ops->c,
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_L(%u);\n",
+ ops->c,
+ ops->a,
+ ops->b * config->warp.lanes);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_R(%u);\n",
+ ops->c,
+ ops->a,
+ ops->b * config->warp.lanes);
+ break;
+
+ case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_BC_GLOBAL_LOAD_L(%u,%u);\n",
+ ops->c,
+ ops->a,
+ config->warp.lanes,ops->b);
+ break;
+
+ case HSG_OP_TYPE_BLOCK_SYNC:
+ fprintf(target->state->source,
+ "HS_BLOCK_BARRIER();\n");
+ //
+ // FIXME - Named barriers to allow coordinating warps to proceed?
+ //
+ break;
+
+ case HSG_OP_TYPE_BS_FRAC_PRED:
+ {
+ if (ops->m == 0)
+ {
+ fprintf(target->state->source,
+ "if (warp_idx < bs_full)\n");
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "else if (bs_frac == %u)\n",
+ ops->w);
+ }
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ fprintf(target->state->source,
+ "HS_BS_MERGE_H_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,m->warps);
+ }
+ break;
+
+ case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ fprintf(target->state->source,
+ "HS_BC_MERGE_H_PREAMBLE(%u,%u,%u);\n",
+ config->warp.lanes,config->thread.regs,m->warps);
+ }
+ break;
+
+ case HSG_OP_TYPE_BX_MERGE_H_PRED:
+ fprintf(target->state->source,
+ "if (get_sub_group_id() < %u)\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_ACTIVE_PRED:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ if (m->warps <= 32)
+ {
+ fprintf(target->state->source,
+ "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n",
+ m->levels[ops->b].active.b32a2[0]);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n",
+ m->levels[ops->b].active.b32a2[1],
+ m->levels[ops->b].active.b32a2[0]);
+ }
+ }
+ break;
+
+ default:
+ fprintf(stderr,"type not found: %s\n",hsg_op_type_string[ops->type]);
+ exit(EXIT_FAILURE);
+ break;
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/hs/gen/target_cuda_sm3x.c b/src/compute/hs/gen/target_cuda_sm3x.c
deleted file mode 100644
index 6369aa33b0..0000000000
--- a/src/compute/hs/gen/target_cuda_sm3x.c
+++ /dev/null
@@ -1,776 +0,0 @@
-/*
- * Copyright 2016 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#include <stdio.h>
-
-//
-//
-//
-
-#include "gen.h"
-#include "util.h"
-
-//
-//
-//
-
-void
-hsg_target_cuda_sm3x(struct hsg_file * const files,
- struct hsg_merge const * const merge,
- struct hsg_op const * const ops,
- uint32_t const depth)
-{
- const char* const type = (hsg_config.type.words == 2) ? "uint64_t" : "uint32_t";
- const char* const type_max = (hsg_config.type.words == 2) ? "UINT64_MAX" : "UINT32_MAX";
-
- switch (ops->type)
- {
- case HSG_OP_TYPE_END:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "}\n");
- break;
-
- case HSG_OP_TYPE_BEGIN:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "{\n");
- break;
-
- case HSG_OP_TYPE_ELSE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "else\n");
- break;
-
- case HSG_OP_TYPE_FILE_HEADER:
- {
- uint32_t const bc_min = msb_idx_u32(hsg_config.block.warps_min);
- uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge[0].warps));
-
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- "// \n"
- "// Copyright 2016 Google Inc. \n"
- "// \n"
- "// Use of this source code is governed by a BSD-style \n"
- "// license that can be found in the LICENSE file. \n"
- "// \n"
- " \n"
- "#pragma once \n"
- " \n"
- "#include <stdint.h> \n"
- " \n"
- "#define HS_LANES_PER_WARP %u \n"
- "#define HS_BS_WARPS_PER_BLOCK %u \n"
- "#define HS_BC_WARPS_LOG2_MIN %u \n"
- "#define HS_BC_WARPS_LOG2_MAX %u \n"
- "#define HS_KEYS_PER_THREAD %u \n"
- "#define HS_KEY_WORDS %u \n"
- "#define HS_KEY_TYPE %s \n"
- " \n"
- "#include <%s_args.h> \n"
- " \n",
- hsg_config.warp.lanes,
- merge->warps,
- bc_min,
- bc_max,
- hsg_config.thread.regs,
- hsg_config.type.words,
- type,
- files[HSG_FILE_TYPE_SOURCE].prefix);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "// -*- compile-command: \"nvcc -arch sm_52 -Xptxas=-v,-abi=no -cubin -I. %s\"; -*-\n",
- files[HSG_FILE_TYPE_SOURCE].name);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "// \n"
- "// Copyright 2016 Google Inc. \n"
- "// \n"
- "// Use of this source code is governed by a BSD-style \n"
- "// license that can be found in the LICENSE file. \n"
- "// \n"
- " \n"
- "#ifdef __cplusplus \n"
- "extern \"C\" { \n"
- "#endif \n"
- " \n"
- "#include \"%s_launcher.h\" \n"
- " \n"
- "#ifdef __cplusplus \n"
- "} \n"
- "#endif \n"
- " \n"
- "#include \"%s_launch_bounds.h\" \n"
- "#include <%s_finalize.inl> \n"
- " \n"
- "// \n"
- "// \n"
- "// \n",
- files[HSG_FILE_TYPE_HEADER].prefix,
- files[HSG_FILE_TYPE_SOURCE].prefix,
- files[HSG_FILE_TYPE_SOURCE].prefix);
- }
- break;
-
- case HSG_OP_TYPE_FILE_FOOTER:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "// \n"
- "// \n"
- "// \n"
- " \n"
- "#include \"%s_launcher.inl\" \n"
- " \n"
- "// \n"
- "// \n"
- "// \n",
- files[HSG_FILE_TYPE_SOURCE].prefix);
- break;
-
- case HSG_OP_TYPE_BS_KERNEL_PROTO:
- {
- const uint32_t tpb = merge->warps * hsg_config.warp.lanes;
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "extern \"C\" \n"
- "__global__ \n"
- "__launch_bounds__(%u,%u) \n"
- "void \n"
- "hs_bs_kernel(const struct hs_args args) \n",
- tpb,1);
- }
- break;
-
- case HSG_OP_TYPE_BS_KERNEL_PREAMBLE:
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "__shared__ union { \n");
-
- for (uint32_t ii=0; ii<MERGE_LEVELS_MAX_LOG2; ii++)
- {
- const struct hsg_merge* const m = merge + ii;
-
- if (m->warps < 2)
- break;
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " %s m%u[%u][%u];\n",
- type,
- ii,
- m->rows_bs,
- m->warps * hsg_config.warp.lanes);
- }
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " struct { \n"
- " %s f[%u][%u]; \n"
- " %s l[%u]; \n"
- " }; \n",
- type,
- merge[0].warps,
- hsg_config.warp.skpw_bs - 1,
- type,
- merge[0].warps);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "} shared; \n"
- " \n");
-
- const uint32_t kpw = hsg_config.warp.lanes * hsg_config.thread.regs;
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "const int32_t block_warp_idx = threadIdx.x / %u; \n"
- "const int32_t warp_lane_idx = threadIdx.x & %u; \n"
- "const int32_t warp_idx = blockIdx.x * %u + block_warp_idx; \n"
- "const int32_t warp_gmem_idx = warp_idx * %u + warp_lane_idx; \n"
- " \n"
- "%s const * const vin_ptr = args.vin + warp_gmem_idx; \n"
- "%s * const vout_ptr = args.vout + warp_gmem_idx; \n"
- " \n",
-
- hsg_config.warp.lanes,
- hsg_config.warp.lanes - 1,
- merge[0].warps,
- kpw,
- type,
- type);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (warp_idx >= args.bs.full + args.bs.frac) \n"
- " return; \n"
- " \n");
- }
- break;
-
- case HSG_OP_TYPE_BC_KERNEL_PROTO:
- {
- uint32_t const bc_warps = merge[ops->a].warps;
- uint32_t const tpb = bc_warps * hsg_config.warp.lanes;
- uint32_t const bpm = hsg_config.block.warps_max / bc_warps;
- uint32_t const msb = msb_idx_u32(bc_warps);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "extern \"C\" \n"
- "__global__ \n"
- "__launch_bounds__(%u,%u) \n"
- "void \n"
- "hs_bc_%u_kernel(const struct hs_args args) \n",
- tpb,bpm,
- msb);
- }
- break;
-
- case HSG_OP_TYPE_BC_KERNEL_PREAMBLE:
- {
- const struct hsg_merge* const m = merge + ops->a;
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "__shared__ union { \n");
-
- if (m->warps >= 2)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " %s m%u[%u][%u]; \n",
- type,
- ops->a,
- m->rows_bc,
- m->warps * hsg_config.warp.lanes);
- }
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " struct { \n"
- " %s f[%u][%u]; \n"
- " %s l[%u]; \n"
- " }; \n"
- "} shared; \n"
- " \n",
- type,m->warps,m->skpw_bc - 1,
- type,m->warps);
-
- const uint32_t kpw = hsg_config.warp.lanes * hsg_config.thread.regs;
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "const int32_t block_warp_idx = threadIdx.x / %u; \n"
- "const int32_t warp_lane_idx = threadIdx.x & %u; \n"
- "const int32_t warp_gmem_base = blockIdx.x * %u * %u + warp_lane_idx; \n"
- "const int32_t warp_gmem_idx = warp_gmem_base + block_warp_idx * %u; \n"
- " \n"
- "%s * const vout_ptr = args.vout + warp_gmem_idx; \n"
- " \n",
- hsg_config.warp.lanes,
- hsg_config.warp.lanes - 1,
- m->warps,kpw,
- kpw,
- type);
-
-#if 0
- //
- // NO LONGER NEED THIS TEST
- //
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (warp_idx >= args.bc.full) \n"
- " return; \n"
- " \n");
-#endif
- }
- break;
-
- case HSG_OP_TYPE_FM_KERNEL_PROTO:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "#define HS_FM_WARPS_LOG2_%u %u \n"
- "extern \"C\" \n"
- "__global__ \n"
- "HS_FM_LAUNCH_BOUNDS_%u \n"
- "void \n"
- "hs_fm_%u_kernel(const struct hs_args args) \n",
- ops->a,
- ops->b,
- ops->a - ops->b,
- ops->a);
- break;
-
- case HSG_OP_TYPE_FM_KERNEL_PREAMBLE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "const int32_t warp_idx = (blockDim.x * blockIdx.x + threadIdx.x) / %u; \n"
- "const int32_t warp_lane_idx = threadIdx.x & %u; \n"
- " \n"
- "const int32_t merge_idx = warp_idx / %u >> %u; \n"
- " \n"
- "const int32_t merge_stride = %u * %u << %u; \n"
- "const int32_t merge_keys = merge_stride * %u; \n"
- " \n"
- "const int32_t merge_base = merge_idx * merge_keys; \n"
- " \n"
- "const int32_t merge_l_off = (warp_idx - merge_idx * (%u << %u)) * %u + warp_lane_idx; \n"
- "const int32_t merge_l_end = merge_l_off + merge_stride * (%u / 2 - 1); \n"
- "%s * const merge_l = args.vout + merge_base + merge_l_off; \n"
- " \n"
- "const int32_t merge_r_off = merge_keys - merge_l_end - 1; \n"
- "%s * const merge_r = args.vout + merge_base + merge_r_off; \n"
- " \n",
- hsg_config.warp.lanes,
- hsg_config.warp.lanes-1,
- hsg_config.thread.regs,ops->b,
- hsg_config.thread.regs,hsg_config.warp.lanes,ops->b,
- ops->a,
- hsg_config.thread.regs,ops->b,hsg_config.warp.lanes,
- ops->a,
- type,
- type);
- break;
-
- case HSG_OP_TYPE_HM_KERNEL_PROTO:
- {
- const uint32_t bc_max = msb_idx_u32(pow2_rd_u32(merge[0].warps));
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "#define HS_HM_WARPS_LOG2_%u %u \n"
- "extern \"C\" \n"
- "__global__ \n"
- "HS_HM_LAUNCH_BOUNDS_%u \n"
- "void \n"
- "hs_hm_%u_kernel(const struct hs_args args) \n",
- ops->a,
- ops->b,
- ops->a - ops->b - bc_max,
- ops->a);
- }
- break;
-
- case HSG_OP_TYPE_HM_KERNEL_PREAMBLE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "const int32_t warp_idx = (blockDim.x * blockIdx.x + threadIdx.x) / %u; \n"
- "const int32_t warp_lane_idx = threadIdx.x & %u; \n"
- " \n"
- "const int32_t merge_idx = (warp_idx / %u) >> %u; \n"
- " \n"
- "const int32_t merge_stride = %u * %u << %u; \n"
- "const int32_t merge_keys = merge_stride * %u; \n"
- " \n"
- "const int32_t merge_base = merge_idx * merge_keys; \n"
- " \n"
- "const int32_t merge_off = (warp_idx - merge_idx * (%u << %u)) * %u; \n"
- "%s * const merge_ptr = args.vout + merge_base + merge_off + warp_lane_idx; \n"
- " \n",
- hsg_config.warp.lanes,
- hsg_config.warp.lanes-1,
- hsg_config.thread.regs,ops->b,
- hsg_config.thread.regs,hsg_config.warp.lanes,ops->b,
- ops->a,
- hsg_config.thread.regs,ops->b,hsg_config.warp.lanes,
- type);
- break;
-
- case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD:
- {
- static const char* const vstr[] = { "vin_ptr", "vout_ptr" };
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s r%-3u = %s[%-3u * %u]; \n",
- type,ops->n,vstr[ops->v],ops->n-1,hsg_config.warp.lanes);
- }
- break;
-
- case HSG_OP_TYPE_BX_REG_GLOBAL_STORE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "vout_ptr[%-3u * %u] = r%u; \n",
- ops->n-1,hsg_config.warp.lanes,ops->n);
- break;
-
-#if 0
- case HSG_OP_TYPE_BX_WARP_STORE_PRED:
- if (ops->a == 1)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (!args.is_final) \n");
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (blockIdx.x * %u + block_warp_idx >= args.bx.ru) \n"
- "{ \n"
- " return; \n"
- "} \n"
- "else if (!args.is_final) \n",
- ops->a);
- }
- break;
-#endif
-
- case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s r%-3u = merge_ptr[%-3u * merge_stride];\n",
- type,ops->a,ops->b);
- break;
-
- case HSG_OP_TYPE_HM_REG_GLOBAL_STORE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "merge_ptr[%-3u * merge_stride] = r%u;\n",
- ops->b,ops->a);
- break;
-
- case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s r%-3u = merge_l[%-3u * merge_stride];\n",
- type,ops->a,ops->b);
- break;
-
- case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "merge_l[%-3u * merge_stride] = r%u;\n",
- ops->b,ops->a);
- break;
-
- case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s r%-3u = merge_r[%-3u * merge_stride];\n",
- type,ops->a,ops->b);
- break;
-
- case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "merge_r[%-3u * merge_stride] = r%u;\n",
- ops->b,ops->a);
- break;
-
- case HSG_OP_TYPE_WARP_FLIP:
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "const int32_t flip_lane_mask = %u; \n"
- "const int32_t flip_lane_idx = warp_lane_idx ^ flip_lane_mask; \n"
- "const bool t_lt = warp_lane_idx < flip_lane_idx; \n",
- ops->n-1);
- }
- break;
-
- case HSG_OP_TYPE_WARP_HALF:
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "const int32_t half_lane_mask = %u; \n"
- "const int32_t half_lane_idx = warp_lane_idx ^ half_lane_mask; \n"
- "const bool t_lt = warp_lane_idx < half_lane_idx; \n",
- ops->n / 2);
- }
- break;
-
- case HSG_OP_TYPE_CMP_FLIP:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_CMP_FLIP(r%-3u,r%-3u,r%-3u)\n",ops->a,ops->b,ops->c);
- break;
-
- case HSG_OP_TYPE_CMP_HALF:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_CMP_HALF(r%-3u,r%-3u)\n",ops->a,ops->b);
- break;
-
- case HSG_OP_TYPE_CMP_XCHG:
- if (ops->c == UINT32_MAX)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_CMP_XCHG(r%-3u,r%-3u)\n",
- ops->a,ops->b);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_CMP_XCHG(r%u_%u,r%u_%u)\n",
- ops->c,ops->a,ops->c,ops->b);
- }
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_STORE_V:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "smem_v[%-3u * %-2u * %-3u] = r%u;\n",
- ops->a,hsg_config.warp.lanes,ops->c,ops->b);
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "r%-3u = smem_v[%-3u * %-2u * %-3u];\n",
- ops->b,ops->a,hsg_config.warp.lanes,ops->c);
- break;
-
- case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s r%-3u = smem_v[%-3u * %-2u * %-3u];\n",
- type,ops->b,ops->a,hsg_config.warp.lanes,ops->c);
- break;
-
- case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "smem_l[%5u] = r%u_%u;\n",
- ops->b * hsg_config.warp.lanes,
- ops->c,
- ops->a);
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "smem_r[%5u] = r%u_%u;\n",
- ops->b * hsg_config.warp.lanes,
- ops->c,
- ops->a);
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s r%u_%-3u = smem_l[%u];\n",
- type,
- ops->c,
- ops->a,
- ops->b * hsg_config.warp.lanes);
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s r%u_%-3u = smem_r[%u];\n",
- type,
- ops->c,
- ops->a,
- ops->b * hsg_config.warp.lanes);
- break;
-
- case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s r%u_%-3u = gmem_l[%u];\n",
- type,
- ops->c,
- ops->a,
- ops->b * hsg_config.warp.lanes);
- break;
-
-#if 0
- case HSG_OP_TYPE_REG_F_PREAMBLE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s* const f_%u_smem_st_ptr = &shared.f[block_warp_idx]",
- type,
- ops->a);
-
- if (ops->a >= (int32_t)hsg_config.warp.lanes)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "[warp_lane_idx * %u];\n",
- (ops->a / hsg_config.warp.lanes) * hsg_config.warp.lanes + 1);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "[(warp_lane_idx & 0x%X) * %u + (warp_lane_idx & ~0x%X)];\n",
- ops->a-1,
- hsg_config.warp.lanes + 1,
- ops->a-1);
- }
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "const %s* const f_%u_smem_ld_ptr = &shared.f[block_warp_idx][warp_lane_idx];\n",
- type,
- ops->a);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s* const f_%u_gmem_st_ptr = args.vout + warp_gmem_idx",
- type,
- ops->a);
-
- if (ops->a >= (int32_t)hsg_config.warp.lanes)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,";\n");
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " - warp_lane_idx + (warp_lane_idx & ~0x%X) * %u + (warp_lane_idx & 0x%X);\n",
- ops->a-1,
- hsg_config.thread.regs,
- ops->a-1);
- }
- break;
-
- case HSG_OP_TYPE_REG_SHARED_STORE_F:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "f_%u_smem_st_ptr[%-3u] = r%u;\n",
- ops->c,
- ops->b,
- ops->a);
- break;
-
- case HSG_OP_TYPE_REG_SHARED_LOAD_F:
- if (ops->c >= (int32_t)hsg_config.warp.lanes)
- {
- uint32_t const adjacent = ops->c / hsg_config.warp.lanes;
- uint32_t const stride = adjacent * hsg_config.warp.lanes + 1;
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "r%-3u = f_%u_smem_ld_ptr[%-3u];\n",
- ops->a,
- ops->c,
- (ops->b / adjacent) * stride + (ops->b % adjacent) * hsg_config.warp.lanes);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "r%-3u = f_%u_smem_ld_ptr[%-3u];\n",
- ops->a,
- ops->c,
- ops->b * (hsg_config.warp.lanes + 1));
- }
- break;
-
- case HSG_OP_TYPE_REG_GLOBAL_STORE_F:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "f_%u_gmem_st_ptr[%-3u * %u + %-3u] = r%u;\n",
- ops->c,
- ops->b,
- hsg_config.thread.regs, // hsg_config.warp.lanes,
- (ops->a - 1) & ~(ops->c - 1),
- ops->a);
- break;
-#endif
-
-#if 0
- case HSG_OP_TYPE_FINALIZE:
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_FINALIZE(%s,args,shared.f[block_warp_idx],shared.l,\n"
- " block_warp_idx,warp_lane_idx,warp_gmem_idx,\n"
- " r%-3u",
- ops->a == 1 ? "true" : "false",
- 1);
-
-#define HS_WARP_FINALIZE_PRETTY_PRINT 8
-
- for (uint32_t r=2; r<=hsg_config.thread.regs; r++)
- {
- if (r % HS_WARP_FINALIZE_PRETTY_PRINT == 1)
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,",\n");
- else
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,",");
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,"r%-3u",r);
- }
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,");\n");
- }
- break;
-#endif
-
- case HSG_OP_TYPE_BLOCK_SYNC:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "__syncthreads();\n");
- break;
-
- case HSG_OP_TYPE_BS_FRAC_PRED:
- {
- if (ops->m == 0)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (warp_idx < args.bs.full)\n");
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "else if (args.bs.frac == %u)\n",
- ops->w);
- }
- }
- break;
-
-#if 0 // DELETED
- case HSG_OP_TYPE_BX_MERGE_V_PREAMBLE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s * const smem_v = shared.m%u[0] + threadIdx.x; \n",
- type,ops->a);
- break;
-#endif
-
- case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE:
- if (ops->c == 0)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s * smem_l = shared.m%u[block_warp_idx ] + warp_lane_idx; \n"
- "%s * smem_r = shared.m%u[block_warp_idx ^ 1] + (warp_lane_idx ^ %u); \n",
- type,ops->a,
- type,ops->a,hsg_config.warp.lanes-1);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "smem_l = shared.m%u[block_warp_idx ] + warp_lane_idx; \n"
- "smem_r = shared.m%u[block_warp_idx ^ 1] + (warp_lane_idx ^ %u); \n",
- ops->a,
- ops->a,hsg_config.warp.lanes-1);
- }
- break;
-
- case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s const * const gmem_l = args.vout + (warp_gmem_base + block_warp_idx * %u); \n"
- "%s * const smem_l = shared.m%u[block_warp_idx] + warp_lane_idx; \n"
- "%s * const smem_v = shared.m%u[0] + threadIdx.x; \n",
- type,hsg_config.warp.lanes,
- type,ops->a,
- type,ops->a);
- break;
-
- case HSG_OP_TYPE_BX_MERGE_H_PRED:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (threadIdx.x < %u)\n",
- ops->a * hsg_config.warp.lanes);
- break;
-
- case HSG_OP_TYPE_BS_ACTIVE_PRED:
- {
- const struct hsg_merge* const m = merge + ops->a;
-
- if (m->warps <= 32)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (((1u << block_warp_idx) & 0x%08X) != 0)\n",
- m->levels[ops->b].active.b32a2[0]);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (((1UL << block_warp_idx) & 0x%08X%08XL) != 0L)\n",
- m->levels[ops->b].active.b32a2[1],
- m->levels[ops->b].active.b32a2[0]);
- }
- }
- break;
-
- case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED:
- {
- if (ops->a == ops->b)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (merge_idx < args.fm.full) \n");
- }
- else if (ops->b > 1)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "else if (args.fm.frac == %u) \n",
- ops->b);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "else\n");
- }
- }
- break;
-
- default:
- hsg_target_debug(files,merge,ops,depth);
- break;
- }
-}
-
-//
-//
-//
diff --git a/src/compute/hs/gen/target_debug.c b/src/compute/hs/gen/target_debug.c
new file mode 100644
index 0000000000..1481ca8041
--- /dev/null
+++ b/src/compute/hs/gen/target_debug.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+//
+//
+
+#include "gen.h"
+
+//
+//
+//
+
+#define HSG_INDENT 2
+
+//
+//
+//
+
+struct hsg_target_state
+{
+ FILE * txt;
+};
+
+//
+//
+//
+
+void
+hsg_target_indent(struct hsg_target * const target, uint32_t const depth)
+{
+ fprintf(target->state->txt,
+ "%*s",
+ depth*HSG_INDENT,"");
+}
+
+void
+hsg_target_debug(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth)
+{
+ if (ops->type == HSG_OP_TYPE_TARGET_BEGIN)
+ {
+ target->state = malloc(sizeof(*target->state));
+ fopen_s(&target->state->txt,"hs_debug.txt","wb");
+ }
+
+ hsg_target_indent(target,depth);
+
+ fprintf(target->state->txt,
+ "%s\n",
+ hsg_op_type_string[ops->type]);
+
+ if (ops->type == HSG_OP_TYPE_TARGET_END)
+ {
+ fclose(target->state->txt);
+ free(target->state);
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/hs/gen/target_glsl.c b/src/compute/hs/gen/target_glsl.c
new file mode 100644
index 0000000000..2bb75797ab
--- /dev/null
+++ b/src/compute/hs/gen/target_glsl.c
@@ -0,0 +1,674 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+//
+//
+
+#include "gen.h"
+#include "transpose.h"
+
+#include "common/util.h"
+#include "common/macros.h"
+
+//
+//
+//
+
+struct hsg_transpose_state
+{
+ FILE * header;
+ struct hsg_config const * config;
+};
+
+static
+char
+hsg_transpose_reg_prefix(uint32_t const cols_log2)
+{
+ return 'a' + (('r' + cols_log2 - 'a') % 26);
+}
+
+static
+void
+hsg_transpose_blend(uint32_t const cols_log2,
+ uint32_t const row_ll, // lower-left
+ uint32_t const row_ur, // upper-right
+ struct hsg_transpose_state * const state)
+{
+ // we're starting register names at '1' for now
+ fprintf(state->header,
+ " HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n",
+ hsg_transpose_reg_prefix(cols_log2-1),
+ hsg_transpose_reg_prefix(cols_log2),
+ cols_log2,row_ll+1,row_ur+1);
+}
+
+static
+void
+hsg_transpose_remap(uint32_t const row_from,
+ uint32_t const row_to,
+ struct hsg_transpose_state * const state)
+{
+ // we're starting register names at '1' for now
+ fprintf(state->header,
+ " HS_TRANSPOSE_REMAP( %c, %3u, %3u ) \\\n",
+ hsg_transpose_reg_prefix(state->config->warp.lanes_log2),
+ row_from+1,row_to+1);
+}
+
+//
+//
+//
+
+static
+void
+hsg_copyright(FILE * file)
+{
+ fprintf(file,
+ "// \n"
+ "// Copyright 2016 Google Inc. \n"
+ "// \n"
+ "// Use of this source code is governed by a BSD-style \n"
+ "// license that can be found in the LICENSE file. \n"
+ "// \n"
+ "\n");
+}
+
+static
+void
+hsg_macros(FILE * file)
+{
+ fprintf(file,
+ "#include \"hs_glsl_macros.h\" \n"
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n"
+ "\n");
+}
+
+//
+//
+//
+
+struct hsg_target_state
+{
+ FILE * header;
+ FILE * embeds;
+ FILE * source;
+};
+
+//
+//
+//
+
+void
+hsg_target_glsl(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth)
+{
+ switch (ops->type)
+ {
+ case HSG_OP_TYPE_END:
+ fprintf(target->state->source,
+ "}\n");
+
+ if (depth == 0) {
+ fclose(target->state->source);
+ target->state->source = NULL;
+ }
+ break;
+
+ case HSG_OP_TYPE_BEGIN:
+ fprintf(target->state->source,
+ "{\n");
+ break;
+
+ case HSG_OP_TYPE_ELSE:
+ fprintf(target->state->source,
+ "else\n");
+ break;
+
+ case HSG_OP_TYPE_TARGET_BEGIN:
+ {
+ // allocate state
+ target->state = malloc(sizeof(*target->state));
+
+ // allocate files
+ fopen_s(&target->state->header,"hs_glsl.h", "wb");
+ fopen_s(&target->state->embeds,"hs_kernels.h","wb");
+
+ hsg_copyright(target->state->header);
+ hsg_copyright(target->state->embeds);
+
+ // initialize header
+ uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps));
+
+ fprintf(target->state->header,
+ "#ifndef HS_GLSL_ONCE \n"
+ "#define HS_GLSL_ONCE \n"
+ " \n"
+ "#define HS_SLAB_THREADS_LOG2 %u \n"
+ "#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2) \n"
+ "#define HS_SLAB_WIDTH_LOG2 %u \n"
+ "#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2) \n"
+ "#define HS_SLAB_HEIGHT %u \n"
+ "#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)\n"
+ "#define HS_REG_LAST(c) c##%u \n"
+ "#define HS_KEY_TYPE %s \n"
+ "#define HS_KEY_WORDS %u \n"
+ "#define HS_VAL_WORDS 0 \n"
+ "#define HS_BS_SLABS %u \n"
+ "#define HS_BS_SLABS_LOG2_RU %u \n"
+ "#define HS_BC_SLABS_LOG2_MAX %u \n"
+ "#define HS_FM_SCALE_MIN %u \n"
+ "#define HS_FM_SCALE_MAX %u \n"
+ "#define HS_HM_SCALE_MIN %u \n"
+ "#define HS_HM_SCALE_MAX %u \n"
+ "#define HS_EMPTY \n"
+ " \n",
+ config->warp.lanes_log2,
+ config->warp.lanes_log2,
+ config->thread.regs,
+ config->thread.regs,
+ (config->type.words == 2) ? "uint64_t" : "uint32_t",
+ config->type.words,
+ merge->warps,
+ msb_idx_u32(pow2_ru_u32(merge->warps)),
+ bc_max,
+ config->merge.flip.lo,
+ config->merge.flip.hi,
+ config->merge.half.lo,
+ config->merge.half.hi);
+
+ fprintf(target->state->header,
+ "#define HS_SLAB_ROWS() \\\n");
+
+ for (uint32_t ii=1; ii<=config->thread.regs; ii++)
+ fprintf(target->state->header,
+ " HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1);
+
+ fprintf(target->state->header,
+ " HS_EMPTY\n"
+ " \n");
+
+ fprintf(target->state->header,
+ "#define HS_TRANSPOSE_SLAB() \\\n");
+
+ for (uint32_t ii=1; ii<=config->warp.lanes_log2; ii++)
+ fprintf(target->state->header,
+ " HS_TRANSPOSE_STAGE( %u ) \\\n",ii);
+
+ struct hsg_transpose_state state[1] =
+ {
+ { .header = target->state->header,
+ .config = config
+ }
+ };
+
+ hsg_transpose(config->warp.lanes_log2,
+ config->thread.regs,
+ hsg_transpose_blend,state,
+ hsg_transpose_remap,state);
+
+ fprintf(target->state->header,
+ " HS_EMPTY\n"
+ " \n");
+
+#if 0
+ fprintf(target->state->source,
+ "#include <hs_glsl_macros.h> \n"
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n");
+#endif
+ }
+ break;
+
+ case HSG_OP_TYPE_TARGET_END:
+ // decorate the files
+ fprintf(target->state->header,
+ "#endif \n"
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n"
+ " \n");
+
+ // close files
+ fclose(target->state->header);
+ fclose(target->state->embeds);
+
+ // free state
+ free(target->state);
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO:
+ {
+ fprintf(target->state->embeds,
+ "#include \"hs_transpose.len.xxd\"\n,\n"
+ "#include \"hs_transpose.spv.xxd\"\n,\n");
+
+ fopen_s(&target->state->source,"hs_transpose.comp","w+");
+
+ hsg_copyright(target->state->source);
+
+ hsg_macros(target->state->source);
+
+ fprintf(target->state->source,
+ "HS_TRANSPOSE_KERNEL_PROTO(%u)\n",
+ config->warp.lanes);
+ }
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE:
+ {
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ }
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY:
+ {
+ fprintf(target->state->source,
+ "HS_TRANSPOSE_SLAB()\n");
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_KERNEL_PROTO:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ uint32_t const bs = pow2_ru_u32(m->warps);
+ uint32_t const msb = msb_idx_u32(bs);
+
+ fprintf(target->state->embeds,
+ "#include \"hs_bs_%u.len.xxd\"\n,\n"
+ "#include \"hs_bs_%u.spv.xxd\"\n,\n",
+ msb,
+ msb);
+
+ char filename[] = { "hs_bs_123.comp" };
+ sprintf_s(filename,sizeof(filename),"hs_bs_%u.comp",msb);
+ fopen_s(&target->state->source,filename,"w+");
+
+ hsg_copyright(target->state->source);
+
+ hsg_macros(target->state->source);
+
+ if (m->warps > 1)
+ {
+ fprintf(target->state->source,
+ "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+ m->warps * config->warp.lanes,
+ m->rows_bs);
+ }
+
+ fprintf(target->state->source,
+ "HS_BS_KERNEL_PROTO(%u,%u,%u)\n",
+ config->warp.lanes,m->warps,msb);
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_KERNEL_PREAMBLE:
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ break;
+
+ case HSG_OP_TYPE_BC_KERNEL_PROTO:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ uint32_t const msb = msb_idx_u32(m->warps);
+
+ fprintf(target->state->embeds,
+ "#include \"hs_bc_%u.len.xxd\"\n,\n"
+ "#include \"hs_bc_%u.spv.xxd\"\n,\n",
+ msb,
+ msb);
+
+ char filename[] = { "hs_bc_123.comp" };
+ sprintf_s(filename,sizeof(filename),"hs_bc_%u.comp",msb);
+ fopen_s(&target->state->source,filename,"w+");
+
+ hsg_copyright(target->state->source);
+
+ hsg_macros(target->state->source);
+
+ if (m->warps > 1)
+ {
+ fprintf(target->state->source,
+ "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+ m->warps * config->warp.lanes,
+ m->rows_bc);
+ }
+
+ fprintf(target->state->source,
+ "HS_BC_KERNEL_PROTO(%u,%u,%u)\n",
+ config->warp.lanes,m->warps,msb);
+ }
+ break;
+
+ case HSG_OP_TYPE_BC_KERNEL_PREAMBLE:
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ break;
+
+ case HSG_OP_TYPE_FM_KERNEL_PROTO:
+ {
+ fprintf(target->state->embeds,
+ "#include \"hs_fm_%u_%u.len.xxd\"\n,\n"
+ "#include \"hs_fm_%u_%u.spv.xxd\"\n,\n",
+ ops->a,ops->b,
+ ops->a,ops->b);
+
+ char filename[] = { "hs_fm_123_123.comp" };
+ sprintf_s(filename,sizeof(filename),"hs_fm_%u_%u.comp",ops->a,ops->b);
+ fopen_s(&target->state->source,filename,"w+");
+
+ hsg_copyright(target->state->source);
+
+ hsg_macros(target->state->source);
+
+ fprintf(target->state->source,
+ "HS_FM_KERNEL_PROTO(%u,%u)\n",
+ ops->a,ops->b);
+ }
+ break;
+
+ case HSG_OP_TYPE_FM_KERNEL_PREAMBLE:
+ fprintf(target->state->source,
+ "HS_FM_PREAMBLE(%u);\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_HM_KERNEL_PROTO:
+ {
+ fprintf(target->state->embeds,
+ "#include \"hs_hm_%u_%u.len.xxd\"\n,\n"
+ "#include \"hs_hm_%u_%u.spv.xxd\"\n,\n",
+ ops->a,ops->b,
+ ops->a,ops->b);
+
+ char filename[] = { "hs_hm_123_123.comp" };
+ sprintf_s(filename,sizeof(filename),"hs_hm_%u_%u.comp",ops->a,ops->b);
+ fopen_s(&target->state->source,filename,"w+");
+
+ hsg_copyright(target->state->source);
+
+ hsg_macros(target->state->source);
+
+ fprintf(target->state->source,
+ "HS_HM_KERNEL_PROTO(%u)\n",
+ ops->a);
+ }
+ break;
+
+ case HSG_OP_TYPE_HM_KERNEL_PREAMBLE:
+ fprintf(target->state->source,
+ "HS_HM_PREAMBLE(%u);\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD:
+ {
+ static char const * const vstr[] = { "vin", "vout" };
+
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_SLAB_GLOBAL_LOAD(%s,%u,%u);\n",
+ ops->n,vstr[ops->v],config->warp.lanes,ops->n-1);
+ }
+ break;
+
+ case HSG_OP_TYPE_BX_REG_GLOBAL_STORE:
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_STORE(%u,%u,r%u);\n",
+ config->warp.lanes,ops->n-1,ops->n);
+ break;
+
+ case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_HM_REG_GLOBAL_STORE:
+ fprintf(target->state->source,
+ "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT:
+ fprintf(target->state->source,
+ "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_FM_GLOBAL_LOAD_R(%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT:
+ fprintf(target->state->source,
+ "HS_FM_GLOBAL_STORE_R(%-3u,r%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED:
+ {
+ if (ops->a <= ops->b)
+ {
+ fprintf(target->state->source,
+ "if (HS_FM_IS_NOT_LAST_SPAN() || (fm_frac == 0))\n");
+ }
+ else if (ops->b > 1)
+ {
+ fprintf(target->state->source,
+ "else if (fm_frac == %u)\n",
+ ops->b);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "else\n");
+ }
+ }
+ break;
+
+ case HSG_OP_TYPE_SLAB_FLIP:
+ fprintf(target->state->source,
+ "HS_SLAB_FLIP_PREAMBLE(%u);\n",
+ ops->n-1);
+ break;
+
+ case HSG_OP_TYPE_SLAB_HALF:
+ fprintf(target->state->source,
+ "HS_SLAB_HALF_PREAMBLE(%u);\n",
+ ops->n / 2);
+ break;
+
+ case HSG_OP_TYPE_CMP_FLIP:
+ fprintf(target->state->source,
+ "HS_CMP_FLIP(%-3u,r%-3u,r%-3u);\n",ops->a,ops->b,ops->c);
+ break;
+
+ case HSG_OP_TYPE_CMP_HALF:
+ fprintf(target->state->source,
+ "HS_CMP_HALF(%-3u,r%-3u);\n",ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_CMP_XCHG:
+ if (ops->c == UINT32_MAX)
+ {
+ fprintf(target->state->source,
+ "HS_CMP_XCHG(r%-3u,r%-3u);\n",
+ ops->a,ops->b);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "HS_CMP_XCHG(r%u_%u,r%u_%u);\n",
+ ops->c,ops->a,ops->c,ops->b);
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_STORE_V:
+ fprintf(target->state->source,
+ "HS_BX_LOCAL_V(%-3u * %-2u * %-3u) = r%u;\n",
+ merge[ops->a].warps,config->warp.lanes,ops->c,ops->b);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V:
+ fprintf(target->state->source,
+ "r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+ ops->b,merge[ops->a].warps,config->warp.lanes,ops->c);
+ break;
+
+ case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+ ops->b,ops->a,config->warp.lanes,ops->c);
+ break;
+
+ case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT:
+ fprintf(target->state->source,
+ "HS_SLAB_LOCAL_L(%5u) = r%u_%u;\n",
+ ops->b * config->warp.lanes,
+ ops->c,
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT:
+ fprintf(target->state->source,
+ "HS_SLAB_LOCAL_R(%5u) = r%u_%u;\n",
+ ops->b * config->warp.lanes,
+ ops->c,
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_L(%u);\n",
+ ops->c,
+ ops->a,
+ ops->b * config->warp.lanes);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_R(%u);\n",
+ ops->c,
+ ops->a,
+ ops->b * config->warp.lanes);
+ break;
+
+ case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_BC_GLOBAL_LOAD_L(%u,%u);\n",
+ ops->c,
+ ops->a,
+ config->warp.lanes,ops->b);
+ break;
+
+ case HSG_OP_TYPE_BLOCK_SYNC:
+ fprintf(target->state->source,
+ "HS_BLOCK_BARRIER();\n");
+ //
+ // FIXME - Named barriers to allow coordinating warps to proceed?
+ //
+ break;
+
+ case HSG_OP_TYPE_BS_FRAC_PRED:
+ {
+ if (ops->m == 0)
+ {
+ fprintf(target->state->source,
+ "if (warp_idx < bs_full)\n");
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "else if (bs_frac == %u)\n",
+ ops->w);
+ }
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ fprintf(target->state->source,
+ "HS_BS_MERGE_H_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,m->warps);
+ }
+ break;
+
+ case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ fprintf(target->state->source,
+ "HS_BC_MERGE_H_PREAMBLE(%u,%u,%u);\n",
+ config->warp.lanes,config->thread.regs,m->warps);
+ }
+ break;
+
+ case HSG_OP_TYPE_BX_MERGE_H_PRED:
+ fprintf(target->state->source,
+ "if (get_sub_group_id() < %u)\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_ACTIVE_PRED:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ if (m->warps <= 32)
+ {
+ fprintf(target->state->source,
+ "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n",
+ m->levels[ops->b].active.b32a2[0]);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n",
+ m->levels[ops->b].active.b32a2[1],
+ m->levels[ops->b].active.b32a2[0]);
+ }
+ }
+ break;
+
+ default:
+ fprintf(stderr,"type not found: %s\n",hsg_op_type_string[ops->type]);
+ exit(EXIT_FAILURE);
+ break;
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/hs/gen/target_igp_genx.c b/src/compute/hs/gen/target_igp_genx.c
deleted file mode 100644
index 3d0f2bc1b8..0000000000
--- a/src/compute/hs/gen/target_igp_genx.c
+++ /dev/null
@@ -1,672 +0,0 @@
-/*
- * Copyright 2016 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#include <stdio.h>
-
-//
-//
-//
-
-#include "gen.h"
-#include "util.h"
-#include "macros.h"
-#include "transpose.h"
-
-//
-//
-//
-
-static
-char
-hsg_transpose_reg_prefix(uint32_t const cols_log2)
-{
- return 'a' + (('r' + cols_log2 - 'a') % 26);
-}
-
-static
-void
-hsg_transpose_blend(uint32_t const cols_log2,
- uint32_t const row_ll, // lower-left
- uint32_t const row_ur, // upper-right
- FILE * file)
-{
- // we're starting register names at '1' for now
- fprintf(file,
- " HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n",
- hsg_transpose_reg_prefix(cols_log2-1),
- hsg_transpose_reg_prefix(cols_log2),
- cols_log2,row_ll+1,row_ur+1);
-}
-
-static
-void
-hsg_transpose_remap(uint32_t const row_from,
- uint32_t const row_to,
- FILE * file)
-{
- // we're starting register names at '1' for now
- fprintf(file,
- " HS_TRANSPOSE_REMAP( %c, %3u, %3u ) \\\n",
- hsg_transpose_reg_prefix(msb_idx_u32(hsg_config.warp.lanes)),
- row_from+1,row_to+1);
-}
-
-//
-//
-//
-
-void
-hsg_target_igp_genx(struct hsg_file * const files,
- struct hsg_merge const * const merge,
- struct hsg_op const * const ops,
- uint32_t const depth)
-{
- switch (ops->type)
- {
- case HSG_OP_TYPE_END:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "}\n");
- break;
-
- case HSG_OP_TYPE_BEGIN:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "{\n");
- break;
-
- case HSG_OP_TYPE_ELSE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "else\n");
- break;
-
- case HSG_OP_TYPE_FILE_HEADER:
- {
- uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps));
- uint32_t const warp_lanes_log2 = msb_idx_u32(hsg_config.warp.lanes);
-
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- "// \n"
- "// Copyright 2016 Google Inc. \n"
- "// \n"
- "// Use of this source code is governed by a BSD-style \n"
- "// license that can be found in the LICENSE file. \n"
- "// \n"
- " \n"
- "#ifndef HS_CL_ONCE \n"
- "#define HS_CL_ONCE \n"
- " \n"
- "#define HS_LANES_PER_WARP_LOG2 %u \n"
- "#define HS_LANES_PER_WARP (1 << HS_LANES_PER_WARP_LOG2) \n"
- "#define HS_BS_WARPS %u \n"
- "#define HS_BS_WARPS_LOG2_RU %u \n"
- "#define HS_BC_WARPS_LOG2_MAX %u \n"
- "#define HS_FM_BLOCKS_LOG2_MIN %u \n"
- "#define HS_HM_BLOCKS_LOG2_MIN %u \n"
- "#define HS_KEYS_PER_LANE %u \n"
- "#define HS_REG_LAST(c) c##%u \n"
- "#define HS_KEY_WORDS %u \n"
- "#define HS_KEY_TYPE %s \n"
- "#define HS_EMPTY \n"
- " \n",
- warp_lanes_log2,
- merge->warps,
- msb_idx_u32(pow2_ru_u32(merge->warps)),
- bc_max,
- hsg_config.merge.flip.lo,
- hsg_config.merge.half.lo,
- hsg_config.thread.regs,
- hsg_config.thread.regs,
- hsg_config.type.words,
- (hsg_config.type.words == 2) ? "ulong" : "uint");
-
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- "#define HS_SLAB_ROWS() \\\n");
-
- for (uint32_t ii=1; ii<=hsg_config.thread.regs; ii++)
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- " HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1);
-
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- " HS_EMPTY\n"
- " \n");
-
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- "#define HS_TRANSPOSE_SLAB() \\\n");
-
- for (uint32_t ii=1; ii<=warp_lanes_log2; ii++)
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- " HS_TRANSPOSE_STAGE( %u ) \\\n",ii);
-
- hsg_transpose(msb_idx_u32(hsg_config.warp.lanes),
- hsg_config.thread.regs,
- files[HSG_FILE_TYPE_HEADER].file,
- files[HSG_FILE_TYPE_HEADER].file,
- hsg_transpose_blend,
- hsg_transpose_remap);
-
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- " HS_EMPTY\n"
- " \n");
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "// \n"
- "// Copyright 2016 Google Inc. \n"
- "// \n"
- "// Use of this source code is governed by a BSD-style \n"
- "// license that can be found in the LICENSE file. \n"
- "// \n"
- " \n"
- "#include <%s_macros.h> \n"
- " \n"
- "// \n"
- "// \n"
- "// \n",
- files[HSG_FILE_TYPE_SOURCE].prefix);
- }
- break;
-
- case HSG_OP_TYPE_FILE_FOOTER:
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- " \n"
- "#endif \n"
- " \n"
- "// \n"
- "// \n"
- "// \n"
- " \n");
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "// \n"
- "// \n"
- "// \n"
- " \n");
- break;
-
- case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO:
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "__kernel \n"
- "__attribute__((intel_reqd_sub_group_size(%u))) \n"
- "void hs_kernel_transpose(__global HS_KEY_TYPE * const restrict vout) \n",
- hsg_config.warp.lanes);
- }
- break;
-
- case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE:
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const global_id = get_global_id(0); \n"
- "uint const gmem_idx = (global_id / %u) * %u + (global_id & %u); \n"
- " \n",
- hsg_config.warp.lanes,
- hsg_config.warp.lanes * hsg_config.thread.regs,
- hsg_config.warp.lanes-1);
- }
- break;
-
- case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY:
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_TRANSPOSE_SLAB()\n");
- }
- break;
-
- case HSG_OP_TYPE_BS_KERNEL_PROTO:
- {
- struct hsg_merge const * const m = merge + ops->a;
-
- uint32_t const tpb = m->warps * hsg_config.warp.lanes;
- uint32_t const bs = pow2_ru_u32(m->warps);
- uint32_t const msb = msb_idx_u32(bs);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "__kernel \n"
- "__attribute__((reqd_work_group_size(%u,1,1))) \n"
- "__attribute__((intel_reqd_sub_group_size(%u))) \n"
- "void hs_kernel_bs_%u(__global HS_KEY_TYPE const * const restrict vin, \n"
- " __global HS_KEY_TYPE * const restrict vout) \n",
- tpb,
- hsg_config.warp.lanes,
- msb);
- }
- break;
-
- case HSG_OP_TYPE_BS_KERNEL_PREAMBLE:
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "__local union { \n");
-
- struct hsg_merge const * const m = merge + ops->a;
-
- if (m->warps > 1)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " HS_KEY_TYPE m[%u * %u];\n",
- m->rows_bs,
- m->warps * hsg_config.warp.lanes);
- }
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "} shared; \n"
- " \n");
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const global_id = get_global_id(0); \n"
- "uint const gmem_idx = (global_id / %u) * %u + (global_id & %u); \n"
- " \n",
- hsg_config.warp.lanes,
- hsg_config.warp.lanes * hsg_config.thread.regs,
- hsg_config.warp.lanes-1);
- }
- break;
-
- case HSG_OP_TYPE_BC_KERNEL_PROTO:
- {
- uint32_t const bc_max = pow2_rd_u32(merge[0].warps);
- uint32_t const tpb = bc_max * hsg_config.warp.lanes;
- uint32_t const msb = msb_idx_u32(merge[ops->a].warps);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "__kernel \n"
- "__attribute__((intel_reqd_sub_group_size(%u))) \n"
- "void hs_kernel_bc_%u(__global HS_KEY_TYPE * const restrict vout) \n",
- hsg_config.warp.lanes,msb);
- }
- break;
-
- case HSG_OP_TYPE_BC_KERNEL_PREAMBLE:
- {
- struct hsg_merge const * const m = merge + ops->a;
- uint32_t const bc_max = pow2_rd_u32(merge[0].warps);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "__local union { \n");
-
- if (m->warps > 1)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " HS_KEY_TYPE m[%-3u * %u];\n",
- m->rows_bc,
- m->warps * hsg_config.warp.lanes);
- }
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "} shared; \n"
- " \n");
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const global_id = get_global_id(0); \n"
- "uint const gmem_idx = (global_id / %u) * %u + (global_id & %u); \n"
- " \n",
- hsg_config.warp.lanes,
- hsg_config.warp.lanes * hsg_config.thread.regs,
- hsg_config.warp.lanes-1);
- }
- break;
-
- case HSG_OP_TYPE_FM_KERNEL_PROTO:
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- "#define HS_FM_BLOCKS_LOG2_%-2u %u \n",
- ops->a,ops->b);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "__kernel \n"
- "__attribute__((intel_reqd_sub_group_size(%u))) \n"
- "void hs_kernel_fm_%u(__global HS_KEY_TYPE * const restrict vout, \n"
- " uint const fm_full, \n"
- " uint const fm_frac) \n",
- hsg_config.warp.lanes,ops->a);
- break;
-
- case HSG_OP_TYPE_FM_KERNEL_PREAMBLE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const global_id = (uint)get_global_id(0); \n"
- "uint const warp_idx = global_id / %u; \n"
- "uint const warp_lane_idx = global_id & %u; \n"
- " \n"
- "uint const merge_idx = warp_idx / %u >> %u; \n"
- " \n"
- "uint const merge_stride = %u * %u << %u; \n"
- "uint const merge_keys = merge_stride * %u; \n"
- " \n"
- "uint const merge_base = merge_idx * merge_keys; \n"
- " \n"
- "uint const merge_l_off = (warp_idx - merge_idx * (%u << %u)) * %u + warp_lane_idx; \n"
- "uint const merge_l_end = merge_stride * (%u / 2 - 1) + merge_l_off; \n"
- " \n"
- "int const merge_r_off = merge_keys - merge_l_end - 1; \n"
- " \n"
- "__global HS_KEY_TYPE * const restrict merge_l = vout + (merge_base + merge_l_off); \n"
- "__global HS_KEY_TYPE * const restrict merge_r = vout + (merge_base + merge_r_off); \n"
- " \n",
- hsg_config.warp.lanes,
- hsg_config.warp.lanes-1,
- hsg_config.thread.regs,ops->b,
- hsg_config.thread.regs,hsg_config.warp.lanes,ops->b,
- ops->a,
- hsg_config.thread.regs,ops->b,hsg_config.warp.lanes,
- ops->a);
- break;
-
- case HSG_OP_TYPE_HM_KERNEL_PROTO:
- {
- uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge[0].warps));
-
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- "#define HS_HM_BLOCKS_LOG2_%-2u %u \n",
- ops->a,ops->b);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "__kernel \n"
- "__attribute__((intel_reqd_sub_group_size(%u))) \n"
- "void hs_kernel_hm_%u(__global HS_KEY_TYPE * const restrict vout) \n",
- hsg_config.warp.lanes,ops->a);
- }
- break;
-
- case HSG_OP_TYPE_HM_KERNEL_PREAMBLE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const global_id = (uint)get_global_id(0); \n"
- "uint const warp_idx = global_id / %u; \n"
- "uint const warp_lane_idx = global_id & %u; \n"
- " \n"
- "uint const merge_idx = (warp_idx / %u) >> %u; \n"
- " \n"
- "uint const merge_stride = %u * %u << %u; \n"
- "uint const merge_keys = merge_stride * %u; \n"
- " \n"
- "uint const merge_base = merge_idx * merge_keys; \n"
- "uint const merge_off = (warp_idx - merge_idx * (%u << %u)) * %u; \n"
- " \n"
- "__global HS_KEY_TYPE * const restrict merge_ptr = vout + (merge_base + merge_off + warp_lane_idx); \n"
- " \n",
- hsg_config.warp.lanes,
- hsg_config.warp.lanes-1,
- hsg_config.thread.regs,ops->b,
- hsg_config.thread.regs,hsg_config.warp.lanes,ops->b,
- ops->a,
- hsg_config.thread.regs,ops->b,hsg_config.warp.lanes);
- break;
-
- case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD:
- {
- static char const * const vstr[] = { "vin", "vout" };
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_KEY_TYPE r%-3u = (%s + gmem_idx)[%-3u * %u]; \n",
- ops->n,vstr[ops->v],ops->n-1,hsg_config.warp.lanes);
- }
- break;
-
- case HSG_OP_TYPE_BX_REG_GLOBAL_STORE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "(vout + gmem_idx)[%-3u * %u] = r%u; \n",
- ops->n-1,hsg_config.warp.lanes,ops->n);
- break;
-
- case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_KEY_TYPE r%-3u = merge_ptr[%-3u * merge_stride];\n",
- ops->a,ops->b);
- break;
-
- case HSG_OP_TYPE_HM_REG_GLOBAL_STORE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "merge_ptr[%-3u * merge_stride] = r%u;\n",
- ops->b,ops->a);
- break;
-
- case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_KEY_TYPE r%-3u = merge_l[%-3u * merge_stride];\n",
- ops->a,ops->b);
- break;
-
- case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "merge_l[%-3u * merge_stride] = r%u;\n",
- ops->b,ops->a);
- break;
-
- case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_KEY_TYPE r%-3u = merge_r[%-3u * merge_stride];\n",
- ops->a,ops->b);
- break;
-
- case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "merge_r[%-3u * merge_stride] = r%u;\n",
- ops->b,ops->a);
- break;
-
- case HSG_OP_TYPE_WARP_FLIP:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const flip_lane_mask = %u; \n"
- "uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; \n"
- "int const t_lt = get_sub_group_local_id() < flip_lane_idx; \n",
- ops->n-1);
- break;
-
- case HSG_OP_TYPE_WARP_HALF:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const half_lane_mask = %u; \n"
- "uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; \n"
- "int const t_lt = get_sub_group_local_id() < half_lane_idx; \n",
- ops->n / 2);
- break;
-
- case HSG_OP_TYPE_CMP_FLIP:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_CMP_FLIP(%-3u,r%-3u,r%-3u)\n",ops->a,ops->b,ops->c);
- break;
-
- case HSG_OP_TYPE_CMP_HALF:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_CMP_HALF(%-3u,r%-3u)\n",ops->a,ops->b);
- break;
-
- case HSG_OP_TYPE_CMP_XCHG:
- if (ops->c == UINT32_MAX)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_CMP_XCHG(r%-3u,r%-3u)\n",
- ops->a,ops->b);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_CMP_XCHG(r%u_%u,r%u_%u)\n",
- ops->c,ops->a,ops->c,ops->b);
- }
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_STORE_V:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "(shared.m + get_local_id(0))[%-3u * %-2u * %-3u] = r%u;\n",
- merge[ops->a].warps,hsg_config.warp.lanes,ops->c,ops->b);
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "r%-3u = (shared.m + get_local_id(0))[%-3u * %-2u * %-3u];\n",
- ops->b,merge[ops->a].warps,hsg_config.warp.lanes,ops->c);
- break;
-
- case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_KEY_TYPE r%-3u = (shared.m + get_local_id(0))[%-3u * %-2u * %-3u];\n",
- ops->b,ops->a,hsg_config.warp.lanes,ops->c);
- break;
-
- case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "(shared.m + smem_l_idx)[%5u] = r%u_%u;\n",
- ops->b * hsg_config.warp.lanes,
- ops->c,
- ops->a);
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "(shared.m + smem_r_idx)[%5u] = r%u_%u;\n",
- ops->b * hsg_config.warp.lanes,
- ops->c,
- ops->a);
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_KEY_TYPE r%u_%-3u = (shared.m + smem_l_idx)[%u];\n",
- ops->c,
- ops->a,
- ops->b * hsg_config.warp.lanes);
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_KEY_TYPE r%u_%-3u = (shared.m + smem_r_idx)[%u];\n",
- ops->c,
- ops->a,
- ops->b * hsg_config.warp.lanes);
- break;
-
- case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_KEY_TYPE r%u_%-3u = (vout + gmem_l_idx)[%u];\n",
- ops->c,
- ops->a,
- ops->b * hsg_config.warp.lanes);
- break;
-
- case HSG_OP_TYPE_BLOCK_SYNC:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "barrier(CLK_LOCAL_MEM_FENCE);\n"); // OpenCL 2.0+: work_group_barrier
- break;
-
- case HSG_OP_TYPE_BS_FRAC_PRED:
- {
- if (ops->m == 0)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (warp_idx < bs_full)\n");
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "else if (bs_frac == %u)\n",
- ops->w);
- }
- }
- break;
-
- case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE:
- {
- struct hsg_merge const * const m = merge + ops->a;
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const smem_l_idx = get_sub_group_id() * %u + get_sub_group_local_id(); \n"
- "uint const smem_r_idx = (get_sub_group_id() ^ 1) * %u + (get_sub_group_local_id() ^ %u); \n",
- m->warps * hsg_config.warp.lanes,
- m->warps * hsg_config.warp.lanes, hsg_config.warp.lanes-1);
-#if 0
- if (ops->b == true)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint smem_l_idx = get_sub_group_id() * %u + get_sub_group_local_id(); \n"
- "uint smem_r_idx = (get_sub_group_id() ^ 1) * %u + (get_sub_group_local_id() ^ %u); \n",
- m->warps * hsg_config.warp.lanes,
- m->warps * hsg_config.warp.lanes, hsg_config.warp.lanes-1);
- }
- else // update
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "smem_l_idx = get_sub_group_id() * %u + get_sub_group_local_id(); \n"
- "smem_r_idx = (get_sub_group_id() ^ 1) * %u + (get_sub_group_local_id() ^ %u); \n",
- m->warps * hsg_config.warp.lanes,
- m->warps * hsg_config.warp.lanes, hsg_config.warp.lanes-1);
- }
-#endif
- }
- break;
-
- case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE:
- {
- struct hsg_merge const * const m = merge + ops->a;
- uint32_t const b = m->warps * hsg_config.warp.lanes;
- uint32_t const k = b * hsg_config.thread.regs;
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const gmem_l_idx = (global_id / %u) * %u + (global_id & %u); \n"
- "uint const smem_l_idx = get_sub_group_id() * %u + get_sub_group_local_id(); \n",
- b,k,b-1,
- b);
-
- }
- break;
-
- case HSG_OP_TYPE_BX_MERGE_H_PRED:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (get_sub_group_id() < %u)\n",
- ops->a);
- break;
-
- case HSG_OP_TYPE_BS_ACTIVE_PRED:
- {
- struct hsg_merge const * const m = merge + ops->a;
-
- if (m->warps <= 32)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n",
- m->levels[ops->b].active.b32a2[0]);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n",
- m->levels[ops->b].active.b32a2[1],
- m->levels[ops->b].active.b32a2[0]);
- }
- }
- break;
-
- case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED:
- {
- if (ops->a == ops->b)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (merge_idx < fm_full) \n");
- }
- else if (ops->b > 1)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "else if (fm_frac == %u) \n",
- ops->b);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "else\n");
- }
- }
- break;
-
- default:
- hsg_target_debug(files,merge,ops,depth);
- break;
- }
-}
-
-//
-//
-//
diff --git a/src/compute/hs/gen/target_opencl.c b/src/compute/hs/gen/target_opencl.c
new file mode 100644
index 0000000000..fe7343ba5d
--- /dev/null
+++ b/src/compute/hs/gen/target_opencl.c
@@ -0,0 +1,600 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+//
+//
+
+#include "gen.h"
+#include "transpose.h"
+
+#include "common/util.h"
+#include "common/macros.h"
+
+//
+//
+//
+
+struct hsg_transpose_state
+{
+ FILE * header;
+ struct hsg_config const * config;
+};
+
+static
+char
+hsg_transpose_reg_prefix(uint32_t const cols_log2)
+{
+ return 'a' + (('r' + cols_log2 - 'a') % 26);
+}
+
+static
+void
+hsg_transpose_blend(uint32_t const cols_log2,
+ uint32_t const row_ll, // lower-left
+ uint32_t const row_ur, // upper-right
+ struct hsg_transpose_state * const state)
+{
+ // we're starting register names at '1' for now
+ fprintf(state->header,
+ " HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n",
+ hsg_transpose_reg_prefix(cols_log2-1),
+ hsg_transpose_reg_prefix(cols_log2),
+ cols_log2,row_ll+1,row_ur+1);
+}
+
+static
+void
+hsg_transpose_remap(uint32_t const row_from,
+ uint32_t const row_to,
+ struct hsg_transpose_state * const state)
+{
+ // we're starting register names at '1' for now
+ fprintf(state->header,
+ " HS_TRANSPOSE_REMAP( %c, %3u, %3u ) \\\n",
+ hsg_transpose_reg_prefix(state->config->warp.lanes_log2),
+ row_from+1,row_to+1);
+}
+
+//
+//
+//
+
+static
+void
+hsg_copyright(FILE * file)
+{
+ fprintf(file,
+ "// \n"
+ "// Copyright 2016 Google Inc. \n"
+ "// \n"
+ "// Use of this source code is governed by a BSD-style \n"
+ "// license that can be found in the LICENSE file. \n"
+ "// \n"
+ "\n");
+}
+
+//
+//
+//
+
+struct hsg_target_state
+{
+ FILE * header;
+ FILE * source;
+};
+
+//
+//
+//
+
+void
+hsg_target_opencl(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth)
+{
+ switch (ops->type)
+ {
+ case HSG_OP_TYPE_END:
+ fprintf(target->state->source,
+ "}\n");
+ break;
+
+ case HSG_OP_TYPE_BEGIN:
+ fprintf(target->state->source,
+ "{\n");
+ break;
+
+ case HSG_OP_TYPE_ELSE:
+ fprintf(target->state->source,
+ "else\n");
+ break;
+
+ case HSG_OP_TYPE_TARGET_BEGIN:
+ {
+ // allocate state
+ target->state = malloc(sizeof(*target->state));
+
+ // allocate files
+ fopen_s(&target->state->header,"hs_cl.h", "wb");
+ fopen_s(&target->state->source,"hs_cl.cl","wb");
+
+ // initialize header
+ uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps));
+
+ hsg_copyright(target->state->header);
+
+ fprintf(target->state->header,
+ "#ifndef HS_CL_ONCE \n"
+ "#define HS_CL_ONCE \n"
+ " \n"
+ "#define HS_SLAB_THREADS_LOG2 %u \n"
+ "#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2) \n"
+ "#define HS_SLAB_WIDTH_LOG2 %u \n"
+ "#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2) \n"
+ "#define HS_SLAB_HEIGHT %u \n"
+ "#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)\n"
+ "#define HS_REG_LAST(c) c##%u \n"
+ "#define HS_KEY_TYPE %s \n"
+ "#define HS_KEY_WORDS %u \n"
+ "#define HS_VAL_WORDS 0 \n"
+ "#define HS_BS_SLABS %u \n"
+ "#define HS_BS_SLABS_LOG2_RU %u \n"
+ "#define HS_BC_SLABS_LOG2_MAX %u \n"
+ "#define HS_FM_SCALE_MIN %u \n"
+ "#define HS_FM_SCALE_MAX %u \n"
+ "#define HS_HM_SCALE_MIN %u \n"
+ "#define HS_HM_SCALE_MAX %u \n"
+ "#define HS_EMPTY \n"
+ " \n",
+ config->warp.lanes_log2,
+ config->warp.lanes_log2,
+ config->thread.regs,
+ config->thread.regs,
+ (config->type.words == 2) ? "ulong" : "uint",
+ config->type.words,
+ merge->warps,
+ msb_idx_u32(pow2_ru_u32(merge->warps)),
+ bc_max,
+ config->merge.flip.lo,
+ config->merge.flip.hi,
+ config->merge.half.lo,
+ config->merge.half.hi);
+
+ fprintf(target->state->header,
+ "#define HS_SLAB_ROWS() \\\n");
+
+ for (uint32_t ii=1; ii<=config->thread.regs; ii++)
+ fprintf(target->state->header,
+ " HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1);
+
+ fprintf(target->state->header,
+ " HS_EMPTY\n"
+ " \n");
+
+ fprintf(target->state->header,
+ "#define HS_TRANSPOSE_SLAB() \\\n");
+
+ for (uint32_t ii=1; ii<=config->warp.lanes_log2; ii++)
+ fprintf(target->state->header,
+ " HS_TRANSPOSE_STAGE( %u ) \\\n",ii);
+
+ struct hsg_transpose_state state[1] =
+ {
+ { .header = target->state->header,
+ .config = config
+ }
+ };
+
+ hsg_transpose(config->warp.lanes_log2,
+ config->thread.regs,
+ hsg_transpose_blend,state,
+ hsg_transpose_remap,state);
+
+ fprintf(target->state->header,
+ " HS_EMPTY\n"
+ " \n");
+
+ hsg_copyright(target->state->source);
+
+ fprintf(target->state->source,
+ "#include \"hs_cl_macros.h\" \n"
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n");
+ }
+ break;
+
+ case HSG_OP_TYPE_TARGET_END:
+ // decorate the files
+ fprintf(target->state->header,
+ "#endif \n"
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n"
+ " \n");
+ fprintf(target->state->source,
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n"
+ " \n");
+
+ // close files
+ fclose(target->state->header);
+ fclose(target->state->source);
+
+ // free state
+ free(target->state);
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO:
+ {
+ fprintf(target->state->source,
+ "\nHS_TRANSPOSE_KERNEL_PROTO(%u)\n",
+ config->warp.lanes);
+ }
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE:
+ {
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ }
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY:
+ {
+ fprintf(target->state->source,
+ "HS_TRANSPOSE_SLAB()\n");
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_KERNEL_PROTO:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ uint32_t const bs = pow2_ru_u32(m->warps);
+ uint32_t const msb = msb_idx_u32(bs);
+
+ fprintf(target->state->source,
+ "\nHS_BS_KERNEL_PROTO(%u,%u,%u)\n",
+ config->warp.lanes,m->warps,msb);
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_KERNEL_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ if (m->warps > 1)
+ {
+ fprintf(target->state->source,
+ "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+ m->warps * config->warp.lanes,
+ m->rows_bs);
+ }
+
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ }
+ break;
+
+ case HSG_OP_TYPE_BC_KERNEL_PROTO:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ uint32_t const msb = msb_idx_u32(m->warps);
+
+ fprintf(target->state->source,
+ "\nHS_BC_KERNEL_PROTO(%u,%u,%u)\n",
+ config->warp.lanes,m->warps,msb);
+ }
+ break;
+
+ case HSG_OP_TYPE_BC_KERNEL_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ if (m->warps > 1)
+ {
+ fprintf(target->state->source,
+ "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+ m->warps * config->warp.lanes,
+ m->rows_bc);
+ }
+
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ }
+ break;
+
+ case HSG_OP_TYPE_FM_KERNEL_PROTO:
+ fprintf(target->state->source,
+ "\nHS_FM_KERNEL_PROTO(%u,%u)\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_FM_KERNEL_PREAMBLE:
+ fprintf(target->state->source,
+ "HS_FM_PREAMBLE(%u);\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_HM_KERNEL_PROTO:
+ {
+ fprintf(target->state->source,
+ "\nHS_HM_KERNEL_PROTO(%u)\n",
+ ops->a);
+ }
+ break;
+
+ case HSG_OP_TYPE_HM_KERNEL_PREAMBLE:
+ fprintf(target->state->source,
+ "HS_HM_PREAMBLE(%u);\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD:
+ {
+ static char const * const vstr[] = { "vin", "vout" };
+
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_SLAB_GLOBAL_LOAD(%s,%u,%u);\n",
+ ops->n,vstr[ops->v],config->warp.lanes,ops->n-1);
+ }
+ break;
+
+ case HSG_OP_TYPE_BX_REG_GLOBAL_STORE:
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_STORE(%u,%u,r%u);\n",
+ config->warp.lanes,ops->n-1,ops->n);
+ break;
+
+ case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_HM_REG_GLOBAL_STORE:
+ fprintf(target->state->source,
+ "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT:
+ fprintf(target->state->source,
+ "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_FM_GLOBAL_LOAD_R(%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT:
+ fprintf(target->state->source,
+ "HS_FM_GLOBAL_STORE_R(%-3u,r%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED:
+ {
+ if (ops->a <= ops->b)
+ {
+ fprintf(target->state->source,
+ "if (HS_FM_IS_NOT_LAST_SPAN() || (fm_frac == 0))\n");
+ }
+ else if (ops->b > 1)
+ {
+ fprintf(target->state->source,
+ "else if (fm_frac == %u)\n",
+ ops->b);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "else\n");
+ }
+ }
+ break;
+
+ case HSG_OP_TYPE_SLAB_FLIP:
+ fprintf(target->state->source,
+ "HS_SLAB_FLIP_PREAMBLE(%u);\n",
+ ops->n-1);
+ break;
+
+ case HSG_OP_TYPE_SLAB_HALF:
+ fprintf(target->state->source,
+ "HS_SLAB_HALF_PREAMBLE(%u);\n",
+ ops->n / 2);
+ break;
+
+ case HSG_OP_TYPE_CMP_FLIP:
+ fprintf(target->state->source,
+ "HS_CMP_FLIP(%-3u,r%-3u,r%-3u);\n",ops->a,ops->b,ops->c);
+ break;
+
+ case HSG_OP_TYPE_CMP_HALF:
+ fprintf(target->state->source,
+ "HS_CMP_HALF(%-3u,r%-3u);\n",ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_CMP_XCHG:
+ if (ops->c == UINT32_MAX)
+ {
+ fprintf(target->state->source,
+ "HS_CMP_XCHG(r%-3u,r%-3u);\n",
+ ops->a,ops->b);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "HS_CMP_XCHG(r%u_%u,r%u_%u);\n",
+ ops->c,ops->a,ops->c,ops->b);
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_STORE_V:
+ fprintf(target->state->source,
+ "HS_BX_LOCAL_V(%-3u * %-2u * %-3u) = r%u;\n",
+ merge[ops->a].warps,config->warp.lanes,ops->c,ops->b);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V:
+ fprintf(target->state->source,
+ "r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+ ops->b,merge[ops->a].warps,config->warp.lanes,ops->c);
+ break;
+
+ case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+ ops->b,ops->a,config->warp.lanes,ops->c);
+ break;
+
+ case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT:
+ fprintf(target->state->source,
+ "HS_SLAB_LOCAL_L(%5u) = r%u_%u;\n",
+ ops->b * config->warp.lanes,
+ ops->c,
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT:
+ fprintf(target->state->source,
+ "HS_SLAB_LOCAL_R(%5u) = r%u_%u;\n",
+ ops->b * config->warp.lanes,
+ ops->c,
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_L(%u);\n",
+ ops->c,
+ ops->a,
+ ops->b * config->warp.lanes);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_R(%u);\n",
+ ops->c,
+ ops->a,
+ ops->b * config->warp.lanes);
+ break;
+
+ case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_BC_GLOBAL_LOAD_L(%u,%u);\n",
+ ops->c,
+ ops->a,
+ config->warp.lanes,ops->b);
+ break;
+
+ case HSG_OP_TYPE_BLOCK_SYNC:
+ fprintf(target->state->source,
+ "HS_BLOCK_BARRIER();\n");
+ //
+ // FIXME - Named barriers to allow coordinating warps to proceed?
+ //
+ break;
+
+ case HSG_OP_TYPE_BS_FRAC_PRED:
+ {
+ if (ops->m == 0)
+ {
+ fprintf(target->state->source,
+ "if (warp_idx < bs_full)\n");
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "else if (bs_frac == %u)\n",
+ ops->w);
+ }
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ fprintf(target->state->source,
+ "HS_BS_MERGE_H_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,m->warps);
+ }
+ break;
+
+ case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ fprintf(target->state->source,
+ "HS_BC_MERGE_H_PREAMBLE(%u,%u,%u);\n",
+ config->warp.lanes,config->thread.regs,m->warps);
+ }
+ break;
+
+ case HSG_OP_TYPE_BX_MERGE_H_PRED:
+ fprintf(target->state->source,
+ "if (get_sub_group_id() < %u)\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_ACTIVE_PRED:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ if (m->warps <= 32)
+ {
+ fprintf(target->state->source,
+ "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n",
+ m->levels[ops->b].active.b32a2[0]);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n",
+ m->levels[ops->b].active.b32a2[1],
+ m->levels[ops->b].active.b32a2[0]);
+ }
+ }
+ break;
+
+ default:
+ fprintf(stderr,"type not found: %s\n",hsg_op_type_string[ops->type]);
+ exit(EXIT_FAILURE);
+ break;
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/hs/gen/transpose.c b/src/compute/hs/gen/transpose.c
index de15c62631..095f53d330 100644
--- a/src/compute/hs/gen/transpose.c
+++ b/src/compute/hs/gen/transpose.c
@@ -11,7 +11,7 @@
//
#include "transpose.h"
-#include "macros.h"
+#include "common/macros.h"
//
// Rows must be an even number. This is enforced elsewhere.
@@ -21,19 +21,19 @@
void
hsg_transpose(uint32_t const cols_log2,
uint32_t const rows,
- void * blend,
- void * remap,
void (*pfn_blend)(uint32_t const cols_log2,
uint32_t const row_ll, // lower-left
uint32_t const row_ur, // upper-right
void * blend),
+ void * blend,
void (*pfn_remap)(uint32_t const row_from,
uint32_t const row_to,
- void * remap))
+ void * remap),
+ void * remap)
{
// get mapping array
- uint32_t * map_curr = ALLOCA(rows * sizeof(*map_curr));
- uint32_t * map_next = ALLOCA(rows * sizeof(*map_next));
+ uint32_t * map_curr = ALLOCA_MACRO(rows * sizeof(*map_curr));
+ uint32_t * map_next = ALLOCA_MACRO(rows * sizeof(*map_next));
// init the mapping array
for (uint32_t ii=0; ii<rows; ii++)
@@ -89,35 +89,35 @@ static uint32_t cols; // implicit on SIMD/GPU
static
void
-hsg_debug_remap(uint32_t const row_from,
- uint32_t const row_to,
- uint32_t * const r)
-{
- fprintf(stdout,"REMAP( %3u, %3u )\n",row_from,row_to);
-
- r[row_to] = row_from;
-}
-
-static
-void
hsg_debug_blend(uint32_t const cols_log2,
uint32_t const row_ll, // lower-left
uint32_t const row_ur, // upper-right
- uint32_t * m)
+ uint32_t * b)
{
fprintf(stdout,"BLEND( %u, %3u, %3u )\n",cols_log2,row_ll,row_ur);
- uint32_t * const ll = ALLOCA(cols * sizeof(*m));
- uint32_t * const ur = ALLOCA(cols * sizeof(*m));
+ uint32_t * const ll = ALLOCA(cols * sizeof(*b));
+ uint32_t * const ur = ALLOCA(cols * sizeof(*b));
- memcpy(ll,m+row_ll*cols,cols * sizeof(*m));
- memcpy(ur,m+row_ur*cols,cols * sizeof(*m));
+ memcpy(ll,b+row_ll*cols,cols * sizeof(*b));
+ memcpy(ur,b+row_ur*cols,cols * sizeof(*b));
for (uint32_t ii=0; ii<cols; ii++)
- m[row_ll*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii] : ur[ii^(1<<cols_log2-1)];
+ b[row_ll*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii] : ur[ii^(1<<cols_log2-1)];
for (uint32_t ii=0; ii<cols; ii++)
- m[row_ur*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii^(1<<cols_log2-1)] : ur[ii];
+ b[row_ur*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii^(1<<cols_log2-1)] : ur[ii];
+}
+
+static
+void
+hsg_debug_remap(uint32_t const row_from,
+ uint32_t const row_to,
+ uint32_t * const r)
+{
+ fprintf(stdout,"REMAP( %3u, %3u )\n",row_from,row_to);
+
+ r[row_to] = row_from;
}
static
@@ -144,23 +144,22 @@ main(int argc, char * argv[])
cols = 1 << cols_log2;
- uint32_t * const m = ALLOCA(cols * rows * sizeof(*m));
+ uint32_t * const b = ALLOCA(cols * rows * sizeof(*b));
uint32_t * const r = ALLOCA( rows * sizeof(*r));
for (uint32_t rr=0; rr<rows; rr++) {
r[rr] = rr;
for (uint32_t cc=0; cc<cols; cc++)
- m[rr*cols+cc] = cc*rows+rr;
+ b[rr*cols+cc] = cc*rows+rr;
}
- hsg_debug_print(rows,m,r);
+ hsg_debug_print(rows,b,r);
hsg_transpose(cols_log2,rows,
- m,r,
- hsg_debug_blend,
- hsg_debug_remap);
+ hsg_debug_blend,b,
+ hsg_debug_remap,r);
- hsg_debug_print(rows,m,r);
+ hsg_debug_print(rows,b,r);
return 0;
}
diff --git a/src/compute/hs/gen/transpose.h b/src/compute/hs/gen/transpose.h
index 83f6fc4e42..380210970d 100644
--- a/src/compute/hs/gen/transpose.h
+++ b/src/compute/hs/gen/transpose.h
@@ -23,15 +23,15 @@
void
hsg_transpose(uint32_t const cols_log2,
uint32_t const rows,
- void * blend,
- void * remap,
void (*pfn_blend)(uint32_t const cols_log2,
uint32_t const row_ll, // lower-left
uint32_t const row_ur, // upper-right
void * blend),
+ void * blend,
void (*pfn_remap)(uint32_t const row_from,
uint32_t const row_to,
- void * remap));
+ void * remap),
+ void * remap);
//
//
diff --git a/src/compute/hs/vk/hs_spirv_target.h b/src/compute/hs/vk/hs_spirv_target.h
new file mode 100644
index 0000000000..aa711efc6d
--- /dev/null
+++ b/src/compute/hs/vk/hs_spirv_target.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include <stdint.h>
+
+//
+// This structure packages all of the parameters and SPIR-V kernels
+// for a target architecture.
+//
+
+struct hs_spirv_target_config
+{
+ struct {
+ uint8_t threads_log2;
+ uint8_t width_log2;
+ uint8_t height;
+ } slab;
+
+ struct {
+ uint8_t key;
+ uint8_t val;
+ } words;
+
+ struct {
+ uint8_t slabs;
+ } block;
+
+ struct {
+ struct {
+ uint8_t scale_min;
+ uint8_t scale_max;
+ } fm;
+ struct {
+ uint8_t scale_min;
+ uint8_t scale_max;
+ } hm;
+ } merge;
+
+ uint8_t pad[2];
+};
+
+static_assert(sizeof(struct hs_spirv_target_config) == 12,
+ "modules.words[] must start on a 32-bit boundary");
+
+//
+// For now, kernels are appended end-to-end with a leading big-endian
+// length followed by a SPIR-V binary.
+//
+// The entry point for each kernel is "main".
+//
+// When the tools support packaging multiple named compute shaders in
+// one SPIR-V module then reevaluate this encoding.
+//
+
+struct hs_spirv_target
+{
+ struct hs_spirv_target_config config;
+ union {
+ uint8_t bytes[];
+ uint32_t words[];
+ } modules;
+};
+
+//
+//
+//
diff --git a/src/compute/hs/vk/hs_vk_launcher.c b/src/compute/hs/vk/hs_vk_launcher.c
new file mode 100644
index 0000000000..e1080a0e8b
--- /dev/null
+++ b/src/compute/hs/vk/hs_vk_launcher.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/vk/assert_vk.h"
+#include "common/util.h"
+
+#include "hs_vk_launcher.h"
+#include "hs_spirv_target.h"
+
+//
+//
+//
+
+struct hs_vk
+{
+ struct hs_spirv_target_config config;
+
+ uint32_t key_val_size;
+ uint32_t slab_keys;
+ uint32_t bs_slabs_log2_ru;
+ uint32_t bc_slabs_log2_max;
+
+ VkDevice device;
+ VkAllocationCallbacks const * allocator;
+
+ struct {
+ uint32_t count;
+ VkPipeline * transpose;
+ VkPipeline * bs;
+ VkPipeline * bc;
+ VkPipeline * fm[3];
+ VkPipeline * hm[3];
+ VkPipeline all[];
+ } pipelines;
+};
+
+//
+//
+//
+
+struct hs_vk *
+hs_vk_create(struct hs_spirv_target const * const target,
+ VkDevice device,
+ VkAllocationCallbacks const * allocator,
+ VkPipelineCache pipeline_cache)
+{
+ //
+ // we reference these values a lot
+ //
+ uint32_t const bs_slabs_log2_ru = msb_idx_u32(pow2_ru_u32(target->config.block.slabs));
+ uint32_t const bc_slabs_log2_max = msb_idx_u32(pow2_rd_u32(target->config.block.slabs));
+
+ //
+ // how many kernels will be created?
+ //
+ uint32_t const count_bs = bs_slabs_log2_ru + 1;
+ uint32_t const count_bc = bc_slabs_log2_max + 1;
+ uint32_t count_fm[3] = { 0 };
+ uint32_t count_hm[3] = { 0 };
+
+ // guaranteed to be in range [0,2]
+ for (uint32_t scale = target->config.merge.fm.scale_min;
+ scale <= target->config.merge.fm.scale_max;
+ scale++)
+ {
+ count_fm[scale] = msb_idx_u32(pow2_ru_u32(target->config.block.slabs>>(scale-1))) + 1;
+ }
+
+ // guaranteed to be in range [0,2]
+ for (uint32_t scale = target->config.merge.hm.scale_min;
+ scale <= target->config.merge.hm.scale_max;
+ scale++)
+ {
+ count_hm[scale] = 1;
+ }
+
+ uint32_t const count_all =
+ 1
+ + count_bs
+ + count_bc
+ + count_fm[0] + count_fm[1] + count_fm[2]
+ + count_hm[0] + count_hm[1] + count_hm[2];
+
+ //
+ // allocate hs_vk
+ //
+ struct hs_vk * hs;
+
+ if (allocator == NULL)
+ {
+ hs = malloc(sizeof(*hs) + sizeof(VkPipeline*) * count_all);
+ }
+ else
+ {
+ hs = NULL;
+ }
+
+ // save the config
+ memcpy(&hs->config,&target->config,sizeof(hs->config));
+
+ // save some frequently used calculated values
+ hs->key_val_size = (target->config.words.key + target->config.words.val) * 4;
+ hs->slab_keys = target->config.slab.height << target->config.slab.width_log2;
+ hs->bs_slabs_log2_ru = bs_slabs_log2_ru;
+ hs->bc_slabs_log2_max = bc_slabs_log2_max;
+
+ // save device & allocator
+ hs->device = device;
+ hs->allocator = allocator;
+
+ // save kernel count
+ hs->pipelines.count = count_all;
+
+ //
+ // create all the compute pipelines
+ //
+ VkComputePipelineCreateInfo cpci = {
+ .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+ .pNext = NULL,
+ .flags = VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT,
+ .stage = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+ .pNext = NULL,
+ .flags = 0,
+ .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+ .module = VK_NULL_HANDLE,
+ .pName = "main",
+ .pSpecializationInfo = NULL
+ },
+ .basePipelineHandle = VK_NULL_HANDLE,
+ .basePipelineIndex = -1
+ };
+
+ //
+ // Create a shader module, use it to create a pipeline... and
+ // dispose of the shader module.
+ //
+ uint32_t const * modules = target->modules.words;
+
+ for (uint32_t ii=0; ii<count_all; ii++)
+ {
+ size_t const module_size = *modules++;
+
+ VkShaderModuleCreateInfo const smci = {
+ .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+ .pNext = NULL,
+ .flags = 0,
+ .codeSize = module_size,
+ .pCode = modules
+ };
+
+ modules += module_size;
+
+ vk(CreateShaderModule(device,
+ &smci,
+ allocator,
+ &cpci.stage.module));
+
+
+ vk(CreateComputePipelines(device,
+ pipeline_cache,
+ count_all,
+ &cpci,
+ allocator,
+ hs->pipelines.all+ii));
+
+ vkDestroyShaderModule(device,
+ cpci.stage.module,
+ allocator);
+ }
+
+ //
+ // initialize pointers to pipeline handles
+ //
+ VkPipeline * pipeline_next = hs->pipelines.all;
+
+ // TRANSPOSE
+ hs->pipelines.transpose = pipeline_next;
+ pipeline_next += 1;
+
+ // BS
+ hs->pipelines.bs = pipeline_next;
+ pipeline_next += count_bs;
+
+ // BC
+ hs->pipelines.bc = pipeline_next;
+ pipeline_next += count_bc;
+
+ // FM[0]
+ hs->pipelines.fm[0] = count_fm[0] ? pipeline_next : NULL;
+ pipeline_next += count_fm[0];
+
+ // FM[1]
+ hs->pipelines.fm[1] = count_fm[1] ? pipeline_next : NULL;
+ pipeline_next += count_fm[1];
+
+ // FM[2]
+ hs->pipelines.fm[2] = count_fm[2] ? pipeline_next : NULL;
+ pipeline_next += count_fm[2];
+
+ // HM[0]
+ hs->pipelines.hm[0] = count_hm[0] ? pipeline_next : NULL;
+ pipeline_next += count_hm[0];
+
+ // HM[1]
+ hs->pipelines.hm[1] = count_hm[1] ? pipeline_next : NULL;
+ pipeline_next += count_hm[1];
+
+ // HM[2]
+ hs->pipelines.hm[2] = count_hm[2] ? pipeline_next : NULL;
+ pipeline_next += count_hm[2];
+
+ return hs;
+}
+
+//
+//
+//
+
+void
+hs_vk_release(struct hs_vk * const hs)
+{
+ for (uint32_t ii=0; ii<hs->pipelines.count; ii++)
+ vkDestroyPipeline(hs->device,
+ hs->pipelines.all[ii],
+ hs->allocator);
+
+ if (hs->allocator == NULL)
+ {
+ free(hs);
+ }
+ else
+ {
+ ;
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/hs/vk/hs_vk_launcher.h b/src/compute/hs/vk/hs_vk_launcher.h
new file mode 100644
index 0000000000..a549666985
--- /dev/null
+++ b/src/compute/hs/vk/hs_vk_launcher.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include <vulkan/vulkan.h>
+
+//
+//
+//
+
+#include <stdint.h>
+#include <stdbool.h>
+
+//
+//
+//
+
+#include "hs_spirv_target.h"
+
+//
+//
+//
+
+struct hs_vk *
+hs_vk_create(struct hs_spirv_target const * const target,
+ VkDevice device,
+ VkAllocationCallbacks const * allocator,
+ VkPipelineCache pipeline_cache);
+
+//
+// Resources will be disposed of with the same device and allocator
+// used for creation.
+//
+
+void
+hs_vk_release(struct hs_vk * const hs);
+
+//
+// Determine what padding will be applied to the input and output
+// buffers.
+//
+// Always check to see if the allocated buffers are large enough.
+//
+// count : number of keys
+// count + count_padded_in : additional keys required for sorting
+// count + count_padded_out : additional keys required for merging
+//
+
+void
+hs_vk_pad(struct hs_vk const * const hs,
+ uint32_t const count,
+ uint32_t * const count_padded_in,
+ uint32_t * const count_padded_out);
+
+//
+// Sort the keys in the vin buffer and store them in the vout buffer.
+//
+// If vout is NULL then the sort will be performed in place.
+//
+
+#if 0
+void
+hs_vk_sort(struct hs_vk const * const hs,
+ vk_command_queue cq,
+ uint32_t const wait_list_size,
+ vk_event * wait_list,
+ vk_event * event,
+ vk_mem vin,
+ vk_mem vout,
+ uint32_t const count,
+ uint32_t const count_padded_in,
+ uint32_t const count_padded_out,
+ bool const linearize);
+#endif
+
+//
+//
+//
diff --git a/src/compute/hs/vk/intel/gen8/u32/make_all.bat b/src/compute/hs/vk/intel/gen8/u32/make_all.bat
new file mode 100644
index 0000000000..9afd7b3a72
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u32/make_all.bat
@@ -0,0 +1,48 @@
+@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+ echo %%~nf
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+ )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat b/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat
new file mode 100644
index 0000000000..9afd7b3a72
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat
@@ -0,0 +1,48 @@
+@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+ echo %%~nf
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+ )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h b/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h
new file mode 100644
index 0000000000..d4376114e5
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h
@@ -0,0 +1,100 @@
+//
+// Copyright 2016 Google Inc.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+
+#ifndef HS_GLSL_ONCE
+#define HS_GLSL_ONCE
+
+#define HS_SLAB_THREADS_LOG2 3
+#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2)
+#define HS_SLAB_WIDTH_LOG2 3
+#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2)
+#define HS_SLAB_HEIGHT 16
+#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)
+#define HS_REG_LAST(c) c##16
+#define HS_KEY_TYPE uint64_t
+#define HS_KEY_WORDS 2
+#define HS_VAL_WORDS 0
+#define HS_BS_SLABS 16
+#define HS_BS_SLABS_LOG2_RU 4
+#define HS_BC_SLABS_LOG2_MAX 4
+#define HS_FM_SCALE_MIN 1
+#define HS_FM_SCALE_MAX 1
+#define HS_HM_SCALE_MIN 1
+#define HS_HM_SCALE_MAX 1
+#define HS_EMPTY
+
+#define HS_SLAB_ROWS() \
+ HS_SLAB_ROW( 1, 0 ) \
+ HS_SLAB_ROW( 2, 1 ) \
+ HS_SLAB_ROW( 3, 2 ) \
+ HS_SLAB_ROW( 4, 3 ) \
+ HS_SLAB_ROW( 5, 4 ) \
+ HS_SLAB_ROW( 6, 5 ) \
+ HS_SLAB_ROW( 7, 6 ) \
+ HS_SLAB_ROW( 8, 7 ) \
+ HS_SLAB_ROW( 9, 8 ) \
+ HS_SLAB_ROW( 10, 9 ) \
+ HS_SLAB_ROW( 11, 10 ) \
+ HS_SLAB_ROW( 12, 11 ) \
+ HS_SLAB_ROW( 13, 12 ) \
+ HS_SLAB_ROW( 14, 13 ) \
+ HS_SLAB_ROW( 15, 14 ) \
+ HS_SLAB_ROW( 16, 15 ) \
+ HS_EMPTY
+
+#define HS_TRANSPOSE_SLAB() \
+ HS_TRANSPOSE_STAGE( 1 ) \
+ HS_TRANSPOSE_STAGE( 2 ) \
+ HS_TRANSPOSE_STAGE( 3 ) \
+ HS_TRANSPOSE_BLEND( r, s, 1, 2, 1 ) \
+ HS_TRANSPOSE_BLEND( r, s, 1, 4, 3 ) \
+ HS_TRANSPOSE_BLEND( r, s, 1, 6, 5 ) \
+ HS_TRANSPOSE_BLEND( r, s, 1, 8, 7 ) \
+ HS_TRANSPOSE_BLEND( r, s, 1, 10, 9 ) \
+ HS_TRANSPOSE_BLEND( r, s, 1, 12, 11 ) \
+ HS_TRANSPOSE_BLEND( r, s, 1, 14, 13 ) \
+ HS_TRANSPOSE_BLEND( r, s, 1, 16, 15 ) \
+ HS_TRANSPOSE_BLEND( s, t, 2, 3, 1 ) \
+ HS_TRANSPOSE_BLEND( s, t, 2, 4, 2 ) \
+ HS_TRANSPOSE_BLEND( s, t, 2, 7, 5 ) \
+ HS_TRANSPOSE_BLEND( s, t, 2, 8, 6 ) \
+ HS_TRANSPOSE_BLEND( s, t, 2, 11, 9 ) \
+ HS_TRANSPOSE_BLEND( s, t, 2, 12, 10 ) \
+ HS_TRANSPOSE_BLEND( s, t, 2, 15, 13 ) \
+ HS_TRANSPOSE_BLEND( s, t, 2, 16, 14 ) \
+ HS_TRANSPOSE_BLEND( t, u, 3, 5, 1 ) \
+ HS_TRANSPOSE_BLEND( t, u, 3, 6, 2 ) \
+ HS_TRANSPOSE_BLEND( t, u, 3, 7, 3 ) \
+ HS_TRANSPOSE_BLEND( t, u, 3, 8, 4 ) \
+ HS_TRANSPOSE_BLEND( t, u, 3, 13, 9 ) \
+ HS_TRANSPOSE_BLEND( t, u, 3, 14, 10 ) \
+ HS_TRANSPOSE_BLEND( t, u, 3, 15, 11 ) \
+ HS_TRANSPOSE_BLEND( t, u, 3, 16, 12 ) \
+ HS_TRANSPOSE_REMAP( u, 1, 1 ) \
+ HS_TRANSPOSE_REMAP( u, 2, 3 ) \
+ HS_TRANSPOSE_REMAP( u, 3, 5 ) \
+ HS_TRANSPOSE_REMAP( u, 4, 7 ) \
+ HS_TRANSPOSE_REMAP( u, 5, 9 ) \
+ HS_TRANSPOSE_REMAP( u, 6, 11 ) \
+ HS_TRANSPOSE_REMAP( u, 7, 13 ) \
+ HS_TRANSPOSE_REMAP( u, 8, 15 ) \
+ HS_TRANSPOSE_REMAP( u, 9, 2 ) \
+ HS_TRANSPOSE_REMAP( u, 10, 4 ) \
+ HS_TRANSPOSE_REMAP( u, 11, 6 ) \
+ HS_TRANSPOSE_REMAP( u, 12, 8 ) \
+ HS_TRANSPOSE_REMAP( u, 13, 10 ) \
+ HS_TRANSPOSE_REMAP( u, 14, 12 ) \
+ HS_TRANSPOSE_REMAP( u, 15, 14 ) \
+ HS_TRANSPOSE_REMAP( u, 16, 16 ) \
+ HS_EMPTY
+
+#endif
+
+//
+//
+//
+
diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h b/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h
new file mode 100644
index 0000000000..c67dffa3a0
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h
@@ -0,0 +1,417 @@
+//
+// Copyright 2016 Google Inc.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+
+#ifndef HS_GLSL_MACROS_ONCE
+#define HS_GLSL_MACROS_ONCE
+
+//
+//
+//
+
+#define HS_HASH #
+#define HS_EVAL(a) a
+#define HS_GLSL_EXT() HS_EVAL(HS_HASH)##extension
+#define HS_GLSL_EXT_ENABLE(name) HS_GLSL_EXT() name : enable
+#define HS_GLSL_VERSION(ver) HS_EVAL(HS_HASH)##version ver
+
+//
+//
+//
+
+// HS_GLSL_VERSION(460)
+
+HS_GLSL_EXT_ENABLE(GL_ARB_gpu_shader_int64)
+HS_GLSL_EXT_ENABLE(GL_KHR_shader_subgroup_shuffle)
+HS_GLSL_EXT_ENABLE(GL_KHR_shader_subgroup_basic)
+
+//
+//
+//
+
+#include "hs_glsl.h"
+
+//
+//
+//
+
+#if (HS_KEY_WORDS == 1)
+#define HS_SHUFFLE_CAST_TO(v) v
+#define HS_SHUFFLE_CAST_FROM(v) v
+#elif (HS_KEY_WORDS == 2)
+#define HS_SHUFFLE_CAST_TO(v) uint64BitsToDouble(v)
+#define HS_SHUFFLE_CAST_FROM(v) doubleBitsToUint64(v)
+#endif
+
+#define HS_SUBGROUP_SHUFFLE(v,i) HS_SHUFFLE_CAST_FROM(subgroupShuffle(HS_SHUFFLE_CAST_TO(v),i))
+#define HS_SUBGROUP_SHUFFLE_XOR(v,m) HS_SHUFFLE_CAST_FROM(subgroupShuffleXor(HS_SHUFFLE_CAST_TO(v),m))
+#define HS_SUBGROUP_SHUFFLE_UP(v,d) HS_SHUFFLE_CAST_FROM(subgroupShuffleUp(HS_SHUFFLE_CAST_TO(v),d))
+#define HS_SUBGROUP_SHUFFLE_DOWN(v,d) HS_SHUFFLE_CAST_FROM(subgroupShuffleDown(HS_SHUFFLE_CAST_TO(v),d))
+
+//
+// This up/down shuffle has defined values for [0,subgroup size)
+//
+
+#define HS_SUBGROUP_SHUFFLE_UP_2(prev,curr,delta)
+
+#define HS_SUBGROUP_SHUFFLE_DOWN_2(curr,next,delta)
+
+//
+// FYI, restrict shouldn't have any impact on these kernels and
+// benchmarks appear to prove that true
+//
+
+#define HS_RESTRICT restrict
+
+//
+//
+//
+
+#define HS_GLSL_WORKGROUP_SIZE(x,y,z) \
+ layout (local_size_x = x, \
+ local_size_y = y, \
+ local_size_z = z) in
+
+#define HS_GLSL_SUBGROUP_SIZE(x)
+
+//
+// KERNEL PROTOS
+//
+
+#define HS_TRANSPOSE_KERNEL_PROTO(slab_width) \
+ buffer _vout { HS_KEY_TYPE vout[]; }; \
+ HS_GLSL_WORKGROUP_SIZE(slab_width,1,1); \
+ HS_GLSL_SUBGROUP_SIZE(slab_width) \
+ void main()
+
+#define HS_BS_KERNEL_PROTO(slab_width,slab_count,slab_count_ru_log2) \
+ buffer readonly _vin { HS_KEY_TYPE vin[]; }; \
+ buffer writeonly _vout { HS_KEY_TYPE vout[]; }; \
+ HS_GLSL_WORKGROUP_SIZE(slab_width*slab_count,1,1); \
+ HS_GLSL_SUBGROUP_SIZE(slab_width) \
+ void main()
+
+#define HS_BC_KERNEL_PROTO(slab_width,slab_count,slab_count_log2) \
+ buffer _vout { HS_KEY_TYPE vout[]; }; \
+ HS_GLSL_WORKGROUP_SIZE(slab_width*slab_count,1,1); \
+ HS_GLSL_SUBGROUP_SIZE(slab_width) \
+ void main()
+
+#define HS_HM_KERNEL_PROTO(s) \
+ buffer _vout { HS_KEY_TYPE vout[]; }; \
+ HS_GLSL_WORKGROUP_SIZE(HS_SLAB_KEYS,1,1); \
+ void main()
+
+#define HS_FM_KERNEL_PROTO(s,r) \
+ buffer _vout { HS_KEY_TYPE vout[]; }; \
+ HS_GLSL_WORKGROUP_SIZE(HS_SLAB_KEYS,1,1); \
+ void main()
+
+//
+// BLOCK LOCAL MEMORY DECLARATION
+//
+
+#define HS_BLOCK_LOCAL_MEM_DECL(width,height) \
+ shared struct { \
+ HS_KEY_TYPE m[width * height]; \
+ } smem
+
+//
+// BLOCK BARRIER
+//
+
+#define HS_BLOCK_BARRIER() \
+ barrier()
+
+//
+// SLAB GLOBAL
+//
+
+#define HS_SLAB_GLOBAL_PREAMBLE(slab_width,slab_height) \
+ const uint gmem_idx = \
+ (gl_GlobalInvocationID.x & ~(slab_width-1)) * slab_height + \
+ gl_SubgroupInvocationID
+
+#define HS_SLAB_GLOBAL_LOAD(extent,slab_width,row_idx) \
+ extent[gmem_idx + slab_width * row_idx]
+
+#define HS_SLAB_GLOBAL_STORE(slab_width,row_idx,reg) \
+ vout[gmem_idx + slab_width * row_idx] = reg
+
+//
+// SLAB LOCAL
+//
+
+#define HS_SLAB_LOCAL_L(offset) \
+ smem.m[smem_l_idx + (offset)]
+
+#define HS_SLAB_LOCAL_R(offset) \
+ smem.m[smem_r_idx + (offset)]
+
+//
+// SLAB LOCAL VERTICAL LOADS
+//
+
+#define HS_BX_LOCAL_V(offset) \
+ smem.m[gl_LocalInvocationID.x + (offset)]
+
+//
+// BLOCK SORT MERGE HORIZONTAL
+//
+
+#define HS_BS_MERGE_H_PREAMBLE(slab_width,slab_count) \
+ const uint smem_l_idx = \
+ gl_SubgroupID * (slab_width * slab_count) + \
+ gl_SubgroupInvocationID; \
+ const uint smem_r_idx = \
+ (gl_SubgroupID ^ 1) * (slab_width * slab_count) + \
+ (gl_SubgroupInvocationID ^ (slab_width - 1))
+
+//
+// BLOCK CLEAN MERGE HORIZONTAL
+//
+
+#define HS_BC_MERGE_H_PREAMBLE(slab_width,slab_height,slab_count) \
+ const uint gmem_l_idx = \
+ (gl_GlobalInvocationID.x & ~(slab_width*slab_count-1)) * slab_height + \
+ gl_LocalInvocationID.x; \
+ const uint smem_l_idx = \
+ gl_SubgroupID * (slab_width * slab_count) + \
+ gl_SubgroupInvocationID
+
+#define HS_BC_GLOBAL_LOAD_L(slab_width,slab_idx) \
+ vout[gmem_l_idx + (slab_width * slab_idx)]
+
+//
+// SLAB FLIP AND HALF PREAMBLES
+//
+
+#define HS_SLAB_FLIP_PREAMBLE(mask) \
+ const uint flip_lane_idx = gl_SubgroupInvocationID ^ mask; \
+ const bool t_lt = gl_SubgroupInvocationID < flip_lane_idx;
+
+#define HS_SLAB_HALF_PREAMBLE(mask) \
+ const uint half_lane_idx = gl_SubgroupInvocationID ^ mask; \
+ const bool t_lt = gl_SubgroupInvocationID < half_lane_idx;
+
+//
+// Inter-lane compare exchange
+//
+
+// default
+#define HS_CMP_XCHG_V0(a,b) \
+ { \
+ const HS_KEY_TYPE t = min(a,b); \
+ b = max(a,b); \
+ a = t; \
+ }
+
+// super slow
+#define HS_CMP_XCHG_V1(a,b) \
+ { \
+ const HS_KEY_TYPE tmp = a; \
+ a = (a < b) ? a : b; \
+ b ^= a ^ tmp; \
+ }
+
+// best
+#define HS_CMP_XCHG_V2(a,b) \
+ if (a >= b) { \
+ const HS_KEY_TYPE t = a; \
+ a = b; \
+ b = t; \
+ }
+
+// good
+#define HS_CMP_XCHG_V3(a,b) \
+ { \
+ const bool ge = a >= b; \
+ const HS_KEY_TYPE t = a; \
+ a = ge ? b : a; \
+ b = ge ? t : b; \
+ }
+
+//
+//
+//
+
+#if (HS_KEY_WORDS == 1)
+#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V0(a,b)
+#elif (HS_KEY_WORDS == 2)
+#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V2(a,b)
+#endif
+
+//
+// The flip/half comparisons rely on a "conditional min/max":
+//
+// - if the flag is false, return min(a,b)
+// - otherwise, return max(a,b)
+//
+// What's a little surprising is that sequence (1) is faster than (2)
+// for 32-bit keys.
+//
+// I suspect either a code generation problem or that the sequence
+// maps well to the GEN instruction set.
+//
+// We mostly care about 64-bit keys and unsurprisingly sequence (2) is
+// fastest for this wider type.
+//
+
+#define HS_LOGICAL_XOR() !=
+
+// this is what you would normally use
+#define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) HS_LOGICAL_XOR() lt) ? b : a
+
+// this seems to be faster for 32-bit keys
+#define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b))
+
+//
+//
+//
+
+#if (HS_KEY_WORDS == 1)
+#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V1(lt,a,b)
+#elif (HS_KEY_WORDS == 2)
+#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b)
+#endif
+
+//
+// Conditional inter-subgroup flip/half compare exchange
+//
+
+#define HS_CMP_FLIP(i,a,b) \
+ { \
+ const HS_KEY_TYPE ta = HS_SUBGROUP_SHUFFLE(a,flip_lane_idx); \
+ const HS_KEY_TYPE tb = HS_SUBGROUP_SHUFFLE(b,flip_lane_idx); \
+ a = HS_COND_MIN_MAX(t_lt,a,tb); \
+ b = HS_COND_MIN_MAX(t_lt,b,ta); \
+ }
+
+#define HS_CMP_HALF(i,a) \
+ { \
+ const HS_KEY_TYPE ta = HS_SUBGROUP_SHUFFLE(a,half_lane_idx); \
+ a = HS_COND_MIN_MAX(t_lt,a,ta); \
+ }
+
+//
+// The device's comparison operator might return what we actually
+// want. For example, it appears GEN 'cmp' returns {true:-1,false:0}.
+//
+
+#define HS_CMP_IS_ZERO_ONE
+
+#ifdef HS_CMP_IS_ZERO_ONE
+// OpenCL requires a {true: +1, false: 0} scalar result
+// (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF }
+#define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b))
+#define HS_CMP_TO_MASK(a) (HS_KEY_TYPE)(-a)
+#else
+// However, OpenCL requires { -1, 0 } for vectors
+// (a < b) -> { 0xFFFFFFFF, 0 }
+#define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64
+#define HS_CMP_TO_MASK(a) (a)
+#endif
+
+//
+// The "flip-merge" and "half-merge" preambles are very similar
+//
+
+#define HS_HM_PREAMBLE(half_span) \
+ const uint span_idx = gl_GlobalInvocationID.z * gl_NumWorkGroups.y + gl_GlobalInvocationID.y; \
+ const uint span_stride = gl_NumWorkGroups.x * gl_WorkGroupSize.x; \
+ const uint span_size = span_stride * half_span * 2; \
+ const uint span_base = span_idx * span_size; \
+ const uint span_off = gl_GlobalInvocationID.x; \
+ const uint span_l = span_base + span_off
+
+#define HS_FM_PREAMBLE(half_span) \
+ HS_HM_PREAMBLE(half_span); \
+ const uint span_r = span_base + span_stride * (half_span + 1) - span_off - 1
+
+//
+//
+//
+
+#define HS_XM_GLOBAL_L(stride_idx) \
+ vout[span_l + span_stride * stride_idx]
+
+#define HS_XM_GLOBAL_LOAD_L(stride_idx) \
+ HS_XM_GLOBAL_L(stride_idx)
+
+#define HS_XM_GLOBAL_STORE_L(stride_idx,reg) \
+ HS_XM_GLOBAL_L(stride_idx) = reg
+
+#define HS_FM_GLOBAL_R(stride_idx) \
+ vout[span_r + span_stride * stride_idx]
+
+#define HS_FM_GLOBAL_LOAD_R(stride_idx) \
+ HS_FM_GLOBAL_R(stride_idx)
+
+#define HS_FM_GLOBAL_STORE_R(stride_idx,reg) \
+ HS_FM_GLOBAL_R(stride_idx) = reg
+
+//
+// This snarl of macros is for transposing a "slab" of sorted elements
+// into linear order.
+//
+// This can occur as the last step in hs_sort() or via a custom kernel
+// that inspects the slab and then transposes and stores it to memory.
+//
+// The slab format can be inspected more efficiently than a linear
+// arrangement.
+//
+// The prime example is detecting when adjacent keys (in sort order)
+// have differing high order bits ("key changes"). The index of each
+// change is recorded to an auxilary array.
+//
+// A post-processing step like this needs to be able to navigate the
+// slab and eventually transpose and store the slab in linear order.
+//
+
+#define HS_TRANSPOSE_REG(prefix,row) prefix##row
+#define HS_TRANSPOSE_DECL(prefix,row) const HS_KEY_TYPE HS_TRANSPOSE_REG(prefix,row)
+#define HS_TRANSPOSE_PRED(level) is_lo_##level
+
+#define HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) \
+ prefix_curr##row_ll##_##row_ur
+
+#define HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) \
+ const HS_KEY_TYPE HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur)
+
+#define HS_TRANSPOSE_STAGE(level) \
+ const bool HS_TRANSPOSE_PRED(level) = \
+ (gl_SubgroupInvocationID & (1 << (level-1))) == 0;
+
+#define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \
+ HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) = \
+ HS_SUBGROUP_SHUFFLE_XOR(HS_TRANSPOSE_PRED(level) ? \
+ HS_TRANSPOSE_REG(prefix_prev,row_ll) : \
+ HS_TRANSPOSE_REG(prefix_prev,row_ur), \
+ 1<<(level-1)); \
+ \
+ HS_TRANSPOSE_DECL(prefix_curr,row_ll) = \
+ HS_TRANSPOSE_PRED(level) ? \
+ HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) : \
+ HS_TRANSPOSE_REG(prefix_prev,row_ll); \
+ \
+ HS_TRANSPOSE_DECL(prefix_curr,row_ur) = \
+ HS_TRANSPOSE_PRED(level) ? \
+ HS_TRANSPOSE_REG(prefix_prev,row_ur) : \
+ HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur);
+
+#define HS_TRANSPOSE_REMAP(prefix,row_from,row_to) \
+ vout[gmem_idx + ((row_to-1) << HS_SLAB_WIDTH_LOG2)] = \
+ HS_TRANSPOSE_REG(prefix,row_from);
+
+//
+//
+//
+
+#endif
+
+//
+//
+//
diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h b/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h
new file mode 100644
index 0000000000..551fc52180
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h
@@ -0,0 +1,75 @@
+//
+// Copyright 2016 Google Inc.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+
+#include "hs_transpose.len.xxd"
+,
+#include "hs_transpose.spv.xxd"
+,
+#include "hs_bs_4.len.xxd"
+,
+#include "hs_bs_4.spv.xxd"
+,
+#include "hs_bs_3.len.xxd"
+,
+#include "hs_bs_3.spv.xxd"
+,
+#include "hs_bs_2.len.xxd"
+,
+#include "hs_bs_2.spv.xxd"
+,
+#include "hs_bs_1.len.xxd"
+,
+#include "hs_bs_1.spv.xxd"
+,
+#include "hs_bs_0.len.xxd"
+,
+#include "hs_bs_0.spv.xxd"
+,
+#include "hs_bc_4.len.xxd"
+,
+#include "hs_bc_4.spv.xxd"
+,
+#include "hs_bc_3.len.xxd"
+,
+#include "hs_bc_3.spv.xxd"
+,
+#include "hs_bc_2.len.xxd"
+,
+#include "hs_bc_2.spv.xxd"
+,
+#include "hs_bc_1.len.xxd"
+,
+#include "hs_bc_1.spv.xxd"
+,
+#include "hs_bc_0.len.xxd"
+,
+#include "hs_bc_0.spv.xxd"
+,
+#include "hs_fm_1_4.len.xxd"
+,
+#include "hs_fm_1_4.spv.xxd"
+,
+#include "hs_fm_1_3.len.xxd"
+,
+#include "hs_fm_1_3.spv.xxd"
+,
+#include "hs_fm_1_2.len.xxd"
+,
+#include "hs_fm_1_2.spv.xxd"
+,
+#include "hs_fm_1_1.len.xxd"
+,
+#include "hs_fm_1_1.spv.xxd"
+,
+#include "hs_fm_1_0.len.xxd"
+,
+#include "hs_fm_1_0.spv.xxd"
+,
+#include "hs_hm_1_0.len.xxd"
+,
+#include "hs_hm_1_0.spv.xxd"
+,
diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_target.h b/src/compute/hs/vk/intel/gen8/u64/hs_target.h
new file mode 100644
index 0000000000..f379c23066
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u64/hs_target.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "../../../hs_spirv_target.h"
+
+//
+//
+//
+
+#include "hs_glsl.h"
+
+//
+//
+//
+
+#ifndef HS_TARGET_NAME
+#define HS_TARGET_NAME hs_target
+#endif
+
+#define HS_TARGET_HELPER(a) a
+
+//
+//
+//
+
+static struct hs_spirv_target const HS_TARGET_NAME =
+{
+ .config = {
+ .slab = {
+ .threads_log2 = HS_SLAB_THREADS_LOG2,
+ .width_log2 = HS_SLAB_WIDTH_LOG2,
+ .height = HS_SLAB_HEIGHT
+ },
+
+ .words = {
+ .key = HS_KEY_WORDS,
+ .val = HS_VAL_WORDS
+ },
+
+ .block = {
+ .slabs = HS_BS_SLABS
+ },
+
+ .merge = {
+ .fm = {
+ .scale_min = HS_FM_SCALE_MIN,
+ .scale_max = HS_FM_SCALE_MAX
+ },
+ .hm = {
+ .scale_min = HS_HM_SCALE_MIN,
+ .scale_max = HS_HM_SCALE_MAX,
+ }
+ },
+
+ .pad = { 0 }
+ },
+
+ .modules.bytes = {
+
+#include "hs_kernels.h"
+
+#ifdef HS_DUMP
+ 0,0,0,0
+#endif
+ }
+};
+
+//
+//
+//
+
+#ifdef HS_DUMP
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int
+main(int argc, char const * argv[])
+{
+ FILE * fp = fopen("hs_target.bin","wb");
+
+ fwrite(&HS_TARGET_NAME.config,1,sizeof(HS_TARGET_NAME.config),fp);
+
+ uint8_t const * modules = HS_TARGET_NAME.modules.bytes;
+ size_t modsize = (modules[0]<<24) | (modules[1]<<16) | (modules[2]<<8) | modules[3];
+
+ while (modsize > 0) {
+ // fprintf(stderr,"%zu\n",modsize);
+ modsize += sizeof(uint32_t);
+ fwrite(modules,1,modsize,fp);
+ modules += modsize;
+ modsize = (modules[0]<<24) | (modules[1]<<16) | (modules[2]<<8) | modules[3];
+ }
+
+ fclose(fp);
+
+ return EXIT_SUCCESS;
+}
+
+#endif
+
+//
+//
+//
diff --git a/src/compute/hs/vk/intel/gen8/u64/make_all.bat b/src/compute/hs/vk/intel/gen8/u64/make_all.bat
new file mode 100644
index 0000000000..d148ef0113
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u64/make_all.bat
@@ -0,0 +1,79 @@
+@ECHO OFF
+
+::
+:: delete the previous images
+::
+
+del *.pre.comp
+del *.comp
+del *.spv
+del *.xxd
+
+::
+::
+::
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+:: --- 32-bit keys ---
+
+:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+:: --- 64-bit keys
+
+%HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+:: CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+::
+:: remove trailing whitespace from generated files
+::
+
+sed -i 's/[[:space:]]*$//' hs_glsl.h
+sed -i 's/[[:space:]]*$//' hs_kernels.h
+
+::
+:: FIXME -- convert this to a bash script
+::
+:: Note that we can use xargs instead of the cmd for/do
+::
+
+for %%f in (*.comp) do (
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+:: glslangValidator -V110 -o %%~nf.spv %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=460 -I . -o %%~nf.spv %%~nf.pre.comp || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+:: spirv-remap ... || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ echo %%~nf.spv %%A
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd || goto :error
+ )
+)
+
+::
+:: dump a binary
+::
+
+cl /DHS_DUMP /Fe:hs_dump.exe /Tchs_target.h
+hs_dump
+
+::
+:: delete temporary files
+::
+
+:: del *.pre.comp
+del *.comp
+del *.spv
+del *.obj
+del *.exe
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat b/src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat
new file mode 100644
index 0000000000..9afd7b3a72
--- /dev/null
+++ b/src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat
@@ -0,0 +1,48 @@
+@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+ echo %%~nf
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+ )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat b/src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat
new file mode 100644
index 0000000000..9afd7b3a72
--- /dev/null
+++ b/src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat
@@ -0,0 +1,48 @@
+@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+ echo %%~nf
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+ )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat b/src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat
new file mode 100644
index 0000000000..9afd7b3a72
--- /dev/null
+++ b/src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat
@@ -0,0 +1,48 @@
+@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+ echo %%~nf
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+ )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/skc/extent_ring.c b/src/compute/skc/extent_ring.c
index ecb41e6fcd..251b4208b2 100644
--- a/src/compute/skc/extent_ring.c
+++ b/src/compute/skc/extent_ring.c
@@ -202,4 +202,3 @@ skc_extent_ring_snap_to(struct skc_extent_ring_snap const * const snap)
//
//
//
-
diff --git a/src/compute/skc/main.c b/src/compute/skc/main.c
index 067d24c773..2af8ebb6fc 100644
--- a/src/compute/skc/main.c
+++ b/src/compute/skc/main.c
@@ -69,7 +69,7 @@ skc_runtime_cl_12_debug(struct skc_context * const context);
//
int
-main(int argc, char** argv)
+main(int argc, char const * argv[])
{
//
//
@@ -242,7 +242,8 @@ main(int argc, char** argv)
skc_composition_seal(composition);
}
- uint32_t const clip[] = { 0, 0, 65535, 65535 }; // tile clip is <= 9 bits (512)
+ uint32_t const clip[] = { 0, 0, 65535, 65535 };
+ int32_t const txty[] = { 0, 0 };
// render the styled composition to the surface
skc_surface_render(surface,
@@ -250,6 +251,7 @@ main(int argc, char** argv)
composition,
skc_interop_get_framebuffer(interop),
clip,
+ txty,
NULL,
NULL);
diff --git a/src/compute/skc/path_builder.h b/src/compute/skc/path_builder.h
index a956475f49..1c32d6a166 100644
--- a/src/compute/skc/path_builder.h
+++ b/src/compute/skc/path_builder.h
@@ -82,4 +82,3 @@ struct skc_path_builder
//
//
//
-
diff --git a/src/compute/skc/platforms/cl_12/allocator_device_cl.c b/src/compute/skc/platforms/cl_12/allocator_device_cl.c
index 8003504706..425952d09a 100644
--- a/src/compute/skc/platforms/cl_12/allocator_device_cl.c
+++ b/src/compute/skc/platforms/cl_12/allocator_device_cl.c
@@ -133,4 +133,3 @@ skc_allocator_device_dispose(struct skc_runtime * const runtime)
//
//
//
-
diff --git a/src/compute/skc/platforms/cl_12/allocator_device_cl.h b/src/compute/skc/platforms/cl_12/allocator_device_cl.h
index 08c4518a6a..17011b7233 100644
--- a/src/compute/skc/platforms/cl_12/allocator_device_cl.h
+++ b/src/compute/skc/platforms/cl_12/allocator_device_cl.h
@@ -51,4 +51,3 @@ skc_allocator_device_dispose(struct skc_runtime * const runtime);
//
//
//
-
diff --git a/src/compute/skc/platforms/cl_12/composition_cl_12.c b/src/compute/skc/platforms/cl_12/composition_cl_12.c
index 5db86762f3..96e7834b03 100644
--- a/src/compute/skc/platforms/cl_12/composition_cl_12.c
+++ b/src/compute/skc/platforms/cl_12/composition_cl_12.c
@@ -380,7 +380,8 @@ static
void
skc_composition_sort_grid_pfn_execute(skc_grid_t const grid)
{
- struct skc_composition_impl * const impl = skc_grid_get_data(grid);
+ struct skc_composition_impl * const impl = skc_grid_get_data(grid);
+ struct skc_runtime * const runtime = impl->runtime;
// we should be sealing
assert(impl->state == SKC_COMPOSITION_STATE_SEALING);
@@ -395,22 +396,24 @@ skc_composition_sort_grid_pfn_execute(skc_grid_t const grid)
{
uint32_t keys_padded_in, keys_padded_out;
- hs_pad(atomics->keys,&keys_padded_in,&keys_padded_out);
+ hs_cl_pad(runtime->hs,atomics->keys,&keys_padded_in,&keys_padded_out);
- hs_sort(impl->cq,
- impl->keys.drw,
- impl->keys.drw,
- atomics->keys,
- keys_padded_in,
- keys_padded_out,
- false);
+ hs_cl_sort(impl->runtime->hs,
+ impl->cq,
+ 0,NULL,NULL,
+ impl->keys.drw,
+ NULL,
+ atomics->keys,
+ keys_padded_in,
+ keys_padded_out,
+ false);
cl(SetKernelArg(impl->kernels.segment,0,SKC_CL_ARG(impl->keys.drw)));
cl(SetKernelArg(impl->kernels.segment,1,SKC_CL_ARG(impl->offsets.drw)));
cl(SetKernelArg(impl->kernels.segment,2,SKC_CL_ARG(impl->atomics.drw)));
// find start of each tile
- skc_device_enqueue_kernel(impl->runtime->device,
+ skc_device_enqueue_kernel(runtime->device,
SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK,
impl->cq,
impl->kernels.segment,
diff --git a/src/compute/skc/platforms/cl_12/export_cl_12.h b/src/compute/skc/platforms/cl_12/export_cl_12.h
index 23ff2343e6..244a5282f6 100644
--- a/src/compute/skc/platforms/cl_12/export_cl_12.h
+++ b/src/compute/skc/platforms/cl_12/export_cl_12.h
@@ -60,4 +60,3 @@ skc_surface_cl_12_create(struct skc_context * const context,
//
//
//
-
diff --git a/src/compute/skc/platforms/cl_12/extent_cl_12.c b/src/compute/skc/platforms/cl_12/extent_cl_12.c
index e145d979c2..2d90d0ecfa 100644
--- a/src/compute/skc/platforms/cl_12/extent_cl_12.c
+++ b/src/compute/skc/platforms/cl_12/extent_cl_12.c
@@ -166,7 +166,7 @@ skc_extent_thr_tdrw_alloc(struct skc_runtime * const runtime,
{
extent->size = size;
extent->hr = skc_runtime_host_temp_alloc(runtime,
- SKC_MEM_FLAGS_READ_WRITE,
+ SKC_MEM_FLAGS_READ_ONLY,
size,&extent->id.hr,NULL);
extent->drw = skc_runtime_device_temp_alloc(runtime,
CL_MEM_READ_WRITE,
diff --git a/src/compute/skc/platforms/cl_12/interop/interop_glfw.c b/src/compute/skc/platforms/cl_12/interop/interop_glfw.c
index 8f94100552..f3c11ee9f1 100644
--- a/src/compute/skc/platforms/cl_12/interop/interop_glfw.c
+++ b/src/compute/skc/platforms/cl_12/interop/interop_glfw.c
@@ -354,7 +354,7 @@ skc_interop_create()
.interop = interop,
.post_render = skc_interop_blit },
- .is_msecs = true,
+ .is_msecs = false,
.is_srgb = true,
.is_vsync_on = false,
.is_fullscreen = false,
@@ -747,5 +747,3 @@ skc_interop_get_size(struct skc_interop * interop,
//
//
//
-
-
diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c
index 0be97235f3..9ff0ba53b7 100644
--- a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c
+++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c
@@ -23,7 +23,7 @@
#include "device_cl_12.h"
#include "hs/cl/hs_cl_launcher.h"
-#include "hs/cl/gen9/hs_cl.h"
+#include "hs/cl/intel/gen8/u64/hs_target.h"
//
//
@@ -500,11 +500,11 @@ skc_device_shaper_segment_ttrk(size_t const work_size,
size_t * const work_local)
{
// work_size is number of keys -- round up to a whole slab
- size_t keys_ru = SKC_ROUND_UP(work_size,HS_LANES_PER_WARP*HS_KEYS_PER_LANE);
+ size_t keys_ru = SKC_ROUND_UP(work_size,HS_SLAB_WIDTH*HS_SLAB_HEIGHT);
work_dim [0] = 1;
- work_global[0] = keys_ru / HS_KEYS_PER_LANE;
- work_local [0] = HS_LANES_PER_WARP; // or just return NULL
+ work_global[0] = keys_ru / HS_SLAB_HEIGHT;
+ work_local [0] = HS_SLAB_WIDTH; // or just return NULL
return work_local;
}
@@ -517,11 +517,11 @@ skc_device_shaper_segment_ttck(size_t const work_size,
size_t * const work_local)
{
// work_size is number of keys -- round up to a whole slab
- size_t keys_ru = SKC_ROUND_UP(work_size,HS_LANES_PER_WARP*HS_KEYS_PER_LANE);
+ size_t keys_ru = SKC_ROUND_UP(work_size,HS_SLAB_WIDTH*HS_SLAB_HEIGHT);
work_dim [0] = 1;
- work_global[0] = keys_ru / HS_KEYS_PER_LANE;
- work_local [0] = HS_LANES_PER_WARP; // or just return NULL
+ work_global[0] = keys_ru / HS_SLAB_HEIGHT;
+ work_local [0] = HS_SLAB_WIDTH; // or just return NULL
return work_local;
}
@@ -894,8 +894,10 @@ skc_device_create(struct skc_runtime * const runtime)
SKC_DEVICE_BUILD_PROGRAM(paths_reclaim);
SKC_DEVICE_BUILD_PROGRAM(rasters_reclaim);
- // create HotSort instance -- FIXME -- how this occurs needs to be cleaned up
- hs_create(runtime->cl.context,runtime->cl.device_id,NULL);
+ // create HotSort instance
+ runtime->hs = hs_cl_create(&hs_target,
+ runtime->cl.context,
+ runtime->cl.device_id);
}
void
@@ -906,6 +908,8 @@ skc_device_dispose(struct skc_runtime * const runtime)
//
skc_runtime_host_perm_free(runtime,runtime->device);
+
+ // dispose of hotsort etc.
}
//
diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/kernel_cl_12.h b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/kernel_cl_12.h
index 0cac2261e7..224d5c9d91 100644
--- a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/kernel_cl_12.h
+++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/kernel_cl_12.h
@@ -18,12 +18,6 @@
#include "block.h"
//
-//
-//
-
-#include <hs/cl/gen9/hs_cl_macros.h>
-
-//
// HOW TO SELECT A SUBBLOCK AND BLOCK SIZES:
//
// 1) The subblock size should match the natural SIMT/SIMD width of
diff --git a/src/compute/skc/platforms/cl_12/kernels/rasterize.cl b/src/compute/skc/platforms/cl_12/kernels/rasterize.cl
index f20f6456b3..a879c99b00 100644
--- a/src/compute/skc/platforms/cl_12/kernels/rasterize.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/rasterize.cl
@@ -1029,8 +1029,8 @@ skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT c
{
return max(1.0f,
ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC *
- SKC_WANG_LENGTH(fabs(t2x - 2.0f * t1x + t0x),
- fabs(t2y - 2.0f * t1y + t0y)))));
+ SKC_WANG_LENGTH(t2x - 2.0f * t1x + t0x,
+ t2y - 2.0f * t1y + t0y))));
}
//
diff --git a/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl b/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl
index 7f48978782..a6a2df661c 100644
--- a/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl
@@ -15,13 +15,13 @@
#include "tile.h"
#include "atomic_cl.h"
#include "kernel_cl_12.h"
+#include "hs/cl/intel/gen8/u64/hs_cl_macros.h"
//
//
//
-#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
-#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
+#define HS_LANE_MASK (HS_SLAB_WIDTH - 1)
//
//
@@ -35,23 +35,23 @@
//
__kernel
-__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
+__attribute__((intel_reqd_sub_group_size(HS_SLAB_WIDTH)))
void
skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
__global uint * SKC_RESTRICT const indices,
__global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics)
{
uint const global_id = get_global_id(0);
- uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
+ uint const gmem_base = (global_id >> HS_SLAB_WIDTH_LOG2) * HS_SLAB_KEYS;
uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
- uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
+ uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_SLAB_HEIGHT;
//
// LOAD ALL THE ROWS
//
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
- HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
+ HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_SLAB_WIDTH];
HS_SLAB_ROWS();
@@ -103,11 +103,11 @@ skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const v
//
uint next = 0;
- if (get_sub_group_local_id() == HS_LANES_PER_WARP-1)
+ if (get_sub_group_local_id() == HS_SLAB_WIDTH-1)
next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset
// distribute base across subgroup
- next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1);
+ next = exclusive + sub_group_broadcast(next,HS_SLAB_WIDTH-1);
//
// STORE THE INDICES
diff --git a/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl b/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl
index 9db82d5f98..c4ace0b2a1 100644
--- a/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl
@@ -13,15 +13,15 @@
//
#include "tile.h"
-#include "raster_builder_cl_12.h" // need meta_in structure
#include "kernel_cl_12.h"
+#include "raster_builder_cl_12.h" // need meta_in structure
+#include "hs/cl/intel/gen8/u64/hs_cl_macros.h"
//
//
//
-#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
-#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
+#define HS_LANE_MASK (HS_SLAB_WIDTH - 1)
//
// THE BEST TYPE TO ZERO SMEM
@@ -39,7 +39,7 @@
// 3: rk
//
-#if (HS_KEYS_PER_SLAB < 256)
+#if (HS_SLAB_KEYS < 256)
#define SKC_META_TYPE uint
#define SKC_META_WORDS 1
@@ -96,7 +96,7 @@
#define SKC_ZERO_RATIO (SKC_ZERO_WORDS / SKC_META_WORDS)
#define SKC_META_ZERO_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE))
-#define SKC_META_ZERO_REM (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2))
+#define SKC_META_ZERO_REM (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_SLAB_WIDTH_LOG2))
#define SKC_META_COMPONENTS 4
#define SKC_META_COMPONENT_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE))
@@ -106,7 +106,7 @@
//
__kernel
-__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
+__attribute__((intel_reqd_sub_group_size(HS_SLAB_WIDTH)))
void
skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
__global uint * SKC_RESTRICT const metas)
@@ -119,16 +119,16 @@ skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
} shared;
uint const global_id = get_global_id(0);
- uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
+ uint const gmem_base = (global_id >> HS_SLAB_WIDTH_LOG2) * HS_SLAB_KEYS;
uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
- uint const gmem_off = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
+ uint const gmem_off = (global_id & HS_LANE_MASK) * HS_SLAB_HEIGHT;
//
// LOAD ALL THE ROWS
//
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
- HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
+ HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_SLAB_WIDTH];
HS_SLAB_ROWS();
@@ -169,7 +169,7 @@ skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
// DEBUG
//
#if 0
- if (gmem_base == HS_KEYS_PER_SLAB * 7)
+ if (gmem_base == HS_SLAB_KEYS * 7)
{
if (get_sub_group_local_id() == 0)
printf("\n%llX ",as_ulong(r0));
@@ -267,14 +267,14 @@ skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
// the min cohort is the first key in the slab
uint const c_min = sub_group_broadcast(c1,0);
-
+
// the max cohort is the max across all lanes
c_max = sub_group_reduce_max(c_max);
#if 0 // REMOVE ME LATER
if (get_sub_group_local_id() == 0)
printf("%3u : ( %3u , %3u )\n",
- get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max);
+ get_global_id(0)>>HS_SLAB_WIDTH_LOG2,c_min,c_max);
#endif
//
@@ -286,7 +286,7 @@ skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
uint zz = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id();
uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO;
- for (; zz<=zz_max; zz+=HS_LANES_PER_WARP)
+ for (; zz<=zz_max; zz+=HS_SLAB_WIDTH)
shared.z[zz] = 0;
#else
// ERROR -- it's highly unlikely that the zero type is smaller than
@@ -348,7 +348,7 @@ skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
// ATOMICALLY ADD THE CARRIED OUT METAS
//
#if 0 // BUG
- if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0))
+ if ((valid & (1<<(HS_SLAB_HEIGHT-1))) && (meta != 0))
SKC_META_LOCAL_ADD(meta);
#else
if (meta != 0)
@@ -378,9 +378,9 @@ skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
atomic_add(metas+cc,c+adjust);
}
- cc += HS_LANES_PER_WARP;
+ cc += HS_SLAB_WIDTH;
- for (; cc<=cc_max; cc+=HS_LANES_PER_WARP)
+ for (; cc<=cc_max; cc+=HS_SLAB_WIDTH)
{
uint const c = shared.c[cc];
diff --git a/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c
index d84b92bfd7..507e1bf077 100644
--- a/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c
+++ b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c
@@ -566,15 +566,17 @@ skc_raster_cohort_sort_prefix(skc_grid_t const grid)
//
uint32_t keys_padded_in, keys_padded_out;
- hs_pad(atomics->keys,&keys_padded_in,&keys_padded_out);
-
- hs_sort(cohort->cq,
- cohort->keys.drw,
- cohort->keys.drw,
- atomics->keys,
- keys_padded_in,
- keys_padded_out,
- false);
+ hs_cl_pad(runtime->hs,atomics->keys,&keys_padded_in,&keys_padded_out);
+
+ hs_cl_sort(runtime->hs,
+ cohort->cq,
+ 0,NULL,NULL,
+ cohort->keys.drw,
+ NULL,
+ atomics->keys,
+ keys_padded_in,
+ keys_padded_out,
+ false);
cl(SetKernelArg(impl->kernels.segment,0,SKC_CL_ARG(cohort->keys.drw)));
cl(SetKernelArg(impl->kernels.segment,1,SKC_CL_ARG(cohort->metas.drw)));
diff --git a/src/compute/skc/platforms/cl_12/runtime_cl_12.c b/src/compute/skc/platforms/cl_12/runtime_cl_12.c
index 81f3aba02f..55b2854c4d 100644
--- a/src/compute/skc/platforms/cl_12/runtime_cl_12.c
+++ b/src/compute/skc/platforms/cl_12/runtime_cl_12.c
@@ -277,4 +277,3 @@ skc_runtime_cl_12_debug(struct skc_context * const context)
//
//
//
-
diff --git a/src/compute/skc/platforms/cl_12/runtime_cl_12.h b/src/compute/skc/platforms/cl_12/runtime_cl_12.h
index beb924f3ca..1cfd51161e 100644
--- a/src/compute/skc/platforms/cl_12/runtime_cl_12.h
+++ b/src/compute/skc/platforms/cl_12/runtime_cl_12.h
@@ -58,6 +58,8 @@ struct skc_runtime
struct skc_config const * config; // FIXME: config will be determined by device with some opportunities to resize
struct skc_device * device; // opaque bundle of kernels
+
+ struct hs_cl const * hs; // opaque hotsort
};
//
@@ -185,4 +187,3 @@ skc_runtime_cl_12_debug(struct skc_context * const context);
//
//
//
-
diff --git a/src/compute/skc/platforms/cl_12/surface_cl_12.c b/src/compute/skc/platforms/cl_12/surface_cl_12.c
index c4e205a04b..e5f79c2d53 100644
--- a/src/compute/skc/platforms/cl_12/surface_cl_12.c
+++ b/src/compute/skc/platforms/cl_12/surface_cl_12.c
@@ -56,6 +56,7 @@ struct skc_surface_impl
struct skc_surface_render
{
skc_uint clip[4];
+ skc_uint txty[2];
struct skc_surface_impl * impl;
struct skc_styling * styling;
@@ -329,6 +330,7 @@ skc_surface_pfn_render(struct skc_surface_impl * const impl,
skc_composition_t composition,
skc_framebuffer_t fb,
uint32_t const clip[4],
+ int32_t const txty[2],
skc_surface_render_notify notify,
void * data)
{
@@ -359,6 +361,9 @@ skc_surface_pfn_render(struct skc_surface_impl * const impl,
render->clip[2] = clip[2];
render->clip[3] = clip[3];
+ render->txty[0] = txty[0];
+ render->txty[1] = txty[1];
+
render->impl = impl;
render->styling = styling;
render->composition = composition;
diff --git a/src/compute/skc/skc.h b/src/compute/skc/skc.h
index a5e81fb2ff..53d5f273af 100644
--- a/src/compute/skc/skc.h
+++ b/src/compute/skc/skc.h
@@ -323,6 +323,7 @@ skc_surface_render(skc_surface_t surface,
skc_composition_t composition,
skc_framebuffer_t fb,
uint32_t const clip[4],
+ int32_t const txty[2],
skc_surface_render_notify notify,
void * data);
diff --git a/src/compute/skc/styling.h b/src/compute/skc/styling.h
index 310a739a07..b5326f6090 100644
--- a/src/compute/skc/styling.h
+++ b/src/compute/skc/styling.h
@@ -52,4 +52,3 @@ struct skc_styling
//
//
//
-
diff --git a/src/compute/skc/styling_types.h b/src/compute/skc/styling_types.h
index ee0e7aa7e3..10442e8f05 100644
--- a/src/compute/skc/styling_types.h
+++ b/src/compute/skc/styling_types.h
@@ -246,5 +246,3 @@ SKC_STATIC_ASSERT(sizeof(union skc_gradient_vector) == sizeof(skc_float4));
//
//
//
-
-
diff --git a/src/compute/skc/surface.c b/src/compute/skc/surface.c
index 3d96bb65ac..107c02dd84 100644
--- a/src/compute/skc/surface.c
+++ b/src/compute/skc/surface.c
@@ -44,6 +44,7 @@ skc_surface_render(skc_surface_t surface,
skc_composition_t composition,
skc_framebuffer_t fb,
uint32_t const clip[4],
+ int32_t const txty[2],
skc_surface_render_notify notify,
void * data)
{
@@ -70,7 +71,10 @@ skc_surface_render(skc_surface_t surface,
// non-overlapping clips. This is fairly easy but at this point
// doesn't seem like a common use case.
//
- surface->render(surface->impl,styling,composition,fb,clip,notify,data);
+ surface->render(surface->impl,
+ styling,composition,
+ fb,clip,txty,
+ notify,data);
return SKC_ERR_SUCCESS;
}
diff --git a/src/compute/skc/surface.h b/src/compute/skc/surface.h
index 94f9128841..8d363569cb 100644
--- a/src/compute/skc/surface.h
+++ b/src/compute/skc/surface.h
@@ -33,6 +33,7 @@ struct skc_surface
skc_composition_t composition,
skc_framebuffer_t fb,
uint32_t const clip[4],
+ int32_t const txty[2],
skc_surface_render_notify notify,
void * data);
};
diff --git a/src/compute/skc/weakref.h b/src/compute/skc/weakref.h
index c6ce6490f8..d239b7e9f7 100644
--- a/src/compute/skc/weakref.h
+++ b/src/compute/skc/weakref.h
@@ -46,5 +46,3 @@ skc_weakref_index(skc_weakref_t const * const weakref);
//
//
//
-
-