From 9e0d7e4072e43495a3907bb2bac7824e8e60c368 Mon Sep 17 00:00:00 2001 From: Allan MacKinnon Date: Mon, 16 Jul 2018 15:57:05 -0700 Subject: Bug fixes and improvements to SKC and HotSort. Vulkan is WIP. Bug: skia: Change-Id: Iffc75a5b4dfcbfa4a6c23d972bb9798c2f550335 Reviewed-on: https://skia-review.googlesource.com/141582 Reviewed-by: Mike Reed Reviewed-by: Allan MacKinnon Commit-Queue: Allan MacKinnon --- src/compute/common/cl/assert_cl.c | 2 +- src/compute/common/cl/assert_cl.h | 1 - src/compute/common/cl/find_cl.c | 11 +- src/compute/common/cl/find_cl.h | 1 - src/compute/common/macros.h | 41 +- src/compute/common/util.c | 4 +- src/compute/common/util.h | 2 - src/compute/hs/cl/bench/main.c | 262 +- src/compute/hs/cl/gen9/hs_cl.cl | 10082 ------------------- src/compute/hs/cl/gen9/hs_cl.h | 122 - src/compute/hs/cl/gen9/hs_cl_macros.h | 199 - src/compute/hs/cl/gen9/make_all.bat | 16 - src/compute/hs/cl/gen9/make_inl_cl.bat | 78 - src/compute/hs/cl/hs_cl_launcher.c | 1524 +-- src/compute/hs/cl/hs_cl_launcher.h | 62 +- src/compute/hs/cl/hs_cl_target.h | 63 + src/compute/hs/cl/intel/gen8/u32/make_all.bat | 16 + src/compute/hs/cl/intel/gen8/u32/make_inl_cl.bat | 77 + src/compute/hs/cl/intel/gen8/u32b32/make_all.bat | 16 + .../hs/cl/intel/gen8/u32b32/make_inl_cl.bat | 77 + src/compute/hs/cl/intel/gen8/u64/hs_cl.cl | 4851 +++++++++ src/compute/hs/cl/intel/gen8/u64/hs_cl.h | 100 + src/compute/hs/cl/intel/gen8/u64/hs_cl_macros.h | 361 + src/compute/hs/cl/intel/gen8/u64/hs_target.h | 115 + src/compute/hs/cl/intel/gen8/u64/make_all.bat | 26 + src/compute/hs/cl/intel/gen8/u64/make_inl_cl.bat | 113 + src/compute/hs/cl/intel/gen9lp/u32/make_inl_cl.bat | 77 + .../hs/cl/intel/gen9lp/u32b32/make_inl_cl.bat | 77 + src/compute/hs/cl/intel/gen9lp/u64/make_inl_cl.bat | 77 + src/compute/hs/gen/gen.h | 112 +- src/compute/hs/gen/main.c | 532 +- src/compute/hs/gen/networks_merging.c | 4 +- src/compute/hs/gen/networks_sorting.c | 4 +- src/compute/hs/gen/target_cuda.c | 600 ++ src/compute/hs/gen/target_cuda_sm3x.c | 776 -- src/compute/hs/gen/target_debug.c | 73 + src/compute/hs/gen/target_glsl.c | 674 ++ src/compute/hs/gen/target_igp_genx.c | 672 -- src/compute/hs/gen/target_opencl.c | 600 ++ src/compute/hs/gen/transpose.c | 61 +- src/compute/hs/gen/transpose.h | 6 +- src/compute/hs/vk/hs_spirv_target.h | 77 + src/compute/hs/vk/hs_vk_launcher.c | 248 + src/compute/hs/vk/hs_vk_launcher.h | 88 + src/compute/hs/vk/intel/gen8/u32/make_all.bat | 48 + src/compute/hs/vk/intel/gen8/u32b32/make_all.bat | 48 + src/compute/hs/vk/intel/gen8/u64/hs_glsl.h | 100 + src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h | 417 + src/compute/hs/vk/intel/gen8/u64/hs_kernels.h | 75 + src/compute/hs/vk/intel/gen8/u64/hs_target.h | 113 + src/compute/hs/vk/intel/gen8/u64/make_all.bat | 79 + src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat | 48 + src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat | 48 + src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat | 48 + src/compute/skc/extent_ring.c | 1 - src/compute/skc/main.c | 6 +- src/compute/skc/path_builder.h | 1 - .../skc/platforms/cl_12/allocator_device_cl.c | 1 - .../skc/platforms/cl_12/allocator_device_cl.h | 1 - .../skc/platforms/cl_12/composition_cl_12.c | 23 +- src/compute/skc/platforms/cl_12/export_cl_12.h | 1 - src/compute/skc/platforms/cl_12/extent_cl_12.c | 2 +- .../skc/platforms/cl_12/interop/interop_glfw.c | 4 +- .../cl_12/kernels/devices/gen9/device_cl_12.c | 22 +- .../cl_12/kernels/devices/gen9/kernel_cl_12.h | 6 - .../skc/platforms/cl_12/kernels/rasterize.cl | 4 +- .../skc/platforms/cl_12/kernels/segment_ttck.cl | 16 +- .../skc/platforms/cl_12/kernels/segment_ttrk.cl | 32 +- .../skc/platforms/cl_12/raster_builder_cl_12.c | 20 +- src/compute/skc/platforms/cl_12/runtime_cl_12.c | 1 - src/compute/skc/platforms/cl_12/runtime_cl_12.h | 3 +- src/compute/skc/platforms/cl_12/surface_cl_12.c | 5 + src/compute/skc/skc.h | 1 + src/compute/skc/styling.h | 1 - src/compute/skc/styling_types.h | 2 - src/compute/skc/surface.c | 6 +- src/compute/skc/surface.h | 1 + src/compute/skc/weakref.h | 2 - 78 files changed, 10796 insertions(+), 13370 deletions(-) delete mode 100644 src/compute/hs/cl/gen9/hs_cl.cl delete mode 100644 src/compute/hs/cl/gen9/hs_cl.h delete mode 100644 src/compute/hs/cl/gen9/hs_cl_macros.h delete mode 100644 src/compute/hs/cl/gen9/make_all.bat delete mode 100644 src/compute/hs/cl/gen9/make_inl_cl.bat create mode 100644 src/compute/hs/cl/hs_cl_target.h create mode 100644 src/compute/hs/cl/intel/gen8/u32/make_all.bat create mode 100644 src/compute/hs/cl/intel/gen8/u32/make_inl_cl.bat create mode 100644 src/compute/hs/cl/intel/gen8/u32b32/make_all.bat create mode 100644 src/compute/hs/cl/intel/gen8/u32b32/make_inl_cl.bat create mode 100644 src/compute/hs/cl/intel/gen8/u64/hs_cl.cl create mode 100644 src/compute/hs/cl/intel/gen8/u64/hs_cl.h create mode 100644 src/compute/hs/cl/intel/gen8/u64/hs_cl_macros.h create mode 100644 src/compute/hs/cl/intel/gen8/u64/hs_target.h create mode 100644 src/compute/hs/cl/intel/gen8/u64/make_all.bat create mode 100644 src/compute/hs/cl/intel/gen8/u64/make_inl_cl.bat create mode 100644 src/compute/hs/cl/intel/gen9lp/u32/make_inl_cl.bat create mode 100644 src/compute/hs/cl/intel/gen9lp/u32b32/make_inl_cl.bat create mode 100644 src/compute/hs/cl/intel/gen9lp/u64/make_inl_cl.bat create mode 100644 src/compute/hs/gen/target_cuda.c delete mode 100644 src/compute/hs/gen/target_cuda_sm3x.c create mode 100644 src/compute/hs/gen/target_debug.c create mode 100644 src/compute/hs/gen/target_glsl.c delete mode 100644 src/compute/hs/gen/target_igp_genx.c create mode 100644 src/compute/hs/gen/target_opencl.c create mode 100644 src/compute/hs/vk/hs_spirv_target.h create mode 100644 src/compute/hs/vk/hs_vk_launcher.c create mode 100644 src/compute/hs/vk/hs_vk_launcher.h create mode 100644 src/compute/hs/vk/intel/gen8/u32/make_all.bat create mode 100644 src/compute/hs/vk/intel/gen8/u32b32/make_all.bat create mode 100644 src/compute/hs/vk/intel/gen8/u64/hs_glsl.h create mode 100644 src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h create mode 100644 src/compute/hs/vk/intel/gen8/u64/hs_kernels.h create mode 100644 src/compute/hs/vk/intel/gen8/u64/hs_target.h create mode 100644 src/compute/hs/vk/intel/gen8/u64/make_all.bat create mode 100644 src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat create mode 100644 src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat create mode 100644 src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat (limited to 'src/compute') diff --git a/src/compute/common/cl/assert_cl.c b/src/compute/common/cl/assert_cl.c index 5d420586b3..944256daec 100644 --- a/src/compute/common/cl/assert_cl.c +++ b/src/compute/common/cl/assert_cl.c @@ -129,7 +129,7 @@ assert_cl(cl_int const code, char const * const file, int const line, bool const char const * const cl_err_str = cl_get_error_string(code); fprintf(stderr, - "\"%s\", line %d: cl_assert (%d) = \"%s\"", + "\"%s\", line %d: assert_cl( %d ) = \"%s\"", file,line,code,cl_err_str); if (abort) diff --git a/src/compute/common/cl/assert_cl.h b/src/compute/common/cl/assert_cl.h index 517ada8d37..efe698f29e 100644 --- a/src/compute/common/cl/assert_cl.h +++ b/src/compute/common/cl/assert_cl.h @@ -53,4 +53,3 @@ cl_get_event_command_type_string(cl_command_type const type); // // // - diff --git a/src/compute/common/cl/find_cl.c b/src/compute/common/cl/find_cl.c index a04d9ebd69..6c500c0865 100644 --- a/src/compute/common/cl/find_cl.c +++ b/src/compute/common/cl/find_cl.c @@ -45,7 +45,7 @@ clFindIdsByName(char const * const target_platform_substring, cl(GetPlatformIDs(0,NULL,&platform_count)); - cl_platform_id * const platform_ids = ALLOCA(sizeof(*platform_ids) * platform_count); + cl_platform_id * const platform_ids = ALLOCA_MACRO(sizeof(*platform_ids) * platform_count); cl(GetPlatformIDs(platform_count,platform_ids,NULL)); @@ -62,7 +62,7 @@ clFindIdsByName(char const * const target_platform_substring, NULL, &platform_name_size)); - char * const platform_name = ALLOCA(platform_name_size); + char * const platform_name = ALLOCA_MACRO(platform_name_size); cl(GetPlatformInfo(platform_ids[ii], CL_PLATFORM_NAME, @@ -93,7 +93,7 @@ clFindIdsByName(char const * const target_platform_substring, NULL, &device_count); - cl_device_id * const device_ids = ALLOCA(sizeof(*device_ids) * device_count); + cl_device_id * const device_ids = ALLOCA_MACRO(sizeof(*device_ids) * device_count); cl_err = clGetDeviceIDs(platform_ids[ii], CL_DEVICE_TYPE_ALL, @@ -121,8 +121,8 @@ clFindIdsByName(char const * const target_platform_substring, NULL, &driver_version_size)); - char * const device_name = ALLOCA(device_name_size); - char * const driver_version = ALLOCA(driver_version_size); + char * const device_name = ALLOCA_MACRO(device_name_size); + char * const driver_version = ALLOCA_MACRO(driver_version_size); cl(GetDeviceInfo(device_ids[jj], CL_DEVICE_NAME, @@ -207,4 +207,3 @@ clFindIdsByName(char const * const target_platform_substring, // // // - diff --git a/src/compute/common/cl/find_cl.h b/src/compute/common/cl/find_cl.h index 5143e39f85..6dbfe10838 100644 --- a/src/compute/common/cl/find_cl.h +++ b/src/compute/common/cl/find_cl.h @@ -32,4 +32,3 @@ clFindIdsByName(char const * const target_platform_substring, // // // - diff --git a/src/compute/common/macros.h b/src/compute/common/macros.h index 52dc8689fc..266b58f108 100644 --- a/src/compute/common/macros.h +++ b/src/compute/common/macros.h @@ -12,16 +12,35 @@ // // -#define ARRAY_LENGTH(x) (sizeof(x)/sizeof(x[0])) +#include // // // -#define MAX_MACRO(a,b) (((a) > (b)) ? (a) : (b)) -#define MIN_MACRO(a,b) (((a) < (b)) ? (a) : (b)) -#define GTE_MACRO(a,b) ((a) >= (b)) -#define LT_MACRO(a,b) ((a) < (b)) +#define ARRAY_LENGTH_MACRO(x) (sizeof(x)/sizeof(x[0])) +#define OFFSET_OF_MACRO(t,m) ((size_t)&(((t*)0)->m)) +#define MEMBER_SIZE_MACRO(t,m) sizeof(((t*)0)->m) + + +// +// +// + +#define MAX_MACRO(a,b) (((a) > (b)) ? (a) : (b)) +#define MIN_MACRO(a,b) (((a) < (b)) ? (a) : (b)) +#define GTE_MACRO(a,b) ((a) >= (b)) +#define LT_MACRO(a,b) ((a) < (b)) + +// +// +// + +#if defined(_MSC_VER) +#define ALLOCA_MACRO(n) _alloca(n) +#else +#define ALLOCA_MACRO(n) alloca(n) +#endif // // @@ -34,14 +53,14 @@ #define BITS_TO_MASK_AT_64(n,b) (BITS_TO_MASK_64(n)<<(b)) // -// +// Convert 4 byte pointer to network order dword to a host order. // -#if defined(_MSC_VER) -#define ALLOCA(n) _alloca(n) -#else -#define ALLOCA(n) alloca(n) -#endif +#define NPBTOHL_MACRO(pb4) ((((pb4)[0])<<24) | (((pb4)[1])<<16) | \ + (((pb4)[2])<< 8) | (pb4)[3]) + +#define NTOHL_MACRO(nl) ntohl(nl) + // // // diff --git a/src/compute/common/util.c b/src/compute/common/util.c index eb05d91a9f..51a8e0128e 100644 --- a/src/compute/common/util.c +++ b/src/compute/common/util.c @@ -59,12 +59,11 @@ pow2_rd_u32(uint32_t n) uint32_t msb_idx_u32(uint32_t n) { - #ifdef _MSC_VER uint32_t index; - _BitScanReverse(&index,n); + _BitScanReverse((unsigned long *)&index,n); return index; @@ -78,7 +77,6 @@ msb_idx_u32(uint32_t n) #error "No msb_index()" #endif - } // diff --git a/src/compute/common/util.h b/src/compute/common/util.h index 7d5a7b4600..113e26d789 100644 --- a/src/compute/common/util.h +++ b/src/compute/common/util.h @@ -27,5 +27,3 @@ uint32_t msb_idx_u32(uint32_t n); // 0-based bit position // // // - - diff --git a/src/compute/hs/cl/bench/main.c b/src/compute/hs/cl/bench/main.c index 3b9ef6e1c7..bfa7c1da38 100644 --- a/src/compute/hs/cl/bench/main.c +++ b/src/compute/hs/cl/bench/main.c @@ -32,9 +32,10 @@ #define CL_USE_DEPRECATED_OPENCL_1_2_APIS #endif -#include "macros.h" -#include "assert_cl.h" -#include "find_cl.h" +#include "common/macros.h" +#include "common/cl/assert_cl.h" +#include "common/cl/find_cl.h" + #include "hs_cl_launcher.h" // @@ -90,10 +91,10 @@ char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count); static char const * -hs_cpu_sort(void * sorted_h, - uint32_t const count, - struct hs_info const * const info, - double * const cpu_ns) +hs_cpu_sort(uint32_t const hs_words, + void * sorted_h, + uint32_t const count, + double * const cpu_ns) { char const * algo; @@ -101,7 +102,7 @@ hs_cpu_sort(void * sorted_h, QueryPerformanceCounter(&t0); - if (info->words == 1) + if (hs_words == 1) algo = hs_cpu_sort_u32(sorted_h,count); else algo = hs_cpu_sort_u64(sorted_h,count); @@ -117,27 +118,34 @@ hs_cpu_sort(void * sorted_h, static bool -hs_verify_linear(void * sorted_h, void * vout_h, const uint32_t count, struct hs_info const * const info) +hs_verify_linear(uint32_t const hs_words, + void * sorted_h, + void * vout_h, + uint32_t const count) { - return memcmp(sorted_h, vout_h, sizeof(uint32_t) * info->words * count) == 0; + return memcmp(sorted_h, vout_h, sizeof(uint32_t) * hs_words * count) == 0; } static void -hs_transpose_slabs_u32(uint32_t * vout_h, const uint32_t count, struct hs_info const * const info) +hs_transpose_slabs_u32(uint32_t const hs_words, + uint32_t const hs_width, + uint32_t const hs_height, + uint32_t * vout_h, + uint32_t const count) { - uint32_t const slab_keys = info->keys * info->lanes; - size_t const slab_size = sizeof(uint32_t) * info->words * slab_keys; - uint32_t * const slab = _alloca(slab_size); + uint32_t const slab_keys = hs_width * hs_height; + size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys; + uint32_t * const slab = ALLOCA_MACRO(slab_size); uint32_t slab_count = count / slab_keys; while (slab_count-- > 0) { memcpy(slab,vout_h,slab_size); - for (uint32_t row=0; rowkeys; row++) - for (uint32_t col=0; collanes; col++) - vout_h[col * info->keys + row] = slab[row * info->lanes + col]; + for (uint32_t row=0; rowkeys * info->lanes; - size_t const slab_size = sizeof(uint32_t) * info->words * slab_keys; - uint64_t * const slab = _alloca(slab_size); + uint32_t const slab_keys = hs_width * hs_height; + size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys; + uint64_t * const slab = ALLOCA_MACRO(slab_size); uint32_t slab_count = count / slab_keys; while (slab_count-- > 0) { memcpy(slab,vout_h,slab_size); - for (uint32_t row=0; rowkeys; row++) - for (uint32_t col=0; collanes; col++) - vout_h[col * info->keys + row] = slab[row * info->lanes + col]; + for (uint32_t row=0; rowwords == 1) - hs_transpose_slabs_u32(vout_h,count,info); + if (hs_words == 1) + hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count); else - hs_transpose_slabs_u64(vout_h,count,info); + hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count); } // @@ -180,18 +196,18 @@ hs_transpose_slabs(void * vout_h, const uint32_t count, struct hs_info const * c static void -hs_debug_u32( - uint32_t const * vout_h, - uint32_t const count, - struct hs_info const * const info) +hs_debug_u32(uint32_t const hs_width, + uint32_t const hs_height, + uint32_t const * vout_h, + uint32_t const count) { - uint32_t const slab = info->keys * info->lanes; - uint32_t const slabs = (count + slab - 1) / slab; + uint32_t const slab_keys = hs_width * hs_height; + uint32_t const slabs = (count + slab_keys - 1) / slab_keys; for (uint32_t ss=0; sskeys; cc++) { - for (uint32_t rr=0; rrlanes; rr++) + for (uint32_t cc=0; cckeys * info->lanes; - uint32_t const slabs = (count + slab - 1) / slab; + uint32_t const slab_keys = hs_width * hs_height; + uint32_t const slabs = (count + slab_keys - 1) / slab_keys; for (uint32_t ss=0; sskeys; cc++) { - for (uint32_t rr=0; rrlanes; rr++) + for (uint32_t cc=0; ccwords; + size_t const key_size = sizeof(uint32_t) * hs_words; - size_t const size_hi = count_hi * key_size; size_t const size_hi_in = count_hi_padded_in * key_size; size_t const size_hi_out = count_hi_padded_out * key_size; @@ -363,7 +386,7 @@ hs_bench(cl_context context, &cl_err); cl_ok(cl_err); // fill with random numbers - hs_fill_rand(random_h,count_hi,info->words); + hs_fill_rand(random_h,count_hi,hs_words); // // UNMAP @@ -379,16 +402,14 @@ hs_bench(cl_context context, // compute padding before sorting uint32_t count_padded_in, count_padded_out; - hs_pad(count,&count_padded_in,&count_padded_out); + hs_cl_pad(hs,count,&count_padded_in,&count_padded_out); cl_ulong elapsed_ns_min = ULONG_MAX; cl_ulong elapsed_ns_max = 0; cl_ulong elapsed_ns_sum = 0; -#if 1 - cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL)); - cl(Finish(cq)); -#endif + cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL)); + cl(Finish(cq)); for (uint32_t ii=0; iiwords == 1) - hs_debug_u32(vout_h,count,info); + if (hs_words == 1) + hs_debug_u32(hs_width,hs_height,vout_h,count); else // ulong - hs_debug_u64(vout_h,count,info); + hs_debug_u64(hs_width,hs_height,vout_h,count); } #endif @@ -519,7 +542,7 @@ hs_bench(cl_context context, // fprintf(stdout,"%s, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n", device_name, - (info->words == 1) ? "uint" : "ulong", + (hs_words == 1) ? "uint" : "ulong", linearize ? "linear" : "slab", verified ? " OK " : "*FAIL*", count, @@ -555,8 +578,15 @@ hs_bench(cl_context context, // // +#define HS_TARGET_NAME hs_target +#include "intel/gen8/u64/hs_target.h" + +// +// +// + int -main(int argc, char** argv) +main(int argc, char const * argv[]) { char const * const target_platform_substring = "Intel"; char const * const target_device_substring = "Graphics"; @@ -601,42 +631,63 @@ main(int argc, char** argv) // // create command queue // - cl_command_queue_properties const props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE; - #if 0 // OPENCL 2.0 - cl_queue_properties queue_properties[] = - { - CL_QUEUE_PROPERTIES, (cl_queue_properties)props, - 0 - }; + + cl_queue_properties props[] = { + CL_QUEUE_PROPERTIES, + (cl_queue_properties)CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, +#ifndef NDEBUG + (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE, +#endif + 0 + }; + + cl_queue_properties props_profile[] = { + CL_QUEUE_PROPERTIES, + (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE, + 0 + }; cl_command_queue cq = clCreateCommandQueueWithProperties(context, device_id, - queue_properties, + props, &cl_err); cl_ok(cl_err); + + cl_command_queue cq_profile = clCreateCommandQueueWithProperties(context, + device_id, + props_profile, + &cl_err); cl_ok(cl_err); #else // OPENCL 1.2 + cl_command_queue cq = clCreateCommandQueue(context, device_id, - props, +#ifndef NDEBUG + CL_QUEUE_PROFILING_ENABLE | +#endif + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &cl_err); cl_ok(cl_err); + + cl_command_queue cq_profile = clCreateCommandQueue(context, + device_id, + CL_QUEUE_PROFILING_ENABLE, + &cl_err); cl_ok(cl_err); #endif + // + // Intel GEN workaround -- create dummy kernel for semi-accurate + // profiling on an out-of-order queue. + // + hs_dummy_kernel_create(context,device_id); + // // create kernels // fprintf(stdout,"Creating... "); - struct hs_info info; - - hs_create(context,device_id,&info); + struct hs_cl * const hs = hs_cl_create(&hs_target,context,device_id); fprintf(stdout,"done.\n"); - // - // create dummy kernel for profiling - // - hs_dummy_kernel_create(context,device_id); - // // // @@ -651,7 +702,7 @@ main(int argc, char** argv) // // sort sizes and loops // - uint32_t const kpb = info.keys * info.lanes; + uint32_t const kpb = hs_target.config.slab.height << hs_target.config.slab.width_log2; uint32_t const count_lo = (argc <= 1) ? kpb : strtoul(argv[1],NULL,0); uint32_t const count_hi = (argc <= 2) ? count_lo : strtoul(argv[2],NULL,0); @@ -663,15 +714,30 @@ main(int argc, char** argv) // // benchmark // - hs_bench(context,cq,device_name,&info,count_lo,count_hi,count_step,loops,warmup,linearize); + hs_bench(context, + cq,cq_profile, + device_name, + hs_target.config.words.key + hs_target.config.words.val, + 1 << hs_target.config.slab.width_log2, + hs_target.config.slab.height, + hs, + count_lo, + count_hi, + count_step, + loops, + warmup, + linearize); // // release everything // + hs_cl_release(hs); + hs_dummy_kernel_release(); - hs_release(); cl(ReleaseCommandQueue(cq)); + cl(ReleaseCommandQueue(cq_profile)); + cl(ReleaseContext(context)); return 0; diff --git a/src/compute/hs/cl/gen9/hs_cl.cl b/src/compute/hs/cl/gen9/hs_cl.cl deleted file mode 100644 index 63627ad068..0000000000 --- a/src/compute/hs/cl/gen9/hs_cl.cl +++ /dev/null @@ -1,10082 +0,0 @@ -// -// Copyright 2016 Google Inc. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. -// - -#include - -// -// -// - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_transpose(__global HS_KEY_TYPE* const restrict vout) -{ - uint const global_id = get_global_id(0); - uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); - - HS_KEY_TYPE r1 = (vout + gmem_idx)[0 * 8]; - HS_KEY_TYPE r2 = (vout + gmem_idx)[1 * 8]; - HS_KEY_TYPE r3 = (vout + gmem_idx)[2 * 8]; - HS_KEY_TYPE r4 = (vout + gmem_idx)[3 * 8]; - HS_KEY_TYPE r5 = (vout + gmem_idx)[4 * 8]; - HS_KEY_TYPE r6 = (vout + gmem_idx)[5 * 8]; - HS_KEY_TYPE r7 = (vout + gmem_idx)[6 * 8]; - HS_KEY_TYPE r8 = (vout + gmem_idx)[7 * 8]; - HS_KEY_TYPE r9 = (vout + gmem_idx)[8 * 8]; - HS_KEY_TYPE r10 = (vout + gmem_idx)[9 * 8]; - HS_KEY_TYPE r11 = (vout + gmem_idx)[10 * 8]; - HS_KEY_TYPE r12 = (vout + gmem_idx)[11 * 8]; - HS_KEY_TYPE r13 = (vout + gmem_idx)[12 * 8]; - HS_KEY_TYPE r14 = (vout + gmem_idx)[13 * 8]; - HS_KEY_TYPE r15 = (vout + gmem_idx)[14 * 8]; - HS_KEY_TYPE r16 = (vout + gmem_idx)[15 * 8]; - HS_TRANSPOSE_SLAB() -} - -__kernel __attribute__((reqd_work_group_size(128, 1, 1))) -__attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_bs_4(__global HS_KEY_TYPE const* const restrict vin, - __global HS_KEY_TYPE* const restrict vout) -{ - __local union - { - HS_KEY_TYPE m[16 * 128]; - } shared; - - uint const global_id = get_global_id(0); - uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); - - HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8]; - HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8]; - HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8]; - HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8]; - HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8]; - HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8]; - HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8]; - HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8]; - HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8]; - HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8]; - HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8]; - HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8]; - HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8]; - HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8]; - HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8]; - HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8]; - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r6, r11) - HS_CMP_XCHG(r7, r10) - HS_CMP_XCHG(r4, r13) - HS_CMP_XCHG(r14, r15) - HS_CMP_XCHG(r8, r12) - HS_CMP_XCHG(r2, r3) - HS_CMP_XCHG(r5, r9) - HS_CMP_XCHG(r2, r5) - HS_CMP_XCHG(r8, r14) - HS_CMP_XCHG(r3, r9) - HS_CMP_XCHG(r12, r15) - HS_CMP_XCHG(r3, r5) - HS_CMP_XCHG(r6, r7) - HS_CMP_XCHG(r10, r11) - HS_CMP_XCHG(r12, r14) - HS_CMP_XCHG(r4, r9) - HS_CMP_XCHG(r8, r13) - HS_CMP_XCHG(r7, r9) - HS_CMP_XCHG(r11, r13) - HS_CMP_XCHG(r4, r6) - HS_CMP_XCHG(r8, r10) - HS_CMP_XCHG(r4, r5) - HS_CMP_XCHG(r6, r7) - HS_CMP_XCHG(r8, r9) - HS_CMP_XCHG(r10, r11) - HS_CMP_XCHG(r12, r13) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - { - uint const flip_lane_mask = 1; - uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; - int const t_lt = get_sub_group_local_id() < flip_lane_idx; - HS_CMP_FLIP(0, r1, r16) - HS_CMP_FLIP(1, r2, r15) - HS_CMP_FLIP(2, r3, r14) - HS_CMP_FLIP(3, r4, r13) - HS_CMP_FLIP(4, r5, r12) - HS_CMP_FLIP(5, r6, r11) - HS_CMP_FLIP(6, r7, r10) - HS_CMP_FLIP(7, r8, r9) - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - { - uint const flip_lane_mask = 3; - uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; - int const t_lt = get_sub_group_local_id() < flip_lane_idx; - HS_CMP_FLIP(0, r1, r16) - HS_CMP_FLIP(1, r2, r15) - HS_CMP_FLIP(2, r3, r14) - HS_CMP_FLIP(3, r4, r13) - HS_CMP_FLIP(4, r5, r12) - HS_CMP_FLIP(5, r6, r11) - HS_CMP_FLIP(6, r7, r10) - HS_CMP_FLIP(7, r8, r9) - } - { - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - { - uint const flip_lane_mask = 7; - uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; - int const t_lt = get_sub_group_local_id() < flip_lane_idx; - HS_CMP_FLIP(0, r1, r16) - HS_CMP_FLIP(1, r2, r15) - HS_CMP_FLIP(2, r3, r14) - HS_CMP_FLIP(3, r4, r13) - HS_CMP_FLIP(4, r5, r12) - HS_CMP_FLIP(5, r6, r11) - HS_CMP_FLIP(6, r7, r10) - HS_CMP_FLIP(7, r8, r9) - } - { - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) - } - { - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - uint const smem_l_idx = get_sub_group_id() * 128 + get_sub_group_local_id(); - uint const smem_r_idx = - (get_sub_group_id() ^ 1) * 128 + (get_sub_group_local_id() ^ 7); - (shared.m + get_local_id(0))[16 * 8 * 0] = r1; - (shared.m + get_local_id(0))[16 * 8 * 1] = r16; - (shared.m + get_local_id(0))[16 * 8 * 2] = r2; - (shared.m + get_local_id(0))[16 * 8 * 3] = r15; - (shared.m + get_local_id(0))[16 * 8 * 4] = r3; - (shared.m + get_local_id(0))[16 * 8 * 5] = r14; - (shared.m + get_local_id(0))[16 * 8 * 6] = r4; - (shared.m + get_local_id(0))[16 * 8 * 7] = r13; - (shared.m + get_local_id(0))[16 * 8 * 8] = r5; - (shared.m + get_local_id(0))[16 * 8 * 9] = r12; - (shared.m + get_local_id(0))[16 * 8 * 10] = r6; - (shared.m + get_local_id(0))[16 * 8 * 11] = r11; - (shared.m + get_local_id(0))[16 * 8 * 12] = r7; - (shared.m + get_local_id(0))[16 * 8 * 13] = r10; - (shared.m + get_local_id(0))[16 * 8 * 14] = r8; - (shared.m + get_local_id(0))[16 * 8 * 15] = r9; - barrier(CLK_LOCAL_MEM_FENCE); - { - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; - HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[8]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[0] = r0_1; - (shared.m + smem_r_idx)[8] = r0_2; - } - { - HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[16]; - HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[24]; - HS_CMP_XCHG(r1_1, r1_2) - (shared.m + smem_l_idx)[16] = r1_1; - (shared.m + smem_r_idx)[24] = r1_2; - } - { - HS_KEY_TYPE r2_1 = (shared.m + smem_l_idx)[32]; - HS_KEY_TYPE r2_2 = (shared.m + smem_r_idx)[40]; - HS_CMP_XCHG(r2_1, r2_2) - (shared.m + smem_l_idx)[32] = r2_1; - (shared.m + smem_r_idx)[40] = r2_2; - } - { - HS_KEY_TYPE r3_1 = (shared.m + smem_l_idx)[48]; - HS_KEY_TYPE r3_2 = (shared.m + smem_r_idx)[56]; - HS_CMP_XCHG(r3_1, r3_2) - (shared.m + smem_l_idx)[48] = r3_1; - (shared.m + smem_r_idx)[56] = r3_2; - } - { - HS_KEY_TYPE r4_1 = (shared.m + smem_l_idx)[64]; - HS_KEY_TYPE r4_2 = (shared.m + smem_r_idx)[72]; - HS_CMP_XCHG(r4_1, r4_2) - (shared.m + smem_l_idx)[64] = r4_1; - (shared.m + smem_r_idx)[72] = r4_2; - } - { - HS_KEY_TYPE r5_1 = (shared.m + smem_l_idx)[80]; - HS_KEY_TYPE r5_2 = (shared.m + smem_r_idx)[88]; - HS_CMP_XCHG(r5_1, r5_2) - (shared.m + smem_l_idx)[80] = r5_1; - (shared.m + smem_r_idx)[88] = r5_2; - } - { - HS_KEY_TYPE r6_1 = (shared.m + smem_l_idx)[96]; - HS_KEY_TYPE r6_2 = (shared.m + smem_r_idx)[104]; - HS_CMP_XCHG(r6_1, r6_2) - (shared.m + smem_l_idx)[96] = r6_1; - (shared.m + smem_r_idx)[104] = r6_2; - } - { - HS_KEY_TYPE r7_1 = (shared.m + smem_l_idx)[112]; - HS_KEY_TYPE r7_2 = (shared.m + smem_r_idx)[120]; - HS_CMP_XCHG(r7_1, r7_2) - (shared.m + smem_l_idx)[112] = r7_1; - (shared.m + smem_r_idx)[120] = r7_2; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - r1 = (shared.m + get_local_id(0))[16 * 8 * 0]; - r16 = (shared.m + get_local_id(0))[16 * 8 * 1]; - r2 = (shared.m + get_local_id(0))[16 * 8 * 2]; - r15 = (shared.m + get_local_id(0))[16 * 8 * 3]; - r3 = (shared.m + get_local_id(0))[16 * 8 * 4]; - r14 = (shared.m + get_local_id(0))[16 * 8 * 5]; - r4 = (shared.m + get_local_id(0))[16 * 8 * 6]; - r13 = (shared.m + get_local_id(0))[16 * 8 * 7]; - r5 = (shared.m + get_local_id(0))[16 * 8 * 8]; - r12 = (shared.m + get_local_id(0))[16 * 8 * 9]; - r6 = (shared.m + get_local_id(0))[16 * 8 * 10]; - r11 = (shared.m + get_local_id(0))[16 * 8 * 11]; - r7 = (shared.m + get_local_id(0))[16 * 8 * 12]; - r10 = (shared.m + get_local_id(0))[16 * 8 * 13]; - r8 = (shared.m + get_local_id(0))[16 * 8 * 14]; - r9 = (shared.m + get_local_id(0))[16 * 8 * 15]; - { { uint const half_lane_mask = 4; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -HS_CMP_XCHG(r1, r9) -HS_CMP_XCHG(r5, r13) -HS_CMP_XCHG(r1, r5) -HS_CMP_XCHG(r9, r13) -HS_CMP_XCHG(r3, r11) -HS_CMP_XCHG(r7, r15) -HS_CMP_XCHG(r3, r7) -HS_CMP_XCHG(r11, r15) -HS_CMP_XCHG(r1, r3) -HS_CMP_XCHG(r5, r7) -HS_CMP_XCHG(r9, r11) -HS_CMP_XCHG(r13, r15) -HS_CMP_XCHG(r2, r10) -HS_CMP_XCHG(r6, r14) -HS_CMP_XCHG(r2, r6) -HS_CMP_XCHG(r10, r14) -HS_CMP_XCHG(r4, r12) -HS_CMP_XCHG(r8, r16) -HS_CMP_XCHG(r4, r8) -HS_CMP_XCHG(r12, r16) -HS_CMP_XCHG(r2, r4) -HS_CMP_XCHG(r6, r8) -HS_CMP_XCHG(r10, r12) -HS_CMP_XCHG(r14, r16) -HS_CMP_XCHG(r1, r2) -HS_CMP_XCHG(r3, r4) -HS_CMP_XCHG(r5, r6) -HS_CMP_XCHG(r7, r8) -HS_CMP_XCHG(r9, r10) -HS_CMP_XCHG(r11, r12) -HS_CMP_XCHG(r13, r14) -HS_CMP_XCHG(r15, r16) -} -(shared.m + get_local_id(0))[16 * 8 * 0] = r1; -(shared.m + get_local_id(0))[16 * 8 * 1] = r16; -(shared.m + get_local_id(0))[16 * 8 * 2] = r2; -(shared.m + get_local_id(0))[16 * 8 * 3] = r15; -(shared.m + get_local_id(0))[16 * 8 * 4] = r3; -(shared.m + get_local_id(0))[16 * 8 * 5] = r14; -(shared.m + get_local_id(0))[16 * 8 * 6] = r4; -(shared.m + get_local_id(0))[16 * 8 * 7] = r13; -(shared.m + get_local_id(0))[16 * 8 * 8] = r5; -(shared.m + get_local_id(0))[16 * 8 * 9] = r12; -(shared.m + get_local_id(0))[16 * 8 * 10] = r6; -(shared.m + get_local_id(0))[16 * 8 * 11] = r11; -(shared.m + get_local_id(0))[16 * 8 * 12] = r7; -(shared.m + get_local_id(0))[16 * 8 * 13] = r10; -(shared.m + get_local_id(0))[16 * 8 * 14] = r8; -(shared.m + get_local_id(0))[16 * 8 * 15] = r9; -barrier(CLK_LOCAL_MEM_FENCE); -{ - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; - HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8]; - HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[16]; - HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[24]; - HS_CMP_XCHG(r0_2, r0_3) - HS_CMP_XCHG(r0_1, r0_4) - HS_CMP_XCHG(r0_3, r0_4) - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[0] = r0_1; - (shared.m + smem_l_idx)[8] = r0_2; - (shared.m + smem_r_idx)[16] = r0_3; - (shared.m + smem_r_idx)[24] = r0_4; - } - { - HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[32]; - HS_KEY_TYPE r1_2 = (shared.m + smem_l_idx)[40]; - HS_KEY_TYPE r1_3 = (shared.m + smem_r_idx)[48]; - HS_KEY_TYPE r1_4 = (shared.m + smem_r_idx)[56]; - HS_CMP_XCHG(r1_2, r1_3) - HS_CMP_XCHG(r1_1, r1_4) - HS_CMP_XCHG(r1_3, r1_4) - HS_CMP_XCHG(r1_1, r1_2) - (shared.m + smem_l_idx)[32] = r1_1; - (shared.m + smem_l_idx)[40] = r1_2; - (shared.m + smem_r_idx)[48] = r1_3; - (shared.m + smem_r_idx)[56] = r1_4; - } - { - HS_KEY_TYPE r2_1 = (shared.m + smem_l_idx)[64]; - HS_KEY_TYPE r2_2 = (shared.m + smem_l_idx)[72]; - HS_KEY_TYPE r2_3 = (shared.m + smem_r_idx)[80]; - HS_KEY_TYPE r2_4 = (shared.m + smem_r_idx)[88]; - HS_CMP_XCHG(r2_2, r2_3) - HS_CMP_XCHG(r2_1, r2_4) - HS_CMP_XCHG(r2_3, r2_4) - HS_CMP_XCHG(r2_1, r2_2) - (shared.m + smem_l_idx)[64] = r2_1; - (shared.m + smem_l_idx)[72] = r2_2; - (shared.m + smem_r_idx)[80] = r2_3; - (shared.m + smem_r_idx)[88] = r2_4; - } - { - HS_KEY_TYPE r3_1 = (shared.m + smem_l_idx)[96]; - HS_KEY_TYPE r3_2 = (shared.m + smem_l_idx)[104]; - HS_KEY_TYPE r3_3 = (shared.m + smem_r_idx)[112]; - HS_KEY_TYPE r3_4 = (shared.m + smem_r_idx)[120]; - HS_CMP_XCHG(r3_2, r3_3) - HS_CMP_XCHG(r3_1, r3_4) - HS_CMP_XCHG(r3_3, r3_4) - HS_CMP_XCHG(r3_1, r3_2) - (shared.m + smem_l_idx)[96] = r3_1; - (shared.m + smem_l_idx)[104] = r3_2; - (shared.m + smem_r_idx)[112] = r3_3; - (shared.m + smem_r_idx)[120] = r3_4; - } -} -barrier(CLK_LOCAL_MEM_FENCE); -r1 = (shared.m + get_local_id(0))[16 * 8 * 0]; -r16 = (shared.m + get_local_id(0))[16 * 8 * 1]; -r2 = (shared.m + get_local_id(0))[16 * 8 * 2]; -r15 = (shared.m + get_local_id(0))[16 * 8 * 3]; -r3 = (shared.m + get_local_id(0))[16 * 8 * 4]; -r14 = (shared.m + get_local_id(0))[16 * 8 * 5]; -r4 = (shared.m + get_local_id(0))[16 * 8 * 6]; -r13 = (shared.m + get_local_id(0))[16 * 8 * 7]; -r5 = (shared.m + get_local_id(0))[16 * 8 * 8]; -r12 = (shared.m + get_local_id(0))[16 * 8 * 9]; -r6 = (shared.m + get_local_id(0))[16 * 8 * 10]; -r11 = (shared.m + get_local_id(0))[16 * 8 * 11]; -r7 = (shared.m + get_local_id(0))[16 * 8 * 12]; -r10 = (shared.m + get_local_id(0))[16 * 8 * 13]; -r8 = (shared.m + get_local_id(0))[16 * 8 * 14]; -r9 = (shared.m + get_local_id(0))[16 * 8 * 15]; -{ { uint const half_lane_mask = 4; -uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; -int const t_lt = get_sub_group_local_id() < half_lane_idx; -HS_CMP_HALF(0, r1) -HS_CMP_HALF(1, r2) -HS_CMP_HALF(2, r3) -HS_CMP_HALF(3, r4) -HS_CMP_HALF(4, r5) -HS_CMP_HALF(5, r6) -HS_CMP_HALF(6, r7) -HS_CMP_HALF(7, r8) -HS_CMP_HALF(8, r9) -HS_CMP_HALF(9, r10) -HS_CMP_HALF(10, r11) -HS_CMP_HALF(11, r12) -HS_CMP_HALF(12, r13) -HS_CMP_HALF(13, r14) -HS_CMP_HALF(14, r15) -HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -HS_CMP_XCHG(r1, r9) -HS_CMP_XCHG(r5, r13) -HS_CMP_XCHG(r1, r5) -HS_CMP_XCHG(r9, r13) -HS_CMP_XCHG(r3, r11) -HS_CMP_XCHG(r7, r15) -HS_CMP_XCHG(r3, r7) -HS_CMP_XCHG(r11, r15) -HS_CMP_XCHG(r1, r3) -HS_CMP_XCHG(r5, r7) -HS_CMP_XCHG(r9, r11) -HS_CMP_XCHG(r13, r15) -HS_CMP_XCHG(r2, r10) -HS_CMP_XCHG(r6, r14) -HS_CMP_XCHG(r2, r6) -HS_CMP_XCHG(r10, r14) -HS_CMP_XCHG(r4, r12) -HS_CMP_XCHG(r8, r16) -HS_CMP_XCHG(r4, r8) -HS_CMP_XCHG(r12, r16) -HS_CMP_XCHG(r2, r4) -HS_CMP_XCHG(r6, r8) -HS_CMP_XCHG(r10, r12) -HS_CMP_XCHG(r14, r16) -HS_CMP_XCHG(r1, r2) -HS_CMP_XCHG(r3, r4) -HS_CMP_XCHG(r5, r6) -HS_CMP_XCHG(r7, r8) -HS_CMP_XCHG(r9, r10) -HS_CMP_XCHG(r11, r12) -HS_CMP_XCHG(r13, r14) -HS_CMP_XCHG(r15, r16) -} -(shared.m + get_local_id(0))[16 * 8 * 0] = r1; -(shared.m + get_local_id(0))[16 * 8 * 1] = r16; -(shared.m + get_local_id(0))[16 * 8 * 2] = r2; -(shared.m + get_local_id(0))[16 * 8 * 3] = r15; -(shared.m + get_local_id(0))[16 * 8 * 4] = r3; -(shared.m + get_local_id(0))[16 * 8 * 5] = r14; -(shared.m + get_local_id(0))[16 * 8 * 6] = r4; -(shared.m + get_local_id(0))[16 * 8 * 7] = r13; -(shared.m + get_local_id(0))[16 * 8 * 8] = r5; -(shared.m + get_local_id(0))[16 * 8 * 9] = r12; -(shared.m + get_local_id(0))[16 * 8 * 10] = r6; -(shared.m + get_local_id(0))[16 * 8 * 11] = r11; -(shared.m + get_local_id(0))[16 * 8 * 12] = r7; -(shared.m + get_local_id(0))[16 * 8 * 13] = r10; -(shared.m + get_local_id(0))[16 * 8 * 14] = r8; -(shared.m + get_local_id(0))[16 * 8 * 15] = r9; -barrier(CLK_LOCAL_MEM_FENCE); -{ - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; - HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8]; - HS_KEY_TYPE r0_3 = (shared.m + smem_l_idx)[16]; - HS_KEY_TYPE r0_4 = (shared.m + smem_l_idx)[24]; - HS_KEY_TYPE r0_5 = (shared.m + smem_r_idx)[32]; - HS_KEY_TYPE r0_6 = (shared.m + smem_r_idx)[40]; - HS_KEY_TYPE r0_7 = (shared.m + smem_r_idx)[48]; - HS_KEY_TYPE r0_8 = (shared.m + smem_r_idx)[56]; - HS_CMP_XCHG(r0_4, r0_5) - HS_CMP_XCHG(r0_3, r0_6) - HS_CMP_XCHG(r0_2, r0_7) - HS_CMP_XCHG(r0_1, r0_8) - HS_CMP_XCHG(r0_5, r0_7) - HS_CMP_XCHG(r0_6, r0_8) - HS_CMP_XCHG(r0_5, r0_6) - HS_CMP_XCHG(r0_7, r0_8) - HS_CMP_XCHG(r0_1, r0_3) - HS_CMP_XCHG(r0_2, r0_4) - HS_CMP_XCHG(r0_1, r0_2) - HS_CMP_XCHG(r0_3, r0_4) - (shared.m + smem_l_idx)[0] = r0_1; - (shared.m + smem_l_idx)[8] = r0_2; - (shared.m + smem_l_idx)[16] = r0_3; - (shared.m + smem_l_idx)[24] = r0_4; - (shared.m + smem_r_idx)[32] = r0_5; - (shared.m + smem_r_idx)[40] = r0_6; - (shared.m + smem_r_idx)[48] = r0_7; - (shared.m + smem_r_idx)[56] = r0_8; - } - { - HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[64]; - HS_KEY_TYPE r1_2 = (shared.m + smem_l_idx)[72]; - HS_KEY_TYPE r1_3 = (shared.m + smem_l_idx)[80]; - HS_KEY_TYPE r1_4 = (shared.m + smem_l_idx)[88]; - HS_KEY_TYPE r1_5 = (shared.m + smem_r_idx)[96]; - HS_KEY_TYPE r1_6 = (shared.m + smem_r_idx)[104]; - HS_KEY_TYPE r1_7 = (shared.m + smem_r_idx)[112]; - HS_KEY_TYPE r1_8 = (shared.m + smem_r_idx)[120]; - HS_CMP_XCHG(r1_4, r1_5) - HS_CMP_XCHG(r1_3, r1_6) - HS_CMP_XCHG(r1_2, r1_7) - HS_CMP_XCHG(r1_1, r1_8) - HS_CMP_XCHG(r1_5, r1_7) - HS_CMP_XCHG(r1_6, r1_8) - HS_CMP_XCHG(r1_5, r1_6) - HS_CMP_XCHG(r1_7, r1_8) - HS_CMP_XCHG(r1_1, r1_3) - HS_CMP_XCHG(r1_2, r1_4) - HS_CMP_XCHG(r1_1, r1_2) - HS_CMP_XCHG(r1_3, r1_4) - (shared.m + smem_l_idx)[64] = r1_1; - (shared.m + smem_l_idx)[72] = r1_2; - (shared.m + smem_l_idx)[80] = r1_3; - (shared.m + smem_l_idx)[88] = r1_4; - (shared.m + smem_r_idx)[96] = r1_5; - (shared.m + smem_r_idx)[104] = r1_6; - (shared.m + smem_r_idx)[112] = r1_7; - (shared.m + smem_r_idx)[120] = r1_8; - } -} -barrier(CLK_LOCAL_MEM_FENCE); -r1 = (shared.m + get_local_id(0))[16 * 8 * 0]; -r16 = (shared.m + get_local_id(0))[16 * 8 * 1]; -r2 = (shared.m + get_local_id(0))[16 * 8 * 2]; -r15 = (shared.m + get_local_id(0))[16 * 8 * 3]; -r3 = (shared.m + get_local_id(0))[16 * 8 * 4]; -r14 = (shared.m + get_local_id(0))[16 * 8 * 5]; -r4 = (shared.m + get_local_id(0))[16 * 8 * 6]; -r13 = (shared.m + get_local_id(0))[16 * 8 * 7]; -r5 = (shared.m + get_local_id(0))[16 * 8 * 8]; -r12 = (shared.m + get_local_id(0))[16 * 8 * 9]; -r6 = (shared.m + get_local_id(0))[16 * 8 * 10]; -r11 = (shared.m + get_local_id(0))[16 * 8 * 11]; -r7 = (shared.m + get_local_id(0))[16 * 8 * 12]; -r10 = (shared.m + get_local_id(0))[16 * 8 * 13]; -r8 = (shared.m + get_local_id(0))[16 * 8 * 14]; -r9 = (shared.m + get_local_id(0))[16 * 8 * 15]; -{ { uint const half_lane_mask = 4; -uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; -int const t_lt = get_sub_group_local_id() < half_lane_idx; -HS_CMP_HALF(0, r1) -HS_CMP_HALF(1, r2) -HS_CMP_HALF(2, r3) -HS_CMP_HALF(3, r4) -HS_CMP_HALF(4, r5) -HS_CMP_HALF(5, r6) -HS_CMP_HALF(6, r7) -HS_CMP_HALF(7, r8) -HS_CMP_HALF(8, r9) -HS_CMP_HALF(9, r10) -HS_CMP_HALF(10, r11) -HS_CMP_HALF(11, r12) -HS_CMP_HALF(12, r13) -HS_CMP_HALF(13, r14) -HS_CMP_HALF(14, r15) -HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -HS_CMP_XCHG(r1, r9) -HS_CMP_XCHG(r5, r13) -HS_CMP_XCHG(r1, r5) -HS_CMP_XCHG(r9, r13) -HS_CMP_XCHG(r3, r11) -HS_CMP_XCHG(r7, r15) -HS_CMP_XCHG(r3, r7) -HS_CMP_XCHG(r11, r15) -HS_CMP_XCHG(r1, r3) -HS_CMP_XCHG(r5, r7) -HS_CMP_XCHG(r9, r11) -HS_CMP_XCHG(r13, r15) -HS_CMP_XCHG(r2, r10) -HS_CMP_XCHG(r6, r14) -HS_CMP_XCHG(r2, r6) -HS_CMP_XCHG(r10, r14) -HS_CMP_XCHG(r4, r12) -HS_CMP_XCHG(r8, r16) -HS_CMP_XCHG(r4, r8) -HS_CMP_XCHG(r12, r16) -HS_CMP_XCHG(r2, r4) -HS_CMP_XCHG(r6, r8) -HS_CMP_XCHG(r10, r12) -HS_CMP_XCHG(r14, r16) -HS_CMP_XCHG(r1, r2) -HS_CMP_XCHG(r3, r4) -HS_CMP_XCHG(r5, r6) -HS_CMP_XCHG(r7, r8) -HS_CMP_XCHG(r9, r10) -HS_CMP_XCHG(r11, r12) -HS_CMP_XCHG(r13, r14) -HS_CMP_XCHG(r15, r16) -} -(shared.m + get_local_id(0))[16 * 8 * 0] = r1; -(shared.m + get_local_id(0))[16 * 8 * 1] = r16; -(shared.m + get_local_id(0))[16 * 8 * 2] = r2; -(shared.m + get_local_id(0))[16 * 8 * 3] = r15; -(shared.m + get_local_id(0))[16 * 8 * 4] = r3; -(shared.m + get_local_id(0))[16 * 8 * 5] = r14; -(shared.m + get_local_id(0))[16 * 8 * 6] = r4; -(shared.m + get_local_id(0))[16 * 8 * 7] = r13; -(shared.m + get_local_id(0))[16 * 8 * 8] = r5; -(shared.m + get_local_id(0))[16 * 8 * 9] = r12; -(shared.m + get_local_id(0))[16 * 8 * 10] = r6; -(shared.m + get_local_id(0))[16 * 8 * 11] = r11; -(shared.m + get_local_id(0))[16 * 8 * 12] = r7; -(shared.m + get_local_id(0))[16 * 8 * 13] = r10; -(shared.m + get_local_id(0))[16 * 8 * 14] = r8; -(shared.m + get_local_id(0))[16 * 8 * 15] = r9; -barrier(CLK_LOCAL_MEM_FENCE); -{ - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; - HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8]; - HS_KEY_TYPE r0_3 = (shared.m + smem_l_idx)[16]; - HS_KEY_TYPE r0_4 = (shared.m + smem_l_idx)[24]; - HS_KEY_TYPE r0_5 = (shared.m + smem_l_idx)[32]; - HS_KEY_TYPE r0_6 = (shared.m + smem_l_idx)[40]; - HS_KEY_TYPE r0_7 = (shared.m + smem_l_idx)[48]; - HS_KEY_TYPE r0_8 = (shared.m + smem_l_idx)[56]; - HS_KEY_TYPE r0_9 = (shared.m + smem_r_idx)[64]; - HS_KEY_TYPE r0_10 = (shared.m + smem_r_idx)[72]; - HS_KEY_TYPE r0_11 = (shared.m + smem_r_idx)[80]; - HS_KEY_TYPE r0_12 = (shared.m + smem_r_idx)[88]; - HS_KEY_TYPE r0_13 = (shared.m + smem_r_idx)[96]; - HS_KEY_TYPE r0_14 = (shared.m + smem_r_idx)[104]; - HS_KEY_TYPE r0_15 = (shared.m + smem_r_idx)[112]; - HS_KEY_TYPE r0_16 = (shared.m + smem_r_idx)[120]; - HS_CMP_XCHG(r0_8, r0_9) - HS_CMP_XCHG(r0_7, r0_10) - HS_CMP_XCHG(r0_6, r0_11) - HS_CMP_XCHG(r0_5, r0_12) - HS_CMP_XCHG(r0_4, r0_13) - HS_CMP_XCHG(r0_3, r0_14) - HS_CMP_XCHG(r0_2, r0_15) - HS_CMP_XCHG(r0_1, r0_16) - HS_CMP_XCHG(r0_9, r0_13) - HS_CMP_XCHG(r0_11, r0_15) - HS_CMP_XCHG(r0_9, r0_11) - HS_CMP_XCHG(r0_13, r0_15) - HS_CMP_XCHG(r0_10, r0_14) - HS_CMP_XCHG(r0_12, r0_16) - HS_CMP_XCHG(r0_10, r0_12) - HS_CMP_XCHG(r0_14, r0_16) - HS_CMP_XCHG(r0_9, r0_10) - HS_CMP_XCHG(r0_11, r0_12) - HS_CMP_XCHG(r0_13, r0_14) - HS_CMP_XCHG(r0_15, r0_16) - HS_CMP_XCHG(r0_1, r0_5) - HS_CMP_XCHG(r0_3, r0_7) - HS_CMP_XCHG(r0_1, r0_3) - HS_CMP_XCHG(r0_5, r0_7) - HS_CMP_XCHG(r0_2, r0_6) - HS_CMP_XCHG(r0_4, r0_8) - HS_CMP_XCHG(r0_2, r0_4) - HS_CMP_XCHG(r0_6, r0_8) - HS_CMP_XCHG(r0_1, r0_2) - HS_CMP_XCHG(r0_3, r0_4) - HS_CMP_XCHG(r0_5, r0_6) - HS_CMP_XCHG(r0_7, r0_8) - (shared.m + smem_l_idx)[0] = r0_1; - (shared.m + smem_l_idx)[8] = r0_2; - (shared.m + smem_l_idx)[16] = r0_3; - (shared.m + smem_l_idx)[24] = r0_4; - (shared.m + smem_l_idx)[32] = r0_5; - (shared.m + smem_l_idx)[40] = r0_6; - (shared.m + smem_l_idx)[48] = r0_7; - (shared.m + smem_l_idx)[56] = r0_8; - (shared.m + smem_r_idx)[64] = r0_9; - (shared.m + smem_r_idx)[72] = r0_10; - (shared.m + smem_r_idx)[80] = r0_11; - (shared.m + smem_r_idx)[88] = r0_12; - (shared.m + smem_r_idx)[96] = r0_13; - (shared.m + smem_r_idx)[104] = r0_14; - (shared.m + smem_r_idx)[112] = r0_15; - (shared.m + smem_r_idx)[120] = r0_16; - } -} -barrier(CLK_LOCAL_MEM_FENCE); -r1 = (shared.m + get_local_id(0))[16 * 8 * 0]; -r16 = (shared.m + get_local_id(0))[16 * 8 * 1]; -r2 = (shared.m + get_local_id(0))[16 * 8 * 2]; -r15 = (shared.m + get_local_id(0))[16 * 8 * 3]; -r3 = (shared.m + get_local_id(0))[16 * 8 * 4]; -r14 = (shared.m + get_local_id(0))[16 * 8 * 5]; -r4 = (shared.m + get_local_id(0))[16 * 8 * 6]; -r13 = (shared.m + get_local_id(0))[16 * 8 * 7]; -r5 = (shared.m + get_local_id(0))[16 * 8 * 8]; -r12 = (shared.m + get_local_id(0))[16 * 8 * 9]; -r6 = (shared.m + get_local_id(0))[16 * 8 * 10]; -r11 = (shared.m + get_local_id(0))[16 * 8 * 11]; -r7 = (shared.m + get_local_id(0))[16 * 8 * 12]; -r10 = (shared.m + get_local_id(0))[16 * 8 * 13]; -r8 = (shared.m + get_local_id(0))[16 * 8 * 14]; -r9 = (shared.m + get_local_id(0))[16 * 8 * 15]; -{ { uint const half_lane_mask = 4; -uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; -int const t_lt = get_sub_group_local_id() < half_lane_idx; -HS_CMP_HALF(0, r1) -HS_CMP_HALF(1, r2) -HS_CMP_HALF(2, r3) -HS_CMP_HALF(3, r4) -HS_CMP_HALF(4, r5) -HS_CMP_HALF(5, r6) -HS_CMP_HALF(6, r7) -HS_CMP_HALF(7, r8) -HS_CMP_HALF(8, r9) -HS_CMP_HALF(9, r10) -HS_CMP_HALF(10, r11) -HS_CMP_HALF(11, r12) -HS_CMP_HALF(12, r13) -HS_CMP_HALF(13, r14) -HS_CMP_HALF(14, r15) -HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -HS_CMP_XCHG(r1, r9) -HS_CMP_XCHG(r5, r13) -HS_CMP_XCHG(r1, r5) -HS_CMP_XCHG(r9, r13) -HS_CMP_XCHG(r3, r11) -HS_CMP_XCHG(r7, r15) -HS_CMP_XCHG(r3, r7) -HS_CMP_XCHG(r11, r15) -HS_CMP_XCHG(r1, r3) -HS_CMP_XCHG(r5, r7) -HS_CMP_XCHG(r9, r11) -HS_CMP_XCHG(r13, r15) -HS_CMP_XCHG(r2, r10) -HS_CMP_XCHG(r6, r14) -HS_CMP_XCHG(r2, r6) -HS_CMP_XCHG(r10, r14) -HS_CMP_XCHG(r4, r12) -HS_CMP_XCHG(r8, r16) -HS_CMP_XCHG(r4, r8) -HS_CMP_XCHG(r12, r16) -HS_CMP_XCHG(r2, r4) -HS_CMP_XCHG(r6, r8) -HS_CMP_XCHG(r10, r12) -HS_CMP_XCHG(r14, r16) -HS_CMP_XCHG(r1, r2) -HS_CMP_XCHG(r3, r4) -HS_CMP_XCHG(r5, r6) -HS_CMP_XCHG(r7, r8) -HS_CMP_XCHG(r9, r10) -HS_CMP_XCHG(r11, r12) -HS_CMP_XCHG(r13, r14) -HS_CMP_XCHG(r15, r16) -} -(vout + gmem_idx)[0 * 8] = r1; -(vout + gmem_idx)[1 * 8] = r2; -(vout + gmem_idx)[2 * 8] = r3; -(vout + gmem_idx)[3 * 8] = r4; -(vout + gmem_idx)[4 * 8] = r5; -(vout + gmem_idx)[5 * 8] = r6; -(vout + gmem_idx)[6 * 8] = r7; -(vout + gmem_idx)[7 * 8] = r8; -(vout + gmem_idx)[8 * 8] = r9; -(vout + gmem_idx)[9 * 8] = r10; -(vout + gmem_idx)[10 * 8] = r11; -(vout + gmem_idx)[11 * 8] = r12; -(vout + gmem_idx)[12 * 8] = r13; -(vout + gmem_idx)[13 * 8] = r14; -(vout + gmem_idx)[14 * 8] = r15; -(vout + gmem_idx)[15 * 8] = r16; -} - -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) -__attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_bs_3(__global HS_KEY_TYPE const* const restrict vin, - __global HS_KEY_TYPE* const restrict vout) -{ - __local union - { - HS_KEY_TYPE m[16 * 64]; - } shared; - - uint const global_id = get_global_id(0); - uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); - - HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8]; - HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8]; - HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8]; - HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8]; - HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8]; - HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8]; - HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8]; - HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8]; - HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8]; - HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8]; - HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8]; - HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8]; - HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8]; - HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8]; - HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8]; - HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8]; - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r6, r11) - HS_CMP_XCHG(r7, r10) - HS_CMP_XCHG(r4, r13) - HS_CMP_XCHG(r14, r15) - HS_CMP_XCHG(r8, r12) - HS_CMP_XCHG(r2, r3) - HS_CMP_XCHG(r5, r9) - HS_CMP_XCHG(r2, r5) - HS_CMP_XCHG(r8, r14) - HS_CMP_XCHG(r3, r9) - HS_CMP_XCHG(r12, r15) - HS_CMP_XCHG(r3, r5) - HS_CMP_XCHG(r6, r7) - HS_CMP_XCHG(r10, r11) - HS_CMP_XCHG(r12, r14) - HS_CMP_XCHG(r4, r9) - HS_CMP_XCHG(r8, r13) - HS_CMP_XCHG(r7, r9) - HS_CMP_XCHG(r11, r13) - HS_CMP_XCHG(r4, r6) - HS_CMP_XCHG(r8, r10) - HS_CMP_XCHG(r4, r5) - HS_CMP_XCHG(r6, r7) - HS_CMP_XCHG(r8, r9) - HS_CMP_XCHG(r10, r11) - HS_CMP_XCHG(r12, r13) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - { - uint const flip_lane_mask = 1; - uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; - int const t_lt = get_sub_group_local_id() < flip_lane_idx; - HS_CMP_FLIP(0, r1, r16) - HS_CMP_FLIP(1, r2, r15) - HS_CMP_FLIP(2, r3, r14) - HS_CMP_FLIP(3, r4, r13) - HS_CMP_FLIP(4, r5, r12) - HS_CMP_FLIP(5, r6, r11) - HS_CMP_FLIP(6, r7, r10) - HS_CMP_FLIP(7, r8, r9) - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - { - uint const flip_lane_mask = 3; - uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; - int const t_lt = get_sub_group_local_id() < flip_lane_idx; - HS_CMP_FLIP(0, r1, r16) - HS_CMP_FLIP(1, r2, r15) - HS_CMP_FLIP(2, r3, r14) - HS_CMP_FLIP(3, r4, r13) - HS_CMP_FLIP(4, r5, r12) - HS_CMP_FLIP(5, r6, r11) - HS_CMP_FLIP(6, r7, r10) - HS_CMP_FLIP(7, r8, r9) - } - { - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - { - uint const flip_lane_mask = 7; - uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; - int const t_lt = get_sub_group_local_id() < flip_lane_idx; - HS_CMP_FLIP(0, r1, r16) - HS_CMP_FLIP(1, r2, r15) - HS_CMP_FLIP(2, r3, r14) - HS_CMP_FLIP(3, r4, r13) - HS_CMP_FLIP(4, r5, r12) - HS_CMP_FLIP(5, r6, r11) - HS_CMP_FLIP(6, r7, r10) - HS_CMP_FLIP(7, r8, r9) - } - { - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) - } - { - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - uint const smem_l_idx = get_sub_group_id() * 64 + get_sub_group_local_id(); - uint const smem_r_idx = - (get_sub_group_id() ^ 1) * 64 + (get_sub_group_local_id() ^ 7); - (shared.m + get_local_id(0))[8 * 8 * 0] = r1; - (shared.m + get_local_id(0))[8 * 8 * 1] = r16; - (shared.m + get_local_id(0))[8 * 8 * 2] = r2; - (shared.m + get_local_id(0))[8 * 8 * 3] = r15; - (shared.m + get_local_id(0))[8 * 8 * 4] = r3; - (shared.m + get_local_id(0))[8 * 8 * 5] = r14; - (shared.m + get_local_id(0))[8 * 8 * 6] = r4; - (shared.m + get_local_id(0))[8 * 8 * 7] = r13; - (shared.m + get_local_id(0))[8 * 8 * 8] = r5; - (shared.m + get_local_id(0))[8 * 8 * 9] = r12; - (shared.m + get_local_id(0))[8 * 8 * 10] = r6; - (shared.m + get_local_id(0))[8 * 8 * 11] = r11; - (shared.m + get_local_id(0))[8 * 8 * 12] = r7; - (shared.m + get_local_id(0))[8 * 8 * 13] = r10; - (shared.m + get_local_id(0))[8 * 8 * 14] = r8; - (shared.m + get_local_id(0))[8 * 8 * 15] = r9; - barrier(CLK_LOCAL_MEM_FENCE); - { - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; - HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[8]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[0] = r0_1; - (shared.m + smem_r_idx)[8] = r0_2; - } - { - HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[16]; - HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[24]; - HS_CMP_XCHG(r1_1, r1_2) - (shared.m + smem_l_idx)[16] = r1_1; - (shared.m + smem_r_idx)[24] = r1_2; - } - { - HS_KEY_TYPE r2_1 = (shared.m + smem_l_idx)[32]; - HS_KEY_TYPE r2_2 = (shared.m + smem_r_idx)[40]; - HS_CMP_XCHG(r2_1, r2_2) - (shared.m + smem_l_idx)[32] = r2_1; - (shared.m + smem_r_idx)[40] = r2_2; - } - { - HS_KEY_TYPE r3_1 = (shared.m + smem_l_idx)[48]; - HS_KEY_TYPE r3_2 = (shared.m + smem_r_idx)[56]; - HS_CMP_XCHG(r3_1, r3_2) - (shared.m + smem_l_idx)[48] = r3_1; - (shared.m + smem_r_idx)[56] = r3_2; - } - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[512]; - HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[520]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[512] = r0_1; - (shared.m + smem_r_idx)[520] = r0_2; - } - { - HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[528]; - HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[536]; - HS_CMP_XCHG(r1_1, r1_2) - (shared.m + smem_l_idx)[528] = r1_1; - (shared.m + smem_r_idx)[536] = r1_2; - } - { - HS_KEY_TYPE r2_1 = (shared.m + smem_l_idx)[544]; - HS_KEY_TYPE r2_2 = (shared.m + smem_r_idx)[552]; - HS_CMP_XCHG(r2_1, r2_2) - (shared.m + smem_l_idx)[544] = r2_1; - (shared.m + smem_r_idx)[552] = r2_2; - } - { - HS_KEY_TYPE r3_1 = (shared.m + smem_l_idx)[560]; - HS_KEY_TYPE r3_2 = (shared.m + smem_r_idx)[568]; - HS_CMP_XCHG(r3_1, r3_2) - (shared.m + smem_l_idx)[560] = r3_1; - (shared.m + smem_r_idx)[568] = r3_2; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - r1 = (shared.m + get_local_id(0))[8 * 8 * 0]; - r16 = (shared.m + get_local_id(0))[8 * 8 * 1]; - r2 = (shared.m + get_local_id(0))[8 * 8 * 2]; - r15 = (shared.m + get_local_id(0))[8 * 8 * 3]; - r3 = (shared.m + get_local_id(0))[8 * 8 * 4]; - r14 = (shared.m + get_local_id(0))[8 * 8 * 5]; - r4 = (shared.m + get_local_id(0))[8 * 8 * 6]; - r13 = (shared.m + get_local_id(0))[8 * 8 * 7]; - r5 = (shared.m + get_local_id(0))[8 * 8 * 8]; - r12 = (shared.m + get_local_id(0))[8 * 8 * 9]; - r6 = (shared.m + get_local_id(0))[8 * 8 * 10]; - r11 = (shared.m + get_local_id(0))[8 * 8 * 11]; - r7 = (shared.m + get_local_id(0))[8 * 8 * 12]; - r10 = (shared.m + get_local_id(0))[8 * 8 * 13]; - r8 = (shared.m + get_local_id(0))[8 * 8 * 14]; - r9 = (shared.m + get_local_id(0))[8 * 8 * 15]; - { { uint const half_lane_mask = 4; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -HS_CMP_XCHG(r1, r9) -HS_CMP_XCHG(r5, r13) -HS_CMP_XCHG(r1, r5) -HS_CMP_XCHG(r9, r13) -HS_CMP_XCHG(r3, r11) -HS_CMP_XCHG(r7, r15) -HS_CMP_XCHG(r3, r7) -HS_CMP_XCHG(r11, r15) -HS_CMP_XCHG(r1, r3) -HS_CMP_XCHG(r5, r7) -HS_CMP_XCHG(r9, r11) -HS_CMP_XCHG(r13, r15) -HS_CMP_XCHG(r2, r10) -HS_CMP_XCHG(r6, r14) -HS_CMP_XCHG(r2, r6) -HS_CMP_XCHG(r10, r14) -HS_CMP_XCHG(r4, r12) -HS_CMP_XCHG(r8, r16) -HS_CMP_XCHG(r4, r8) -HS_CMP_XCHG(r12, r16) -HS_CMP_XCHG(r2, r4) -HS_CMP_XCHG(r6, r8) -HS_CMP_XCHG(r10, r12) -HS_CMP_XCHG(r14, r16) -HS_CMP_XCHG(r1, r2) -HS_CMP_XCHG(r3, r4) -HS_CMP_XCHG(r5, r6) -HS_CMP_XCHG(r7, r8) -HS_CMP_XCHG(r9, r10) -HS_CMP_XCHG(r11, r12) -HS_CMP_XCHG(r13, r14) -HS_CMP_XCHG(r15, r16) -} -(shared.m + get_local_id(0))[8 * 8 * 0] = r1; -(shared.m + get_local_id(0))[8 * 8 * 1] = r16; -(shared.m + get_local_id(0))[8 * 8 * 2] = r2; -(shared.m + get_local_id(0))[8 * 8 * 3] = r15; -(shared.m + get_local_id(0))[8 * 8 * 4] = r3; -(shared.m + get_local_id(0))[8 * 8 * 5] = r14; -(shared.m + get_local_id(0))[8 * 8 * 6] = r4; -(shared.m + get_local_id(0))[8 * 8 * 7] = r13; -(shared.m + get_local_id(0))[8 * 8 * 8] = r5; -(shared.m + get_local_id(0))[8 * 8 * 9] = r12; -(shared.m + get_local_id(0))[8 * 8 * 10] = r6; -(shared.m + get_local_id(0))[8 * 8 * 11] = r11; -(shared.m + get_local_id(0))[8 * 8 * 12] = r7; -(shared.m + get_local_id(0))[8 * 8 * 13] = r10; -(shared.m + get_local_id(0))[8 * 8 * 14] = r8; -(shared.m + get_local_id(0))[8 * 8 * 15] = r9; -barrier(CLK_LOCAL_MEM_FENCE); -{ - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; - HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8]; - HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[16]; - HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[24]; - HS_CMP_XCHG(r0_2, r0_3) - HS_CMP_XCHG(r0_1, r0_4) - HS_CMP_XCHG(r0_3, r0_4) - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[0] = r0_1; - (shared.m + smem_l_idx)[8] = r0_2; - (shared.m + smem_r_idx)[16] = r0_3; - (shared.m + smem_r_idx)[24] = r0_4; - } - { - HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[32]; - HS_KEY_TYPE r1_2 = (shared.m + smem_l_idx)[40]; - HS_KEY_TYPE r1_3 = (shared.m + smem_r_idx)[48]; - HS_KEY_TYPE r1_4 = (shared.m + smem_r_idx)[56]; - HS_CMP_XCHG(r1_2, r1_3) - HS_CMP_XCHG(r1_1, r1_4) - HS_CMP_XCHG(r1_3, r1_4) - HS_CMP_XCHG(r1_1, r1_2) - (shared.m + smem_l_idx)[32] = r1_1; - (shared.m + smem_l_idx)[40] = r1_2; - (shared.m + smem_r_idx)[48] = r1_3; - (shared.m + smem_r_idx)[56] = r1_4; - } - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[512]; - HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[520]; - HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[528]; - HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[536]; - HS_CMP_XCHG(r0_2, r0_3) - HS_CMP_XCHG(r0_1, r0_4) - HS_CMP_XCHG(r0_3, r0_4) - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[512] = r0_1; - (shared.m + smem_l_idx)[520] = r0_2; - (shared.m + smem_r_idx)[528] = r0_3; - (shared.m + smem_r_idx)[536] = r0_4; - } - { - HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[544]; - HS_KEY_TYPE r1_2 = (shared.m + smem_l_idx)[552]; - HS_KEY_TYPE r1_3 = (shared.m + smem_r_idx)[560]; - HS_KEY_TYPE r1_4 = (shared.m + smem_r_idx)[568]; - HS_CMP_XCHG(r1_2, r1_3) - HS_CMP_XCHG(r1_1, r1_4) - HS_CMP_XCHG(r1_3, r1_4) - HS_CMP_XCHG(r1_1, r1_2) - (shared.m + smem_l_idx)[544] = r1_1; - (shared.m + smem_l_idx)[552] = r1_2; - (shared.m + smem_r_idx)[560] = r1_3; - (shared.m + smem_r_idx)[568] = r1_4; - } -} -barrier(CLK_LOCAL_MEM_FENCE); -r1 = (shared.m + get_local_id(0))[8 * 8 * 0]; -r16 = (shared.m + get_local_id(0))[8 * 8 * 1]; -r2 = (shared.m + get_local_id(0))[8 * 8 * 2]; -r15 = (shared.m + get_local_id(0))[8 * 8 * 3]; -r3 = (shared.m + get_local_id(0))[8 * 8 * 4]; -r14 = (shared.m + get_local_id(0))[8 * 8 * 5]; -r4 = (shared.m + get_local_id(0))[8 * 8 * 6]; -r13 = (shared.m + get_local_id(0))[8 * 8 * 7]; -r5 = (shared.m + get_local_id(0))[8 * 8 * 8]; -r12 = (shared.m + get_local_id(0))[8 * 8 * 9]; -r6 = (shared.m + get_local_id(0))[8 * 8 * 10]; -r11 = (shared.m + get_local_id(0))[8 * 8 * 11]; -r7 = (shared.m + get_local_id(0))[8 * 8 * 12]; -r10 = (shared.m + get_local_id(0))[8 * 8 * 13]; -r8 = (shared.m + get_local_id(0))[8 * 8 * 14]; -r9 = (shared.m + get_local_id(0))[8 * 8 * 15]; -{ { uint const half_lane_mask = 4; -uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; -int const t_lt = get_sub_group_local_id() < half_lane_idx; -HS_CMP_HALF(0, r1) -HS_CMP_HALF(1, r2) -HS_CMP_HALF(2, r3) -HS_CMP_HALF(3, r4) -HS_CMP_HALF(4, r5) -HS_CMP_HALF(5, r6) -HS_CMP_HALF(6, r7) -HS_CMP_HALF(7, r8) -HS_CMP_HALF(8, r9) -HS_CMP_HALF(9, r10) -HS_CMP_HALF(10, r11) -HS_CMP_HALF(11, r12) -HS_CMP_HALF(12, r13) -HS_CMP_HALF(13, r14) -HS_CMP_HALF(14, r15) -HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -HS_CMP_XCHG(r1, r9) -HS_CMP_XCHG(r5, r13) -HS_CMP_XCHG(r1, r5) -HS_CMP_XCHG(r9, r13) -HS_CMP_XCHG(r3, r11) -HS_CMP_XCHG(r7, r15) -HS_CMP_XCHG(r3, r7) -HS_CMP_XCHG(r11, r15) -HS_CMP_XCHG(r1, r3) -HS_CMP_XCHG(r5, r7) -HS_CMP_XCHG(r9, r11) -HS_CMP_XCHG(r13, r15) -HS_CMP_XCHG(r2, r10) -HS_CMP_XCHG(r6, r14) -HS_CMP_XCHG(r2, r6) -HS_CMP_XCHG(r10, r14) -HS_CMP_XCHG(r4, r12) -HS_CMP_XCHG(r8, r16) -HS_CMP_XCHG(r4, r8) -HS_CMP_XCHG(r12, r16) -HS_CMP_XCHG(r2, r4) -HS_CMP_XCHG(r6, r8) -HS_CMP_XCHG(r10, r12) -HS_CMP_XCHG(r14, r16) -HS_CMP_XCHG(r1, r2) -HS_CMP_XCHG(r3, r4) -HS_CMP_XCHG(r5, r6) -HS_CMP_XCHG(r7, r8) -HS_CMP_XCHG(r9, r10) -HS_CMP_XCHG(r11, r12) -HS_CMP_XCHG(r13, r14) -HS_CMP_XCHG(r15, r16) -} -(shared.m + get_local_id(0))[8 * 8 * 0] = r1; -(shared.m + get_local_id(0))[8 * 8 * 1] = r16; -(shared.m + get_local_id(0))[8 * 8 * 2] = r2; -(shared.m + get_local_id(0))[8 * 8 * 3] = r15; -(shared.m + get_local_id(0))[8 * 8 * 4] = r3; -(shared.m + get_local_id(0))[8 * 8 * 5] = r14; -(shared.m + get_local_id(0))[8 * 8 * 6] = r4; -(shared.m + get_local_id(0))[8 * 8 * 7] = r13; -(shared.m + get_local_id(0))[8 * 8 * 8] = r5; -(shared.m + get_local_id(0))[8 * 8 * 9] = r12; -(shared.m + get_local_id(0))[8 * 8 * 10] = r6; -(shared.m + get_local_id(0))[8 * 8 * 11] = r11; -(shared.m + get_local_id(0))[8 * 8 * 12] = r7; -(shared.m + get_local_id(0))[8 * 8 * 13] = r10; -(shared.m + get_local_id(0))[8 * 8 * 14] = r8; -(shared.m + get_local_id(0))[8 * 8 * 15] = r9; -barrier(CLK_LOCAL_MEM_FENCE); -{ - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; - HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8]; - HS_KEY_TYPE r0_3 = (shared.m + smem_l_idx)[16]; - HS_KEY_TYPE r0_4 = (shared.m + smem_l_idx)[24]; - HS_KEY_TYPE r0_5 = (shared.m + smem_r_idx)[32]; - HS_KEY_TYPE r0_6 = (shared.m + smem_r_idx)[40]; - HS_KEY_TYPE r0_7 = (shared.m + smem_r_idx)[48]; - HS_KEY_TYPE r0_8 = (shared.m + smem_r_idx)[56]; - HS_CMP_XCHG(r0_4, r0_5) - HS_CMP_XCHG(r0_3, r0_6) - HS_CMP_XCHG(r0_2, r0_7) - HS_CMP_XCHG(r0_1, r0_8) - HS_CMP_XCHG(r0_5, r0_7) - HS_CMP_XCHG(r0_6, r0_8) - HS_CMP_XCHG(r0_5, r0_6) - HS_CMP_XCHG(r0_7, r0_8) - HS_CMP_XCHG(r0_1, r0_3) - HS_CMP_XCHG(r0_2, r0_4) - HS_CMP_XCHG(r0_1, r0_2) - HS_CMP_XCHG(r0_3, r0_4) - (shared.m + smem_l_idx)[0] = r0_1; - (shared.m + smem_l_idx)[8] = r0_2; - (shared.m + smem_l_idx)[16] = r0_3; - (shared.m + smem_l_idx)[24] = r0_4; - (shared.m + smem_r_idx)[32] = r0_5; - (shared.m + smem_r_idx)[40] = r0_6; - (shared.m + smem_r_idx)[48] = r0_7; - (shared.m + smem_r_idx)[56] = r0_8; - } - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[512]; - HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[520]; - HS_KEY_TYPE r0_3 = (shared.m + smem_l_idx)[528]; - HS_KEY_TYPE r0_4 = (shared.m + smem_l_idx)[536]; - HS_KEY_TYPE r0_5 = (shared.m + smem_r_idx)[544]; - HS_KEY_TYPE r0_6 = (shared.m + smem_r_idx)[552]; - HS_KEY_TYPE r0_7 = (shared.m + smem_r_idx)[560]; - HS_KEY_TYPE r0_8 = (shared.m + smem_r_idx)[568]; - HS_CMP_XCHG(r0_4, r0_5) - HS_CMP_XCHG(r0_3, r0_6) - HS_CMP_XCHG(r0_2, r0_7) - HS_CMP_XCHG(r0_1, r0_8) - HS_CMP_XCHG(r0_5, r0_7) - HS_CMP_XCHG(r0_6, r0_8) - HS_CMP_XCHG(r0_5, r0_6) - HS_CMP_XCHG(r0_7, r0_8) - HS_CMP_XCHG(r0_1, r0_3) - HS_CMP_XCHG(r0_2, r0_4) - HS_CMP_XCHG(r0_1, r0_2) - HS_CMP_XCHG(r0_3, r0_4) - (shared.m + smem_l_idx)[512] = r0_1; - (shared.m + smem_l_idx)[520] = r0_2; - (shared.m + smem_l_idx)[528] = r0_3; - (shared.m + smem_l_idx)[536] = r0_4; - (shared.m + smem_r_idx)[544] = r0_5; - (shared.m + smem_r_idx)[552] = r0_6; - (shared.m + smem_r_idx)[560] = r0_7; - (shared.m + smem_r_idx)[568] = r0_8; - } -} -barrier(CLK_LOCAL_MEM_FENCE); -r1 = (shared.m + get_local_id(0))[8 * 8 * 0]; -r16 = (shared.m + get_local_id(0))[8 * 8 * 1]; -r2 = (shared.m + get_local_id(0))[8 * 8 * 2]; -r15 = (shared.m + get_local_id(0))[8 * 8 * 3]; -r3 = (shared.m + get_local_id(0))[8 * 8 * 4]; -r14 = (shared.m + get_local_id(0))[8 * 8 * 5]; -r4 = (shared.m + get_local_id(0))[8 * 8 * 6]; -r13 = (shared.m + get_local_id(0))[8 * 8 * 7]; -r5 = (shared.m + get_local_id(0))[8 * 8 * 8]; -r12 = (shared.m + get_local_id(0))[8 * 8 * 9]; -r6 = (shared.m + get_local_id(0))[8 * 8 * 10]; -r11 = (shared.m + get_local_id(0))[8 * 8 * 11]; -r7 = (shared.m + get_local_id(0))[8 * 8 * 12]; -r10 = (shared.m + get_local_id(0))[8 * 8 * 13]; -r8 = (shared.m + get_local_id(0))[8 * 8 * 14]; -r9 = (shared.m + get_local_id(0))[8 * 8 * 15]; -{ { uint const half_lane_mask = 4; -uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; -int const t_lt = get_sub_group_local_id() < half_lane_idx; -HS_CMP_HALF(0, r1) -HS_CMP_HALF(1, r2) -HS_CMP_HALF(2, r3) -HS_CMP_HALF(3, r4) -HS_CMP_HALF(4, r5) -HS_CMP_HALF(5, r6) -HS_CMP_HALF(6, r7) -HS_CMP_HALF(7, r8) -HS_CMP_HALF(8, r9) -HS_CMP_HALF(9, r10) -HS_CMP_HALF(10, r11) -HS_CMP_HALF(11, r12) -HS_CMP_HALF(12, r13) -HS_CMP_HALF(13, r14) -HS_CMP_HALF(14, r15) -HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -HS_CMP_XCHG(r1, r9) -HS_CMP_XCHG(r5, r13) -HS_CMP_XCHG(r1, r5) -HS_CMP_XCHG(r9, r13) -HS_CMP_XCHG(r3, r11) -HS_CMP_XCHG(r7, r15) -HS_CMP_XCHG(r3, r7) -HS_CMP_XCHG(r11, r15) -HS_CMP_XCHG(r1, r3) -HS_CMP_XCHG(r5, r7) -HS_CMP_XCHG(r9, r11) -HS_CMP_XCHG(r13, r15) -HS_CMP_XCHG(r2, r10) -HS_CMP_XCHG(r6, r14) -HS_CMP_XCHG(r2, r6) -HS_CMP_XCHG(r10, r14) -HS_CMP_XCHG(r4, r12) -HS_CMP_XCHG(r8, r16) -HS_CMP_XCHG(r4, r8) -HS_CMP_XCHG(r12, r16) -HS_CMP_XCHG(r2, r4) -HS_CMP_XCHG(r6, r8) -HS_CMP_XCHG(r10, r12) -HS_CMP_XCHG(r14, r16) -HS_CMP_XCHG(r1, r2) -HS_CMP_XCHG(r3, r4) -HS_CMP_XCHG(r5, r6) -HS_CMP_XCHG(r7, r8) -HS_CMP_XCHG(r9, r10) -HS_CMP_XCHG(r11, r12) -HS_CMP_XCHG(r13, r14) -HS_CMP_XCHG(r15, r16) -} -(vout + gmem_idx)[0 * 8] = r1; -(vout + gmem_idx)[1 * 8] = r2; -(vout + gmem_idx)[2 * 8] = r3; -(vout + gmem_idx)[3 * 8] = r4; -(vout + gmem_idx)[4 * 8] = r5; -(vout + gmem_idx)[5 * 8] = r6; -(vout + gmem_idx)[6 * 8] = r7; -(vout + gmem_idx)[7 * 8] = r8; -(vout + gmem_idx)[8 * 8] = r9; -(vout + gmem_idx)[9 * 8] = r10; -(vout + gmem_idx)[10 * 8] = r11; -(vout + gmem_idx)[11 * 8] = r12; -(vout + gmem_idx)[12 * 8] = r13; -(vout + gmem_idx)[13 * 8] = r14; -(vout + gmem_idx)[14 * 8] = r15; -(vout + gmem_idx)[15 * 8] = r16; -} - -__kernel __attribute__((reqd_work_group_size(32, 1, 1))) -__attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_bs_2(__global HS_KEY_TYPE const* const restrict vin, - __global HS_KEY_TYPE* const restrict vout) -{ - __local union - { - HS_KEY_TYPE m[16 * 32]; - } shared; - - uint const global_id = get_global_id(0); - uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); - - HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8]; - HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8]; - HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8]; - HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8]; - HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8]; - HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8]; - HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8]; - HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8]; - HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8]; - HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8]; - HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8]; - HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8]; - HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8]; - HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8]; - HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8]; - HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8]; - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r6, r11) - HS_CMP_XCHG(r7, r10) - HS_CMP_XCHG(r4, r13) - HS_CMP_XCHG(r14, r15) - HS_CMP_XCHG(r8, r12) - HS_CMP_XCHG(r2, r3) - HS_CMP_XCHG(r5, r9) - HS_CMP_XCHG(r2, r5) - HS_CMP_XCHG(r8, r14) - HS_CMP_XCHG(r3, r9) - HS_CMP_XCHG(r12, r15) - HS_CMP_XCHG(r3, r5) - HS_CMP_XCHG(r6, r7) - HS_CMP_XCHG(r10, r11) - HS_CMP_XCHG(r12, r14) - HS_CMP_XCHG(r4, r9) - HS_CMP_XCHG(r8, r13) - HS_CMP_XCHG(r7, r9) - HS_CMP_XCHG(r11, r13) - HS_CMP_XCHG(r4, r6) - HS_CMP_XCHG(r8, r10) - HS_CMP_XCHG(r4, r5) - HS_CMP_XCHG(r6, r7) - HS_CMP_XCHG(r8, r9) - HS_CMP_XCHG(r10, r11) - HS_CMP_XCHG(r12, r13) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - { - uint const flip_lane_mask = 1; - uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; - int const t_lt = get_sub_group_local_id() < flip_lane_idx; - HS_CMP_FLIP(0, r1, r16) - HS_CMP_FLIP(1, r2, r15) - HS_CMP_FLIP(2, r3, r14) - HS_CMP_FLIP(3, r4, r13) - HS_CMP_FLIP(4, r5, r12) - HS_CMP_FLIP(5, r6, r11) - HS_CMP_FLIP(6, r7, r10) - HS_CMP_FLIP(7, r8, r9) - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - { - uint const flip_lane_mask = 3; - uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; - int const t_lt = get_sub_group_local_id() < flip_lane_idx; - HS_CMP_FLIP(0, r1, r16) - HS_CMP_FLIP(1, r2, r15) - HS_CMP_FLIP(2, r3, r14) - HS_CMP_FLIP(3, r4, r13) - HS_CMP_FLIP(4, r5, r12) - HS_CMP_FLIP(5, r6, r11) - HS_CMP_FLIP(6, r7, r10) - HS_CMP_FLIP(7, r8, r9) - } - { - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - { - uint const flip_lane_mask = 7; - uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; - int const t_lt = get_sub_group_local_id() < flip_lane_idx; - HS_CMP_FLIP(0, r1, r16) - HS_CMP_FLIP(1, r2, r15) - HS_CMP_FLIP(2, r3, r14) - HS_CMP_FLIP(3, r4, r13) - HS_CMP_FLIP(4, r5, r12) - HS_CMP_FLIP(5, r6, r11) - HS_CMP_FLIP(6, r7, r10) - HS_CMP_FLIP(7, r8, r9) - } - { - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) - } - { - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - uint const smem_l_idx = get_sub_group_id() * 32 + get_sub_group_local_id(); - uint const smem_r_idx = - (get_sub_group_id() ^ 1) * 32 + (get_sub_group_local_id() ^ 7); - (shared.m + get_local_id(0))[4 * 8 * 0] = r1; - (shared.m + get_local_id(0))[4 * 8 * 1] = r16; - (shared.m + get_local_id(0))[4 * 8 * 2] = r2; - (shared.m + get_local_id(0))[4 * 8 * 3] = r15; - (shared.m + get_local_id(0))[4 * 8 * 4] = r3; - (shared.m + get_local_id(0))[4 * 8 * 5] = r14; - (shared.m + get_local_id(0))[4 * 8 * 6] = r4; - (shared.m + get_local_id(0))[4 * 8 * 7] = r13; - (shared.m + get_local_id(0))[4 * 8 * 8] = r5; - (shared.m + get_local_id(0))[4 * 8 * 9] = r12; - (shared.m + get_local_id(0))[4 * 8 * 10] = r6; - (shared.m + get_local_id(0))[4 * 8 * 11] = r11; - (shared.m + get_local_id(0))[4 * 8 * 12] = r7; - (shared.m + get_local_id(0))[4 * 8 * 13] = r10; - (shared.m + get_local_id(0))[4 * 8 * 14] = r8; - (shared.m + get_local_id(0))[4 * 8 * 15] = r9; - barrier(CLK_LOCAL_MEM_FENCE); - { - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; - HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[8]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[0] = r0_1; - (shared.m + smem_r_idx)[8] = r0_2; - } - { - HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[16]; - HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[24]; - HS_CMP_XCHG(r1_1, r1_2) - (shared.m + smem_l_idx)[16] = r1_1; - (shared.m + smem_r_idx)[24] = r1_2; - } - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[128]; - HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[136]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[128] = r0_1; - (shared.m + smem_r_idx)[136] = r0_2; - } - { - HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[144]; - HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[152]; - HS_CMP_XCHG(r1_1, r1_2) - (shared.m + smem_l_idx)[144] = r1_1; - (shared.m + smem_r_idx)[152] = r1_2; - } - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[256]; - HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[264]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[256] = r0_1; - (shared.m + smem_r_idx)[264] = r0_2; - } - { - HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[272]; - HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[280]; - HS_CMP_XCHG(r1_1, r1_2) - (shared.m + smem_l_idx)[272] = r1_1; - (shared.m + smem_r_idx)[280] = r1_2; - } - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[384]; - HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[392]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[384] = r0_1; - (shared.m + smem_r_idx)[392] = r0_2; - } - { - HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[400]; - HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[408]; - HS_CMP_XCHG(r1_1, r1_2) - (shared.m + smem_l_idx)[400] = r1_1; - (shared.m + smem_r_idx)[408] = r1_2; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - r1 = (shared.m + get_local_id(0))[4 * 8 * 0]; - r16 = (shared.m + get_local_id(0))[4 * 8 * 1]; - r2 = (shared.m + get_local_id(0))[4 * 8 * 2]; - r15 = (shared.m + get_local_id(0))[4 * 8 * 3]; - r3 = (shared.m + get_local_id(0))[4 * 8 * 4]; - r14 = (shared.m + get_local_id(0))[4 * 8 * 5]; - r4 = (shared.m + get_local_id(0))[4 * 8 * 6]; - r13 = (shared.m + get_local_id(0))[4 * 8 * 7]; - r5 = (shared.m + get_local_id(0))[4 * 8 * 8]; - r12 = (shared.m + get_local_id(0))[4 * 8 * 9]; - r6 = (shared.m + get_local_id(0))[4 * 8 * 10]; - r11 = (shared.m + get_local_id(0))[4 * 8 * 11]; - r7 = (shared.m + get_local_id(0))[4 * 8 * 12]; - r10 = (shared.m + get_local_id(0))[4 * 8 * 13]; - r8 = (shared.m + get_local_id(0))[4 * 8 * 14]; - r9 = (shared.m + get_local_id(0))[4 * 8 * 15]; - { { uint const half_lane_mask = 4; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -HS_CMP_XCHG(r1, r9) -HS_CMP_XCHG(r5, r13) -HS_CMP_XCHG(r1, r5) -HS_CMP_XCHG(r9, r13) -HS_CMP_XCHG(r3, r11) -HS_CMP_XCHG(r7, r15) -HS_CMP_XCHG(r3, r7) -HS_CMP_XCHG(r11, r15) -HS_CMP_XCHG(r1, r3) -HS_CMP_XCHG(r5, r7) -HS_CMP_XCHG(r9, r11) -HS_CMP_XCHG(r13, r15) -HS_CMP_XCHG(r2, r10) -HS_CMP_XCHG(r6, r14) -HS_CMP_XCHG(r2, r6) -HS_CMP_XCHG(r10, r14) -HS_CMP_XCHG(r4, r12) -HS_CMP_XCHG(r8, r16) -HS_CMP_XCHG(r4, r8) -HS_CMP_XCHG(r12, r16) -HS_CMP_XCHG(r2, r4) -HS_CMP_XCHG(r6, r8) -HS_CMP_XCHG(r10, r12) -HS_CMP_XCHG(r14, r16) -HS_CMP_XCHG(r1, r2) -HS_CMP_XCHG(r3, r4) -HS_CMP_XCHG(r5, r6) -HS_CMP_XCHG(r7, r8) -HS_CMP_XCHG(r9, r10) -HS_CMP_XCHG(r11, r12) -HS_CMP_XCHG(r13, r14) -HS_CMP_XCHG(r15, r16) -} -(shared.m + get_local_id(0))[4 * 8 * 0] = r1; -(shared.m + get_local_id(0))[4 * 8 * 1] = r16; -(shared.m + get_local_id(0))[4 * 8 * 2] = r2; -(shared.m + get_local_id(0))[4 * 8 * 3] = r15; -(shared.m + get_local_id(0))[4 * 8 * 4] = r3; -(shared.m + get_local_id(0))[4 * 8 * 5] = r14; -(shared.m + get_local_id(0))[4 * 8 * 6] = r4; -(shared.m + get_local_id(0))[4 * 8 * 7] = r13; -(shared.m + get_local_id(0))[4 * 8 * 8] = r5; -(shared.m + get_local_id(0))[4 * 8 * 9] = r12; -(shared.m + get_local_id(0))[4 * 8 * 10] = r6; -(shared.m + get_local_id(0))[4 * 8 * 11] = r11; -(shared.m + get_local_id(0))[4 * 8 * 12] = r7; -(shared.m + get_local_id(0))[4 * 8 * 13] = r10; -(shared.m + get_local_id(0))[4 * 8 * 14] = r8; -(shared.m + get_local_id(0))[4 * 8 * 15] = r9; -barrier(CLK_LOCAL_MEM_FENCE); -{ - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; - HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8]; - HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[16]; - HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[24]; - HS_CMP_XCHG(r0_2, r0_3) - HS_CMP_XCHG(r0_1, r0_4) - HS_CMP_XCHG(r0_3, r0_4) - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[0] = r0_1; - (shared.m + smem_l_idx)[8] = r0_2; - (shared.m + smem_r_idx)[16] = r0_3; - (shared.m + smem_r_idx)[24] = r0_4; - } - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[128]; - HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[136]; - HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[144]; - HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[152]; - HS_CMP_XCHG(r0_2, r0_3) - HS_CMP_XCHG(r0_1, r0_4) - HS_CMP_XCHG(r0_3, r0_4) - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[128] = r0_1; - (shared.m + smem_l_idx)[136] = r0_2; - (shared.m + smem_r_idx)[144] = r0_3; - (shared.m + smem_r_idx)[152] = r0_4; - } - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[256]; - HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[264]; - HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[272]; - HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[280]; - HS_CMP_XCHG(r0_2, r0_3) - HS_CMP_XCHG(r0_1, r0_4) - HS_CMP_XCHG(r0_3, r0_4) - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[256] = r0_1; - (shared.m + smem_l_idx)[264] = r0_2; - (shared.m + smem_r_idx)[272] = r0_3; - (shared.m + smem_r_idx)[280] = r0_4; - } - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[384]; - HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[392]; - HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[400]; - HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[408]; - HS_CMP_XCHG(r0_2, r0_3) - HS_CMP_XCHG(r0_1, r0_4) - HS_CMP_XCHG(r0_3, r0_4) - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[384] = r0_1; - (shared.m + smem_l_idx)[392] = r0_2; - (shared.m + smem_r_idx)[400] = r0_3; - (shared.m + smem_r_idx)[408] = r0_4; - } -} -barrier(CLK_LOCAL_MEM_FENCE); -r1 = (shared.m + get_local_id(0))[4 * 8 * 0]; -r16 = (shared.m + get_local_id(0))[4 * 8 * 1]; -r2 = (shared.m + get_local_id(0))[4 * 8 * 2]; -r15 = (shared.m + get_local_id(0))[4 * 8 * 3]; -r3 = (shared.m + get_local_id(0))[4 * 8 * 4]; -r14 = (shared.m + get_local_id(0))[4 * 8 * 5]; -r4 = (shared.m + get_local_id(0))[4 * 8 * 6]; -r13 = (shared.m + get_local_id(0))[4 * 8 * 7]; -r5 = (shared.m + get_local_id(0))[4 * 8 * 8]; -r12 = (shared.m + get_local_id(0))[4 * 8 * 9]; -r6 = (shared.m + get_local_id(0))[4 * 8 * 10]; -r11 = (shared.m + get_local_id(0))[4 * 8 * 11]; -r7 = (shared.m + get_local_id(0))[4 * 8 * 12]; -r10 = (shared.m + get_local_id(0))[4 * 8 * 13]; -r8 = (shared.m + get_local_id(0))[4 * 8 * 14]; -r9 = (shared.m + get_local_id(0))[4 * 8 * 15]; -{ { uint const half_lane_mask = 4; -uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; -int const t_lt = get_sub_group_local_id() < half_lane_idx; -HS_CMP_HALF(0, r1) -HS_CMP_HALF(1, r2) -HS_CMP_HALF(2, r3) -HS_CMP_HALF(3, r4) -HS_CMP_HALF(4, r5) -HS_CMP_HALF(5, r6) -HS_CMP_HALF(6, r7) -HS_CMP_HALF(7, r8) -HS_CMP_HALF(8, r9) -HS_CMP_HALF(9, r10) -HS_CMP_HALF(10, r11) -HS_CMP_HALF(11, r12) -HS_CMP_HALF(12, r13) -HS_CMP_HALF(13, r14) -HS_CMP_HALF(14, r15) -HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -HS_CMP_XCHG(r1, r9) -HS_CMP_XCHG(r5, r13) -HS_CMP_XCHG(r1, r5) -HS_CMP_XCHG(r9, r13) -HS_CMP_XCHG(r3, r11) -HS_CMP_XCHG(r7, r15) -HS_CMP_XCHG(r3, r7) -HS_CMP_XCHG(r11, r15) -HS_CMP_XCHG(r1, r3) -HS_CMP_XCHG(r5, r7) -HS_CMP_XCHG(r9, r11) -HS_CMP_XCHG(r13, r15) -HS_CMP_XCHG(r2, r10) -HS_CMP_XCHG(r6, r14) -HS_CMP_XCHG(r2, r6) -HS_CMP_XCHG(r10, r14) -HS_CMP_XCHG(r4, r12) -HS_CMP_XCHG(r8, r16) -HS_CMP_XCHG(r4, r8) -HS_CMP_XCHG(r12, r16) -HS_CMP_XCHG(r2, r4) -HS_CMP_XCHG(r6, r8) -HS_CMP_XCHG(r10, r12) -HS_CMP_XCHG(r14, r16) -HS_CMP_XCHG(r1, r2) -HS_CMP_XCHG(r3, r4) -HS_CMP_XCHG(r5, r6) -HS_CMP_XCHG(r7, r8) -HS_CMP_XCHG(r9, r10) -HS_CMP_XCHG(r11, r12) -HS_CMP_XCHG(r13, r14) -HS_CMP_XCHG(r15, r16) -} -(vout + gmem_idx)[0 * 8] = r1; -(vout + gmem_idx)[1 * 8] = r2; -(vout + gmem_idx)[2 * 8] = r3; -(vout + gmem_idx)[3 * 8] = r4; -(vout + gmem_idx)[4 * 8] = r5; -(vout + gmem_idx)[5 * 8] = r6; -(vout + gmem_idx)[6 * 8] = r7; -(vout + gmem_idx)[7 * 8] = r8; -(vout + gmem_idx)[8 * 8] = r9; -(vout + gmem_idx)[9 * 8] = r10; -(vout + gmem_idx)[10 * 8] = r11; -(vout + gmem_idx)[11 * 8] = r12; -(vout + gmem_idx)[12 * 8] = r13; -(vout + gmem_idx)[13 * 8] = r14; -(vout + gmem_idx)[14 * 8] = r15; -(vout + gmem_idx)[15 * 8] = r16; -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) -__attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_bs_1(__global HS_KEY_TYPE const* const restrict vin, - __global HS_KEY_TYPE* const restrict vout) -{ - __local union - { - HS_KEY_TYPE m[16 * 16]; - } shared; - - uint const global_id = get_global_id(0); - uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); - - HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8]; - HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8]; - HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8]; - HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8]; - HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8]; - HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8]; - HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8]; - HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8]; - HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8]; - HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8]; - HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8]; - HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8]; - HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8]; - HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8]; - HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8]; - HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8]; - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r6, r11) - HS_CMP_XCHG(r7, r10) - HS_CMP_XCHG(r4, r13) - HS_CMP_XCHG(r14, r15) - HS_CMP_XCHG(r8, r12) - HS_CMP_XCHG(r2, r3) - HS_CMP_XCHG(r5, r9) - HS_CMP_XCHG(r2, r5) - HS_CMP_XCHG(r8, r14) - HS_CMP_XCHG(r3, r9) - HS_CMP_XCHG(r12, r15) - HS_CMP_XCHG(r3, r5) - HS_CMP_XCHG(r6, r7) - HS_CMP_XCHG(r10, r11) - HS_CMP_XCHG(r12, r14) - HS_CMP_XCHG(r4, r9) - HS_CMP_XCHG(r8, r13) - HS_CMP_XCHG(r7, r9) - HS_CMP_XCHG(r11, r13) - HS_CMP_XCHG(r4, r6) - HS_CMP_XCHG(r8, r10) - HS_CMP_XCHG(r4, r5) - HS_CMP_XCHG(r6, r7) - HS_CMP_XCHG(r8, r9) - HS_CMP_XCHG(r10, r11) - HS_CMP_XCHG(r12, r13) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - { - uint const flip_lane_mask = 1; - uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; - int const t_lt = get_sub_group_local_id() < flip_lane_idx; - HS_CMP_FLIP(0, r1, r16) - HS_CMP_FLIP(1, r2, r15) - HS_CMP_FLIP(2, r3, r14) - HS_CMP_FLIP(3, r4, r13) - HS_CMP_FLIP(4, r5, r12) - HS_CMP_FLIP(5, r6, r11) - HS_CMP_FLIP(6, r7, r10) - HS_CMP_FLIP(7, r8, r9) - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - { - uint const flip_lane_mask = 3; - uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; - int const t_lt = get_sub_group_local_id() < flip_lane_idx; - HS_CMP_FLIP(0, r1, r16) - HS_CMP_FLIP(1, r2, r15) - HS_CMP_FLIP(2, r3, r14) - HS_CMP_FLIP(3, r4, r13) - HS_CMP_FLIP(4, r5, r12) - HS_CMP_FLIP(5, r6, r11) - HS_CMP_FLIP(6, r7, r10) - HS_CMP_FLIP(7, r8, r9) - } - { - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - { - uint const flip_lane_mask = 7; - uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; - int const t_lt = get_sub_group_local_id() < flip_lane_idx; - HS_CMP_FLIP(0, r1, r16) - HS_CMP_FLIP(1, r2, r15) - HS_CMP_FLIP(2, r3, r14) - HS_CMP_FLIP(3, r4, r13) - HS_CMP_FLIP(4, r5, r12) - HS_CMP_FLIP(5, r6, r11) - HS_CMP_FLIP(6, r7, r10) - HS_CMP_FLIP(7, r8, r9) - } - { - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) - } - { - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - uint const smem_l_idx = get_sub_group_id() * 16 + get_sub_group_local_id(); - uint const smem_r_idx = - (get_sub_group_id() ^ 1) * 16 + (get_sub_group_local_id() ^ 7); - (shared.m + get_local_id(0))[2 * 8 * 0] = r1; - (shared.m + get_local_id(0))[2 * 8 * 1] = r16; - (shared.m + get_local_id(0))[2 * 8 * 2] = r2; - (shared.m + get_local_id(0))[2 * 8 * 3] = r15; - (shared.m + get_local_id(0))[2 * 8 * 4] = r3; - (shared.m + get_local_id(0))[2 * 8 * 5] = r14; - (shared.m + get_local_id(0))[2 * 8 * 6] = r4; - (shared.m + get_local_id(0))[2 * 8 * 7] = r13; - (shared.m + get_local_id(0))[2 * 8 * 8] = r5; - (shared.m + get_local_id(0))[2 * 8 * 9] = r12; - (shared.m + get_local_id(0))[2 * 8 * 10] = r6; - (shared.m + get_local_id(0))[2 * 8 * 11] = r11; - (shared.m + get_local_id(0))[2 * 8 * 12] = r7; - (shared.m + get_local_id(0))[2 * 8 * 13] = r10; - (shared.m + get_local_id(0))[2 * 8 * 14] = r8; - (shared.m + get_local_id(0))[2 * 8 * 15] = r9; - barrier(CLK_LOCAL_MEM_FENCE); - { - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0]; - HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[8]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[0] = r0_1; - (shared.m + smem_r_idx)[8] = r0_2; - } - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[32]; - HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[40]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[32] = r0_1; - (shared.m + smem_r_idx)[40] = r0_2; - } - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[64]; - HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[72]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[64] = r0_1; - (shared.m + smem_r_idx)[72] = r0_2; - } - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[96]; - HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[104]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[96] = r0_1; - (shared.m + smem_r_idx)[104] = r0_2; - } - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[128]; - HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[136]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[128] = r0_1; - (shared.m + smem_r_idx)[136] = r0_2; - } - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[160]; - HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[168]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[160] = r0_1; - (shared.m + smem_r_idx)[168] = r0_2; - } - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[192]; - HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[200]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[192] = r0_1; - (shared.m + smem_r_idx)[200] = r0_2; - } - { - HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[224]; - HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[232]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[224] = r0_1; - (shared.m + smem_r_idx)[232] = r0_2; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - r1 = (shared.m + get_local_id(0))[2 * 8 * 0]; - r16 = (shared.m + get_local_id(0))[2 * 8 * 1]; - r2 = (shared.m + get_local_id(0))[2 * 8 * 2]; - r15 = (shared.m + get_local_id(0))[2 * 8 * 3]; - r3 = (shared.m + get_local_id(0))[2 * 8 * 4]; - r14 = (shared.m + get_local_id(0))[2 * 8 * 5]; - r4 = (shared.m + get_local_id(0))[2 * 8 * 6]; - r13 = (shared.m + get_local_id(0))[2 * 8 * 7]; - r5 = (shared.m + get_local_id(0))[2 * 8 * 8]; - r12 = (shared.m + get_local_id(0))[2 * 8 * 9]; - r6 = (shared.m + get_local_id(0))[2 * 8 * 10]; - r11 = (shared.m + get_local_id(0))[2 * 8 * 11]; - r7 = (shared.m + get_local_id(0))[2 * 8 * 12]; - r10 = (shared.m + get_local_id(0))[2 * 8 * 13]; - r8 = (shared.m + get_local_id(0))[2 * 8 * 14]; - r9 = (shared.m + get_local_id(0))[2 * 8 * 15]; - { { uint const half_lane_mask = 4; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -HS_CMP_XCHG(r1, r9) -HS_CMP_XCHG(r5, r13) -HS_CMP_XCHG(r1, r5) -HS_CMP_XCHG(r9, r13) -HS_CMP_XCHG(r3, r11) -HS_CMP_XCHG(r7, r15) -HS_CMP_XCHG(r3, r7) -HS_CMP_XCHG(r11, r15) -HS_CMP_XCHG(r1, r3) -HS_CMP_XCHG(r5, r7) -HS_CMP_XCHG(r9, r11) -HS_CMP_XCHG(r13, r15) -HS_CMP_XCHG(r2, r10) -HS_CMP_XCHG(r6, r14) -HS_CMP_XCHG(r2, r6) -HS_CMP_XCHG(r10, r14) -HS_CMP_XCHG(r4, r12) -HS_CMP_XCHG(r8, r16) -HS_CMP_XCHG(r4, r8) -HS_CMP_XCHG(r12, r16) -HS_CMP_XCHG(r2, r4) -HS_CMP_XCHG(r6, r8) -HS_CMP_XCHG(r10, r12) -HS_CMP_XCHG(r14, r16) -HS_CMP_XCHG(r1, r2) -HS_CMP_XCHG(r3, r4) -HS_CMP_XCHG(r5, r6) -HS_CMP_XCHG(r7, r8) -HS_CMP_XCHG(r9, r10) -HS_CMP_XCHG(r11, r12) -HS_CMP_XCHG(r13, r14) -HS_CMP_XCHG(r15, r16) -} -(vout + gmem_idx)[0 * 8] = r1; -(vout + gmem_idx)[1 * 8] = r2; -(vout + gmem_idx)[2 * 8] = r3; -(vout + gmem_idx)[3 * 8] = r4; -(vout + gmem_idx)[4 * 8] = r5; -(vout + gmem_idx)[5 * 8] = r6; -(vout + gmem_idx)[6 * 8] = r7; -(vout + gmem_idx)[7 * 8] = r8; -(vout + gmem_idx)[8 * 8] = r9; -(vout + gmem_idx)[9 * 8] = r10; -(vout + gmem_idx)[10 * 8] = r11; -(vout + gmem_idx)[11 * 8] = r12; -(vout + gmem_idx)[12 * 8] = r13; -(vout + gmem_idx)[13 * 8] = r14; -(vout + gmem_idx)[14 * 8] = r15; -(vout + gmem_idx)[15 * 8] = r16; -} - -__kernel __attribute__((reqd_work_group_size(8, 1, 1))) -__attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_bs_0(__global HS_KEY_TYPE const* const restrict vin, - __global HS_KEY_TYPE* const restrict vout) -{ - __local union - { - } shared; - - uint const global_id = get_global_id(0); - uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); - - HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8]; - HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8]; - HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8]; - HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8]; - HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8]; - HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8]; - HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8]; - HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8]; - HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8]; - HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8]; - HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8]; - HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8]; - HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8]; - HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8]; - HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8]; - HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8]; - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r6, r11) - HS_CMP_XCHG(r7, r10) - HS_CMP_XCHG(r4, r13) - HS_CMP_XCHG(r14, r15) - HS_CMP_XCHG(r8, r12) - HS_CMP_XCHG(r2, r3) - HS_CMP_XCHG(r5, r9) - HS_CMP_XCHG(r2, r5) - HS_CMP_XCHG(r8, r14) - HS_CMP_XCHG(r3, r9) - HS_CMP_XCHG(r12, r15) - HS_CMP_XCHG(r3, r5) - HS_CMP_XCHG(r6, r7) - HS_CMP_XCHG(r10, r11) - HS_CMP_XCHG(r12, r14) - HS_CMP_XCHG(r4, r9) - HS_CMP_XCHG(r8, r13) - HS_CMP_XCHG(r7, r9) - HS_CMP_XCHG(r11, r13) - HS_CMP_XCHG(r4, r6) - HS_CMP_XCHG(r8, r10) - HS_CMP_XCHG(r4, r5) - HS_CMP_XCHG(r6, r7) - HS_CMP_XCHG(r8, r9) - HS_CMP_XCHG(r10, r11) - HS_CMP_XCHG(r12, r13) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - { - uint const flip_lane_mask = 1; - uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; - int const t_lt = get_sub_group_local_id() < flip_lane_idx; - HS_CMP_FLIP(0, r1, r16) - HS_CMP_FLIP(1, r2, r15) - HS_CMP_FLIP(2, r3, r14) - HS_CMP_FLIP(3, r4, r13) - HS_CMP_FLIP(4, r5, r12) - HS_CMP_FLIP(5, r6, r11) - HS_CMP_FLIP(6, r7, r10) - HS_CMP_FLIP(7, r8, r9) - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - { - uint const flip_lane_mask = 3; - uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; - int const t_lt = get_sub_group_local_id() < flip_lane_idx; - HS_CMP_FLIP(0, r1, r16) - HS_CMP_FLIP(1, r2, r15) - HS_CMP_FLIP(2, r3, r14) - HS_CMP_FLIP(3, r4, r13) - HS_CMP_FLIP(4, r5, r12) - HS_CMP_FLIP(5, r6, r11) - HS_CMP_FLIP(6, r7, r10) - HS_CMP_FLIP(7, r8, r9) - } - { - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - { - uint const flip_lane_mask = 7; - uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; - int const t_lt = get_sub_group_local_id() < flip_lane_idx; - HS_CMP_FLIP(0, r1, r16) - HS_CMP_FLIP(1, r2, r15) - HS_CMP_FLIP(2, r3, r14) - HS_CMP_FLIP(3, r4, r13) - HS_CMP_FLIP(4, r5, r12) - HS_CMP_FLIP(5, r6, r11) - HS_CMP_FLIP(6, r7, r10) - HS_CMP_FLIP(7, r8, r9) - } - { - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) - } - { - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - (vout + gmem_idx)[0 * 8] = r1; - (vout + gmem_idx)[1 * 8] = r2; - (vout + gmem_idx)[2 * 8] = r3; - (vout + gmem_idx)[3 * 8] = r4; - (vout + gmem_idx)[4 * 8] = r5; - (vout + gmem_idx)[5 * 8] = r6; - (vout + gmem_idx)[6 * 8] = r7; - (vout + gmem_idx)[7 * 8] = r8; - (vout + gmem_idx)[8 * 8] = r9; - (vout + gmem_idx)[9 * 8] = r10; - (vout + gmem_idx)[10 * 8] = r11; - (vout + gmem_idx)[11 * 8] = r12; - (vout + gmem_idx)[12 * 8] = r13; - (vout + gmem_idx)[13 * 8] = r14; - (vout + gmem_idx)[14 * 8] = r15; - (vout + gmem_idx)[15 * 8] = r16; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_bc_4(__global HS_KEY_TYPE* const restrict vout) -{ - __local union - { - HS_KEY_TYPE m[16 * 128]; - } shared; - - uint const global_id = get_global_id(0); - uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); - - uint const gmem_l_idx = (global_id / 128) * 2048 + (global_id & 127); - uint const smem_l_idx = get_sub_group_id() * 128 + get_sub_group_local_id(); - { - { - HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[0]; - HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[128]; - HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[256]; - HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[384]; - HS_KEY_TYPE r0_5 = (vout + gmem_l_idx)[512]; - HS_KEY_TYPE r0_6 = (vout + gmem_l_idx)[640]; - HS_KEY_TYPE r0_7 = (vout + gmem_l_idx)[768]; - HS_KEY_TYPE r0_8 = (vout + gmem_l_idx)[896]; - HS_KEY_TYPE r0_9 = (vout + gmem_l_idx)[1024]; - HS_KEY_TYPE r0_10 = (vout + gmem_l_idx)[1152]; - HS_KEY_TYPE r0_11 = (vout + gmem_l_idx)[1280]; - HS_KEY_TYPE r0_12 = (vout + gmem_l_idx)[1408]; - HS_KEY_TYPE r0_13 = (vout + gmem_l_idx)[1536]; - HS_KEY_TYPE r0_14 = (vout + gmem_l_idx)[1664]; - HS_KEY_TYPE r0_15 = (vout + gmem_l_idx)[1792]; - HS_KEY_TYPE r0_16 = (vout + gmem_l_idx)[1920]; - HS_CMP_XCHG(r0_1, r0_9) - HS_CMP_XCHG(r0_5, r0_13) - HS_CMP_XCHG(r0_1, r0_5) - HS_CMP_XCHG(r0_9, r0_13) - HS_CMP_XCHG(r0_3, r0_11) - HS_CMP_XCHG(r0_7, r0_15) - HS_CMP_XCHG(r0_3, r0_7) - HS_CMP_XCHG(r0_11, r0_15) - HS_CMP_XCHG(r0_1, r0_3) - HS_CMP_XCHG(r0_5, r0_7) - HS_CMP_XCHG(r0_9, r0_11) - HS_CMP_XCHG(r0_13, r0_15) - HS_CMP_XCHG(r0_2, r0_10) - HS_CMP_XCHG(r0_6, r0_14) - HS_CMP_XCHG(r0_2, r0_6) - HS_CMP_XCHG(r0_10, r0_14) - HS_CMP_XCHG(r0_4, r0_12) - HS_CMP_XCHG(r0_8, r0_16) - HS_CMP_XCHG(r0_4, r0_8) - HS_CMP_XCHG(r0_12, r0_16) - HS_CMP_XCHG(r0_2, r0_4) - HS_CMP_XCHG(r0_6, r0_8) - HS_CMP_XCHG(r0_10, r0_12) - HS_CMP_XCHG(r0_14, r0_16) - HS_CMP_XCHG(r0_1, r0_2) - HS_CMP_XCHG(r0_3, r0_4) - HS_CMP_XCHG(r0_5, r0_6) - HS_CMP_XCHG(r0_7, r0_8) - HS_CMP_XCHG(r0_9, r0_10) - HS_CMP_XCHG(r0_11, r0_12) - HS_CMP_XCHG(r0_13, r0_14) - HS_CMP_XCHG(r0_15, r0_16) - (shared.m + smem_l_idx)[0] = r0_1; - (shared.m + smem_l_idx)[8] = r0_2; - (shared.m + smem_l_idx)[16] = r0_3; - (shared.m + smem_l_idx)[24] = r0_4; - (shared.m + smem_l_idx)[32] = r0_5; - (shared.m + smem_l_idx)[40] = r0_6; - (shared.m + smem_l_idx)[48] = r0_7; - (shared.m + smem_l_idx)[56] = r0_8; - (shared.m + smem_l_idx)[64] = r0_9; - (shared.m + smem_l_idx)[72] = r0_10; - (shared.m + smem_l_idx)[80] = r0_11; - (shared.m + smem_l_idx)[88] = r0_12; - (shared.m + smem_l_idx)[96] = r0_13; - (shared.m + smem_l_idx)[104] = r0_14; - (shared.m + smem_l_idx)[112] = r0_15; - (shared.m + smem_l_idx)[120] = r0_16; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - HS_KEY_TYPE r1 = (shared.m + get_local_id(0))[16 * 8 * 0]; - HS_KEY_TYPE r2 = (shared.m + get_local_id(0))[16 * 8 * 1]; - HS_KEY_TYPE r3 = (shared.m + get_local_id(0))[16 * 8 * 2]; - HS_KEY_TYPE r4 = (shared.m + get_local_id(0))[16 * 8 * 3]; - HS_KEY_TYPE r5 = (shared.m + get_local_id(0))[16 * 8 * 4]; - HS_KEY_TYPE r6 = (shared.m + get_local_id(0))[16 * 8 * 5]; - HS_KEY_TYPE r7 = (shared.m + get_local_id(0))[16 * 8 * 6]; - HS_KEY_TYPE r8 = (shared.m + get_local_id(0))[16 * 8 * 7]; - HS_KEY_TYPE r9 = (shared.m + get_local_id(0))[16 * 8 * 8]; - HS_KEY_TYPE r10 = (shared.m + get_local_id(0))[16 * 8 * 9]; - HS_KEY_TYPE r11 = (shared.m + get_local_id(0))[16 * 8 * 10]; - HS_KEY_TYPE r12 = (shared.m + get_local_id(0))[16 * 8 * 11]; - HS_KEY_TYPE r13 = (shared.m + get_local_id(0))[16 * 8 * 12]; - HS_KEY_TYPE r14 = (shared.m + get_local_id(0))[16 * 8 * 13]; - HS_KEY_TYPE r15 = (shared.m + get_local_id(0))[16 * 8 * 14]; - HS_KEY_TYPE r16 = (shared.m + get_local_id(0))[16 * 8 * 15]; - { { uint const half_lane_mask = 4; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -HS_CMP_XCHG(r1, r9) -HS_CMP_XCHG(r5, r13) -HS_CMP_XCHG(r1, r5) -HS_CMP_XCHG(r9, r13) -HS_CMP_XCHG(r3, r11) -HS_CMP_XCHG(r7, r15) -HS_CMP_XCHG(r3, r7) -HS_CMP_XCHG(r11, r15) -HS_CMP_XCHG(r1, r3) -HS_CMP_XCHG(r5, r7) -HS_CMP_XCHG(r9, r11) -HS_CMP_XCHG(r13, r15) -HS_CMP_XCHG(r2, r10) -HS_CMP_XCHG(r6, r14) -HS_CMP_XCHG(r2, r6) -HS_CMP_XCHG(r10, r14) -HS_CMP_XCHG(r4, r12) -HS_CMP_XCHG(r8, r16) -HS_CMP_XCHG(r4, r8) -HS_CMP_XCHG(r12, r16) -HS_CMP_XCHG(r2, r4) -HS_CMP_XCHG(r6, r8) -HS_CMP_XCHG(r10, r12) -HS_CMP_XCHG(r14, r16) -HS_CMP_XCHG(r1, r2) -HS_CMP_XCHG(r3, r4) -HS_CMP_XCHG(r5, r6) -HS_CMP_XCHG(r7, r8) -HS_CMP_XCHG(r9, r10) -HS_CMP_XCHG(r11, r12) -HS_CMP_XCHG(r13, r14) -HS_CMP_XCHG(r15, r16) -} -(vout + gmem_idx)[0 * 8] = r1; -(vout + gmem_idx)[1 * 8] = r2; -(vout + gmem_idx)[2 * 8] = r3; -(vout + gmem_idx)[3 * 8] = r4; -(vout + gmem_idx)[4 * 8] = r5; -(vout + gmem_idx)[5 * 8] = r6; -(vout + gmem_idx)[6 * 8] = r7; -(vout + gmem_idx)[7 * 8] = r8; -(vout + gmem_idx)[8 * 8] = r9; -(vout + gmem_idx)[9 * 8] = r10; -(vout + gmem_idx)[10 * 8] = r11; -(vout + gmem_idx)[11 * 8] = r12; -(vout + gmem_idx)[12 * 8] = r13; -(vout + gmem_idx)[13 * 8] = r14; -(vout + gmem_idx)[14 * 8] = r15; -(vout + gmem_idx)[15 * 8] = r16; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_bc_3(__global HS_KEY_TYPE* const restrict vout) -{ - __local union - { - HS_KEY_TYPE m[16 * 64]; - } shared; - - uint const global_id = get_global_id(0); - uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); - - uint const gmem_l_idx = (global_id / 64) * 1024 + (global_id & 63); - uint const smem_l_idx = get_sub_group_id() * 64 + get_sub_group_local_id(); - { - { - HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[0]; - HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[128]; - HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[256]; - HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[384]; - HS_KEY_TYPE r0_5 = (vout + gmem_l_idx)[512]; - HS_KEY_TYPE r0_6 = (vout + gmem_l_idx)[640]; - HS_KEY_TYPE r0_7 = (vout + gmem_l_idx)[768]; - HS_KEY_TYPE r0_8 = (vout + gmem_l_idx)[896]; - HS_CMP_XCHG(r0_1, r0_5) - HS_CMP_XCHG(r0_3, r0_7) - HS_CMP_XCHG(r0_1, r0_3) - HS_CMP_XCHG(r0_5, r0_7) - HS_CMP_XCHG(r0_2, r0_6) - HS_CMP_XCHG(r0_4, r0_8) - HS_CMP_XCHG(r0_2, r0_4) - HS_CMP_XCHG(r0_6, r0_8) - HS_CMP_XCHG(r0_1, r0_2) - HS_CMP_XCHG(r0_3, r0_4) - HS_CMP_XCHG(r0_5, r0_6) - HS_CMP_XCHG(r0_7, r0_8) - (shared.m + smem_l_idx)[0] = r0_1; - (shared.m + smem_l_idx)[8] = r0_2; - (shared.m + smem_l_idx)[16] = r0_3; - (shared.m + smem_l_idx)[24] = r0_4; - (shared.m + smem_l_idx)[32] = r0_5; - (shared.m + smem_l_idx)[40] = r0_6; - (shared.m + smem_l_idx)[48] = r0_7; - (shared.m + smem_l_idx)[56] = r0_8; - } - { - HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[64]; - HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[192]; - HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[320]; - HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[448]; - HS_KEY_TYPE r0_5 = (vout + gmem_l_idx)[576]; - HS_KEY_TYPE r0_6 = (vout + gmem_l_idx)[704]; - HS_KEY_TYPE r0_7 = (vout + gmem_l_idx)[832]; - HS_KEY_TYPE r0_8 = (vout + gmem_l_idx)[960]; - HS_CMP_XCHG(r0_1, r0_5) - HS_CMP_XCHG(r0_3, r0_7) - HS_CMP_XCHG(r0_1, r0_3) - HS_CMP_XCHG(r0_5, r0_7) - HS_CMP_XCHG(r0_2, r0_6) - HS_CMP_XCHG(r0_4, r0_8) - HS_CMP_XCHG(r0_2, r0_4) - HS_CMP_XCHG(r0_6, r0_8) - HS_CMP_XCHG(r0_1, r0_2) - HS_CMP_XCHG(r0_3, r0_4) - HS_CMP_XCHG(r0_5, r0_6) - HS_CMP_XCHG(r0_7, r0_8) - (shared.m + smem_l_idx)[512] = r0_1; - (shared.m + smem_l_idx)[520] = r0_2; - (shared.m + smem_l_idx)[528] = r0_3; - (shared.m + smem_l_idx)[536] = r0_4; - (shared.m + smem_l_idx)[544] = r0_5; - (shared.m + smem_l_idx)[552] = r0_6; - (shared.m + smem_l_idx)[560] = r0_7; - (shared.m + smem_l_idx)[568] = r0_8; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - HS_KEY_TYPE r1 = (shared.m + get_local_id(0))[8 * 8 * 0]; - HS_KEY_TYPE r2 = (shared.m + get_local_id(0))[8 * 8 * 1]; - HS_KEY_TYPE r3 = (shared.m + get_local_id(0))[8 * 8 * 2]; - HS_KEY_TYPE r4 = (shared.m + get_local_id(0))[8 * 8 * 3]; - HS_KEY_TYPE r5 = (shared.m + get_local_id(0))[8 * 8 * 4]; - HS_KEY_TYPE r6 = (shared.m + get_local_id(0))[8 * 8 * 5]; - HS_KEY_TYPE r7 = (shared.m + get_local_id(0))[8 * 8 * 6]; - HS_KEY_TYPE r8 = (shared.m + get_local_id(0))[8 * 8 * 7]; - HS_KEY_TYPE r9 = (shared.m + get_local_id(0))[8 * 8 * 8]; - HS_KEY_TYPE r10 = (shared.m + get_local_id(0))[8 * 8 * 9]; - HS_KEY_TYPE r11 = (shared.m + get_local_id(0))[8 * 8 * 10]; - HS_KEY_TYPE r12 = (shared.m + get_local_id(0))[8 * 8 * 11]; - HS_KEY_TYPE r13 = (shared.m + get_local_id(0))[8 * 8 * 12]; - HS_KEY_TYPE r14 = (shared.m + get_local_id(0))[8 * 8 * 13]; - HS_KEY_TYPE r15 = (shared.m + get_local_id(0))[8 * 8 * 14]; - HS_KEY_TYPE r16 = (shared.m + get_local_id(0))[8 * 8 * 15]; - { { uint const half_lane_mask = 4; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -HS_CMP_XCHG(r1, r9) -HS_CMP_XCHG(r5, r13) -HS_CMP_XCHG(r1, r5) -HS_CMP_XCHG(r9, r13) -HS_CMP_XCHG(r3, r11) -HS_CMP_XCHG(r7, r15) -HS_CMP_XCHG(r3, r7) -HS_CMP_XCHG(r11, r15) -HS_CMP_XCHG(r1, r3) -HS_CMP_XCHG(r5, r7) -HS_CMP_XCHG(r9, r11) -HS_CMP_XCHG(r13, r15) -HS_CMP_XCHG(r2, r10) -HS_CMP_XCHG(r6, r14) -HS_CMP_XCHG(r2, r6) -HS_CMP_XCHG(r10, r14) -HS_CMP_XCHG(r4, r12) -HS_CMP_XCHG(r8, r16) -HS_CMP_XCHG(r4, r8) -HS_CMP_XCHG(r12, r16) -HS_CMP_XCHG(r2, r4) -HS_CMP_XCHG(r6, r8) -HS_CMP_XCHG(r10, r12) -HS_CMP_XCHG(r14, r16) -HS_CMP_XCHG(r1, r2) -HS_CMP_XCHG(r3, r4) -HS_CMP_XCHG(r5, r6) -HS_CMP_XCHG(r7, r8) -HS_CMP_XCHG(r9, r10) -HS_CMP_XCHG(r11, r12) -HS_CMP_XCHG(r13, r14) -HS_CMP_XCHG(r15, r16) -} -(vout + gmem_idx)[0 * 8] = r1; -(vout + gmem_idx)[1 * 8] = r2; -(vout + gmem_idx)[2 * 8] = r3; -(vout + gmem_idx)[3 * 8] = r4; -(vout + gmem_idx)[4 * 8] = r5; -(vout + gmem_idx)[5 * 8] = r6; -(vout + gmem_idx)[6 * 8] = r7; -(vout + gmem_idx)[7 * 8] = r8; -(vout + gmem_idx)[8 * 8] = r9; -(vout + gmem_idx)[9 * 8] = r10; -(vout + gmem_idx)[10 * 8] = r11; -(vout + gmem_idx)[11 * 8] = r12; -(vout + gmem_idx)[12 * 8] = r13; -(vout + gmem_idx)[13 * 8] = r14; -(vout + gmem_idx)[14 * 8] = r15; -(vout + gmem_idx)[15 * 8] = r16; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_bc_2(__global HS_KEY_TYPE* const restrict vout) -{ - __local union - { - HS_KEY_TYPE m[16 * 32]; - } shared; - - uint const global_id = get_global_id(0); - uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); - - uint const gmem_l_idx = (global_id / 32) * 512 + (global_id & 31); - uint const smem_l_idx = get_sub_group_id() * 32 + get_sub_group_local_id(); - { - { - HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[0]; - HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[128]; - HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[256]; - HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[384]; - HS_CMP_XCHG(r0_1, r0_3) - HS_CMP_XCHG(r0_2, r0_4) - HS_CMP_XCHG(r0_1, r0_2) - HS_CMP_XCHG(r0_3, r0_4) - (shared.m + smem_l_idx)[0] = r0_1; - (shared.m + smem_l_idx)[8] = r0_2; - (shared.m + smem_l_idx)[16] = r0_3; - (shared.m + smem_l_idx)[24] = r0_4; - } - { - HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[32]; - HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[160]; - HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[288]; - HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[416]; - HS_CMP_XCHG(r0_1, r0_3) - HS_CMP_XCHG(r0_2, r0_4) - HS_CMP_XCHG(r0_1, r0_2) - HS_CMP_XCHG(r0_3, r0_4) - (shared.m + smem_l_idx)[128] = r0_1; - (shared.m + smem_l_idx)[136] = r0_2; - (shared.m + smem_l_idx)[144] = r0_3; - (shared.m + smem_l_idx)[152] = r0_4; - } - { - HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[64]; - HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[192]; - HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[320]; - HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[448]; - HS_CMP_XCHG(r0_1, r0_3) - HS_CMP_XCHG(r0_2, r0_4) - HS_CMP_XCHG(r0_1, r0_2) - HS_CMP_XCHG(r0_3, r0_4) - (shared.m + smem_l_idx)[256] = r0_1; - (shared.m + smem_l_idx)[264] = r0_2; - (shared.m + smem_l_idx)[272] = r0_3; - (shared.m + smem_l_idx)[280] = r0_4; - } - { - HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[96]; - HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[224]; - HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[352]; - HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[480]; - HS_CMP_XCHG(r0_1, r0_3) - HS_CMP_XCHG(r0_2, r0_4) - HS_CMP_XCHG(r0_1, r0_2) - HS_CMP_XCHG(r0_3, r0_4) - (shared.m + smem_l_idx)[384] = r0_1; - (shared.m + smem_l_idx)[392] = r0_2; - (shared.m + smem_l_idx)[400] = r0_3; - (shared.m + smem_l_idx)[408] = r0_4; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - HS_KEY_TYPE r1 = (shared.m + get_local_id(0))[4 * 8 * 0]; - HS_KEY_TYPE r2 = (shared.m + get_local_id(0))[4 * 8 * 1]; - HS_KEY_TYPE r3 = (shared.m + get_local_id(0))[4 * 8 * 2]; - HS_KEY_TYPE r4 = (shared.m + get_local_id(0))[4 * 8 * 3]; - HS_KEY_TYPE r5 = (shared.m + get_local_id(0))[4 * 8 * 4]; - HS_KEY_TYPE r6 = (shared.m + get_local_id(0))[4 * 8 * 5]; - HS_KEY_TYPE r7 = (shared.m + get_local_id(0))[4 * 8 * 6]; - HS_KEY_TYPE r8 = (shared.m + get_local_id(0))[4 * 8 * 7]; - HS_KEY_TYPE r9 = (shared.m + get_local_id(0))[4 * 8 * 8]; - HS_KEY_TYPE r10 = (shared.m + get_local_id(0))[4 * 8 * 9]; - HS_KEY_TYPE r11 = (shared.m + get_local_id(0))[4 * 8 * 10]; - HS_KEY_TYPE r12 = (shared.m + get_local_id(0))[4 * 8 * 11]; - HS_KEY_TYPE r13 = (shared.m + get_local_id(0))[4 * 8 * 12]; - HS_KEY_TYPE r14 = (shared.m + get_local_id(0))[4 * 8 * 13]; - HS_KEY_TYPE r15 = (shared.m + get_local_id(0))[4 * 8 * 14]; - HS_KEY_TYPE r16 = (shared.m + get_local_id(0))[4 * 8 * 15]; - { { uint const half_lane_mask = 4; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -HS_CMP_XCHG(r1, r9) -HS_CMP_XCHG(r5, r13) -HS_CMP_XCHG(r1, r5) -HS_CMP_XCHG(r9, r13) -HS_CMP_XCHG(r3, r11) -HS_CMP_XCHG(r7, r15) -HS_CMP_XCHG(r3, r7) -HS_CMP_XCHG(r11, r15) -HS_CMP_XCHG(r1, r3) -HS_CMP_XCHG(r5, r7) -HS_CMP_XCHG(r9, r11) -HS_CMP_XCHG(r13, r15) -HS_CMP_XCHG(r2, r10) -HS_CMP_XCHG(r6, r14) -HS_CMP_XCHG(r2, r6) -HS_CMP_XCHG(r10, r14) -HS_CMP_XCHG(r4, r12) -HS_CMP_XCHG(r8, r16) -HS_CMP_XCHG(r4, r8) -HS_CMP_XCHG(r12, r16) -HS_CMP_XCHG(r2, r4) -HS_CMP_XCHG(r6, r8) -HS_CMP_XCHG(r10, r12) -HS_CMP_XCHG(r14, r16) -HS_CMP_XCHG(r1, r2) -HS_CMP_XCHG(r3, r4) -HS_CMP_XCHG(r5, r6) -HS_CMP_XCHG(r7, r8) -HS_CMP_XCHG(r9, r10) -HS_CMP_XCHG(r11, r12) -HS_CMP_XCHG(r13, r14) -HS_CMP_XCHG(r15, r16) -} -(vout + gmem_idx)[0 * 8] = r1; -(vout + gmem_idx)[1 * 8] = r2; -(vout + gmem_idx)[2 * 8] = r3; -(vout + gmem_idx)[3 * 8] = r4; -(vout + gmem_idx)[4 * 8] = r5; -(vout + gmem_idx)[5 * 8] = r6; -(vout + gmem_idx)[6 * 8] = r7; -(vout + gmem_idx)[7 * 8] = r8; -(vout + gmem_idx)[8 * 8] = r9; -(vout + gmem_idx)[9 * 8] = r10; -(vout + gmem_idx)[10 * 8] = r11; -(vout + gmem_idx)[11 * 8] = r12; -(vout + gmem_idx)[12 * 8] = r13; -(vout + gmem_idx)[13 * 8] = r14; -(vout + gmem_idx)[14 * 8] = r15; -(vout + gmem_idx)[15 * 8] = r16; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_bc_1(__global HS_KEY_TYPE* const restrict vout) -{ - __local union - { - HS_KEY_TYPE m[16 * 16]; - } shared; - - uint const global_id = get_global_id(0); - uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); - - uint const gmem_l_idx = (global_id / 16) * 256 + (global_id & 15); - uint const smem_l_idx = get_sub_group_id() * 16 + get_sub_group_local_id(); - { - { - HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[0]; - HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[128]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[0] = r0_1; - (shared.m + smem_l_idx)[8] = r0_2; - } - { - HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[16]; - HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[144]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[32] = r0_1; - (shared.m + smem_l_idx)[40] = r0_2; - } - { - HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[32]; - HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[160]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[64] = r0_1; - (shared.m + smem_l_idx)[72] = r0_2; - } - { - HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[48]; - HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[176]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[96] = r0_1; - (shared.m + smem_l_idx)[104] = r0_2; - } - { - HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[64]; - HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[192]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[128] = r0_1; - (shared.m + smem_l_idx)[136] = r0_2; - } - { - HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[80]; - HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[208]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[160] = r0_1; - (shared.m + smem_l_idx)[168] = r0_2; - } - { - HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[96]; - HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[224]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[192] = r0_1; - (shared.m + smem_l_idx)[200] = r0_2; - } - { - HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[112]; - HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[240]; - HS_CMP_XCHG(r0_1, r0_2) - (shared.m + smem_l_idx)[224] = r0_1; - (shared.m + smem_l_idx)[232] = r0_2; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - HS_KEY_TYPE r1 = (shared.m + get_local_id(0))[2 * 8 * 0]; - HS_KEY_TYPE r2 = (shared.m + get_local_id(0))[2 * 8 * 1]; - HS_KEY_TYPE r3 = (shared.m + get_local_id(0))[2 * 8 * 2]; - HS_KEY_TYPE r4 = (shared.m + get_local_id(0))[2 * 8 * 3]; - HS_KEY_TYPE r5 = (shared.m + get_local_id(0))[2 * 8 * 4]; - HS_KEY_TYPE r6 = (shared.m + get_local_id(0))[2 * 8 * 5]; - HS_KEY_TYPE r7 = (shared.m + get_local_id(0))[2 * 8 * 6]; - HS_KEY_TYPE r8 = (shared.m + get_local_id(0))[2 * 8 * 7]; - HS_KEY_TYPE r9 = (shared.m + get_local_id(0))[2 * 8 * 8]; - HS_KEY_TYPE r10 = (shared.m + get_local_id(0))[2 * 8 * 9]; - HS_KEY_TYPE r11 = (shared.m + get_local_id(0))[2 * 8 * 10]; - HS_KEY_TYPE r12 = (shared.m + get_local_id(0))[2 * 8 * 11]; - HS_KEY_TYPE r13 = (shared.m + get_local_id(0))[2 * 8 * 12]; - HS_KEY_TYPE r14 = (shared.m + get_local_id(0))[2 * 8 * 13]; - HS_KEY_TYPE r15 = (shared.m + get_local_id(0))[2 * 8 * 14]; - HS_KEY_TYPE r16 = (shared.m + get_local_id(0))[2 * 8 * 15]; - { { uint const half_lane_mask = 4; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -HS_CMP_XCHG(r1, r9) -HS_CMP_XCHG(r5, r13) -HS_CMP_XCHG(r1, r5) -HS_CMP_XCHG(r9, r13) -HS_CMP_XCHG(r3, r11) -HS_CMP_XCHG(r7, r15) -HS_CMP_XCHG(r3, r7) -HS_CMP_XCHG(r11, r15) -HS_CMP_XCHG(r1, r3) -HS_CMP_XCHG(r5, r7) -HS_CMP_XCHG(r9, r11) -HS_CMP_XCHG(r13, r15) -HS_CMP_XCHG(r2, r10) -HS_CMP_XCHG(r6, r14) -HS_CMP_XCHG(r2, r6) -HS_CMP_XCHG(r10, r14) -HS_CMP_XCHG(r4, r12) -HS_CMP_XCHG(r8, r16) -HS_CMP_XCHG(r4, r8) -HS_CMP_XCHG(r12, r16) -HS_CMP_XCHG(r2, r4) -HS_CMP_XCHG(r6, r8) -HS_CMP_XCHG(r10, r12) -HS_CMP_XCHG(r14, r16) -HS_CMP_XCHG(r1, r2) -HS_CMP_XCHG(r3, r4) -HS_CMP_XCHG(r5, r6) -HS_CMP_XCHG(r7, r8) -HS_CMP_XCHG(r9, r10) -HS_CMP_XCHG(r11, r12) -HS_CMP_XCHG(r13, r14) -HS_CMP_XCHG(r15, r16) -} -(vout + gmem_idx)[0 * 8] = r1; -(vout + gmem_idx)[1 * 8] = r2; -(vout + gmem_idx)[2 * 8] = r3; -(vout + gmem_idx)[3 * 8] = r4; -(vout + gmem_idx)[4 * 8] = r5; -(vout + gmem_idx)[5 * 8] = r6; -(vout + gmem_idx)[6 * 8] = r7; -(vout + gmem_idx)[7 * 8] = r8; -(vout + gmem_idx)[8 * 8] = r9; -(vout + gmem_idx)[9 * 8] = r10; -(vout + gmem_idx)[10 * 8] = r11; -(vout + gmem_idx)[11 * 8] = r12; -(vout + gmem_idx)[12 * 8] = r13; -(vout + gmem_idx)[13 * 8] = r14; -(vout + gmem_idx)[14 * 8] = r15; -(vout + gmem_idx)[15 * 8] = r16; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_bc_0(__global HS_KEY_TYPE* const restrict vout) -{ - __local union - { - } shared; - - uint const global_id = get_global_id(0); - uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7); - - HS_KEY_TYPE r1 = (vout + gmem_idx)[0 * 8]; - HS_KEY_TYPE r2 = (vout + gmem_idx)[1 * 8]; - HS_KEY_TYPE r3 = (vout + gmem_idx)[2 * 8]; - HS_KEY_TYPE r4 = (vout + gmem_idx)[3 * 8]; - HS_KEY_TYPE r5 = (vout + gmem_idx)[4 * 8]; - HS_KEY_TYPE r6 = (vout + gmem_idx)[5 * 8]; - HS_KEY_TYPE r7 = (vout + gmem_idx)[6 * 8]; - HS_KEY_TYPE r8 = (vout + gmem_idx)[7 * 8]; - HS_KEY_TYPE r9 = (vout + gmem_idx)[8 * 8]; - HS_KEY_TYPE r10 = (vout + gmem_idx)[9 * 8]; - HS_KEY_TYPE r11 = (vout + gmem_idx)[10 * 8]; - HS_KEY_TYPE r12 = (vout + gmem_idx)[11 * 8]; - HS_KEY_TYPE r13 = (vout + gmem_idx)[12 * 8]; - HS_KEY_TYPE r14 = (vout + gmem_idx)[13 * 8]; - HS_KEY_TYPE r15 = (vout + gmem_idx)[14 * 8]; - HS_KEY_TYPE r16 = (vout + gmem_idx)[15 * 8]; - { { uint const half_lane_mask = 4; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 2; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -{ - uint const half_lane_mask = 1; - uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; - int const t_lt = get_sub_group_local_id() < half_lane_idx; - HS_CMP_HALF(0, r1) - HS_CMP_HALF(1, r2) - HS_CMP_HALF(2, r3) - HS_CMP_HALF(3, r4) - HS_CMP_HALF(4, r5) - HS_CMP_HALF(5, r6) - HS_CMP_HALF(6, r7) - HS_CMP_HALF(7, r8) - HS_CMP_HALF(8, r9) - HS_CMP_HALF(9, r10) - HS_CMP_HALF(10, r11) - HS_CMP_HALF(11, r12) - HS_CMP_HALF(12, r13) - HS_CMP_HALF(13, r14) - HS_CMP_HALF(14, r15) - HS_CMP_HALF(15, r16) -} -HS_CMP_XCHG(r1, r9) -HS_CMP_XCHG(r5, r13) -HS_CMP_XCHG(r1, r5) -HS_CMP_XCHG(r9, r13) -HS_CMP_XCHG(r3, r11) -HS_CMP_XCHG(r7, r15) -HS_CMP_XCHG(r3, r7) -HS_CMP_XCHG(r11, r15) -HS_CMP_XCHG(r1, r3) -HS_CMP_XCHG(r5, r7) -HS_CMP_XCHG(r9, r11) -HS_CMP_XCHG(r13, r15) -HS_CMP_XCHG(r2, r10) -HS_CMP_XCHG(r6, r14) -HS_CMP_XCHG(r2, r6) -HS_CMP_XCHG(r10, r14) -HS_CMP_XCHG(r4, r12) -HS_CMP_XCHG(r8, r16) -HS_CMP_XCHG(r4, r8) -HS_CMP_XCHG(r12, r16) -HS_CMP_XCHG(r2, r4) -HS_CMP_XCHG(r6, r8) -HS_CMP_XCHG(r10, r12) -HS_CMP_XCHG(r14, r16) -HS_CMP_XCHG(r1, r2) -HS_CMP_XCHG(r3, r4) -HS_CMP_XCHG(r5, r6) -HS_CMP_XCHG(r7, r8) -HS_CMP_XCHG(r9, r10) -HS_CMP_XCHG(r11, r12) -HS_CMP_XCHG(r13, r14) -HS_CMP_XCHG(r15, r16) -} -(vout + gmem_idx)[0 * 8] = r1; -(vout + gmem_idx)[1 * 8] = r2; -(vout + gmem_idx)[2 * 8] = r3; -(vout + gmem_idx)[3 * 8] = r4; -(vout + gmem_idx)[4 * 8] = r5; -(vout + gmem_idx)[5 * 8] = r6; -(vout + gmem_idx)[6 * 8] = r7; -(vout + gmem_idx)[7 * 8] = r8; -(vout + gmem_idx)[8 * 8] = r9; -(vout + gmem_idx)[9 * 8] = r10; -(vout + gmem_idx)[10 * 8] = r11; -(vout + gmem_idx)[11 * 8] = r12; -(vout + gmem_idx)[12 * 8] = r13; -(vout + gmem_idx)[13 * 8] = r14; -(vout + gmem_idx)[14 * 8] = r15; -(vout + gmem_idx)[15 * 8] = r16; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_fm_1(__global HS_KEY_TYPE* const restrict vout, - uint const fm_full, - uint const fm_frac) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = warp_idx / 16 >> 0; - - uint const merge_stride = 16 * 8 << 0; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - - uint const merge_l_off = - (warp_idx - merge_idx * (16 << 0)) * 8 + warp_lane_idx; - uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; - - int const merge_r_off = merge_keys - merge_l_end - 1; - - __global HS_KEY_TYPE* const restrict merge_l = - vout + (merge_base + merge_l_off); - __global HS_KEY_TYPE* const restrict merge_r = - vout + (merge_base + merge_r_off); - - HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; - if (merge_idx < fm_full) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; - HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; - HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; - HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; - HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; - HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; - HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; - HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r8, r25) - HS_CMP_XCHG(r7, r26) - HS_CMP_XCHG(r6, r27) - HS_CMP_XCHG(r5, r28) - HS_CMP_XCHG(r4, r29) - HS_CMP_XCHG(r3, r30) - HS_CMP_XCHG(r2, r31) - HS_CMP_XCHG(r1, r32) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_r[15 * merge_stride] = r32; - merge_r[14 * merge_stride] = r31; - merge_r[13 * merge_stride] = r30; - merge_r[12 * merge_stride] = r29; - merge_r[11 * merge_stride] = r28; - merge_r[10 * merge_stride] = r27; - merge_r[9 * merge_stride] = r26; - merge_r[8 * merge_stride] = r25; - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 8) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 4) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 2) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r17, r18) - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_CMP_XCHG(r16, r17) - merge_r[0 * merge_stride] = r17; - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - merge_l[15 * merge_stride] = r16; - merge_l[14 * merge_stride] = r15; - merge_l[13 * merge_stride] = r14; - merge_l[12 * merge_stride] = r13; - merge_l[11 * merge_stride] = r12; - merge_l[10 * merge_stride] = r11; - merge_l[9 * merge_stride] = r10; - merge_l[8 * merge_stride] = r9; - merge_l[7 * merge_stride] = r8; - merge_l[6 * merge_stride] = r7; - merge_l[5 * merge_stride] = r6; - merge_l[4 * merge_stride] = r5; - merge_l[3 * merge_stride] = r4; - merge_l[2 * merge_stride] = r3; - merge_l[1 * merge_stride] = r2; - merge_l[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_fm_2(__global HS_KEY_TYPE* const restrict vout, - uint const fm_full, - uint const fm_frac) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = warp_idx / 16 >> 1; - - uint const merge_stride = 16 * 8 << 1; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - - uint const merge_l_off = - (warp_idx - merge_idx * (16 << 1)) * 8 + warp_lane_idx; - uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; - - int const merge_r_off = merge_keys - merge_l_end - 1; - - __global HS_KEY_TYPE* const restrict merge_l = - vout + (merge_base + merge_l_off); - __global HS_KEY_TYPE* const restrict merge_r = - vout + (merge_base + merge_r_off); - - HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; - if (merge_idx < fm_full) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; - HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; - HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; - HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; - HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; - HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; - HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; - HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r8, r25) - HS_CMP_XCHG(r7, r26) - HS_CMP_XCHG(r6, r27) - HS_CMP_XCHG(r5, r28) - HS_CMP_XCHG(r4, r29) - HS_CMP_XCHG(r3, r30) - HS_CMP_XCHG(r2, r31) - HS_CMP_XCHG(r1, r32) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_r[15 * merge_stride] = r32; - merge_r[14 * merge_stride] = r31; - merge_r[13 * merge_stride] = r30; - merge_r[12 * merge_stride] = r29; - merge_r[11 * merge_stride] = r28; - merge_r[10 * merge_stride] = r27; - merge_r[9 * merge_stride] = r26; - merge_r[8 * merge_stride] = r25; - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 8) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 4) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 2) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r17, r18) - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_CMP_XCHG(r16, r17) - merge_r[0 * merge_stride] = r17; - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - merge_l[15 * merge_stride] = r16; - merge_l[14 * merge_stride] = r15; - merge_l[13 * merge_stride] = r14; - merge_l[12 * merge_stride] = r13; - merge_l[11 * merge_stride] = r12; - merge_l[10 * merge_stride] = r11; - merge_l[9 * merge_stride] = r10; - merge_l[8 * merge_stride] = r9; - merge_l[7 * merge_stride] = r8; - merge_l[6 * merge_stride] = r7; - merge_l[5 * merge_stride] = r6; - merge_l[4 * merge_stride] = r5; - merge_l[3 * merge_stride] = r4; - merge_l[2 * merge_stride] = r3; - merge_l[1 * merge_stride] = r2; - merge_l[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_fm_3(__global HS_KEY_TYPE* const restrict vout, - uint const fm_full, - uint const fm_frac) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = warp_idx / 16 >> 2; - - uint const merge_stride = 16 * 8 << 2; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - - uint const merge_l_off = - (warp_idx - merge_idx * (16 << 2)) * 8 + warp_lane_idx; - uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; - - int const merge_r_off = merge_keys - merge_l_end - 1; - - __global HS_KEY_TYPE* const restrict merge_l = - vout + (merge_base + merge_l_off); - __global HS_KEY_TYPE* const restrict merge_r = - vout + (merge_base + merge_r_off); - - HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; - if (merge_idx < fm_full) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; - HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; - HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; - HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; - HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; - HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; - HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; - HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r8, r25) - HS_CMP_XCHG(r7, r26) - HS_CMP_XCHG(r6, r27) - HS_CMP_XCHG(r5, r28) - HS_CMP_XCHG(r4, r29) - HS_CMP_XCHG(r3, r30) - HS_CMP_XCHG(r2, r31) - HS_CMP_XCHG(r1, r32) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_r[15 * merge_stride] = r32; - merge_r[14 * merge_stride] = r31; - merge_r[13 * merge_stride] = r30; - merge_r[12 * merge_stride] = r29; - merge_r[11 * merge_stride] = r28; - merge_r[10 * merge_stride] = r27; - merge_r[9 * merge_stride] = r26; - merge_r[8 * merge_stride] = r25; - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 8) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 4) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 2) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r17, r18) - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_CMP_XCHG(r16, r17) - merge_r[0 * merge_stride] = r17; - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - merge_l[15 * merge_stride] = r16; - merge_l[14 * merge_stride] = r15; - merge_l[13 * merge_stride] = r14; - merge_l[12 * merge_stride] = r13; - merge_l[11 * merge_stride] = r12; - merge_l[10 * merge_stride] = r11; - merge_l[9 * merge_stride] = r10; - merge_l[8 * merge_stride] = r9; - merge_l[7 * merge_stride] = r8; - merge_l[6 * merge_stride] = r7; - merge_l[5 * merge_stride] = r6; - merge_l[4 * merge_stride] = r5; - merge_l[3 * merge_stride] = r4; - merge_l[2 * merge_stride] = r3; - merge_l[1 * merge_stride] = r2; - merge_l[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_fm_4(__global HS_KEY_TYPE* const restrict vout, - uint const fm_full, - uint const fm_frac) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = warp_idx / 16 >> 3; - - uint const merge_stride = 16 * 8 << 3; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - - uint const merge_l_off = - (warp_idx - merge_idx * (16 << 3)) * 8 + warp_lane_idx; - uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; - - int const merge_r_off = merge_keys - merge_l_end - 1; - - __global HS_KEY_TYPE* const restrict merge_l = - vout + (merge_base + merge_l_off); - __global HS_KEY_TYPE* const restrict merge_r = - vout + (merge_base + merge_r_off); - - HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; - if (merge_idx < fm_full) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; - HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; - HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; - HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; - HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; - HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; - HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; - HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r8, r25) - HS_CMP_XCHG(r7, r26) - HS_CMP_XCHG(r6, r27) - HS_CMP_XCHG(r5, r28) - HS_CMP_XCHG(r4, r29) - HS_CMP_XCHG(r3, r30) - HS_CMP_XCHG(r2, r31) - HS_CMP_XCHG(r1, r32) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_r[15 * merge_stride] = r32; - merge_r[14 * merge_stride] = r31; - merge_r[13 * merge_stride] = r30; - merge_r[12 * merge_stride] = r29; - merge_r[11 * merge_stride] = r28; - merge_r[10 * merge_stride] = r27; - merge_r[9 * merge_stride] = r26; - merge_r[8 * merge_stride] = r25; - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 8) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 4) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 2) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r17, r18) - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_CMP_XCHG(r16, r17) - merge_r[0 * merge_stride] = r17; - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - merge_l[15 * merge_stride] = r16; - merge_l[14 * merge_stride] = r15; - merge_l[13 * merge_stride] = r14; - merge_l[12 * merge_stride] = r13; - merge_l[11 * merge_stride] = r12; - merge_l[10 * merge_stride] = r11; - merge_l[9 * merge_stride] = r10; - merge_l[8 * merge_stride] = r9; - merge_l[7 * merge_stride] = r8; - merge_l[6 * merge_stride] = r7; - merge_l[5 * merge_stride] = r6; - merge_l[4 * merge_stride] = r5; - merge_l[3 * merge_stride] = r4; - merge_l[2 * merge_stride] = r3; - merge_l[1 * merge_stride] = r2; - merge_l[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_fm_5(__global HS_KEY_TYPE* const restrict vout, - uint const fm_full, - uint const fm_frac) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = warp_idx / 16 >> 4; - - uint const merge_stride = 16 * 8 << 4; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - - uint const merge_l_off = - (warp_idx - merge_idx * (16 << 4)) * 8 + warp_lane_idx; - uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; - - int const merge_r_off = merge_keys - merge_l_end - 1; - - __global HS_KEY_TYPE* const restrict merge_l = - vout + (merge_base + merge_l_off); - __global HS_KEY_TYPE* const restrict merge_r = - vout + (merge_base + merge_r_off); - - HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; - if (merge_idx < fm_full) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; - HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; - HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; - HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; - HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; - HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; - HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; - HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r8, r25) - HS_CMP_XCHG(r7, r26) - HS_CMP_XCHG(r6, r27) - HS_CMP_XCHG(r5, r28) - HS_CMP_XCHG(r4, r29) - HS_CMP_XCHG(r3, r30) - HS_CMP_XCHG(r2, r31) - HS_CMP_XCHG(r1, r32) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_r[15 * merge_stride] = r32; - merge_r[14 * merge_stride] = r31; - merge_r[13 * merge_stride] = r30; - merge_r[12 * merge_stride] = r29; - merge_r[11 * merge_stride] = r28; - merge_r[10 * merge_stride] = r27; - merge_r[9 * merge_stride] = r26; - merge_r[8 * merge_stride] = r25; - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 8) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 4) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 2) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r17, r18) - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_CMP_XCHG(r16, r17) - merge_r[0 * merge_stride] = r17; - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - merge_l[15 * merge_stride] = r16; - merge_l[14 * merge_stride] = r15; - merge_l[13 * merge_stride] = r14; - merge_l[12 * merge_stride] = r13; - merge_l[11 * merge_stride] = r12; - merge_l[10 * merge_stride] = r11; - merge_l[9 * merge_stride] = r10; - merge_l[8 * merge_stride] = r9; - merge_l[7 * merge_stride] = r8; - merge_l[6 * merge_stride] = r7; - merge_l[5 * merge_stride] = r6; - merge_l[4 * merge_stride] = r5; - merge_l[3 * merge_stride] = r4; - merge_l[2 * merge_stride] = r3; - merge_l[1 * merge_stride] = r2; - merge_l[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_fm_6(__global HS_KEY_TYPE* const restrict vout, - uint const fm_full, - uint const fm_frac) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = warp_idx / 16 >> 5; - - uint const merge_stride = 16 * 8 << 5; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - - uint const merge_l_off = - (warp_idx - merge_idx * (16 << 5)) * 8 + warp_lane_idx; - uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; - - int const merge_r_off = merge_keys - merge_l_end - 1; - - __global HS_KEY_TYPE* const restrict merge_l = - vout + (merge_base + merge_l_off); - __global HS_KEY_TYPE* const restrict merge_r = - vout + (merge_base + merge_r_off); - - HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; - if (merge_idx < fm_full) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; - HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; - HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; - HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; - HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; - HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; - HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; - HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r8, r25) - HS_CMP_XCHG(r7, r26) - HS_CMP_XCHG(r6, r27) - HS_CMP_XCHG(r5, r28) - HS_CMP_XCHG(r4, r29) - HS_CMP_XCHG(r3, r30) - HS_CMP_XCHG(r2, r31) - HS_CMP_XCHG(r1, r32) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_r[15 * merge_stride] = r32; - merge_r[14 * merge_stride] = r31; - merge_r[13 * merge_stride] = r30; - merge_r[12 * merge_stride] = r29; - merge_r[11 * merge_stride] = r28; - merge_r[10 * merge_stride] = r27; - merge_r[9 * merge_stride] = r26; - merge_r[8 * merge_stride] = r25; - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 8) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 4) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 2) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r17, r18) - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_CMP_XCHG(r16, r17) - merge_r[0 * merge_stride] = r17; - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - merge_l[15 * merge_stride] = r16; - merge_l[14 * merge_stride] = r15; - merge_l[13 * merge_stride] = r14; - merge_l[12 * merge_stride] = r13; - merge_l[11 * merge_stride] = r12; - merge_l[10 * merge_stride] = r11; - merge_l[9 * merge_stride] = r10; - merge_l[8 * merge_stride] = r9; - merge_l[7 * merge_stride] = r8; - merge_l[6 * merge_stride] = r7; - merge_l[5 * merge_stride] = r6; - merge_l[4 * merge_stride] = r5; - merge_l[3 * merge_stride] = r4; - merge_l[2 * merge_stride] = r3; - merge_l[1 * merge_stride] = r2; - merge_l[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_hm_5(__global HS_KEY_TYPE* const restrict vout) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = (warp_idx / 16) >> 0; - - uint const merge_stride = 16 * 8 << 0; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - uint const merge_off = (warp_idx - merge_idx * (16 << 0)) * 8; - - __global HS_KEY_TYPE* const restrict merge_ptr = - vout + (merge_base + merge_off + warp_lane_idx); - - HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; - HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; - HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; - HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; - HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; - HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; - HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; - HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; - HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; - HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; - HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; - HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; - HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; - HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; - HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; - HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; - HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; - HS_CMP_XCHG(r1, r17) - HS_CMP_XCHG(r9, r25) - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r5, r21) - HS_CMP_XCHG(r13, r29) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r3, r19) - HS_CMP_XCHG(r11, r27) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r7, r23) - HS_CMP_XCHG(r15, r31) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r2, r18) - HS_CMP_XCHG(r10, r26) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r6, r22) - HS_CMP_XCHG(r14, r30) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r4, r20) - HS_CMP_XCHG(r12, r28) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r8, r24) - HS_CMP_XCHG(r16, r32) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_ptr[31 * merge_stride] = r32; - merge_ptr[30 * merge_stride] = r31; - merge_ptr[29 * merge_stride] = r30; - merge_ptr[28 * merge_stride] = r29; - merge_ptr[27 * merge_stride] = r28; - merge_ptr[26 * merge_stride] = r27; - merge_ptr[25 * merge_stride] = r26; - merge_ptr[24 * merge_stride] = r25; - merge_ptr[23 * merge_stride] = r24; - merge_ptr[22 * merge_stride] = r23; - merge_ptr[21 * merge_stride] = r22; - merge_ptr[20 * merge_stride] = r21; - merge_ptr[19 * merge_stride] = r20; - merge_ptr[18 * merge_stride] = r19; - merge_ptr[17 * merge_stride] = r18; - merge_ptr[16 * merge_stride] = r17; - merge_ptr[15 * merge_stride] = r16; - merge_ptr[14 * merge_stride] = r15; - merge_ptr[13 * merge_stride] = r14; - merge_ptr[12 * merge_stride] = r13; - merge_ptr[11 * merge_stride] = r12; - merge_ptr[10 * merge_stride] = r11; - merge_ptr[9 * merge_stride] = r10; - merge_ptr[8 * merge_stride] = r9; - merge_ptr[7 * merge_stride] = r8; - merge_ptr[6 * merge_stride] = r7; - merge_ptr[5 * merge_stride] = r6; - merge_ptr[4 * merge_stride] = r5; - merge_ptr[3 * merge_stride] = r4; - merge_ptr[2 * merge_stride] = r3; - merge_ptr[1 * merge_stride] = r2; - merge_ptr[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_fm_7(__global HS_KEY_TYPE* const restrict vout, - uint const fm_full, - uint const fm_frac) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = warp_idx / 16 >> 6; - - uint const merge_stride = 16 * 8 << 6; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - - uint const merge_l_off = - (warp_idx - merge_idx * (16 << 6)) * 8 + warp_lane_idx; - uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; - - int const merge_r_off = merge_keys - merge_l_end - 1; - - __global HS_KEY_TYPE* const restrict merge_l = - vout + (merge_base + merge_l_off); - __global HS_KEY_TYPE* const restrict merge_r = - vout + (merge_base + merge_r_off); - - HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; - if (merge_idx < fm_full) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; - HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; - HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; - HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; - HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; - HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; - HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; - HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r8, r25) - HS_CMP_XCHG(r7, r26) - HS_CMP_XCHG(r6, r27) - HS_CMP_XCHG(r5, r28) - HS_CMP_XCHG(r4, r29) - HS_CMP_XCHG(r3, r30) - HS_CMP_XCHG(r2, r31) - HS_CMP_XCHG(r1, r32) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_r[15 * merge_stride] = r32; - merge_r[14 * merge_stride] = r31; - merge_r[13 * merge_stride] = r30; - merge_r[12 * merge_stride] = r29; - merge_r[11 * merge_stride] = r28; - merge_r[10 * merge_stride] = r27; - merge_r[9 * merge_stride] = r26; - merge_r[8 * merge_stride] = r25; - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 8) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 4) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 2) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r17, r18) - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_CMP_XCHG(r16, r17) - merge_r[0 * merge_stride] = r17; - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - merge_l[15 * merge_stride] = r16; - merge_l[14 * merge_stride] = r15; - merge_l[13 * merge_stride] = r14; - merge_l[12 * merge_stride] = r13; - merge_l[11 * merge_stride] = r12; - merge_l[10 * merge_stride] = r11; - merge_l[9 * merge_stride] = r10; - merge_l[8 * merge_stride] = r9; - merge_l[7 * merge_stride] = r8; - merge_l[6 * merge_stride] = r7; - merge_l[5 * merge_stride] = r6; - merge_l[4 * merge_stride] = r5; - merge_l[3 * merge_stride] = r4; - merge_l[2 * merge_stride] = r3; - merge_l[1 * merge_stride] = r2; - merge_l[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_hm_6(__global HS_KEY_TYPE* const restrict vout) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = (warp_idx / 16) >> 1; - - uint const merge_stride = 16 * 8 << 1; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - uint const merge_off = (warp_idx - merge_idx * (16 << 1)) * 8; - - __global HS_KEY_TYPE* const restrict merge_ptr = - vout + (merge_base + merge_off + warp_lane_idx); - - HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; - HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; - HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; - HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; - HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; - HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; - HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; - HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; - HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; - HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; - HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; - HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; - HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; - HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; - HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; - HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; - HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; - HS_CMP_XCHG(r1, r17) - HS_CMP_XCHG(r9, r25) - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r5, r21) - HS_CMP_XCHG(r13, r29) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r3, r19) - HS_CMP_XCHG(r11, r27) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r7, r23) - HS_CMP_XCHG(r15, r31) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r2, r18) - HS_CMP_XCHG(r10, r26) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r6, r22) - HS_CMP_XCHG(r14, r30) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r4, r20) - HS_CMP_XCHG(r12, r28) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r8, r24) - HS_CMP_XCHG(r16, r32) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_ptr[31 * merge_stride] = r32; - merge_ptr[30 * merge_stride] = r31; - merge_ptr[29 * merge_stride] = r30; - merge_ptr[28 * merge_stride] = r29; - merge_ptr[27 * merge_stride] = r28; - merge_ptr[26 * merge_stride] = r27; - merge_ptr[25 * merge_stride] = r26; - merge_ptr[24 * merge_stride] = r25; - merge_ptr[23 * merge_stride] = r24; - merge_ptr[22 * merge_stride] = r23; - merge_ptr[21 * merge_stride] = r22; - merge_ptr[20 * merge_stride] = r21; - merge_ptr[19 * merge_stride] = r20; - merge_ptr[18 * merge_stride] = r19; - merge_ptr[17 * merge_stride] = r18; - merge_ptr[16 * merge_stride] = r17; - merge_ptr[15 * merge_stride] = r16; - merge_ptr[14 * merge_stride] = r15; - merge_ptr[13 * merge_stride] = r14; - merge_ptr[12 * merge_stride] = r13; - merge_ptr[11 * merge_stride] = r12; - merge_ptr[10 * merge_stride] = r11; - merge_ptr[9 * merge_stride] = r10; - merge_ptr[8 * merge_stride] = r9; - merge_ptr[7 * merge_stride] = r8; - merge_ptr[6 * merge_stride] = r7; - merge_ptr[5 * merge_stride] = r6; - merge_ptr[4 * merge_stride] = r5; - merge_ptr[3 * merge_stride] = r4; - merge_ptr[2 * merge_stride] = r3; - merge_ptr[1 * merge_stride] = r2; - merge_ptr[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_fm_8(__global HS_KEY_TYPE* const restrict vout, - uint const fm_full, - uint const fm_frac) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = warp_idx / 16 >> 7; - - uint const merge_stride = 16 * 8 << 7; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - - uint const merge_l_off = - (warp_idx - merge_idx * (16 << 7)) * 8 + warp_lane_idx; - uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; - - int const merge_r_off = merge_keys - merge_l_end - 1; - - __global HS_KEY_TYPE* const restrict merge_l = - vout + (merge_base + merge_l_off); - __global HS_KEY_TYPE* const restrict merge_r = - vout + (merge_base + merge_r_off); - - HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; - if (merge_idx < fm_full) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; - HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; - HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; - HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; - HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; - HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; - HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; - HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r8, r25) - HS_CMP_XCHG(r7, r26) - HS_CMP_XCHG(r6, r27) - HS_CMP_XCHG(r5, r28) - HS_CMP_XCHG(r4, r29) - HS_CMP_XCHG(r3, r30) - HS_CMP_XCHG(r2, r31) - HS_CMP_XCHG(r1, r32) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_r[15 * merge_stride] = r32; - merge_r[14 * merge_stride] = r31; - merge_r[13 * merge_stride] = r30; - merge_r[12 * merge_stride] = r29; - merge_r[11 * merge_stride] = r28; - merge_r[10 * merge_stride] = r27; - merge_r[9 * merge_stride] = r26; - merge_r[8 * merge_stride] = r25; - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 8) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 4) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 2) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r17, r18) - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_CMP_XCHG(r16, r17) - merge_r[0 * merge_stride] = r17; - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - merge_l[15 * merge_stride] = r16; - merge_l[14 * merge_stride] = r15; - merge_l[13 * merge_stride] = r14; - merge_l[12 * merge_stride] = r13; - merge_l[11 * merge_stride] = r12; - merge_l[10 * merge_stride] = r11; - merge_l[9 * merge_stride] = r10; - merge_l[8 * merge_stride] = r9; - merge_l[7 * merge_stride] = r8; - merge_l[6 * merge_stride] = r7; - merge_l[5 * merge_stride] = r6; - merge_l[4 * merge_stride] = r5; - merge_l[3 * merge_stride] = r4; - merge_l[2 * merge_stride] = r3; - merge_l[1 * merge_stride] = r2; - merge_l[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_hm_7(__global HS_KEY_TYPE* const restrict vout) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = (warp_idx / 16) >> 2; - - uint const merge_stride = 16 * 8 << 2; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - uint const merge_off = (warp_idx - merge_idx * (16 << 2)) * 8; - - __global HS_KEY_TYPE* const restrict merge_ptr = - vout + (merge_base + merge_off + warp_lane_idx); - - HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; - HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; - HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; - HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; - HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; - HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; - HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; - HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; - HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; - HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; - HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; - HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; - HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; - HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; - HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; - HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; - HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; - HS_CMP_XCHG(r1, r17) - HS_CMP_XCHG(r9, r25) - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r5, r21) - HS_CMP_XCHG(r13, r29) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r3, r19) - HS_CMP_XCHG(r11, r27) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r7, r23) - HS_CMP_XCHG(r15, r31) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r2, r18) - HS_CMP_XCHG(r10, r26) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r6, r22) - HS_CMP_XCHG(r14, r30) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r4, r20) - HS_CMP_XCHG(r12, r28) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r8, r24) - HS_CMP_XCHG(r16, r32) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_ptr[31 * merge_stride] = r32; - merge_ptr[30 * merge_stride] = r31; - merge_ptr[29 * merge_stride] = r30; - merge_ptr[28 * merge_stride] = r29; - merge_ptr[27 * merge_stride] = r28; - merge_ptr[26 * merge_stride] = r27; - merge_ptr[25 * merge_stride] = r26; - merge_ptr[24 * merge_stride] = r25; - merge_ptr[23 * merge_stride] = r24; - merge_ptr[22 * merge_stride] = r23; - merge_ptr[21 * merge_stride] = r22; - merge_ptr[20 * merge_stride] = r21; - merge_ptr[19 * merge_stride] = r20; - merge_ptr[18 * merge_stride] = r19; - merge_ptr[17 * merge_stride] = r18; - merge_ptr[16 * merge_stride] = r17; - merge_ptr[15 * merge_stride] = r16; - merge_ptr[14 * merge_stride] = r15; - merge_ptr[13 * merge_stride] = r14; - merge_ptr[12 * merge_stride] = r13; - merge_ptr[11 * merge_stride] = r12; - merge_ptr[10 * merge_stride] = r11; - merge_ptr[9 * merge_stride] = r10; - merge_ptr[8 * merge_stride] = r9; - merge_ptr[7 * merge_stride] = r8; - merge_ptr[6 * merge_stride] = r7; - merge_ptr[5 * merge_stride] = r6; - merge_ptr[4 * merge_stride] = r5; - merge_ptr[3 * merge_stride] = r4; - merge_ptr[2 * merge_stride] = r3; - merge_ptr[1 * merge_stride] = r2; - merge_ptr[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_fm_9(__global HS_KEY_TYPE* const restrict vout, - uint const fm_full, - uint const fm_frac) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = warp_idx / 16 >> 8; - - uint const merge_stride = 16 * 8 << 8; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - - uint const merge_l_off = - (warp_idx - merge_idx * (16 << 8)) * 8 + warp_lane_idx; - uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; - - int const merge_r_off = merge_keys - merge_l_end - 1; - - __global HS_KEY_TYPE* const restrict merge_l = - vout + (merge_base + merge_l_off); - __global HS_KEY_TYPE* const restrict merge_r = - vout + (merge_base + merge_r_off); - - HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; - if (merge_idx < fm_full) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; - HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; - HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; - HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; - HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; - HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; - HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; - HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r8, r25) - HS_CMP_XCHG(r7, r26) - HS_CMP_XCHG(r6, r27) - HS_CMP_XCHG(r5, r28) - HS_CMP_XCHG(r4, r29) - HS_CMP_XCHG(r3, r30) - HS_CMP_XCHG(r2, r31) - HS_CMP_XCHG(r1, r32) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_r[15 * merge_stride] = r32; - merge_r[14 * merge_stride] = r31; - merge_r[13 * merge_stride] = r30; - merge_r[12 * merge_stride] = r29; - merge_r[11 * merge_stride] = r28; - merge_r[10 * merge_stride] = r27; - merge_r[9 * merge_stride] = r26; - merge_r[8 * merge_stride] = r25; - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 8) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 4) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 2) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r17, r18) - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_CMP_XCHG(r16, r17) - merge_r[0 * merge_stride] = r17; - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - merge_l[15 * merge_stride] = r16; - merge_l[14 * merge_stride] = r15; - merge_l[13 * merge_stride] = r14; - merge_l[12 * merge_stride] = r13; - merge_l[11 * merge_stride] = r12; - merge_l[10 * merge_stride] = r11; - merge_l[9 * merge_stride] = r10; - merge_l[8 * merge_stride] = r9; - merge_l[7 * merge_stride] = r8; - merge_l[6 * merge_stride] = r7; - merge_l[5 * merge_stride] = r6; - merge_l[4 * merge_stride] = r5; - merge_l[3 * merge_stride] = r4; - merge_l[2 * merge_stride] = r3; - merge_l[1 * merge_stride] = r2; - merge_l[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_hm_8(__global HS_KEY_TYPE* const restrict vout) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = (warp_idx / 16) >> 3; - - uint const merge_stride = 16 * 8 << 3; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - uint const merge_off = (warp_idx - merge_idx * (16 << 3)) * 8; - - __global HS_KEY_TYPE* const restrict merge_ptr = - vout + (merge_base + merge_off + warp_lane_idx); - - HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; - HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; - HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; - HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; - HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; - HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; - HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; - HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; - HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; - HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; - HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; - HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; - HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; - HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; - HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; - HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; - HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; - HS_CMP_XCHG(r1, r17) - HS_CMP_XCHG(r9, r25) - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r5, r21) - HS_CMP_XCHG(r13, r29) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r3, r19) - HS_CMP_XCHG(r11, r27) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r7, r23) - HS_CMP_XCHG(r15, r31) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r2, r18) - HS_CMP_XCHG(r10, r26) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r6, r22) - HS_CMP_XCHG(r14, r30) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r4, r20) - HS_CMP_XCHG(r12, r28) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r8, r24) - HS_CMP_XCHG(r16, r32) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_ptr[31 * merge_stride] = r32; - merge_ptr[30 * merge_stride] = r31; - merge_ptr[29 * merge_stride] = r30; - merge_ptr[28 * merge_stride] = r29; - merge_ptr[27 * merge_stride] = r28; - merge_ptr[26 * merge_stride] = r27; - merge_ptr[25 * merge_stride] = r26; - merge_ptr[24 * merge_stride] = r25; - merge_ptr[23 * merge_stride] = r24; - merge_ptr[22 * merge_stride] = r23; - merge_ptr[21 * merge_stride] = r22; - merge_ptr[20 * merge_stride] = r21; - merge_ptr[19 * merge_stride] = r20; - merge_ptr[18 * merge_stride] = r19; - merge_ptr[17 * merge_stride] = r18; - merge_ptr[16 * merge_stride] = r17; - merge_ptr[15 * merge_stride] = r16; - merge_ptr[14 * merge_stride] = r15; - merge_ptr[13 * merge_stride] = r14; - merge_ptr[12 * merge_stride] = r13; - merge_ptr[11 * merge_stride] = r12; - merge_ptr[10 * merge_stride] = r11; - merge_ptr[9 * merge_stride] = r10; - merge_ptr[8 * merge_stride] = r9; - merge_ptr[7 * merge_stride] = r8; - merge_ptr[6 * merge_stride] = r7; - merge_ptr[5 * merge_stride] = r6; - merge_ptr[4 * merge_stride] = r5; - merge_ptr[3 * merge_stride] = r4; - merge_ptr[2 * merge_stride] = r3; - merge_ptr[1 * merge_stride] = r2; - merge_ptr[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_fm_10(__global HS_KEY_TYPE* const restrict vout, - uint const fm_full, - uint const fm_frac) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = warp_idx / 16 >> 9; - - uint const merge_stride = 16 * 8 << 9; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - - uint const merge_l_off = - (warp_idx - merge_idx * (16 << 9)) * 8 + warp_lane_idx; - uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; - - int const merge_r_off = merge_keys - merge_l_end - 1; - - __global HS_KEY_TYPE* const restrict merge_l = - vout + (merge_base + merge_l_off); - __global HS_KEY_TYPE* const restrict merge_r = - vout + (merge_base + merge_r_off); - - HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; - if (merge_idx < fm_full) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; - HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; - HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; - HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; - HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; - HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; - HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; - HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r8, r25) - HS_CMP_XCHG(r7, r26) - HS_CMP_XCHG(r6, r27) - HS_CMP_XCHG(r5, r28) - HS_CMP_XCHG(r4, r29) - HS_CMP_XCHG(r3, r30) - HS_CMP_XCHG(r2, r31) - HS_CMP_XCHG(r1, r32) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_r[15 * merge_stride] = r32; - merge_r[14 * merge_stride] = r31; - merge_r[13 * merge_stride] = r30; - merge_r[12 * merge_stride] = r29; - merge_r[11 * merge_stride] = r28; - merge_r[10 * merge_stride] = r27; - merge_r[9 * merge_stride] = r26; - merge_r[8 * merge_stride] = r25; - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 8) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 4) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 2) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r17, r18) - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_CMP_XCHG(r16, r17) - merge_r[0 * merge_stride] = r17; - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - merge_l[15 * merge_stride] = r16; - merge_l[14 * merge_stride] = r15; - merge_l[13 * merge_stride] = r14; - merge_l[12 * merge_stride] = r13; - merge_l[11 * merge_stride] = r12; - merge_l[10 * merge_stride] = r11; - merge_l[9 * merge_stride] = r10; - merge_l[8 * merge_stride] = r9; - merge_l[7 * merge_stride] = r8; - merge_l[6 * merge_stride] = r7; - merge_l[5 * merge_stride] = r6; - merge_l[4 * merge_stride] = r5; - merge_l[3 * merge_stride] = r4; - merge_l[2 * merge_stride] = r3; - merge_l[1 * merge_stride] = r2; - merge_l[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_hm_9(__global HS_KEY_TYPE* const restrict vout) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = (warp_idx / 16) >> 4; - - uint const merge_stride = 16 * 8 << 4; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - uint const merge_off = (warp_idx - merge_idx * (16 << 4)) * 8; - - __global HS_KEY_TYPE* const restrict merge_ptr = - vout + (merge_base + merge_off + warp_lane_idx); - - HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; - HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; - HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; - HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; - HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; - HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; - HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; - HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; - HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; - HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; - HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; - HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; - HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; - HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; - HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; - HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; - HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; - HS_CMP_XCHG(r1, r17) - HS_CMP_XCHG(r9, r25) - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r5, r21) - HS_CMP_XCHG(r13, r29) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r3, r19) - HS_CMP_XCHG(r11, r27) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r7, r23) - HS_CMP_XCHG(r15, r31) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r2, r18) - HS_CMP_XCHG(r10, r26) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r6, r22) - HS_CMP_XCHG(r14, r30) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r4, r20) - HS_CMP_XCHG(r12, r28) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r8, r24) - HS_CMP_XCHG(r16, r32) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_ptr[31 * merge_stride] = r32; - merge_ptr[30 * merge_stride] = r31; - merge_ptr[29 * merge_stride] = r30; - merge_ptr[28 * merge_stride] = r29; - merge_ptr[27 * merge_stride] = r28; - merge_ptr[26 * merge_stride] = r27; - merge_ptr[25 * merge_stride] = r26; - merge_ptr[24 * merge_stride] = r25; - merge_ptr[23 * merge_stride] = r24; - merge_ptr[22 * merge_stride] = r23; - merge_ptr[21 * merge_stride] = r22; - merge_ptr[20 * merge_stride] = r21; - merge_ptr[19 * merge_stride] = r20; - merge_ptr[18 * merge_stride] = r19; - merge_ptr[17 * merge_stride] = r18; - merge_ptr[16 * merge_stride] = r17; - merge_ptr[15 * merge_stride] = r16; - merge_ptr[14 * merge_stride] = r15; - merge_ptr[13 * merge_stride] = r14; - merge_ptr[12 * merge_stride] = r13; - merge_ptr[11 * merge_stride] = r12; - merge_ptr[10 * merge_stride] = r11; - merge_ptr[9 * merge_stride] = r10; - merge_ptr[8 * merge_stride] = r9; - merge_ptr[7 * merge_stride] = r8; - merge_ptr[6 * merge_stride] = r7; - merge_ptr[5 * merge_stride] = r6; - merge_ptr[4 * merge_stride] = r5; - merge_ptr[3 * merge_stride] = r4; - merge_ptr[2 * merge_stride] = r3; - merge_ptr[1 * merge_stride] = r2; - merge_ptr[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_fm_11(__global HS_KEY_TYPE* const restrict vout, - uint const fm_full, - uint const fm_frac) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = warp_idx / 16 >> 10; - - uint const merge_stride = 16 * 8 << 10; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - - uint const merge_l_off = - (warp_idx - merge_idx * (16 << 10)) * 8 + warp_lane_idx; - uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; - - int const merge_r_off = merge_keys - merge_l_end - 1; - - __global HS_KEY_TYPE* const restrict merge_l = - vout + (merge_base + merge_l_off); - __global HS_KEY_TYPE* const restrict merge_r = - vout + (merge_base + merge_r_off); - - HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; - if (merge_idx < fm_full) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; - HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; - HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; - HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; - HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; - HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; - HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; - HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r8, r25) - HS_CMP_XCHG(r7, r26) - HS_CMP_XCHG(r6, r27) - HS_CMP_XCHG(r5, r28) - HS_CMP_XCHG(r4, r29) - HS_CMP_XCHG(r3, r30) - HS_CMP_XCHG(r2, r31) - HS_CMP_XCHG(r1, r32) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_r[15 * merge_stride] = r32; - merge_r[14 * merge_stride] = r31; - merge_r[13 * merge_stride] = r30; - merge_r[12 * merge_stride] = r29; - merge_r[11 * merge_stride] = r28; - merge_r[10 * merge_stride] = r27; - merge_r[9 * merge_stride] = r26; - merge_r[8 * merge_stride] = r25; - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 8) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 4) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 2) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r17, r18) - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_CMP_XCHG(r16, r17) - merge_r[0 * merge_stride] = r17; - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - merge_l[15 * merge_stride] = r16; - merge_l[14 * merge_stride] = r15; - merge_l[13 * merge_stride] = r14; - merge_l[12 * merge_stride] = r13; - merge_l[11 * merge_stride] = r12; - merge_l[10 * merge_stride] = r11; - merge_l[9 * merge_stride] = r10; - merge_l[8 * merge_stride] = r9; - merge_l[7 * merge_stride] = r8; - merge_l[6 * merge_stride] = r7; - merge_l[5 * merge_stride] = r6; - merge_l[4 * merge_stride] = r5; - merge_l[3 * merge_stride] = r4; - merge_l[2 * merge_stride] = r3; - merge_l[1 * merge_stride] = r2; - merge_l[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_hm_10(__global HS_KEY_TYPE* const restrict vout) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = (warp_idx / 16) >> 5; - - uint const merge_stride = 16 * 8 << 5; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - uint const merge_off = (warp_idx - merge_idx * (16 << 5)) * 8; - - __global HS_KEY_TYPE* const restrict merge_ptr = - vout + (merge_base + merge_off + warp_lane_idx); - - HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; - HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; - HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; - HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; - HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; - HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; - HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; - HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; - HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; - HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; - HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; - HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; - HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; - HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; - HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; - HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; - HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; - HS_CMP_XCHG(r1, r17) - HS_CMP_XCHG(r9, r25) - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r5, r21) - HS_CMP_XCHG(r13, r29) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r3, r19) - HS_CMP_XCHG(r11, r27) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r7, r23) - HS_CMP_XCHG(r15, r31) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r2, r18) - HS_CMP_XCHG(r10, r26) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r6, r22) - HS_CMP_XCHG(r14, r30) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r4, r20) - HS_CMP_XCHG(r12, r28) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r8, r24) - HS_CMP_XCHG(r16, r32) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_ptr[31 * merge_stride] = r32; - merge_ptr[30 * merge_stride] = r31; - merge_ptr[29 * merge_stride] = r30; - merge_ptr[28 * merge_stride] = r29; - merge_ptr[27 * merge_stride] = r28; - merge_ptr[26 * merge_stride] = r27; - merge_ptr[25 * merge_stride] = r26; - merge_ptr[24 * merge_stride] = r25; - merge_ptr[23 * merge_stride] = r24; - merge_ptr[22 * merge_stride] = r23; - merge_ptr[21 * merge_stride] = r22; - merge_ptr[20 * merge_stride] = r21; - merge_ptr[19 * merge_stride] = r20; - merge_ptr[18 * merge_stride] = r19; - merge_ptr[17 * merge_stride] = r18; - merge_ptr[16 * merge_stride] = r17; - merge_ptr[15 * merge_stride] = r16; - merge_ptr[14 * merge_stride] = r15; - merge_ptr[13 * merge_stride] = r14; - merge_ptr[12 * merge_stride] = r13; - merge_ptr[11 * merge_stride] = r12; - merge_ptr[10 * merge_stride] = r11; - merge_ptr[9 * merge_stride] = r10; - merge_ptr[8 * merge_stride] = r9; - merge_ptr[7 * merge_stride] = r8; - merge_ptr[6 * merge_stride] = r7; - merge_ptr[5 * merge_stride] = r6; - merge_ptr[4 * merge_stride] = r5; - merge_ptr[3 * merge_stride] = r4; - merge_ptr[2 * merge_stride] = r3; - merge_ptr[1 * merge_stride] = r2; - merge_ptr[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_fm_12(__global HS_KEY_TYPE* const restrict vout, - uint const fm_full, - uint const fm_frac) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = warp_idx / 16 >> 11; - - uint const merge_stride = 16 * 8 << 11; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - - uint const merge_l_off = - (warp_idx - merge_idx * (16 << 11)) * 8 + warp_lane_idx; - uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; - - int const merge_r_off = merge_keys - merge_l_end - 1; - - __global HS_KEY_TYPE* const restrict merge_l = - vout + (merge_base + merge_l_off); - __global HS_KEY_TYPE* const restrict merge_r = - vout + (merge_base + merge_r_off); - - HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; - if (merge_idx < fm_full) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; - HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; - HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; - HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; - HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; - HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; - HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; - HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r8, r25) - HS_CMP_XCHG(r7, r26) - HS_CMP_XCHG(r6, r27) - HS_CMP_XCHG(r5, r28) - HS_CMP_XCHG(r4, r29) - HS_CMP_XCHG(r3, r30) - HS_CMP_XCHG(r2, r31) - HS_CMP_XCHG(r1, r32) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_r[15 * merge_stride] = r32; - merge_r[14 * merge_stride] = r31; - merge_r[13 * merge_stride] = r30; - merge_r[12 * merge_stride] = r29; - merge_r[11 * merge_stride] = r28; - merge_r[10 * merge_stride] = r27; - merge_r[9 * merge_stride] = r26; - merge_r[8 * merge_stride] = r25; - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 8) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 4) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 2) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r17, r18) - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_CMP_XCHG(r16, r17) - merge_r[0 * merge_stride] = r17; - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - merge_l[15 * merge_stride] = r16; - merge_l[14 * merge_stride] = r15; - merge_l[13 * merge_stride] = r14; - merge_l[12 * merge_stride] = r13; - merge_l[11 * merge_stride] = r12; - merge_l[10 * merge_stride] = r11; - merge_l[9 * merge_stride] = r10; - merge_l[8 * merge_stride] = r9; - merge_l[7 * merge_stride] = r8; - merge_l[6 * merge_stride] = r7; - merge_l[5 * merge_stride] = r6; - merge_l[4 * merge_stride] = r5; - merge_l[3 * merge_stride] = r4; - merge_l[2 * merge_stride] = r3; - merge_l[1 * merge_stride] = r2; - merge_l[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_hm_11(__global HS_KEY_TYPE* const restrict vout) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = (warp_idx / 16) >> 6; - - uint const merge_stride = 16 * 8 << 6; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - uint const merge_off = (warp_idx - merge_idx * (16 << 6)) * 8; - - __global HS_KEY_TYPE* const restrict merge_ptr = - vout + (merge_base + merge_off + warp_lane_idx); - - HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; - HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; - HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; - HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; - HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; - HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; - HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; - HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; - HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; - HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; - HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; - HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; - HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; - HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; - HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; - HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; - HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; - HS_CMP_XCHG(r1, r17) - HS_CMP_XCHG(r9, r25) - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r5, r21) - HS_CMP_XCHG(r13, r29) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r3, r19) - HS_CMP_XCHG(r11, r27) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r7, r23) - HS_CMP_XCHG(r15, r31) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r2, r18) - HS_CMP_XCHG(r10, r26) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r6, r22) - HS_CMP_XCHG(r14, r30) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r4, r20) - HS_CMP_XCHG(r12, r28) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r8, r24) - HS_CMP_XCHG(r16, r32) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_ptr[31 * merge_stride] = r32; - merge_ptr[30 * merge_stride] = r31; - merge_ptr[29 * merge_stride] = r30; - merge_ptr[28 * merge_stride] = r29; - merge_ptr[27 * merge_stride] = r28; - merge_ptr[26 * merge_stride] = r27; - merge_ptr[25 * merge_stride] = r26; - merge_ptr[24 * merge_stride] = r25; - merge_ptr[23 * merge_stride] = r24; - merge_ptr[22 * merge_stride] = r23; - merge_ptr[21 * merge_stride] = r22; - merge_ptr[20 * merge_stride] = r21; - merge_ptr[19 * merge_stride] = r20; - merge_ptr[18 * merge_stride] = r19; - merge_ptr[17 * merge_stride] = r18; - merge_ptr[16 * merge_stride] = r17; - merge_ptr[15 * merge_stride] = r16; - merge_ptr[14 * merge_stride] = r15; - merge_ptr[13 * merge_stride] = r14; - merge_ptr[12 * merge_stride] = r13; - merge_ptr[11 * merge_stride] = r12; - merge_ptr[10 * merge_stride] = r11; - merge_ptr[9 * merge_stride] = r10; - merge_ptr[8 * merge_stride] = r9; - merge_ptr[7 * merge_stride] = r8; - merge_ptr[6 * merge_stride] = r7; - merge_ptr[5 * merge_stride] = r6; - merge_ptr[4 * merge_stride] = r5; - merge_ptr[3 * merge_stride] = r4; - merge_ptr[2 * merge_stride] = r3; - merge_ptr[1 * merge_stride] = r2; - merge_ptr[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_fm_13(__global HS_KEY_TYPE* const restrict vout, - uint const fm_full, - uint const fm_frac) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = warp_idx / 16 >> 12; - - uint const merge_stride = 16 * 8 << 12; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - - uint const merge_l_off = - (warp_idx - merge_idx * (16 << 12)) * 8 + warp_lane_idx; - uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; - - int const merge_r_off = merge_keys - merge_l_end - 1; - - __global HS_KEY_TYPE* const restrict merge_l = - vout + (merge_base + merge_l_off); - __global HS_KEY_TYPE* const restrict merge_r = - vout + (merge_base + merge_r_off); - - HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; - if (merge_idx < fm_full) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; - HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; - HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; - HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; - HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; - HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; - HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; - HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r8, r25) - HS_CMP_XCHG(r7, r26) - HS_CMP_XCHG(r6, r27) - HS_CMP_XCHG(r5, r28) - HS_CMP_XCHG(r4, r29) - HS_CMP_XCHG(r3, r30) - HS_CMP_XCHG(r2, r31) - HS_CMP_XCHG(r1, r32) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_r[15 * merge_stride] = r32; - merge_r[14 * merge_stride] = r31; - merge_r[13 * merge_stride] = r30; - merge_r[12 * merge_stride] = r29; - merge_r[11 * merge_stride] = r28; - merge_r[10 * merge_stride] = r27; - merge_r[9 * merge_stride] = r26; - merge_r[8 * merge_stride] = r25; - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 8) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 4) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 2) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r17, r18) - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_CMP_XCHG(r16, r17) - merge_r[0 * merge_stride] = r17; - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - merge_l[15 * merge_stride] = r16; - merge_l[14 * merge_stride] = r15; - merge_l[13 * merge_stride] = r14; - merge_l[12 * merge_stride] = r13; - merge_l[11 * merge_stride] = r12; - merge_l[10 * merge_stride] = r11; - merge_l[9 * merge_stride] = r10; - merge_l[8 * merge_stride] = r9; - merge_l[7 * merge_stride] = r8; - merge_l[6 * merge_stride] = r7; - merge_l[5 * merge_stride] = r6; - merge_l[4 * merge_stride] = r5; - merge_l[3 * merge_stride] = r4; - merge_l[2 * merge_stride] = r3; - merge_l[1 * merge_stride] = r2; - merge_l[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_hm_12(__global HS_KEY_TYPE* const restrict vout) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = (warp_idx / 16) >> 7; - - uint const merge_stride = 16 * 8 << 7; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - uint const merge_off = (warp_idx - merge_idx * (16 << 7)) * 8; - - __global HS_KEY_TYPE* const restrict merge_ptr = - vout + (merge_base + merge_off + warp_lane_idx); - - HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; - HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; - HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; - HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; - HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; - HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; - HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; - HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; - HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; - HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; - HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; - HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; - HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; - HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; - HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; - HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; - HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; - HS_CMP_XCHG(r1, r17) - HS_CMP_XCHG(r9, r25) - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r5, r21) - HS_CMP_XCHG(r13, r29) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r3, r19) - HS_CMP_XCHG(r11, r27) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r7, r23) - HS_CMP_XCHG(r15, r31) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r2, r18) - HS_CMP_XCHG(r10, r26) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r6, r22) - HS_CMP_XCHG(r14, r30) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r4, r20) - HS_CMP_XCHG(r12, r28) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r8, r24) - HS_CMP_XCHG(r16, r32) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_ptr[31 * merge_stride] = r32; - merge_ptr[30 * merge_stride] = r31; - merge_ptr[29 * merge_stride] = r30; - merge_ptr[28 * merge_stride] = r29; - merge_ptr[27 * merge_stride] = r28; - merge_ptr[26 * merge_stride] = r27; - merge_ptr[25 * merge_stride] = r26; - merge_ptr[24 * merge_stride] = r25; - merge_ptr[23 * merge_stride] = r24; - merge_ptr[22 * merge_stride] = r23; - merge_ptr[21 * merge_stride] = r22; - merge_ptr[20 * merge_stride] = r21; - merge_ptr[19 * merge_stride] = r20; - merge_ptr[18 * merge_stride] = r19; - merge_ptr[17 * merge_stride] = r18; - merge_ptr[16 * merge_stride] = r17; - merge_ptr[15 * merge_stride] = r16; - merge_ptr[14 * merge_stride] = r15; - merge_ptr[13 * merge_stride] = r14; - merge_ptr[12 * merge_stride] = r13; - merge_ptr[11 * merge_stride] = r12; - merge_ptr[10 * merge_stride] = r11; - merge_ptr[9 * merge_stride] = r10; - merge_ptr[8 * merge_stride] = r9; - merge_ptr[7 * merge_stride] = r8; - merge_ptr[6 * merge_stride] = r7; - merge_ptr[5 * merge_stride] = r6; - merge_ptr[4 * merge_stride] = r5; - merge_ptr[3 * merge_stride] = r4; - merge_ptr[2 * merge_stride] = r3; - merge_ptr[1 * merge_stride] = r2; - merge_ptr[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_fm_14(__global HS_KEY_TYPE* const restrict vout, - uint const fm_full, - uint const fm_frac) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = warp_idx / 16 >> 13; - - uint const merge_stride = 16 * 8 << 13; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - - uint const merge_l_off = - (warp_idx - merge_idx * (16 << 13)) * 8 + warp_lane_idx; - uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; - - int const merge_r_off = merge_keys - merge_l_end - 1; - - __global HS_KEY_TYPE* const restrict merge_l = - vout + (merge_base + merge_l_off); - __global HS_KEY_TYPE* const restrict merge_r = - vout + (merge_base + merge_r_off); - - HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; - if (merge_idx < fm_full) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; - HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; - HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; - HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; - HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; - HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; - HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; - HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r8, r25) - HS_CMP_XCHG(r7, r26) - HS_CMP_XCHG(r6, r27) - HS_CMP_XCHG(r5, r28) - HS_CMP_XCHG(r4, r29) - HS_CMP_XCHG(r3, r30) - HS_CMP_XCHG(r2, r31) - HS_CMP_XCHG(r1, r32) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_r[15 * merge_stride] = r32; - merge_r[14 * merge_stride] = r31; - merge_r[13 * merge_stride] = r30; - merge_r[12 * merge_stride] = r29; - merge_r[11 * merge_stride] = r28; - merge_r[10 * merge_stride] = r27; - merge_r[9 * merge_stride] = r26; - merge_r[8 * merge_stride] = r25; - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 8) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 4) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 2) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r17, r18) - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_CMP_XCHG(r16, r17) - merge_r[0 * merge_stride] = r17; - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - merge_l[15 * merge_stride] = r16; - merge_l[14 * merge_stride] = r15; - merge_l[13 * merge_stride] = r14; - merge_l[12 * merge_stride] = r13; - merge_l[11 * merge_stride] = r12; - merge_l[10 * merge_stride] = r11; - merge_l[9 * merge_stride] = r10; - merge_l[8 * merge_stride] = r9; - merge_l[7 * merge_stride] = r8; - merge_l[6 * merge_stride] = r7; - merge_l[5 * merge_stride] = r6; - merge_l[4 * merge_stride] = r5; - merge_l[3 * merge_stride] = r4; - merge_l[2 * merge_stride] = r3; - merge_l[1 * merge_stride] = r2; - merge_l[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_hm_13(__global HS_KEY_TYPE* const restrict vout) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = (warp_idx / 16) >> 8; - - uint const merge_stride = 16 * 8 << 8; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - uint const merge_off = (warp_idx - merge_idx * (16 << 8)) * 8; - - __global HS_KEY_TYPE* const restrict merge_ptr = - vout + (merge_base + merge_off + warp_lane_idx); - - HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; - HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; - HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; - HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; - HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; - HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; - HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; - HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; - HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; - HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; - HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; - HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; - HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; - HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; - HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; - HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; - HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; - HS_CMP_XCHG(r1, r17) - HS_CMP_XCHG(r9, r25) - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r5, r21) - HS_CMP_XCHG(r13, r29) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r3, r19) - HS_CMP_XCHG(r11, r27) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r7, r23) - HS_CMP_XCHG(r15, r31) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r2, r18) - HS_CMP_XCHG(r10, r26) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r6, r22) - HS_CMP_XCHG(r14, r30) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r4, r20) - HS_CMP_XCHG(r12, r28) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r8, r24) - HS_CMP_XCHG(r16, r32) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_ptr[31 * merge_stride] = r32; - merge_ptr[30 * merge_stride] = r31; - merge_ptr[29 * merge_stride] = r30; - merge_ptr[28 * merge_stride] = r29; - merge_ptr[27 * merge_stride] = r28; - merge_ptr[26 * merge_stride] = r27; - merge_ptr[25 * merge_stride] = r26; - merge_ptr[24 * merge_stride] = r25; - merge_ptr[23 * merge_stride] = r24; - merge_ptr[22 * merge_stride] = r23; - merge_ptr[21 * merge_stride] = r22; - merge_ptr[20 * merge_stride] = r21; - merge_ptr[19 * merge_stride] = r20; - merge_ptr[18 * merge_stride] = r19; - merge_ptr[17 * merge_stride] = r18; - merge_ptr[16 * merge_stride] = r17; - merge_ptr[15 * merge_stride] = r16; - merge_ptr[14 * merge_stride] = r15; - merge_ptr[13 * merge_stride] = r14; - merge_ptr[12 * merge_stride] = r13; - merge_ptr[11 * merge_stride] = r12; - merge_ptr[10 * merge_stride] = r11; - merge_ptr[9 * merge_stride] = r10; - merge_ptr[8 * merge_stride] = r9; - merge_ptr[7 * merge_stride] = r8; - merge_ptr[6 * merge_stride] = r7; - merge_ptr[5 * merge_stride] = r6; - merge_ptr[4 * merge_stride] = r5; - merge_ptr[3 * merge_stride] = r4; - merge_ptr[2 * merge_stride] = r3; - merge_ptr[1 * merge_stride] = r2; - merge_ptr[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_fm_15(__global HS_KEY_TYPE* const restrict vout, - uint const fm_full, - uint const fm_frac) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = warp_idx / 16 >> 14; - - uint const merge_stride = 16 * 8 << 14; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - - uint const merge_l_off = - (warp_idx - merge_idx * (16 << 14)) * 8 + warp_lane_idx; - uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; - - int const merge_r_off = merge_keys - merge_l_end - 1; - - __global HS_KEY_TYPE* const restrict merge_l = - vout + (merge_base + merge_l_off); - __global HS_KEY_TYPE* const restrict merge_r = - vout + (merge_base + merge_r_off); - - HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; - if (merge_idx < fm_full) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; - HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; - HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; - HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; - HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; - HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; - HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; - HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r8, r25) - HS_CMP_XCHG(r7, r26) - HS_CMP_XCHG(r6, r27) - HS_CMP_XCHG(r5, r28) - HS_CMP_XCHG(r4, r29) - HS_CMP_XCHG(r3, r30) - HS_CMP_XCHG(r2, r31) - HS_CMP_XCHG(r1, r32) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_r[15 * merge_stride] = r32; - merge_r[14 * merge_stride] = r31; - merge_r[13 * merge_stride] = r30; - merge_r[12 * merge_stride] = r29; - merge_r[11 * merge_stride] = r28; - merge_r[10 * merge_stride] = r27; - merge_r[9 * merge_stride] = r26; - merge_r[8 * merge_stride] = r25; - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 8) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 4) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 2) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r17, r18) - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_CMP_XCHG(r16, r17) - merge_r[0 * merge_stride] = r17; - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - merge_l[15 * merge_stride] = r16; - merge_l[14 * merge_stride] = r15; - merge_l[13 * merge_stride] = r14; - merge_l[12 * merge_stride] = r13; - merge_l[11 * merge_stride] = r12; - merge_l[10 * merge_stride] = r11; - merge_l[9 * merge_stride] = r10; - merge_l[8 * merge_stride] = r9; - merge_l[7 * merge_stride] = r8; - merge_l[6 * merge_stride] = r7; - merge_l[5 * merge_stride] = r6; - merge_l[4 * merge_stride] = r5; - merge_l[3 * merge_stride] = r4; - merge_l[2 * merge_stride] = r3; - merge_l[1 * merge_stride] = r2; - merge_l[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_hm_14(__global HS_KEY_TYPE* const restrict vout) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = (warp_idx / 16) >> 9; - - uint const merge_stride = 16 * 8 << 9; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - uint const merge_off = (warp_idx - merge_idx * (16 << 9)) * 8; - - __global HS_KEY_TYPE* const restrict merge_ptr = - vout + (merge_base + merge_off + warp_lane_idx); - - HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; - HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; - HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; - HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; - HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; - HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; - HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; - HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; - HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; - HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; - HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; - HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; - HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; - HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; - HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; - HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; - HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; - HS_CMP_XCHG(r1, r17) - HS_CMP_XCHG(r9, r25) - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r5, r21) - HS_CMP_XCHG(r13, r29) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r3, r19) - HS_CMP_XCHG(r11, r27) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r7, r23) - HS_CMP_XCHG(r15, r31) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r2, r18) - HS_CMP_XCHG(r10, r26) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r6, r22) - HS_CMP_XCHG(r14, r30) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r4, r20) - HS_CMP_XCHG(r12, r28) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r8, r24) - HS_CMP_XCHG(r16, r32) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_ptr[31 * merge_stride] = r32; - merge_ptr[30 * merge_stride] = r31; - merge_ptr[29 * merge_stride] = r30; - merge_ptr[28 * merge_stride] = r29; - merge_ptr[27 * merge_stride] = r28; - merge_ptr[26 * merge_stride] = r27; - merge_ptr[25 * merge_stride] = r26; - merge_ptr[24 * merge_stride] = r25; - merge_ptr[23 * merge_stride] = r24; - merge_ptr[22 * merge_stride] = r23; - merge_ptr[21 * merge_stride] = r22; - merge_ptr[20 * merge_stride] = r21; - merge_ptr[19 * merge_stride] = r20; - merge_ptr[18 * merge_stride] = r19; - merge_ptr[17 * merge_stride] = r18; - merge_ptr[16 * merge_stride] = r17; - merge_ptr[15 * merge_stride] = r16; - merge_ptr[14 * merge_stride] = r15; - merge_ptr[13 * merge_stride] = r14; - merge_ptr[12 * merge_stride] = r13; - merge_ptr[11 * merge_stride] = r12; - merge_ptr[10 * merge_stride] = r11; - merge_ptr[9 * merge_stride] = r10; - merge_ptr[8 * merge_stride] = r9; - merge_ptr[7 * merge_stride] = r8; - merge_ptr[6 * merge_stride] = r7; - merge_ptr[5 * merge_stride] = r6; - merge_ptr[4 * merge_stride] = r5; - merge_ptr[3 * merge_stride] = r4; - merge_ptr[2 * merge_stride] = r3; - merge_ptr[1 * merge_stride] = r2; - merge_ptr[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_fm_16(__global HS_KEY_TYPE* const restrict vout, - uint const fm_full, - uint const fm_frac) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = warp_idx / 16 >> 15; - - uint const merge_stride = 16 * 8 << 15; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - - uint const merge_l_off = - (warp_idx - merge_idx * (16 << 15)) * 8 + warp_lane_idx; - uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off; - - int const merge_r_off = merge_keys - merge_l_end - 1; - - __global HS_KEY_TYPE* const restrict merge_l = - vout + (merge_base + merge_l_off); - __global HS_KEY_TYPE* const restrict merge_r = - vout + (merge_base + merge_r_off); - - HS_KEY_TYPE r1 = merge_l[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_l[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_l[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_l[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_l[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_l[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_l[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_l[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_l[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_l[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_l[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_l[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_l[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_l[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_l[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_l[15 * merge_stride]; - if (merge_idx < fm_full) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_KEY_TYPE r25 = merge_r[8 * merge_stride]; - HS_KEY_TYPE r26 = merge_r[9 * merge_stride]; - HS_KEY_TYPE r27 = merge_r[10 * merge_stride]; - HS_KEY_TYPE r28 = merge_r[11 * merge_stride]; - HS_KEY_TYPE r29 = merge_r[12 * merge_stride]; - HS_KEY_TYPE r30 = merge_r[13 * merge_stride]; - HS_KEY_TYPE r31 = merge_r[14 * merge_stride]; - HS_KEY_TYPE r32 = merge_r[15 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r8, r25) - HS_CMP_XCHG(r7, r26) - HS_CMP_XCHG(r6, r27) - HS_CMP_XCHG(r5, r28) - HS_CMP_XCHG(r4, r29) - HS_CMP_XCHG(r3, r30) - HS_CMP_XCHG(r2, r31) - HS_CMP_XCHG(r1, r32) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_r[15 * merge_stride] = r32; - merge_r[14 * merge_stride] = r31; - merge_r[13 * merge_stride] = r30; - merge_r[12 * merge_stride] = r29; - merge_r[11 * merge_stride] = r28; - merge_r[10 * merge_stride] = r27; - merge_r[9 * merge_stride] = r26; - merge_r[8 * merge_stride] = r25; - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 8) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_KEY_TYPE r21 = merge_r[4 * merge_stride]; - HS_KEY_TYPE r22 = merge_r[5 * merge_stride]; - HS_KEY_TYPE r23 = merge_r[6 * merge_stride]; - HS_KEY_TYPE r24 = merge_r[7 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r12, r21) - HS_CMP_XCHG(r11, r22) - HS_CMP_XCHG(r10, r23) - HS_CMP_XCHG(r9, r24) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - merge_r[7 * merge_stride] = r24; - merge_r[6 * merge_stride] = r23; - merge_r[5 * merge_stride] = r22; - merge_r[4 * merge_stride] = r21; - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 4) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_KEY_TYPE r19 = merge_r[2 * merge_stride]; - HS_KEY_TYPE r20 = merge_r[3 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r14, r19) - HS_CMP_XCHG(r13, r20) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - merge_r[3 * merge_stride] = r20; - merge_r[2 * merge_stride] = r19; - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else if (fm_frac == 2) { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_KEY_TYPE r18 = merge_r[1 * merge_stride]; - HS_CMP_XCHG(r16, r17) - HS_CMP_XCHG(r15, r18) - HS_CMP_XCHG(r17, r18) - merge_r[1 * merge_stride] = r18; - merge_r[0 * merge_stride] = r17; - } else { - HS_KEY_TYPE r17 = merge_r[0 * merge_stride]; - HS_CMP_XCHG(r16, r17) - merge_r[0 * merge_stride] = r17; - } - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - merge_l[15 * merge_stride] = r16; - merge_l[14 * merge_stride] = r15; - merge_l[13 * merge_stride] = r14; - merge_l[12 * merge_stride] = r13; - merge_l[11 * merge_stride] = r12; - merge_l[10 * merge_stride] = r11; - merge_l[9 * merge_stride] = r10; - merge_l[8 * merge_stride] = r9; - merge_l[7 * merge_stride] = r8; - merge_l[6 * merge_stride] = r7; - merge_l[5 * merge_stride] = r6; - merge_l[4 * merge_stride] = r5; - merge_l[3 * merge_stride] = r4; - merge_l[2 * merge_stride] = r3; - merge_l[1 * merge_stride] = r2; - merge_l[0 * merge_stride] = r1; -} - -__kernel __attribute__((intel_reqd_sub_group_size(8))) void -hs_kernel_hm_15(__global HS_KEY_TYPE* const restrict vout) -{ - uint const global_id = (uint)get_global_id(0); - uint const warp_idx = global_id / 8; - uint const warp_lane_idx = global_id & 7; - - uint const merge_idx = (warp_idx / 16) >> 10; - - uint const merge_stride = 16 * 8 << 10; - uint const merge_keys = merge_stride * 32; - - uint const merge_base = merge_idx * merge_keys; - uint const merge_off = (warp_idx - merge_idx * (16 << 10)) * 8; - - __global HS_KEY_TYPE* const restrict merge_ptr = - vout + (merge_base + merge_off + warp_lane_idx); - - HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride]; - HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride]; - HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride]; - HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride]; - HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride]; - HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride]; - HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride]; - HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride]; - HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride]; - HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride]; - HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride]; - HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride]; - HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride]; - HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride]; - HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride]; - HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride]; - HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride]; - HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride]; - HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride]; - HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride]; - HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride]; - HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride]; - HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride]; - HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride]; - HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride]; - HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride]; - HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride]; - HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride]; - HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride]; - HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride]; - HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride]; - HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride]; - HS_CMP_XCHG(r1, r17) - HS_CMP_XCHG(r9, r25) - HS_CMP_XCHG(r1, r9) - HS_CMP_XCHG(r17, r25) - HS_CMP_XCHG(r5, r21) - HS_CMP_XCHG(r13, r29) - HS_CMP_XCHG(r5, r13) - HS_CMP_XCHG(r21, r29) - HS_CMP_XCHG(r1, r5) - HS_CMP_XCHG(r9, r13) - HS_CMP_XCHG(r17, r21) - HS_CMP_XCHG(r25, r29) - HS_CMP_XCHG(r3, r19) - HS_CMP_XCHG(r11, r27) - HS_CMP_XCHG(r3, r11) - HS_CMP_XCHG(r19, r27) - HS_CMP_XCHG(r7, r23) - HS_CMP_XCHG(r15, r31) - HS_CMP_XCHG(r7, r15) - HS_CMP_XCHG(r23, r31) - HS_CMP_XCHG(r3, r7) - HS_CMP_XCHG(r11, r15) - HS_CMP_XCHG(r19, r23) - HS_CMP_XCHG(r27, r31) - HS_CMP_XCHG(r1, r3) - HS_CMP_XCHG(r5, r7) - HS_CMP_XCHG(r9, r11) - HS_CMP_XCHG(r13, r15) - HS_CMP_XCHG(r17, r19) - HS_CMP_XCHG(r21, r23) - HS_CMP_XCHG(r25, r27) - HS_CMP_XCHG(r29, r31) - HS_CMP_XCHG(r2, r18) - HS_CMP_XCHG(r10, r26) - HS_CMP_XCHG(r2, r10) - HS_CMP_XCHG(r18, r26) - HS_CMP_XCHG(r6, r22) - HS_CMP_XCHG(r14, r30) - HS_CMP_XCHG(r6, r14) - HS_CMP_XCHG(r22, r30) - HS_CMP_XCHG(r2, r6) - HS_CMP_XCHG(r10, r14) - HS_CMP_XCHG(r18, r22) - HS_CMP_XCHG(r26, r30) - HS_CMP_XCHG(r4, r20) - HS_CMP_XCHG(r12, r28) - HS_CMP_XCHG(r4, r12) - HS_CMP_XCHG(r20, r28) - HS_CMP_XCHG(r8, r24) - HS_CMP_XCHG(r16, r32) - HS_CMP_XCHG(r8, r16) - HS_CMP_XCHG(r24, r32) - HS_CMP_XCHG(r4, r8) - HS_CMP_XCHG(r12, r16) - HS_CMP_XCHG(r20, r24) - HS_CMP_XCHG(r28, r32) - HS_CMP_XCHG(r2, r4) - HS_CMP_XCHG(r6, r8) - HS_CMP_XCHG(r10, r12) - HS_CMP_XCHG(r14, r16) - HS_CMP_XCHG(r18, r20) - HS_CMP_XCHG(r22, r24) - HS_CMP_XCHG(r26, r28) - HS_CMP_XCHG(r30, r32) - HS_CMP_XCHG(r1, r2) - HS_CMP_XCHG(r3, r4) - HS_CMP_XCHG(r5, r6) - HS_CMP_XCHG(r7, r8) - HS_CMP_XCHG(r9, r10) - HS_CMP_XCHG(r11, r12) - HS_CMP_XCHG(r13, r14) - HS_CMP_XCHG(r15, r16) - HS_CMP_XCHG(r17, r18) - HS_CMP_XCHG(r19, r20) - HS_CMP_XCHG(r21, r22) - HS_CMP_XCHG(r23, r24) - HS_CMP_XCHG(r25, r26) - HS_CMP_XCHG(r27, r28) - HS_CMP_XCHG(r29, r30) - HS_CMP_XCHG(r31, r32) - merge_ptr[31 * merge_stride] = r32; - merge_ptr[30 * merge_stride] = r31; - merge_ptr[29 * merge_stride] = r30; - merge_ptr[28 * merge_stride] = r29; - merge_ptr[27 * merge_stride] = r28; - merge_ptr[26 * merge_stride] = r27; - merge_ptr[25 * merge_stride] = r26; - merge_ptr[24 * merge_stride] = r25; - merge_ptr[23 * merge_stride] = r24; - merge_ptr[22 * merge_stride] = r23; - merge_ptr[21 * merge_stride] = r22; - merge_ptr[20 * merge_stride] = r21; - merge_ptr[19 * merge_stride] = r20; - merge_ptr[18 * merge_stride] = r19; - merge_ptr[17 * merge_stride] = r18; - merge_ptr[16 * merge_stride] = r17; - merge_ptr[15 * merge_stride] = r16; - merge_ptr[14 * merge_stride] = r15; - merge_ptr[13 * merge_stride] = r14; - merge_ptr[12 * merge_stride] = r13; - merge_ptr[11 * merge_stride] = r12; - merge_ptr[10 * merge_stride] = r11; - merge_ptr[9 * merge_stride] = r10; - merge_ptr[8 * merge_stride] = r9; - merge_ptr[7 * merge_stride] = r8; - merge_ptr[6 * merge_stride] = r7; - merge_ptr[5 * merge_stride] = r6; - merge_ptr[4 * merge_stride] = r5; - merge_ptr[3 * merge_stride] = r4; - merge_ptr[2 * merge_stride] = r3; - merge_ptr[1 * merge_stride] = r2; - merge_ptr[0 * merge_stride] = r1; -} - -// -// -// diff --git a/src/compute/hs/cl/gen9/hs_cl.h b/src/compute/hs/cl/gen9/hs_cl.h deleted file mode 100644 index 4926a14fb3..0000000000 --- a/src/compute/hs/cl/gen9/hs_cl.h +++ /dev/null @@ -1,122 +0,0 @@ -// -// Copyright 2016 Google Inc. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. -// - -#ifndef HS_CL_ONCE -#define HS_CL_ONCE - -#define HS_LANES_PER_WARP_LOG2 3 -#define HS_LANES_PER_WARP (1 << HS_LANES_PER_WARP_LOG2) -#define HS_BS_WARPS 16 -#define HS_BS_WARPS_LOG2_RU 4 -#define HS_BC_WARPS_LOG2_MAX 4 -#define HS_FM_BLOCKS_LOG2_MIN 1 -#define HS_HM_BLOCKS_LOG2_MIN 1 -#define HS_KEYS_PER_LANE 16 -#define HS_REG_LAST(c) c##16 -#define HS_KEY_WORDS 2 -#define HS_KEY_TYPE ulong -#define HS_EMPTY - -#define HS_SLAB_ROWS() \ - HS_SLAB_ROW( 1, 0 ) \ - HS_SLAB_ROW( 2, 1 ) \ - HS_SLAB_ROW( 3, 2 ) \ - HS_SLAB_ROW( 4, 3 ) \ - HS_SLAB_ROW( 5, 4 ) \ - HS_SLAB_ROW( 6, 5 ) \ - HS_SLAB_ROW( 7, 6 ) \ - HS_SLAB_ROW( 8, 7 ) \ - HS_SLAB_ROW( 9, 8 ) \ - HS_SLAB_ROW( 10, 9 ) \ - HS_SLAB_ROW( 11, 10 ) \ - HS_SLAB_ROW( 12, 11 ) \ - HS_SLAB_ROW( 13, 12 ) \ - HS_SLAB_ROW( 14, 13 ) \ - HS_SLAB_ROW( 15, 14 ) \ - HS_SLAB_ROW( 16, 15 ) \ - HS_EMPTY - -#define HS_TRANSPOSE_SLAB() \ - HS_TRANSPOSE_STAGE( 1 ) \ - HS_TRANSPOSE_STAGE( 2 ) \ - HS_TRANSPOSE_STAGE( 3 ) \ - HS_TRANSPOSE_BLEND( r, s, 1, 2, 1 ) \ - HS_TRANSPOSE_BLEND( r, s, 1, 4, 3 ) \ - HS_TRANSPOSE_BLEND( r, s, 1, 6, 5 ) \ - HS_TRANSPOSE_BLEND( r, s, 1, 8, 7 ) \ - HS_TRANSPOSE_BLEND( r, s, 1, 10, 9 ) \ - HS_TRANSPOSE_BLEND( r, s, 1, 12, 11 ) \ - HS_TRANSPOSE_BLEND( r, s, 1, 14, 13 ) \ - HS_TRANSPOSE_BLEND( r, s, 1, 16, 15 ) \ - HS_TRANSPOSE_BLEND( s, t, 2, 3, 1 ) \ - HS_TRANSPOSE_BLEND( s, t, 2, 4, 2 ) \ - HS_TRANSPOSE_BLEND( s, t, 2, 7, 5 ) \ - HS_TRANSPOSE_BLEND( s, t, 2, 8, 6 ) \ - HS_TRANSPOSE_BLEND( s, t, 2, 11, 9 ) \ - HS_TRANSPOSE_BLEND( s, t, 2, 12, 10 ) \ - HS_TRANSPOSE_BLEND( s, t, 2, 15, 13 ) \ - HS_TRANSPOSE_BLEND( s, t, 2, 16, 14 ) \ - HS_TRANSPOSE_BLEND( t, u, 3, 5, 1 ) \ - HS_TRANSPOSE_BLEND( t, u, 3, 6, 2 ) \ - HS_TRANSPOSE_BLEND( t, u, 3, 7, 3 ) \ - HS_TRANSPOSE_BLEND( t, u, 3, 8, 4 ) \ - HS_TRANSPOSE_BLEND( t, u, 3, 13, 9 ) \ - HS_TRANSPOSE_BLEND( t, u, 3, 14, 10 ) \ - HS_TRANSPOSE_BLEND( t, u, 3, 15, 11 ) \ - HS_TRANSPOSE_BLEND( t, u, 3, 16, 12 ) \ - HS_TRANSPOSE_REMAP( u, 1, 1 ) \ - HS_TRANSPOSE_REMAP( u, 2, 3 ) \ - HS_TRANSPOSE_REMAP( u, 3, 5 ) \ - HS_TRANSPOSE_REMAP( u, 4, 7 ) \ - HS_TRANSPOSE_REMAP( u, 5, 9 ) \ - HS_TRANSPOSE_REMAP( u, 6, 11 ) \ - HS_TRANSPOSE_REMAP( u, 7, 13 ) \ - HS_TRANSPOSE_REMAP( u, 8, 15 ) \ - HS_TRANSPOSE_REMAP( u, 9, 2 ) \ - HS_TRANSPOSE_REMAP( u, 10, 4 ) \ - HS_TRANSPOSE_REMAP( u, 11, 6 ) \ - HS_TRANSPOSE_REMAP( u, 12, 8 ) \ - HS_TRANSPOSE_REMAP( u, 13, 10 ) \ - HS_TRANSPOSE_REMAP( u, 14, 12 ) \ - HS_TRANSPOSE_REMAP( u, 15, 14 ) \ - HS_TRANSPOSE_REMAP( u, 16, 16 ) \ - HS_EMPTY - -#define HS_FM_BLOCKS_LOG2_1 0 -#define HS_FM_BLOCKS_LOG2_2 1 -#define HS_FM_BLOCKS_LOG2_3 2 -#define HS_FM_BLOCKS_LOG2_4 3 -#define HS_FM_BLOCKS_LOG2_5 4 -#define HS_FM_BLOCKS_LOG2_6 5 -#define HS_HM_BLOCKS_LOG2_5 0 -#define HS_FM_BLOCKS_LOG2_7 6 -#define HS_HM_BLOCKS_LOG2_6 1 -#define HS_FM_BLOCKS_LOG2_8 7 -#define HS_HM_BLOCKS_LOG2_7 2 -#define HS_FM_BLOCKS_LOG2_9 8 -#define HS_HM_BLOCKS_LOG2_8 3 -#define HS_FM_BLOCKS_LOG2_10 9 -#define HS_HM_BLOCKS_LOG2_9 4 -#define HS_FM_BLOCKS_LOG2_11 10 -#define HS_HM_BLOCKS_LOG2_10 5 -#define HS_FM_BLOCKS_LOG2_12 11 -#define HS_HM_BLOCKS_LOG2_11 6 -#define HS_FM_BLOCKS_LOG2_13 12 -#define HS_HM_BLOCKS_LOG2_12 7 -#define HS_FM_BLOCKS_LOG2_14 13 -#define HS_HM_BLOCKS_LOG2_13 8 -#define HS_FM_BLOCKS_LOG2_15 14 -#define HS_HM_BLOCKS_LOG2_14 9 -#define HS_FM_BLOCKS_LOG2_16 15 -#define HS_HM_BLOCKS_LOG2_15 10 - -#endif - -// -// -// - diff --git a/src/compute/hs/cl/gen9/hs_cl_macros.h b/src/compute/hs/cl/gen9/hs_cl_macros.h deleted file mode 100644 index d314fe88ae..0000000000 --- a/src/compute/hs/cl/gen9/hs_cl_macros.h +++ /dev/null @@ -1,199 +0,0 @@ -// -// Copyright 2016 Google Inc. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. -// - -#ifndef HS_CL_MACROS_ONCE -#define HS_CL_MACROS_ONCE - -// -// -// - -#include "hs_cl.h" - -// -// Inter-lane compare exchange -// - -// default -#define HS_CMP_XCHG_V0(a,b) \ - { \ - HS_KEY_TYPE const t = min(a,b); \ - b = max(a,b); \ - a = t; \ - } - -// super slow -#define HS_CMP_XCHG_V1(a,b) \ - { \ - HS_KEY_TYPE const tmp = a; \ - a = (a < b) ? a : b; \ - b ^= a ^ tmp; \ - } - -// best -#define HS_CMP_XCHG_V2(a,b) \ - if (a >= b) { \ - HS_KEY_TYPE const t = a; \ - a = b; \ - b = t; \ - } - -// good -#define HS_CMP_XCHG_V3(a,b) \ - { \ - int const ge = a >= b; \ - HS_KEY_TYPE const t = a; \ - a = ge ? b : a; \ - b = ge ? t : b; \ - } - -// -// -// - -#if (HS_KEY_WORDS == 1) -#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V0(a,b) -#elif (HS_KEY_WORDS == 2) -#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V2(a,b) -#endif - -// -// Conditional inter-subgroup flip/half compare exchange -// - -#define HS_CMP_FLIP(i,a,b) \ - { \ - HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,flip_lane_idx); \ - HS_KEY_TYPE const tb = intel_sub_group_shuffle(b,flip_lane_idx); \ - a = HS_COND_MIN_MAX(t_lt,a,tb); \ - b = HS_COND_MIN_MAX(t_lt,b,ta); \ - } - -#define HS_CMP_HALF(i,a) \ - { \ - HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,half_lane_idx); \ - a = HS_COND_MIN_MAX(t_lt,a,ta); \ - } - -// -// The device's comparison operator might return what we actually -// want. For example, it appears GEN 'cmp' returns {true:-1,false:0}. -// - -#define HS_CMP_IS_ZERO_ONE - -#ifdef HS_CMP_IS_ZERO_ONE -// OpenCL requires a {true: +1, false: 0} scalar result -// (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF } -#define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b)) -#define HS_CMP_TO_MASK(a) (HS_KEY_TYPE)(-a) -#else -// However, OpenCL requires { -1, 0 } for vectors -// (a < b) -> { 0xFFFFFFFF, 0 } -#define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64 -#define HS_CMP_TO_MASK(a) (a) -#endif - -// -// The flip/half comparisons rely on a "conditional min/max": -// -// - if the flag is false, return min(a,b) -// - otherwise, return max(a,b) -// -// What's a little surprising is that sequence (1) is faster than (2) -// for 32-bit keys. -// -// I suspect either a code generation problem or that the sequence -// maps well to the GEN instruction set. -// -// We mostly care about 64-bit keys and unsurprisingly sequence (2) is -// fastest for this wider type. -// - -// this is what you would normally use -#define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) ^ lt) ? b : a - -// this seems to be faster for 32-bit keys -#define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b)) - -// -// -// - -#if (HS_KEY_WORDS == 1) -#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V1(lt,a,b) -#elif (HS_KEY_WORDS == 2) -#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b) -#endif - -// -// This snarl of macros is for transposing a "slab" of sorted elements -// into linear order. -// -// This can occur as the last step in hs_sort() or via a custom kernel -// that inspects the slab and then transposes and stores it to memory. -// -// The slab format can be inspected more efficiently than a linear -// arrangement. -// -// The prime example is detecting when adjacent keys (in sort order) -// have differing high order bits ("key changes"). The index of each -// change is recorded to an auxilary array. -// -// A post-processing step like this needs to be able to navigate the -// slab and eventually transpose and store the slab in linear order. -// - -#define HS_TRANSPOSE_REG(prefix,row) prefix##row -#define HS_TRANSPOSE_DECL(prefix,row) HS_KEY_TYPE const HS_TRANSPOSE_REG(prefix,row) - -#define HS_TRANSPOSE_DELTA(level) (HS_LANES_PER_WARP + (1 << (level-1))) -#define HS_TRANSPOSE_IF(level) ((get_sub_group_local_id() >> (level - 1)) & 1) - -#define HS_TRANSPOSE_LL(level) HS_TRANSPOSE_IF(level) ? 0 : HS_TRANSPOSE_DELTA(level) -#define HS_TRANSPOSE_UR(level) HS_TRANSPOSE_IF(level) ? HS_TRANSPOSE_DELTA(level) : 0 - -#define HS_TRANSPOSE_DELTA_LL(level) delta_ll_##level -#define HS_TRANSPOSE_DELTA_UR(level) delta_ur_##level - -#define HS_TRANSPOSE_STAGE(level) \ - uint const HS_TRANSPOSE_DELTA_LL(level) = HS_TRANSPOSE_LL(level); \ - uint const HS_TRANSPOSE_DELTA_UR(level) = HS_TRANSPOSE_UR(level); - -#define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \ - HS_TRANSPOSE_DECL(prefix_curr,row_ll) = \ - intel_sub_group_shuffle_down(HS_TRANSPOSE_REG(prefix_prev,row_ll), \ - HS_TRANSPOSE_REG(prefix_prev,row_ur), \ - HS_TRANSPOSE_DELTA_LL(level)); \ - HS_TRANSPOSE_DECL(prefix_curr,row_ur) = \ - intel_sub_group_shuffle_up(HS_TRANSPOSE_REG(prefix_prev,row_ll), \ - HS_TRANSPOSE_REG(prefix_prev,row_ur), \ - HS_TRANSPOSE_DELTA_UR(level)); \ - -// #define HS_TRANSPOSE_LOAD(row) \ -// HS_TRANSPOSE_DECL(0,row) = (vout + gmem_idx)[(row-1) << HS_LANES_PER_WARP_LOG2]; - -#define HS_TRANSPOSE_REMAP(prefix,row_from,row_to) \ - (vout + gmem_idx)[(row_to-1) << HS_LANES_PER_WARP_LOG2] = \ - HS_TRANSPOSE_REG(prefix,row_from); - -// -// undefine these if you want to override -// - -#define HS_TRANSPOSE_PREAMBLE() -#define HS_TRANSPOSE_BODY() - -// -// -// - -#endif - -// -// -// diff --git a/src/compute/hs/cl/gen9/make_all.bat b/src/compute/hs/cl/gen9/make_all.bat deleted file mode 100644 index fac82b41a0..0000000000 --- a/src/compute/hs/cl/gen9/make_all.bat +++ /dev/null @@ -1,16 +0,0 @@ -@ECHO OFF - -SET HS_GEN=..\..\..\..\spinel\bin\x64\Debug\hs_gen - -REM --- 32-bit keys --- - -REM CMD /C %HS_GEN% -a 2 -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z -REM CMD /C %HS_GEN% -a 2 -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z -REM CMD /C %HS_GEN% -a 2 -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z - -REM --- 64-bit keys - -CMD /C %HS_GEN% -a 2 -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z -REM CMD /C %HS_GEN% -a 2 -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z - -CMD /C make_inl_cl.bat hs_cl.cl diff --git a/src/compute/hs/cl/gen9/make_inl_cl.bat b/src/compute/hs/cl/gen9/make_inl_cl.bat deleted file mode 100644 index 76cb6e569e..0000000000 --- a/src/compute/hs/cl/gen9/make_inl_cl.bat +++ /dev/null @@ -1,78 +0,0 @@ - -@ECHO OFF - -:: -:: -:: - -SET OPENCL_STD=-cl-std=CL1.2 -SET OPENCL_PRE=__OPENCL_C_VERSION__=120 - -:: SET OPENCL_STD=-cl-std=CL2.0 -:: SET OPENCL_PRE=__OPENCL_C_VERSION__=200 - -:: -:: -:: - -SET IOC=ioc64 - -:: -:: -:: - -SET IOC_IR_OPTS_OPT=%OPENCL_STD% -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info - -SET IOC_IR_OPTS_DBG=%OPENCL_STD% -cl-kernel-arg-info -g - -SET IOC_IR_OPTS=%IOC_IR_OPTS_OPT% - -:: -:: -:: - -SET PRE_DIR=%~p1 - -CD %PRE_DIR% - -SET PRE_CL=%~n1 -SET PRE_CL=%PRE_CL%.pre.cl - -SET PRE_SRC_INL=%~n1 -SET PRE_SRC_INL=%PRE_SRC_INL%.pre.src.inl - -SET PRE_BIN_IR=%~n1 -SET PRE_BIN_IR=%PRE_BIN_IR%.pre.ir - -SET PRE_BIN_INL=%~n1 -SET PRE_BIN_INL=%PRE_BIN_INL%.pre.bin.inl - -:: -:: *.pre.cl -:: *.pre.src.inl -:: - -CMD /C clang-format -style=Mozilla -i %1 -CMD /C cl -I . -I "%INTELOCLSDKROOT%\include" -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_CL%" -CMD /C clang-format -style=Mozilla -i %PRE_CL% -CMD /C dos2unix -q %PRE_CL% -CMD /C xxd -i %PRE_CL% %PRE_SRC_INL% - -echo %PRE_CL% -echo %PRE_SRC_INL% - -:: -:: *.pre.cl -:: *.pre.src.inl -:: - -CMD /C touch %PRE_BIN_IR% -ECHO ON -@CMD /C %IOC% -cmd=build -bo="%IOC_IR_OPTS%" -device=gpu -input=%PRE_CL% -ir=%PRE_BIN_IR% -@ECHO OFF -CMD /C xxd -i %PRE_BIN_IR% %PRE_BIN_INL% - -echo %PRE_BIN_IR% -echo %PRE_BIN_INL% - - diff --git a/src/compute/hs/cl/hs_cl_launcher.c b/src/compute/hs/cl/hs_cl_launcher.c index f8a87f1dde..828f59ef63 100644 --- a/src/compute/hs/cl/hs_cl_launcher.c +++ b/src/compute/hs/cl/hs_cl_launcher.c @@ -11,126 +11,140 @@ // #include +#include // // // -#include "hs_cl_launcher.h" -#include "assert_cl.h" -#include "macros.h" -#include "util.h" +#include "common/cl/assert_cl.h" +#include "common/macros.h" +#include "common/util.h" // // // -typedef uint32_t uint; -typedef uint64_t ulong; +#include "hs_cl_launcher.h" // // // -#include "hs_cl.h" +struct hs_cl +{ + struct hs_cl_target_config config; + + uint32_t key_val_size; + uint32_t slab_keys; + uint32_t bs_slabs_log2_ru; + uint32_t bc_slabs_log2_max; + + struct { + uint32_t count; + cl_kernel * transpose; + cl_kernel * bs; + cl_kernel * bc; + cl_kernel * fm[3]; + cl_kernel * hm[3]; + cl_kernel all[]; + } kernels; +}; // // // -#if 0 // #ifndef NDEBUG -#define HS_KERNEL_SOURCE -#else -#define HS_KERNEL_BINARY +struct hs_state +{ +#ifndef NDEBUG + cl_ulong t_total; // 0 #endif -// -// #define HS_KERNEL_SPIRV -// + cl_command_queue cq; -// -// -// + // key buffers + cl_mem vin; + cl_mem vout; // can be vin -#ifdef NDEBUG + // enforces ordering on out-of-order queue + cl_event wait_list[3]; // worst case + uint32_t wait_list_size; -#define HS_LAUNCH_TRACE(k,g,l) + // bx_ru is number of rounded up warps in vin + uint32_t bx_ru; +}; -#else +// +// +// -#include +static +void +hs_state_wait_list_release(struct hs_state * const state) +{ + for (uint32_t ii=0; iiwait_list_size; ii++) + cl(ReleaseEvent(state->wait_list[ii])); -#define HS_KERNEL_NAME_MAX 20 + state->wait_list_size = 0; +} static void -hs_launch_trace(cl_kernel kernel, - size_t const global_work_size, - size_t const local_work_size) +hs_state_wait_list_update(struct hs_state * const state, + uint32_t const wait_list_size, + cl_event const * const wait_list) { - if (kernel == NULL) - return; + uint32_t const new_size = state->wait_list_size + wait_list_size; - char name[HS_KERNEL_NAME_MAX]; - - cl(GetKernelInfo(kernel,CL_KERNEL_FUNCTION_NAME,HS_KERNEL_NAME_MAX,name,NULL)); + for (uint32_t ii=state->wait_list_size; iiwait_list[ii] = wait_list[ii]; - fprintf(stderr,"%-19s ( %6zu, %4zu )\n",name,global_work_size,local_work_size); + state->wait_list_size = new_size; } -#define HS_LAUNCH_TRACE(k,g,l) hs_launch_trace(k,g,l) - -#endif - // // // #ifdef NDEBUG -#define HS_EVENT_NEXT() NULL -#define HS_EVENT_PROFILE(cq) +#define HS_STATE_WAIT_LIST_PROFILE(state) +#define HS_STATE_WAIT_LIST_PROFILE_EX(state,wait_list_size,wait_list) #else -#define HS_EVENTS_MAX 128 +#include -static cl_event events[HS_EVENTS_MAX]; -static uint32_t events_count; +#define HS_STATE_WAIT_LIST_PROFILE(state) \ + hs_state_wait_list_profile(state, \ + state->wait_list_size, \ + state->wait_list) -static -cl_event * -hs_event_next() -{ - if (events_count + 1 >= HS_EVENTS_MAX) // no events can be recorded? - { - return NULL; - } - else // return next event slot - { - return events + events_count++; - } -} +#define HS_STATE_WAIT_LIST_PROFILE_EX(state,wait_list_size,wait_list) \ + hs_state_wait_list_profile(state, \ + wait_list_size, \ + wait_list) static void -hs_event_profile(cl_command_queue cq) +hs_state_wait_list_profile(struct hs_state * const state, + uint32_t const wait_list_size, + cl_event const * const wait_list) { - cl(Finish(cq)); + cl(Finish(state->cq)); cl_command_queue_properties props; - cl(GetCommandQueueInfo(cq, + cl(GetCommandQueueInfo(state->cq, CL_QUEUE_PROPERTIES, sizeof(props), &props, NULL)); - cl_ulong t_min=UINT64_MAX, t_max=0; - - for (uint32_t ee=0; eet_total += t_end - t_start; } // @@ -164,316 +177,52 @@ hs_event_profile(cl_command_queue cq) cl_get_event_info(event,&status,&type); - fprintf(stdout,"%-3u, %-13s, %-28s, %20llu, %20llu, %20llu, %20llu\n", - ee, + fprintf(stdout,"%-13s, %-28s, %20llu, %20llu, %20llu, %20llu\n", cl_get_event_command_status_string(status), cl_get_event_command_type_string(type), - t_start,t_end,t_end-t_start,t_max-t_min); - - // release - cl(ReleaseEvent(event)); + t_start,t_end,t_end-t_start,state->t_total); } } -#define HS_EVENT_NEXT() hs_event_next() -#define HS_EVENT_PROFILE(cq) hs_event_profile(cq); - -#endif - -// -// -// - -struct hs_state -{ - cl_mem vin; - cl_mem vout; - - // bx.ru is number of rounded up warps in vin - struct { - uint32_t ru; - } bx; - - // these values change on each iteration - union { - struct { - uint32_t full; - uint32_t frac; - } bs; // warps - struct { - uint32_t full; - uint32_t na; - } bc; // warps - struct { - uint32_t full; - uint32_t frac; - } fm; // rows - }; -}; - -// -// -// - -#define HS_THREADS_PER_BLOCK (HS_BS_WARPS * HS_LANES_PER_WARP) -#define HS_KEYS_PER_WARP (HS_KEYS_PER_LANE * HS_LANES_PER_WARP) - -#define HS_BS_KEYS_PER_BLOCK (HS_KEYS_PER_WARP * HS_BS_WARPS) -#define HS_BS_BLOCK_SIZE (HS_BS_KEYS_PER_BLOCK * sizeof(HS_KEY_TYPE)) - -#define HS_BC_KEYS_PER_BLOCK (HS_KEYS_PER_WARP << HS_BC_WARPS_LOG2_MAX) -#define HS_BC_BLOCK_SIZE (HS_BC_KEYS_PER_BLOCK * sizeof(HS_KEY_TYPE)) - -// -// -// - -#if defined( HS_KERNEL_SOURCE ) - -#include "hs_cl.pre.src.inl" - -#elif defined( HS_KERNEL_BINARY ) - -#include "hs_cl.pre.bin.inl" - -#elif defined( HS_KERNEL_SPIRV ) - -#include "hs_cl.pre.spv.inl" - #endif // // // -struct hs_transpose_kernel -{ - cl_kernel kernel; - char const * name; -}; - -#define HS_TRANSPOSE_KERNEL_DECLARE(n) { .name = #n } - -static struct hs_transpose_kernel transpose_kernels[] = - { - HS_TRANSPOSE_KERNEL_DECLARE(hs_kernel_transpose) - }; - -// -// -// - -struct hs_bs_kernel -{ - cl_kernel kernel; - char const * name; -}; - -#define HS_BS_KERNEL_DECLARE(n) { .name = #n } - -static struct hs_bs_kernel bs_kernels[] = - { -#if 0 <= HS_BS_WARPS_LOG2_RU - HS_BS_KERNEL_DECLARE(hs_kernel_bs_0), -#endif -#if 1 <= HS_BS_WARPS_LOG2_RU - HS_BS_KERNEL_DECLARE(hs_kernel_bs_1), -#endif -#if 2 <= HS_BS_WARPS_LOG2_RU - HS_BS_KERNEL_DECLARE(hs_kernel_bs_2), -#endif -#if 3 <= HS_BS_WARPS_LOG2_RU - HS_BS_KERNEL_DECLARE(hs_kernel_bs_3), -#endif -#if 4 <= HS_BS_WARPS_LOG2_RU - HS_BS_KERNEL_DECLARE(hs_kernel_bs_4), -#endif -#if 5 <= HS_BS_WARPS_LOG2_RU - HS_BS_KERNEL_DECLARE(hs_kernel_bs_5), -#endif -#if 6 <= HS_BS_WARPS_LOG2_RU - HS_BS_KERNEL_DECLARE(hs_kernel_bs_6), -#endif -#if 7 <= HS_BS_WARPS_LOG2_RU - HS_BS_KERNEL_DECLARE(hs_kernel_bs_7), -#endif - }; - -// -// -// +#ifdef NDEBUG -struct hs_bc_kernel -{ - cl_kernel kernel; - char const * name; -}; +#define HS_LAUNCH_TRACE(k,g,l) -#define HS_BC_KERNEL_DECLARE(n) { .name = #n } +#else -static struct hs_bc_kernel bc_kernels[] = - { -#if (0 >= HS_BC_WARPS_LOG2_MIN) && (0 <= HS_BC_WARPS_LOG2_MAX) - HS_BC_KERNEL_DECLARE(hs_kernel_bc_0), -#endif -#if (1 >= HS_BC_WARPS_LOG2_MIN) && (1 <= HS_BC_WARPS_LOG2_MAX) - HS_BC_KERNEL_DECLARE(hs_kernel_bc_1), -#endif -#if (2 >= HS_BC_WARPS_LOG2_MIN) && (2 <= HS_BC_WARPS_LOG2_MAX) - HS_BC_KERNEL_DECLARE(hs_kernel_bc_2), -#endif -#if (3 >= HS_BC_WARPS_LOG2_MIN) && (3 <= HS_BC_WARPS_LOG2_MAX) - HS_BC_KERNEL_DECLARE(hs_kernel_bc_3), -#endif -#if (4 >= HS_BC_WARPS_LOG2_MIN) && (4 <= HS_BC_WARPS_LOG2_MAX) - HS_BC_KERNEL_DECLARE(hs_kernel_bc_4), -#endif -#if (5 >= HS_BC_WARPS_LOG2_MIN) && (5 <= HS_BC_WARPS_LOG2_MAX) - HS_BC_KERNEL_DECLARE(hs_kernel_bc_5), -#endif -#if (6 >= HS_BC_WARPS_LOG2_MIN) && (6 <= HS_BC_WARPS_LOG2_MAX) - HS_BC_KERNEL_DECLARE(hs_kernel_bc_6), -#endif -#if (7 >= HS_BC_WARPS_LOG2_MIN) && (7 <= HS_BC_WARPS_LOG2_MAX) - HS_BC_KERNEL_DECLARE(hs_kernel_bc_7), -#endif - }; +#include -// -// -// +#define HS_KERNEL_NAME_MAX 20 -struct hs_fm_kernel +static +void +hs_launch_trace(cl_kernel kernel, + uint32_t const dim, + size_t const * const global_work_size) { - cl_kernel kernel; - char const * name; - uint32_t const log2; -}; - -#define HS_FM_KERNEL_DECLARE(n,l) { .name = #n, .log2 = l } + if (kernel == NULL) + return; -static struct hs_fm_kernel fm_kernels[] = - { -#ifdef HS_FM_BLOCKS_LOG2_0 - HS_FM_KERNEL_DECLARE(hs_kernel_fm_0,HS_FM_BLOCKS_LOG2_0), -#endif -#ifdef HS_FM_BLOCKS_LOG2_1 - HS_FM_KERNEL_DECLARE(hs_kernel_fm_1,HS_FM_BLOCKS_LOG2_1), -#endif -#ifdef HS_FM_BLOCKS_LOG2_2 - HS_FM_KERNEL_DECLARE(hs_kernel_fm_2,HS_FM_BLOCKS_LOG2_2), -#endif -#ifdef HS_FM_BLOCKS_LOG2_3 - HS_FM_KERNEL_DECLARE(hs_kernel_fm_3,HS_FM_BLOCKS_LOG2_3), -#endif -#ifdef HS_FM_BLOCKS_LOG2_4 - HS_FM_KERNEL_DECLARE(hs_kernel_fm_4,HS_FM_BLOCKS_LOG2_4), -#endif -#ifdef HS_FM_BLOCKS_LOG2_5 - HS_FM_KERNEL_DECLARE(hs_kernel_fm_5,HS_FM_BLOCKS_LOG2_5), -#endif -#ifdef HS_FM_BLOCKS_LOG2_6 - HS_FM_KERNEL_DECLARE(hs_kernel_fm_6,HS_FM_BLOCKS_LOG2_6), -#endif -#ifdef HS_FM_BLOCKS_LOG2_7 - HS_FM_KERNEL_DECLARE(hs_kernel_fm_7,HS_FM_BLOCKS_LOG2_7), -#endif -#ifdef HS_FM_BLOCKS_LOG2_8 - HS_FM_KERNEL_DECLARE(hs_kernel_fm_8,HS_FM_BLOCKS_LOG2_8), -#endif -#ifdef HS_FM_BLOCKS_LOG2_9 - HS_FM_KERNEL_DECLARE(hs_kernel_fm_9,HS_FM_BLOCKS_LOG2_9), -#endif -#ifdef HS_FM_BLOCKS_LOG2_10 - HS_FM_KERNEL_DECLARE(hs_kernel_fm_10,HS_FM_BLOCKS_LOG2_10), -#endif -#ifdef HS_FM_BLOCKS_LOG2_11 - HS_FM_KERNEL_DECLARE(hs_kernel_fm_11,HS_FM_BLOCKS_LOG2_11), -#endif -#ifdef HS_FM_BLOCKS_LOG2_12 - HS_FM_KERNEL_DECLARE(hs_kernel_fm_12,HS_FM_BLOCKS_LOG2_12), -#endif -#ifdef HS_FM_BLOCKS_LOG2_13 - HS_FM_KERNEL_DECLARE(hs_kernel_fm_13,HS_FM_BLOCKS_LOG2_13), -#endif -#ifdef HS_FM_BLOCKS_LOG2_14 - HS_FM_KERNEL_DECLARE(hs_kernel_fm_14,HS_FM_BLOCKS_LOG2_14), -#endif -#ifdef HS_FM_BLOCKS_LOG2_15 - HS_FM_KERNEL_DECLARE(hs_kernel_fm_15,HS_FM_BLOCKS_LOG2_15), -#endif -#ifdef HS_FM_BLOCKS_LOG2_16 - HS_FM_KERNEL_DECLARE(hs_kernel_fm_16,HS_FM_BLOCKS_LOG2_16), -#endif - }; + char name[HS_KERNEL_NAME_MAX]; -// -// -// + cl(GetKernelInfo(kernel,CL_KERNEL_FUNCTION_NAME,HS_KERNEL_NAME_MAX,name,NULL)); -struct hs_hm_kernel -{ - cl_kernel kernel; - char const * name; - uint32_t const log2; -}; + fprintf(stderr,"%-19s ( %6zu, %6zu, %6zu )\n", + name, + global_work_size[0], + dim < 2 ? 0 : global_work_size[1], + dim < 3 ? 0 : global_work_size[2]); +} -#define HS_HM_KERNEL_DECLARE(n,l) { .name = #n, .log2 = l } +#define HS_LAUNCH_TRACE(k,d,g) hs_launch_trace(k,d,g) -static struct hs_hm_kernel hm_kernels[] = - { -#ifdef HS_HM_BLOCKS_LOG2_0 - HS_HM_KERNEL_DECLARE(hs_kernel_hm_0,HS_HM_BLOCKS_LOG2_0), -#endif -#ifdef HS_HM_BLOCKS_LOG2_1 - HS_HM_KERNEL_DECLARE(hs_kernel_hm_1,HS_HM_BLOCKS_LOG2_1), -#endif -#ifdef HS_HM_BLOCKS_LOG2_2 - HS_HM_KERNEL_DECLARE(hs_kernel_hm_2,HS_HM_BLOCKS_LOG2_2), -#endif -#ifdef HS_HM_BLOCKS_LOG2_3 - HS_HM_KERNEL_DECLARE(hs_kernel_hm_3,HS_HM_BLOCKS_LOG2_3), -#endif -#ifdef HS_HM_BLOCKS_LOG2_4 - HS_HM_KERNEL_DECLARE(hs_kernel_hm_4,HS_HM_BLOCKS_LOG2_4), -#endif -#ifdef HS_HM_BLOCKS_LOG2_5 - HS_HM_KERNEL_DECLARE(hs_kernel_hm_5,HS_HM_BLOCKS_LOG2_5), -#endif -#ifdef HS_HM_BLOCKS_LOG2_6 - HS_HM_KERNEL_DECLARE(hs_kernel_hm_6,HS_HM_BLOCKS_LOG2_6), -#endif -#ifdef HS_HM_BLOCKS_LOG2_7 - HS_HM_KERNEL_DECLARE(hs_kernel_hm_7,HS_HM_BLOCKS_LOG2_7), -#endif -#ifdef HS_HM_BLOCKS_LOG2_8 - HS_HM_KERNEL_DECLARE(hs_kernel_hm_8,HS_HM_BLOCKS_LOG2_8), -#endif -#ifdef HS_HM_BLOCKS_LOG2_9 - HS_HM_KERNEL_DECLARE(hs_kernel_hm_9,HS_HM_BLOCKS_LOG2_9), -#endif -#ifdef HS_HM_BLOCKS_LOG2_10 - HS_HM_KERNEL_DECLARE(hs_kernel_hm_10,HS_HM_BLOCKS_LOG2_10), -#endif -#ifdef HS_HM_BLOCKS_LOG2_11 - HS_HM_KERNEL_DECLARE(hs_kernel_hm_11,HS_HM_BLOCKS_LOG2_11), #endif -#ifdef HS_HM_BLOCKS_LOG2_12 - HS_HM_KERNEL_DECLARE(hs_kernel_hm_12,HS_HM_BLOCKS_LOG2_12), -#endif -#ifdef HS_HM_BLOCKS_LOG2_13 - HS_HM_KERNEL_DECLARE(hs_kernel_hm_13,HS_HM_BLOCKS_LOG2_13), -#endif -#ifdef HS_HM_BLOCKS_LOG2_14 - HS_HM_KERNEL_DECLARE(hs_kernel_hm_14,HS_HM_BLOCKS_LOG2_14), -#endif -#ifdef HS_HM_BLOCKS_LOG2_15 - HS_HM_KERNEL_DECLARE(hs_kernel_hm_15,HS_HM_BLOCKS_LOG2_15), -#endif -#ifdef HS_HM_BLOCKS_LOG2_16 - HS_HM_KERNEL_DECLARE(hs_kernel_hm_16,HS_HM_BLOCKS_LOG2_16), -#endif - }; // // @@ -481,36 +230,38 @@ static struct hs_hm_kernel hm_kernels[] = static void -hs_barrier(cl_command_queue cq) +hs_transpose_launcher(struct hs_cl const * const hs, + struct hs_state * const state) { - cl(EnqueueBarrierWithWaitList(cq,0,NULL,NULL)); -} + size_t const size[1] = { state->bx_ru << hs->config.slab.threads_log2 }; + cl_kernel kernel = hs->kernels.transpose[0]; -// -// -// - -static -void -hs_launch_transpose(struct hs_state const * const state, - cl_command_queue cq, - cl_kernel kernel, - size_t const global_work_size, - size_t const local_work_size) -{ - HS_LAUNCH_TRACE(kernel,global_work_size,local_work_size); + HS_LAUNCH_TRACE(kernel,1,size); + // + // The transpose kernel operates on a single slab. For now, let's + // rely on the driver to choose a workgroup size. + // + // size_t local_work_size[1] = { HS_SLAB_THREADS }; + // cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout)); - cl(EnqueueNDRangeKernel(cq, + cl_event wait_list_out[1]; + + cl(EnqueueNDRangeKernel(state->cq, kernel, 1, NULL, - &global_work_size, - &local_work_size, - 0, + size, NULL, - HS_EVENT_NEXT())); + state->wait_list_size, + state->wait_list, + wait_list_out)); + + hs_state_wait_list_release(state); + hs_state_wait_list_update(state,1,wait_list_out); + + HS_STATE_WAIT_LIST_PROFILE(state); } // @@ -519,49 +270,63 @@ hs_launch_transpose(struct hs_state const * const state, static void -hs_launch_bs(struct hs_state const * const state, - cl_command_queue cq, - cl_kernel kernel_full, - cl_kernel kernel_frac, - size_t const global_work_size_full, - size_t const local_work_size_full, - size_t const local_work_size_frac) - +hs_launch_bs(struct hs_cl const * const hs, + struct hs_state * const state, + uint32_t const full, + uint32_t const frac, + uint32_t const wait_list_size, + cl_event * wait_list) { - HS_LAUNCH_TRACE(kernel_full,global_work_size_full,local_work_size_full); - HS_LAUNCH_TRACE(kernel_frac,local_work_size_frac,local_work_size_frac); + uint32_t wait_list_out_size = 0; + cl_event wait_list_out[2]; - if (kernel_full != NULL) + if (full > 0) { + size_t const size_full[1] = { full << hs->config.slab.threads_log2 }; + cl_kernel kernel_full = hs->kernels.bs[hs->bs_slabs_log2_ru]; + + HS_LAUNCH_TRACE(kernel_full,1,size_full); + cl(SetKernelArg(kernel_full,0,sizeof(state->vin), &state->vin)); cl(SetKernelArg(kernel_full,1,sizeof(state->vout),&state->vout)); - cl(EnqueueNDRangeKernel(cq, + cl(EnqueueNDRangeKernel(state->cq, kernel_full, 1, NULL, - &global_work_size_full, - &local_work_size_full, - 0, + size_full, NULL, - HS_EVENT_NEXT())); + wait_list_size, + wait_list, + wait_list_out+wait_list_out_size++)); } - if (kernel_frac != NULL) + if (frac > 0) { + size_t const offset_frac[1] = { full << hs->config.slab.threads_log2 }; + size_t const size_frac [1] = { frac << hs->config.slab.threads_log2 }; + cl_kernel kernel_frac = hs->kernels.bs[msb_idx_u32(frac)]; + + HS_LAUNCH_TRACE(kernel_frac,1,size_frac); + cl(SetKernelArg(kernel_frac,0,sizeof(state->vin), &state->vin)); cl(SetKernelArg(kernel_frac,1,sizeof(state->vout),&state->vout)); - cl(EnqueueNDRangeKernel(cq, + cl(EnqueueNDRangeKernel(state->cq, kernel_frac, 1, - &global_work_size_full, - &local_work_size_frac, - &local_work_size_frac, - 0, + offset_frac, + size_frac, NULL, - HS_EVENT_NEXT())); + wait_list_size, + wait_list, + wait_list_out+wait_list_out_size++)); } + + hs_state_wait_list_release(state); + hs_state_wait_list_update(state,wait_list_out_size,wait_list_out); + + HS_STATE_WAIT_LIST_PROFILE(state); } // @@ -570,25 +335,34 @@ hs_launch_bs(struct hs_state const * const state, static void -hs_launch_bc(struct hs_state const * const state, - cl_command_queue cq, - cl_kernel kernel, - size_t const global_work_size, - size_t const local_work_size) +hs_launch_bc(struct hs_cl const * const hs, + struct hs_state * const state, + uint32_t const full, + uint32_t const clean_slabs_log2) { - HS_LAUNCH_TRACE(kernel,global_work_size,local_work_size); + size_t const size[1] = { full << hs->config.slab.threads_log2 }; + cl_kernel kernel = hs->kernels.bc[clean_slabs_log2]; + + HS_LAUNCH_TRACE(kernel,1,size); cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout)); - cl(EnqueueNDRangeKernel(cq, + cl_event wait_list_out[1]; + + cl(EnqueueNDRangeKernel(state->cq, kernel, 1, NULL, - &global_work_size, - &local_work_size, - 0, + size, NULL, - HS_EVENT_NEXT())); + state->wait_list_size, + state->wait_list, + wait_list_out)); + + hs_state_wait_list_release(state); + hs_state_wait_list_update(state,1,wait_list_out); + + HS_STATE_WAIT_LIST_PROFILE(state); } // @@ -597,26 +371,64 @@ hs_launch_bc(struct hs_state const * const state, static void -hs_launch_fm(struct hs_state const * const state, - cl_command_queue cq, - cl_kernel kernel, - size_t const global_work_size) +hs_launch_fm(struct hs_cl const * const hs, + struct hs_state * const state, + uint32_t const scale_log2, + uint32_t const fm_full, + uint32_t const fm_frac, + uint32_t const span_threads) { - HS_LAUNCH_TRACE(kernel,global_work_size,0); + // + // Note that some platforms might need to use .z on large grids + // + uint32_t wait_list_out_size = 0; + cl_event wait_list_out[2]; - cl(SetKernelArg(kernel,0,sizeof(state->vout), &state->vout)); - cl(SetKernelArg(kernel,1,sizeof(state->fm.full),&state->fm.full)); - cl(SetKernelArg(kernel,2,sizeof(state->fm.frac),&state->fm.frac)); + if (fm_full > 0) + { + size_t const size_full[3] = { span_threads, fm_full, 1 }; + cl_kernel kernel_full = hs->kernels.fm[scale_log2][hs->bs_slabs_log2_ru]; - cl(EnqueueNDRangeKernel(cq, - kernel, - 1, - NULL, - &global_work_size, - NULL, - 0, - NULL, - HS_EVENT_NEXT())); + HS_LAUNCH_TRACE(kernel_full,3,size_full); + + cl(SetKernelArg(kernel_full,0,sizeof(state->vout),&state->vout)); + + cl(EnqueueNDRangeKernel(state->cq, + kernel_full, + 3, + NULL, + size_full, + NULL, + state->wait_list_size, + state->wait_list, + wait_list_out+wait_list_out_size++)); + } + + if (fm_frac > 0) + { + size_t const offset_frac[3] = { 0, fm_full, 0 }; + size_t const size_frac [3] = { span_threads, 1, 1 }; + cl_kernel kernel_frac = hs->kernels.fm[scale_log2][msb_idx_u32(fm_frac)]; + + HS_LAUNCH_TRACE(kernel_frac,3,size_frac); + + cl(SetKernelArg(kernel_frac,0,sizeof(state->vout),&state->vout)); + + cl(EnqueueNDRangeKernel(state->cq, + kernel_frac, + 3, + offset_frac, + size_frac, + NULL, + state->wait_list_size, + state->wait_list, + wait_list_out+wait_list_out_size++)); + } + + hs_state_wait_list_release(state); + hs_state_wait_list_update(state,wait_list_out_size,wait_list_out); + + HS_STATE_WAIT_LIST_PROFILE(state); } // @@ -625,24 +437,38 @@ hs_launch_fm(struct hs_state const * const state, static void -hs_launch_hm(struct hs_state const * const state, - cl_command_queue cq, - cl_kernel kernel, - size_t const global_work_size) +hs_launch_hm(struct hs_cl const * const hs, + struct hs_state * const state, + uint32_t const scale_log2, + uint32_t const spans, + uint32_t const span_threads) { - HS_LAUNCH_TRACE(kernel,global_work_size,0); + // + // Note that some platforms might need to use .z on large grids + // + size_t const size[3] = { span_threads, spans, 1 }; + cl_kernel kernel = hs->kernels.hm[scale_log2][0]; + + HS_LAUNCH_TRACE(kernel,3,size); cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout)); - cl(EnqueueNDRangeKernel(cq, + cl_event wait_list_out[1]; + + cl(EnqueueNDRangeKernel(state->cq, kernel, - 1, + 3, NULL, - &global_work_size, + size, NULL, - 0, - NULL, - HS_EVENT_NEXT())); + state->wait_list_size, + state->wait_list, + wait_list_out)); + + hs_state_wait_list_release(state); + hs_state_wait_list_update(state,1,wait_list_out); + + HS_STATE_WAIT_LIST_PROFILE(state); } // @@ -651,47 +477,54 @@ hs_launch_hm(struct hs_state const * const state, static void -hs_transpose_launcher(struct hs_state * const state, - cl_command_queue cq) +hs_keyset_pre_sort(struct hs_cl const * const hs, + struct hs_state * const state, + uint32_t const count, + uint32_t const count_hi, + uint32_t const wait_list_size, + cl_event * wait_list, + cl_event * event) { - // transpose each slab - size_t const global_work_size = state->bx.ru * HS_LANES_PER_WARP; - size_t const local_work_size = HS_LANES_PER_WARP; // FIXME -- might not always want to specify this - - hs_launch_transpose(state, - cq, - transpose_kernels[0].kernel, - global_work_size, - local_work_size); -} + uint32_t const vin_span = count_hi - count; + uint32_t const pattern = UINT32_MAX; -// -// -// + cl(EnqueueFillBuffer(state->cq, + state->vin, + &pattern, + sizeof(pattern), + count * hs->key_val_size, + vin_span * hs->key_val_size, + wait_list_size, + wait_list, + event)); + + HS_STATE_WAIT_LIST_PROFILE_EX(state,1,event); +} static void -hs_bs_launcher(struct hs_state * const state, - uint32_t const warps_in, - cl_command_queue cq) +hs_keyset_pre_merge(struct hs_cl const * const hs, + struct hs_state * const state, + uint32_t const count_lo, + uint32_t const count_hi, + uint32_t const wait_list_size, + cl_event * wait_list) { - // warps_in is already rounded up - uint32_t const full = (warps_in / HS_BS_WARPS) * HS_BS_WARPS; - uint32_t const frac = warps_in - full; + uint32_t const vout_span = count_hi - count_lo; + uint32_t const pattern = UINT32_MAX; - // - // FIXME -- launch on different queues - // - cl_kernel kernel_full = (full == 0) ? NULL : bs_kernels[HS_BS_WARPS_LOG2_RU].kernel; - cl_kernel kernel_frac = (frac == 0) ? NULL : bs_kernels[msb_idx_u32(frac)].kernel; - - hs_launch_bs(state, - cq, - kernel_full, - kernel_frac, - full * HS_LANES_PER_WARP, - HS_BS_WARPS * HS_LANES_PER_WARP, - frac * HS_LANES_PER_WARP); + // appends event to incoming wait list + cl(EnqueueFillBuffer(state->cq, + state->vout, + &pattern, + sizeof(pattern), + count_lo * hs->key_val_size, + vout_span * hs->key_val_size, + wait_list_size, + wait_list, + state->wait_list+state->wait_list_size++)); + + HS_STATE_WAIT_LIST_PROFILE(state); } // @@ -700,27 +533,19 @@ hs_bs_launcher(struct hs_state * const state, static void -hs_bc_launcher(struct hs_state * const state, - uint32_t const down_warps, - uint32_t const down_warps_log2, - cl_command_queue cq) +hs_bs_launcher(struct hs_cl const * const hs, + struct hs_state * const state, + uint32_t const count_padded_in, + uint32_t const wait_list_size, + cl_event * wait_list) { - // block clean the minimal number of down_warps_log2 spans - uint32_t const frac_ru = (1u << down_warps_log2) - 1; - state->bc.full = (down_warps + frac_ru) & ~frac_ru; - - // launch block slab sorting grid - size_t const global_work_size = state->bc.full * HS_LANES_PER_WARP; - size_t const local_work_size = HS_LANES_PER_WARP << down_warps_log2; + uint32_t const slabs_in = count_padded_in / hs->slab_keys; + uint32_t const full = (slabs_in / hs->config.block.slabs) * hs->config.block.slabs; + uint32_t const frac = slabs_in - full; - // - // we better be capable of cleaning at least two warps !!! - // - hs_launch_bc(state, - cq, - bc_kernels[down_warps_log2].kernel, - global_work_size, - local_work_size); + hs_launch_bs(hs,state, + full,frac, + wait_list_size,wait_list); } // @@ -728,30 +553,18 @@ hs_bc_launcher(struct hs_state * const state, // static -uint32_t -hs_hm_launcher(struct hs_state * const state, - uint32_t const down_warps, - uint32_t const down_warps_log2_in, - cl_command_queue cq) +void +hs_bc_launcher(struct hs_cl const * const hs, + struct hs_state * const state, + uint32_t const down_slabs, + uint32_t const clean_slabs_log2) { - // how many scaled half-merge spans are there? - uint32_t const frac_ru = (1 << down_warps_log2_in) - 1; - uint32_t const spans_ru = (down_warps + frac_ru) >> down_warps_log2_in; + // block clean the minimal number of down_slabs_log2 spans + uint32_t const frac_ru = (1u << clean_slabs_log2) - 1; + uint32_t const full = (down_slabs + frac_ru) & ~frac_ru; - // get the kernel record - struct hs_hm_kernel const * const hm = hm_kernels + down_warps_log2_in - HS_BC_WARPS_LOG2_MAX - 1; - - // how large is the grid? - size_t const global_work_size = HS_LANES_PER_WARP * HS_KEYS_PER_LANE * (spans_ru << hm->log2); - size_t const local_work_size = HS_LANES_PER_WARP; - - // launch the hm kernel - hs_launch_hm(state, - cq, - hm->kernel, - global_work_size); - - return hm->log2; + // we better be capable of cleaning at least two warps !!! + hs_launch_bc(hs,state,full,clean_slabs_log2); } // @@ -760,63 +573,74 @@ hs_hm_launcher(struct hs_state * const state, static uint32_t -hs_fm_launcher(struct hs_state * const state, - uint32_t const up_scale_log2, - uint32_t * const down_warps, - cl_command_queue cq) +hs_fm_launcher(struct hs_cl const * const hs, + struct hs_state * const state, + uint32_t * const down_slabs, + uint32_t const up_scale_log2) { - // get the kernel record - struct hs_fm_kernel const * const fm = fm_kernels + up_scale_log2 - 1; + // + // FIXME OPTIMIZATION: in previous HotSort launchers it's sometimes + // a performance win to bias toward launching the smaller flip merge + // kernel in order to get more warps in flight (increased + // occupancy). This is useful when merging small numbers of slabs. + // + // Note that HS_FM_SCALE_MIN will always be 0 or 1. + // + // So, for now, just clamp to the max until there is a reason to + // restore the fancier and probably low-impact approach. + // + uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.fm.scale_max,up_scale_log2); + uint32_t const clean_log2 = up_scale_log2 - scale_log2; - // number of warps in a full-sized scaled flip-merge span - uint32_t const full_span_warps = HS_BS_WARPS << up_scale_log2; + // number of slabs in a full-sized scaled flip-merge span + uint32_t const full_span_slabs = hs->config.block.slabs << up_scale_log2; // how many full-sized scaled flip-merge spans are there? - state->fm.full = state->bx.ru / full_span_warps; - state->fm.frac = 0; + uint32_t fm_full = state->bx_ru / full_span_slabs; + uint32_t fm_frac = 0; - // initialize down_warps - *down_warps = state->fm.full * full_span_warps; + // initialize down_slabs + *down_slabs = fm_full * full_span_slabs; // how many half-size scaled + fractional scaled spans are there? - uint32_t const span_rem = state->bx.ru - state->fm.full * full_span_warps; - uint32_t const half_span_warps = full_span_warps >> 1; + uint32_t const span_rem = state->bx_ru - *down_slabs; + uint32_t const half_span_slabs = full_span_slabs >> 1; - if (span_rem > half_span_warps) + // if we have over a half-span then fractionally merge it + if (span_rem > half_span_slabs) { - uint32_t const frac_rem = span_rem - half_span_warps; + // the remaining slabs will be cleaned + *down_slabs += span_rem; + + uint32_t const frac_rem = span_rem - half_span_slabs; uint32_t const frac_rem_pow2 = pow2_ru_u32(frac_rem); - if (frac_rem_pow2 >= half_span_warps) + if (frac_rem_pow2 >= half_span_slabs) { - *down_warps += full_span_warps; - state->fm.full += 1; + // bump it up to a full span + fm_full += 1; } else { - uint32_t const frac_interleaved = frac_rem_pow2 >> fm->log2; - - *down_warps += half_span_warps + frac_rem_pow2; - state->fm.frac = MAX_MACRO(1,frac_interleaved); + // otherwise, add fractional + fm_frac = MAX_MACRO(1,frac_rem_pow2 >> clean_log2); } } // size the grid - uint32_t const spans_frac = MIN_MACRO(state->fm.frac,1); - uint32_t const spans_total = state->fm.full + spans_frac; - uint32_t const scale = spans_total << fm->log2; - size_t const global_work_size = HS_LANES_PER_WARP * HS_KEYS_PER_LANE * scale; - size_t const local_work_size = HS_LANES_PER_WARP; + uint32_t const span_threads = hs->slab_keys << clean_log2; // // launch the fm kernel // - hs_launch_fm(state, - cq, - fm->kernel, - global_work_size); - - return fm->log2; + hs_launch_fm(hs, + state, + scale_log2, + fm_full, + fm_frac, + span_threads); + + return clean_log2; } // @@ -824,67 +648,196 @@ hs_fm_launcher(struct hs_state * const state, // static +uint32_t +hs_hm_launcher(struct hs_cl const * const hs, + struct hs_state * const state, + uint32_t const down_slabs, + uint32_t const clean_slabs_log2) +{ + // how many scaled half-merge spans are there? + uint32_t const frac_ru = (1 << clean_slabs_log2) - 1; + uint32_t const spans = (down_slabs + frac_ru) >> clean_slabs_log2; + + // for now, just clamp to the max + uint32_t const log2_rem = clean_slabs_log2 - hs->bc_slabs_log2_max; + uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.hm.scale_max,log2_rem); + uint32_t const log2_out = log2_rem - scale_log2; + + // size the grid + uint32_t const span_threads = hs->slab_keys << log2_out; + + // launch the hm kernel + hs_launch_hm(hs, + state, + scale_log2, + spans, + span_threads); + + return log2_out; +} + +// +// +// + void -hs_keyset_launcher(cl_mem mem, - uint32_t const offset, - uint32_t const span, - cl_command_queue cq) +hs_cl_sort(struct hs_cl const * const hs, + cl_command_queue cq, + uint32_t const wait_list_size, + cl_event * wait_list, + cl_event * event, + cl_mem vin, + cl_mem vout, + uint32_t const count, + uint32_t const count_padded_in, + uint32_t const count_padded_out, + bool const linearize) { + // is this sort in place? + bool const is_in_place = (vout == NULL); + // cq, buffers, wait list and slab count + struct hs_state state = { +#ifndef NDEBUG + .t_total = 0, +#endif + .cq = cq, + .vin = vin, + .vout = is_in_place ? vin : vout, + .wait_list_size = 0, + .bx_ru = (count + hs->slab_keys - 1) / hs->slab_keys + }; + + // initialize vin + uint32_t const count_hi = is_in_place ? count_padded_out : count_padded_in; + bool const is_pre_sort_keyset_reqd = count_hi > count; + cl_event event_keyset_pre_sort[1]; + + // initialize any trailing keys in vin before sorting + if (is_pre_sort_keyset_reqd) + { + hs_keyset_pre_sort(hs,&state, + count,count_hi, + wait_list_size,wait_list, + event_keyset_pre_sort); + } + + // initialize any trailing keys in vout before merging + if (!is_in_place && (count_padded_out > count_padded_in)) + { + hs_keyset_pre_merge(hs,&state, + count_padded_in,count_padded_out, + wait_list_size,wait_list); + } // - // DOES NOT TEST FOR SPAN = 0 + // sort blocks of slabs // - HS_KEY_TYPE const pattern = (HS_KEY_TYPE)-1L; + hs_bs_launcher(hs,&state, + count_padded_in, + is_pre_sort_keyset_reqd ? 1 : wait_list_size, + is_pre_sort_keyset_reqd ? event_keyset_pre_sort : wait_list); - cl(EnqueueFillBuffer(cq, - mem, - &pattern, - sizeof(HS_KEY_TYPE), - offset * sizeof(HS_KEY_TYPE), - span * sizeof(HS_KEY_TYPE), - 0, - NULL, - HS_EVENT_NEXT())); + // release the event + if (is_pre_sort_keyset_reqd) + cl(ReleaseEvent(event_keyset_pre_sort[0])); + + // + // we're done if this was a single bs block... + // + // otherwise, merge sorted spans of slabs until done + // + if (state.bx_ru > hs->config.block.slabs) + { + int32_t up_scale_log2 = 1; + + while (true) + { + uint32_t down_slabs; + + // flip merge slabs -- return span of slabs that must be cleaned + uint32_t clean_slabs_log2 = hs_fm_launcher(hs,&state, + &down_slabs, + up_scale_log2); + + // if span is gt largest slab block cleaner then half merge + while (clean_slabs_log2 > hs->bc_slabs_log2_max) + { + clean_slabs_log2 = hs_hm_launcher(hs,&state, + down_slabs, + clean_slabs_log2); + } + + // launch clean slab grid -- is it the final launch? + hs_bc_launcher(hs,&state, + down_slabs, + clean_slabs_log2); + + // was this the final block clean? + if (((uint32_t)hs->config.block.slabs << up_scale_log2) >= state.bx_ru) + break; + + // otherwise, merge twice as many slabs + up_scale_log2 += 1; + } + } + + // slabs or linear? + if (linearize) { + hs_transpose_launcher(hs,&state); + } + + // does the caller want the final event? + if (event != NULL) { + *event = state.wait_list[0]; + } else { + cl(ReleaseEvent(state.wait_list[0])); + } } // -// all grids will be computed as a function of the minimum number of warps +// all grids will be computed as a function of the minimum number of slabs // void -hs_pad(uint32_t const count, - uint32_t * const count_padded_in, - uint32_t * const count_padded_out) +hs_cl_pad(struct hs_cl const * const hs, + uint32_t const count, + uint32_t * const count_padded_in, + uint32_t * const count_padded_out) { // - // round up the count to warps + // round up the count to slabs // - uint32_t const warps_ru = (count + HS_KEYS_PER_WARP - 1) / HS_KEYS_PER_WARP; - uint32_t const blocks = warps_ru / HS_BS_WARPS; - uint32_t const warps_mod = warps_ru % HS_BS_WARPS; - uint32_t const warps_mod_ru = MIN_MACRO(pow2_ru_u32(warps_mod),HS_BS_WARPS); + uint32_t const slabs_ru = (count + hs->slab_keys - 1) / hs->slab_keys; + uint32_t const blocks = slabs_ru / hs->config.block.slabs; + uint32_t const block_slabs = blocks * hs->config.block.slabs; + uint32_t const slabs_ru_rem = slabs_ru - block_slabs; + uint32_t const slabs_ru_rem_ru = MIN_MACRO(pow2_ru_u32(slabs_ru_rem),hs->config.block.slabs); - *count_padded_in = (blocks * HS_BS_WARPS + warps_mod_ru) * HS_KEYS_PER_WARP; + *count_padded_in = (block_slabs + slabs_ru_rem_ru) * hs->slab_keys; *count_padded_out = *count_padded_in; // - // more than a single block sort? + // will merging be required? // - if (warps_ru > HS_BS_WARPS) + if (slabs_ru > hs->config.block.slabs) { // more than one block - uint32_t const blocks_lo = pow2_rd_u32(blocks); - uint32_t const warps_lo = blocks_lo * HS_BS_WARPS; - uint32_t const warps_rem = warps_ru - warps_lo; + uint32_t const blocks_lo = pow2_rd_u32(blocks); + uint32_t const block_slabs_lo = blocks_lo * hs->config.block.slabs; + uint32_t const block_slabs_rem = slabs_ru - block_slabs_lo; - if (warps_rem > 0) + if (block_slabs_rem > 0) { - uint32_t const warps_rem_ru = pow2_ru_u32(warps_rem); - uint32_t const warps_hi = MAX_MACRO(warps_rem_ru,blocks_lo << HS_FM_BLOCKS_LOG2_1); - uint32_t const warps_padded_out = MIN_MACRO(warps_lo+warps_hi,warps_lo*2); // clamp non-pow2 blocks + uint32_t const block_slabs_rem_ru = pow2_ru_u32(block_slabs_rem); + + uint32_t const block_slabs_hi = MAX_MACRO(block_slabs_rem_ru, + blocks_lo << (1 - hs->config.merge.fm.scale_min)); + + uint32_t const block_slabs_padded_out = MIN_MACRO(block_slabs_lo+block_slabs_hi, + block_slabs_lo*2); // clamp non-pow2 blocks - *count_padded_out = warps_padded_out * HS_KEYS_PER_WARP; + *count_padded_out = block_slabs_padded_out * hs->slab_keys; } } } @@ -893,229 +846,291 @@ hs_pad(uint32_t const count, // // +static void -hs_sort(cl_command_queue cq, // out-of-order cq - cl_mem vin, - cl_mem vout, - uint32_t const count, - uint32_t const count_padded_in, - uint32_t const count_padded_out, - bool const linearize) +hs_create_kernel(cl_program program, + cl_kernel * const kernel, + char const * const name) { -#ifndef NDEBUG - events_count = 0; -#endif + cl_int err; - // - // FIXME -- get rid of this vestigial structure - // - struct hs_state state = { .vin = vin, .vout = vout }; + *kernel = clCreateKernel(program,name,&err); + + cl_ok(err); +} - // how many rounded-up key slabs are there? - state.bx.ru = (count + HS_KEYS_PER_WARP - 1) / HS_KEYS_PER_WARP; +static +void +hs_create_kernels(cl_program program, + cl_kernel * kernels, + char name_template[], + size_t const name_template_size, + uint32_t const count) +{ + char const n_max = '0'+(char)count; + + for (char n = '0'; nprogram[0] == 0); + uint32_t const program_size = NPBTOHL_MACRO(target->program+1); - if (!split) - { - uint32_t const vin_span = count_padded_out - count; + cl_program program; - if (vin_span > 0) - { - hs_keyset_launcher(state.vin, - count,vin_span, - cq); - keyset = true; - } + if (is_binary) // program is a binary + { + cl_int status, err; + + size_t const bins_sizeof[] = { program_size }; + unsigned char const * bins[] = { target->program+5 }; + + program = clCreateProgramWithBinary(context, + 1, + &device_id, + bins_sizeof, + bins, + &status, + &err); + cl_ok(err); + + cl(BuildProgram(program, + 1, + &device_id, + NULL, + NULL, + NULL)); } - else + else // program is source code { - uint32_t const vin_span = count_padded_in - count; - - if (vin_span > 0) - { - hs_keyset_launcher(state.vin, - count,vin_span, - cq); - keyset = true; - } + cl_int err; + + size_t const strings_sizeof[] = { program_size }; + char const * strings[] = { (char*)target->program+5 }; + + program = clCreateProgramWithSource(context, + 1, + strings, + strings_sizeof, + &err); + cl_ok(err); + + char const * const options = + "-cl-std=CL1.2 -cl-fast-relaxed-math " // FIXME FIXME FIXME FIXME 1.2 + "-cl-no-signed-zeros -cl-mad-enable " + "-cl-denorms-are-zero " + "-cl-kernel-arg-info"; + + cl(BuildProgram(program, + 1, + &device_id, + options, + NULL, + NULL)); + } - uint32_t const vout_span = count_padded_out - count_padded_in; + // + // we reference these values a lot + // + uint32_t const bs_slabs_log2_ru = msb_idx_u32(pow2_ru_u32(target->config.block.slabs)); + uint32_t const bc_slabs_log2_max = msb_idx_u32(pow2_rd_u32(target->config.block.slabs)); - if (vout_span > 0) - { - hs_keyset_launcher(state.vout, - count_padded_in,vout_span, - cq); - keyset = true; - } + // + // how many kernels will be created? + // + uint32_t const count_bs = bs_slabs_log2_ru + 1; + uint32_t const count_bc = bc_slabs_log2_max + 1; + uint32_t count_fm[3] = { 0 }; + uint32_t count_hm[3] = { 0 }; + + // guaranteed to be in range [0,2] + for (uint32_t scale = target->config.merge.fm.scale_min; + scale <= target->config.merge.fm.scale_max; + scale++) + { + count_fm[scale] = msb_idx_u32(pow2_ru_u32(target->config.block.slabs>>(scale-1))) + 1; } - if (keyset) + // guaranteed to be in range [0,2] + for (uint32_t scale = target->config.merge.hm.scale_min; + scale <= target->config.merge.hm.scale_max; + scale++) { - hs_barrier(cq); + count_hm[scale] = 1; } + uint32_t const count_all = + 1 + + count_bs + + count_bc + + count_fm[0] + count_fm[1] + count_fm[2] + + count_hm[0] + count_hm[1] + count_hm[2]; + // - // sort blocks + // allocate hs_cl // - uint32_t const warps_in = count_padded_in / HS_KEYS_PER_WARP; + struct hs_cl * hs = malloc(sizeof(*hs) + sizeof(cl_kernel) * count_all); - hs_bs_launcher(&state,warps_in,cq); + memcpy(&hs->config,&target->config,sizeof(hs->config)); - hs_barrier(cq); + // save some frequently used calculated values + hs->key_val_size = (target->config.words.key + target->config.words.val) * 4; + hs->slab_keys = target->config.slab.height << target->config.slab.width_log2; + hs->bs_slabs_log2_ru = bs_slabs_log2_ru; + hs->bc_slabs_log2_max = bc_slabs_log2_max; + + // save kernel count + hs->kernels.count = count_all; // - // we're done if only a single bs kernel block was required + // create all the kernels and release the program // - if (state.bx.ru > HS_BS_WARPS) - { - // - // otherwise... merge sorted spans of warps until done - // - uint32_t up_scale_log2 = 1; + cl_kernel * kernel_next = hs->kernels.all; - while (true) - { - uint32_t down_warps; + // + // TRANSPOSE + // + { + hs->kernels.transpose = kernel_next; - // flip merge warps -- return span of warps that must be cleaned - uint32_t down_warps_log2 = hs_fm_launcher(&state, - up_scale_log2, - &down_warps, - cq); + hs_create_kernel(program, + kernel_next, + "hs_kernel_transpose"); - hs_barrier(cq); + kernel_next += 1; + } - // if span is gt largest slab block cleaner then half merge - while (down_warps_log2 > HS_BC_WARPS_LOG2_MAX) - { - down_warps_log2 = hs_hm_launcher(&state, - down_warps, - down_warps_log2, - cq); + // + // BS + // + { + hs->kernels.bs = kernel_next; - hs_barrier(cq); - } + char bs_name[] = { "hs_kernel_bs_X" }; - // launch clean slab grid -- is it the final launch? - hs_bc_launcher(&state, - down_warps, - down_warps_log2, - cq); + hs_create_kernels(program, + kernel_next, + bs_name,sizeof(bs_name), + count_bs); - hs_barrier(cq); + kernel_next += count_bs; + } - // was this the final block clean? - if (((uint32_t)HS_BS_WARPS << up_scale_log2) >= state.bx.ru) - break; + // + // BC + // + { + hs->kernels.bc = kernel_next; - // otherwise, merge twice as many slabs - up_scale_log2 += 1; - } - } + char bc_name[] = { "hs_kernel_bc_X" }; + + hs_create_kernels(program, + kernel_next, + bc_name,sizeof(bc_name), + count_bc); + + kernel_next += count_bc; + } - if (linearize) + // + // FM 0 + // + if (count_fm[0] > 0) { - // launch linearize; - hs_transpose_launcher(&state,cq); + hs->kernels.fm[0] = kernel_next; + + char fm_0_name[] = { "hs_kernel_fm_0_X" }; - hs_barrier(cq); + hs_create_kernels(program, + kernel_next, + fm_0_name,sizeof(fm_0_name), + count_fm[0]); + + kernel_next += count_fm[0]; } - HS_EVENT_PROFILE(cq); -} + if (count_fm[1] > 0) + { + hs->kernels.fm[1] = kernel_next; -// -// -// + char fm_1_name[] = { "hs_kernel_fm_1_X" }; -void -hs_create(cl_context context, - cl_device_id device_id, - struct hs_info * const info) -{ - // - // create and build the program from source or a precompiled binary - // - if (info != NULL) + hs_create_kernels(program, + kernel_next, + fm_1_name,sizeof(fm_1_name), + count_fm[1]); + + kernel_next += count_fm[1]; + } + + if (count_fm[2] > 0) { - info->words = HS_KEY_WORDS; - info->keys = HS_KEYS_PER_LANE; - info->lanes = HS_LANES_PER_WARP; + hs->kernels.fm[2] = kernel_next; + + char fm_2_name[] = { "hs_kernel_fm_2_X" }; + + hs_create_kernels(program, + kernel_next, + fm_2_name,sizeof(fm_2_name), + count_fm[2]); + + kernel_next += count_fm[2]; } -#if defined( HS_KERNEL_SOURCE ) + if (count_hm[0] > 0) + { + hs->kernels.hm[0] = kernel_next; - cl_int err; + hs_create_kernel(program, + kernel_next, + "hs_kernel_hm_0"); + + kernel_next += count_hm[0]; + } - size_t const strings_sizeof[] = { sizeof(hs_cl_pre_cl) }; - char const * strings[] = { (char*)hs_cl_pre_cl }; + if (count_hm[1] > 0) + { + hs->kernels.hm[1] = kernel_next; - cl_program program = clCreateProgramWithSource(context, - 1, - strings, - strings_sizeof, - &err); - cl_ok(err); + hs_create_kernel(program, + kernel_next, + "hs_kernel_hm_1"); - char const * const options = - "-cl-std=CL2.0 -cl-fast-relaxed-math " - "-cl-no-signed-zeros -cl-mad-enable " - "-cl-denorms-are-zero " - "-cl-kernel-arg-info"; - - cl(BuildProgram(program, - 1, - &device_id, - options, - NULL, - NULL)); - -#elif defined( HS_KERNEL_BINARY ) - - cl_int status, err; - - size_t const bins_sizeof[] = { sizeof(hs_cl_pre_ir) }; - unsigned char const * bins[] = { hs_cl_pre_ir }; - - cl_program program = clCreateProgramWithBinary(context, - 1, - &device_id, - bins_sizeof, - bins, - &status, - &err); - cl_ok(err); + kernel_next += count_hm[1]; + } - cl(BuildProgram(program, - 1, - &device_id, - NULL, - NULL, - NULL)); -#endif + if (count_hm[2] > 0) + { + hs->kernels.hm[2] = kernel_next; - // - // create all the kernels and release the program - // -#define HS_CREATE_KERNELS(ks) \ - for (uint32_t ii=0; iikernels.count; ii++) + cl(ReleaseKernel(hs->kernels.all[ii])); + + free(hs); } // diff --git a/src/compute/hs/cl/hs_cl_launcher.h b/src/compute/hs/cl/hs_cl_launcher.h index 049657cc2f..33f62d9943 100644 --- a/src/compute/hs/cl/hs_cl_launcher.h +++ b/src/compute/hs/cl/hs_cl_launcher.h @@ -17,61 +17,65 @@ #include // -// Returns some useful info about algorithm's configuration for the -// target architecture. +// // -struct hs_info -{ - uint32_t words; // words-per-key (1 = uint, 2 = ulong) - uint32_t keys; // keys-per-lane - uint32_t lanes; // lanes-per-warp -}; +#include "hs_cl_target.h" // // // -void -hs_create(cl_context context, - cl_device_id device_id, - struct hs_info * const info); +struct hs_cl * +hs_cl_create(struct hs_cl_target const * const target, + cl_context context, + cl_device_id device_id); + // // // void -hs_release(); +hs_cl_release(struct hs_cl * const hs); // -// Size the buffers. +// Determine what padding will be applied to the input and output +// buffers. +// +// Always check to see if the allocated buffers are large enough. +// +// count : number of keys +// count + count_padded_in : additional keys required for sorting +// count + count_padded_out : additional keys required for merging // void -hs_pad(uint32_t const count, - uint32_t * const count_padded_in, - uint32_t * const count_padded_out); +hs_cl_pad(struct hs_cl const * const hs, + uint32_t const count, + uint32_t * const count_padded_in, + uint32_t * const count_padded_out); // // Sort the keys in the vin buffer and store them in the vout buffer. // -// The vin and vout buffers can be the same buffer. -// -// If it is necessary, a barrier should be enqueued before running -// hs_sort(). +// If vout is NULL then the sort will be performed in place. // -// A final barrier will enqueued before returning. +// The implementation assumes the command queue is out-of-order. // void -hs_sort(cl_command_queue cq, // out-of-order cq - cl_mem vin, - cl_mem vout, - uint32_t const count, - uint32_t const count_padded_in, - uint32_t const count_padded_out, - bool const linearize); +hs_cl_sort(struct hs_cl const * const hs, + cl_command_queue cq, + uint32_t const wait_list_size, + cl_event * wait_list, + cl_event * event, + cl_mem vin, + cl_mem vout, + uint32_t const count, + uint32_t const count_padded_in, + uint32_t const count_padded_out, + bool const linearize); // // diff --git a/src/compute/hs/cl/hs_cl_target.h b/src/compute/hs/cl/hs_cl_target.h new file mode 100644 index 0000000000..b7bb73e0d3 --- /dev/null +++ b/src/compute/hs/cl/hs_cl_target.h @@ -0,0 +1,63 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#pragma once + +// +// +// + +#include + +// +// This structure packages all of the parameters and kernels for a +// target architecture. +// + +struct hs_cl_target_config +{ + struct { + uint8_t threads_log2; + uint8_t width_log2; + uint8_t height; + } slab; + + struct { + uint8_t key; + uint8_t val; + } words; + + struct { + uint8_t slabs; + } block; + + struct { + struct { + uint8_t scale_min; + uint8_t scale_max; + } fm; + struct { + uint8_t scale_min; + uint8_t scale_max; + } hm; + } merge; +}; + +// +// +// + +struct hs_cl_target +{ + struct hs_cl_target_config config; + uint8_t program[]; +}; + +// +// +// diff --git a/src/compute/hs/cl/intel/gen8/u32/make_all.bat b/src/compute/hs/cl/intel/gen8/u32/make_all.bat new file mode 100644 index 0000000000..a68057af0e --- /dev/null +++ b/src/compute/hs/cl/intel/gen8/u32/make_all.bat @@ -0,0 +1,16 @@ +@ECHO OFF + +SET HS_GEN=..\..\..\..\..\spinel\bin\x64\Debug\hs_gen + +REM --- 32-bit keys --- + +REM CMD /C %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +REM --- 64-bit keys + +CMD /C %HS_GEN% -v -a "opencl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "opencl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +CMD /C make_inl_cl.bat hs_cl.cl diff --git a/src/compute/hs/cl/intel/gen8/u32/make_inl_cl.bat b/src/compute/hs/cl/intel/gen8/u32/make_inl_cl.bat new file mode 100644 index 0000000000..54b1aac48f --- /dev/null +++ b/src/compute/hs/cl/intel/gen8/u32/make_inl_cl.bat @@ -0,0 +1,77 @@ +@ECHO OFF + +:: +:: +:: + +SET OPENCL_STD=-cl-std=CL1.2 +SET OPENCL_PRE=__OPENCL_C_VERSION__=120 + +:: SET OPENCL_STD=-cl-std=CL2.0 +:: SET OPENCL_PRE=__OPENCL_C_VERSION__=200 + +:: +:: +:: + +SET IOC=ioc64 + +:: +:: +:: + +SET IOC_IR_OPTS_OPT=%OPENCL_STD% -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info + +SET IOC_IR_OPTS_DBG=%OPENCL_STD% -cl-kernel-arg-info -g + +SET IOC_IR_OPTS=%IOC_IR_OPTS_OPT% + +:: +:: +:: + +SET PRE_DIR=%~p1 + +CD %PRE_DIR% + +SET PRE_CL=%~n1 +SET PRE_CL=%PRE_CL%.pre.cl + +SET PRE_SRC_INL=%~n1 +SET PRE_SRC_INL=%PRE_SRC_INL%.pre.src.inl + +SET PRE_BIN_IR=%~n1 +SET PRE_BIN_IR=%PRE_BIN_IR%.pre.ir + +SET PRE_BIN_INL=%~n1 +SET PRE_BIN_INL=%PRE_BIN_INL%.pre.bin.inl + +:: +:: *.pre.cl +:: *.pre.src.inl +:: + +CMD /C clang-format -style=Mozilla -i %1 +CMD /C cl -I . -I "%INTELOCLSDKROOT%\include" -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_CL%" +CMD /C clang-format -style=Mozilla -i %PRE_CL% +CMD /C dos2unix -q %PRE_CL% +CMD /C xxd -i %PRE_CL% %PRE_SRC_INL% + +echo %PRE_CL% +echo %PRE_SRC_INL% + +:: +:: *.pre.cl +:: *.pre.src.inl +:: + +CMD /C touch %PRE_BIN_IR% +ECHO ON +@CMD /C %IOC% -cmd=build -bo="%IOC_IR_OPTS%" -device=gpu -input=%PRE_CL% -ir=%PRE_BIN_IR% +@ECHO OFF +CMD /C xxd -i %PRE_BIN_IR% %PRE_BIN_INL% + +echo %PRE_BIN_IR% +echo %PRE_BIN_INL% + + diff --git a/src/compute/hs/cl/intel/gen8/u32b32/make_all.bat b/src/compute/hs/cl/intel/gen8/u32b32/make_all.bat new file mode 100644 index 0000000000..a68057af0e --- /dev/null +++ b/src/compute/hs/cl/intel/gen8/u32b32/make_all.bat @@ -0,0 +1,16 @@ +@ECHO OFF + +SET HS_GEN=..\..\..\..\..\spinel\bin\x64\Debug\hs_gen + +REM --- 32-bit keys --- + +REM CMD /C %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +REM --- 64-bit keys + +CMD /C %HS_GEN% -v -a "opencl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "opencl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +CMD /C make_inl_cl.bat hs_cl.cl diff --git a/src/compute/hs/cl/intel/gen8/u32b32/make_inl_cl.bat b/src/compute/hs/cl/intel/gen8/u32b32/make_inl_cl.bat new file mode 100644 index 0000000000..54b1aac48f --- /dev/null +++ b/src/compute/hs/cl/intel/gen8/u32b32/make_inl_cl.bat @@ -0,0 +1,77 @@ +@ECHO OFF + +:: +:: +:: + +SET OPENCL_STD=-cl-std=CL1.2 +SET OPENCL_PRE=__OPENCL_C_VERSION__=120 + +:: SET OPENCL_STD=-cl-std=CL2.0 +:: SET OPENCL_PRE=__OPENCL_C_VERSION__=200 + +:: +:: +:: + +SET IOC=ioc64 + +:: +:: +:: + +SET IOC_IR_OPTS_OPT=%OPENCL_STD% -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info + +SET IOC_IR_OPTS_DBG=%OPENCL_STD% -cl-kernel-arg-info -g + +SET IOC_IR_OPTS=%IOC_IR_OPTS_OPT% + +:: +:: +:: + +SET PRE_DIR=%~p1 + +CD %PRE_DIR% + +SET PRE_CL=%~n1 +SET PRE_CL=%PRE_CL%.pre.cl + +SET PRE_SRC_INL=%~n1 +SET PRE_SRC_INL=%PRE_SRC_INL%.pre.src.inl + +SET PRE_BIN_IR=%~n1 +SET PRE_BIN_IR=%PRE_BIN_IR%.pre.ir + +SET PRE_BIN_INL=%~n1 +SET PRE_BIN_INL=%PRE_BIN_INL%.pre.bin.inl + +:: +:: *.pre.cl +:: *.pre.src.inl +:: + +CMD /C clang-format -style=Mozilla -i %1 +CMD /C cl -I . -I "%INTELOCLSDKROOT%\include" -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_CL%" +CMD /C clang-format -style=Mozilla -i %PRE_CL% +CMD /C dos2unix -q %PRE_CL% +CMD /C xxd -i %PRE_CL% %PRE_SRC_INL% + +echo %PRE_CL% +echo %PRE_SRC_INL% + +:: +:: *.pre.cl +:: *.pre.src.inl +:: + +CMD /C touch %PRE_BIN_IR% +ECHO ON +@CMD /C %IOC% -cmd=build -bo="%IOC_IR_OPTS%" -device=gpu -input=%PRE_CL% -ir=%PRE_BIN_IR% +@ECHO OFF +CMD /C xxd -i %PRE_BIN_IR% %PRE_BIN_INL% + +echo %PRE_BIN_IR% +echo %PRE_BIN_INL% + + diff --git a/src/compute/hs/cl/intel/gen8/u64/hs_cl.cl b/src/compute/hs/cl/intel/gen8/u64/hs_cl.cl new file mode 100644 index 0000000000..b994d8276f --- /dev/null +++ b/src/compute/hs/cl/intel/gen8/u64/hs_cl.cl @@ -0,0 +1,4851 @@ +// +// Copyright 2016 Google Inc. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// + +#include "hs_cl_macros.h" + +// +// +// + +HS_TRANSPOSE_KERNEL_PROTO(8) +{ + HS_SLAB_GLOBAL_PREAMBLE(8, 16); + HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 8, 0); + HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 8, 1); + HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 8, 2); + HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 8, 3); + HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 8, 4); + HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 8, 5); + HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 8, 6); + HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 8, 7); + HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vout, 8, 8); + HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vout, 8, 9); + HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vout, 8, 10); + HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vout, 8, 11); + HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vout, 8, 12); + HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vout, 8, 13); + HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vout, 8, 14); + HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vout, 8, 15); + HS_TRANSPOSE_SLAB() +} + +HS_BS_KERNEL_PROTO(8, 16, 4) +{ + HS_BLOCK_LOCAL_MEM_DECL(128, 16); + + HS_SLAB_GLOBAL_PREAMBLE(8, 16); + HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 8, 0); + HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 8, 1); + HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 8, 2); + HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 8, 3); + HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 8, 4); + HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 8, 5); + HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 8, 6); + HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 8, 7); + HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8, 8); + HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 8, 9); + HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 8, 10); + HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 8, 11); + HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 8, 12); + HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 8, 13); + HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 8, 14); + HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 8, 15); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r6, r11); + HS_CMP_XCHG(r7, r10); + HS_CMP_XCHG(r4, r13); + HS_CMP_XCHG(r14, r15); + HS_CMP_XCHG(r8, r12); + HS_CMP_XCHG(r2, r3); + HS_CMP_XCHG(r5, r9); + HS_CMP_XCHG(r2, r5); + HS_CMP_XCHG(r8, r14); + HS_CMP_XCHG(r3, r9); + HS_CMP_XCHG(r12, r15); + HS_CMP_XCHG(r3, r5); + HS_CMP_XCHG(r6, r7); + HS_CMP_XCHG(r10, r11); + HS_CMP_XCHG(r12, r14); + HS_CMP_XCHG(r4, r9); + HS_CMP_XCHG(r8, r13); + HS_CMP_XCHG(r7, r9); + HS_CMP_XCHG(r11, r13); + HS_CMP_XCHG(r4, r6); + HS_CMP_XCHG(r8, r10); + HS_CMP_XCHG(r4, r5); + HS_CMP_XCHG(r6, r7); + HS_CMP_XCHG(r8, r9); + HS_CMP_XCHG(r10, r11); + HS_CMP_XCHG(r12, r13); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + { + HS_SLAB_FLIP_PREAMBLE(1); + HS_CMP_FLIP(0, r1, r16); + HS_CMP_FLIP(1, r2, r15); + HS_CMP_FLIP(2, r3, r14); + HS_CMP_FLIP(3, r4, r13); + HS_CMP_FLIP(4, r5, r12); + HS_CMP_FLIP(5, r6, r11); + HS_CMP_FLIP(6, r7, r10); + HS_CMP_FLIP(7, r8, r9); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + { + HS_SLAB_FLIP_PREAMBLE(3); + HS_CMP_FLIP(0, r1, r16); + HS_CMP_FLIP(1, r2, r15); + HS_CMP_FLIP(2, r3, r14); + HS_CMP_FLIP(3, r4, r13); + HS_CMP_FLIP(4, r5, r12); + HS_CMP_FLIP(5, r6, r11); + HS_CMP_FLIP(6, r7, r10); + HS_CMP_FLIP(7, r8, r9); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + { + HS_SLAB_FLIP_PREAMBLE(7); + HS_CMP_FLIP(0, r1, r16); + HS_CMP_FLIP(1, r2, r15); + HS_CMP_FLIP(2, r3, r14); + HS_CMP_FLIP(3, r4, r13); + HS_CMP_FLIP(4, r5, r12); + HS_CMP_FLIP(5, r6, r11); + HS_CMP_FLIP(6, r7, r10); + HS_CMP_FLIP(7, r8, r9); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + HS_BS_MERGE_H_PREAMBLE(8, 16); + HS_BX_LOCAL_V(16 * 8 * 0) = r1; + HS_BX_LOCAL_V(16 * 8 * 1) = r16; + HS_BX_LOCAL_V(16 * 8 * 2) = r2; + HS_BX_LOCAL_V(16 * 8 * 3) = r15; + HS_BX_LOCAL_V(16 * 8 * 4) = r3; + HS_BX_LOCAL_V(16 * 8 * 5) = r14; + HS_BX_LOCAL_V(16 * 8 * 6) = r4; + HS_BX_LOCAL_V(16 * 8 * 7) = r13; + HS_BX_LOCAL_V(16 * 8 * 8) = r5; + HS_BX_LOCAL_V(16 * 8 * 9) = r12; + HS_BX_LOCAL_V(16 * 8 * 10) = r6; + HS_BX_LOCAL_V(16 * 8 * 11) = r11; + HS_BX_LOCAL_V(16 * 8 * 12) = r7; + HS_BX_LOCAL_V(16 * 8 * 13) = r10; + HS_BX_LOCAL_V(16 * 8 * 14) = r8; + HS_BX_LOCAL_V(16 * 8 * 15) = r9; + HS_BLOCK_BARRIER(); + { + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(8); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(0) = r0_1; + HS_SLAB_LOCAL_R(8) = r0_2; + } + { + HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(16); + HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(24); + HS_CMP_XCHG(r1_1, r1_2); + HS_SLAB_LOCAL_L(16) = r1_1; + HS_SLAB_LOCAL_R(24) = r1_2; + } + { + HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(32); + HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(40); + HS_CMP_XCHG(r2_1, r2_2); + HS_SLAB_LOCAL_L(32) = r2_1; + HS_SLAB_LOCAL_R(40) = r2_2; + } + { + HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(48); + HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(56); + HS_CMP_XCHG(r3_1, r3_2); + HS_SLAB_LOCAL_L(48) = r3_1; + HS_SLAB_LOCAL_R(56) = r3_2; + } + { + HS_KEY_TYPE r4_1 = HS_SLAB_LOCAL_L(64); + HS_KEY_TYPE r4_2 = HS_SLAB_LOCAL_R(72); + HS_CMP_XCHG(r4_1, r4_2); + HS_SLAB_LOCAL_L(64) = r4_1; + HS_SLAB_LOCAL_R(72) = r4_2; + } + { + HS_KEY_TYPE r5_1 = HS_SLAB_LOCAL_L(80); + HS_KEY_TYPE r5_2 = HS_SLAB_LOCAL_R(88); + HS_CMP_XCHG(r5_1, r5_2); + HS_SLAB_LOCAL_L(80) = r5_1; + HS_SLAB_LOCAL_R(88) = r5_2; + } + { + HS_KEY_TYPE r6_1 = HS_SLAB_LOCAL_L(96); + HS_KEY_TYPE r6_2 = HS_SLAB_LOCAL_R(104); + HS_CMP_XCHG(r6_1, r6_2); + HS_SLAB_LOCAL_L(96) = r6_1; + HS_SLAB_LOCAL_R(104) = r6_2; + } + { + HS_KEY_TYPE r7_1 = HS_SLAB_LOCAL_L(112); + HS_KEY_TYPE r7_2 = HS_SLAB_LOCAL_R(120); + HS_CMP_XCHG(r7_1, r7_2); + HS_SLAB_LOCAL_L(112) = r7_1; + HS_SLAB_LOCAL_R(120) = r7_2; + } + } + HS_BLOCK_BARRIER(); + r1 = HS_BX_LOCAL_V(16 * 8 * 0); + r16 = HS_BX_LOCAL_V(16 * 8 * 1); + r2 = HS_BX_LOCAL_V(16 * 8 * 2); + r15 = HS_BX_LOCAL_V(16 * 8 * 3); + r3 = HS_BX_LOCAL_V(16 * 8 * 4); + r14 = HS_BX_LOCAL_V(16 * 8 * 5); + r4 = HS_BX_LOCAL_V(16 * 8 * 6); + r13 = HS_BX_LOCAL_V(16 * 8 * 7); + r5 = HS_BX_LOCAL_V(16 * 8 * 8); + r12 = HS_BX_LOCAL_V(16 * 8 * 9); + r6 = HS_BX_LOCAL_V(16 * 8 * 10); + r11 = HS_BX_LOCAL_V(16 * 8 * 11); + r7 = HS_BX_LOCAL_V(16 * 8 * 12); + r10 = HS_BX_LOCAL_V(16 * 8 * 13); + r8 = HS_BX_LOCAL_V(16 * 8 * 14); + r9 = HS_BX_LOCAL_V(16 * 8 * 15); + { + { + HS_SLAB_HALF_PREAMBLE(4); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + } + HS_BX_LOCAL_V(16 * 8 * 0) = r1; + HS_BX_LOCAL_V(16 * 8 * 1) = r16; + HS_BX_LOCAL_V(16 * 8 * 2) = r2; + HS_BX_LOCAL_V(16 * 8 * 3) = r15; + HS_BX_LOCAL_V(16 * 8 * 4) = r3; + HS_BX_LOCAL_V(16 * 8 * 5) = r14; + HS_BX_LOCAL_V(16 * 8 * 6) = r4; + HS_BX_LOCAL_V(16 * 8 * 7) = r13; + HS_BX_LOCAL_V(16 * 8 * 8) = r5; + HS_BX_LOCAL_V(16 * 8 * 9) = r12; + HS_BX_LOCAL_V(16 * 8 * 10) = r6; + HS_BX_LOCAL_V(16 * 8 * 11) = r11; + HS_BX_LOCAL_V(16 * 8 * 12) = r7; + HS_BX_LOCAL_V(16 * 8 * 13) = r10; + HS_BX_LOCAL_V(16 * 8 * 14) = r8; + HS_BX_LOCAL_V(16 * 8 * 15) = r9; + HS_BLOCK_BARRIER(); + { + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8); + HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(16); + HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(24); + HS_CMP_XCHG(r0_2, r0_3); + HS_CMP_XCHG(r0_1, r0_4); + HS_CMP_XCHG(r0_3, r0_4); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(0) = r0_1; + HS_SLAB_LOCAL_L(8) = r0_2; + HS_SLAB_LOCAL_R(16) = r0_3; + HS_SLAB_LOCAL_R(24) = r0_4; + } + { + HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(32); + HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(40); + HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(48); + HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(56); + HS_CMP_XCHG(r1_2, r1_3); + HS_CMP_XCHG(r1_1, r1_4); + HS_CMP_XCHG(r1_3, r1_4); + HS_CMP_XCHG(r1_1, r1_2); + HS_SLAB_LOCAL_L(32) = r1_1; + HS_SLAB_LOCAL_L(40) = r1_2; + HS_SLAB_LOCAL_R(48) = r1_3; + HS_SLAB_LOCAL_R(56) = r1_4; + } + { + HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(64); + HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_L(72); + HS_KEY_TYPE r2_3 = HS_SLAB_LOCAL_R(80); + HS_KEY_TYPE r2_4 = HS_SLAB_LOCAL_R(88); + HS_CMP_XCHG(r2_2, r2_3); + HS_CMP_XCHG(r2_1, r2_4); + HS_CMP_XCHG(r2_3, r2_4); + HS_CMP_XCHG(r2_1, r2_2); + HS_SLAB_LOCAL_L(64) = r2_1; + HS_SLAB_LOCAL_L(72) = r2_2; + HS_SLAB_LOCAL_R(80) = r2_3; + HS_SLAB_LOCAL_R(88) = r2_4; + } + { + HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(96); + HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_L(104); + HS_KEY_TYPE r3_3 = HS_SLAB_LOCAL_R(112); + HS_KEY_TYPE r3_4 = HS_SLAB_LOCAL_R(120); + HS_CMP_XCHG(r3_2, r3_3); + HS_CMP_XCHG(r3_1, r3_4); + HS_CMP_XCHG(r3_3, r3_4); + HS_CMP_XCHG(r3_1, r3_2); + HS_SLAB_LOCAL_L(96) = r3_1; + HS_SLAB_LOCAL_L(104) = r3_2; + HS_SLAB_LOCAL_R(112) = r3_3; + HS_SLAB_LOCAL_R(120) = r3_4; + } + } + HS_BLOCK_BARRIER(); + r1 = HS_BX_LOCAL_V(16 * 8 * 0); + r16 = HS_BX_LOCAL_V(16 * 8 * 1); + r2 = HS_BX_LOCAL_V(16 * 8 * 2); + r15 = HS_BX_LOCAL_V(16 * 8 * 3); + r3 = HS_BX_LOCAL_V(16 * 8 * 4); + r14 = HS_BX_LOCAL_V(16 * 8 * 5); + r4 = HS_BX_LOCAL_V(16 * 8 * 6); + r13 = HS_BX_LOCAL_V(16 * 8 * 7); + r5 = HS_BX_LOCAL_V(16 * 8 * 8); + r12 = HS_BX_LOCAL_V(16 * 8 * 9); + r6 = HS_BX_LOCAL_V(16 * 8 * 10); + r11 = HS_BX_LOCAL_V(16 * 8 * 11); + r7 = HS_BX_LOCAL_V(16 * 8 * 12); + r10 = HS_BX_LOCAL_V(16 * 8 * 13); + r8 = HS_BX_LOCAL_V(16 * 8 * 14); + r9 = HS_BX_LOCAL_V(16 * 8 * 15); + { + { + HS_SLAB_HALF_PREAMBLE(4); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + } + HS_BX_LOCAL_V(16 * 8 * 0) = r1; + HS_BX_LOCAL_V(16 * 8 * 1) = r16; + HS_BX_LOCAL_V(16 * 8 * 2) = r2; + HS_BX_LOCAL_V(16 * 8 * 3) = r15; + HS_BX_LOCAL_V(16 * 8 * 4) = r3; + HS_BX_LOCAL_V(16 * 8 * 5) = r14; + HS_BX_LOCAL_V(16 * 8 * 6) = r4; + HS_BX_LOCAL_V(16 * 8 * 7) = r13; + HS_BX_LOCAL_V(16 * 8 * 8) = r5; + HS_BX_LOCAL_V(16 * 8 * 9) = r12; + HS_BX_LOCAL_V(16 * 8 * 10) = r6; + HS_BX_LOCAL_V(16 * 8 * 11) = r11; + HS_BX_LOCAL_V(16 * 8 * 12) = r7; + HS_BX_LOCAL_V(16 * 8 * 13) = r10; + HS_BX_LOCAL_V(16 * 8 * 14) = r8; + HS_BX_LOCAL_V(16 * 8 * 15) = r9; + HS_BLOCK_BARRIER(); + { + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8); + HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(16); + HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(24); + HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(32); + HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(40); + HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(48); + HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(56); + HS_CMP_XCHG(r0_4, r0_5); + HS_CMP_XCHG(r0_3, r0_6); + HS_CMP_XCHG(r0_2, r0_7); + HS_CMP_XCHG(r0_1, r0_8); + HS_CMP_XCHG(r0_5, r0_7); + HS_CMP_XCHG(r0_6, r0_8); + HS_CMP_XCHG(r0_5, r0_6); + HS_CMP_XCHG(r0_7, r0_8); + HS_CMP_XCHG(r0_1, r0_3); + HS_CMP_XCHG(r0_2, r0_4); + HS_CMP_XCHG(r0_1, r0_2); + HS_CMP_XCHG(r0_3, r0_4); + HS_SLAB_LOCAL_L(0) = r0_1; + HS_SLAB_LOCAL_L(8) = r0_2; + HS_SLAB_LOCAL_L(16) = r0_3; + HS_SLAB_LOCAL_L(24) = r0_4; + HS_SLAB_LOCAL_R(32) = r0_5; + HS_SLAB_LOCAL_R(40) = r0_6; + HS_SLAB_LOCAL_R(48) = r0_7; + HS_SLAB_LOCAL_R(56) = r0_8; + } + { + HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64); + HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(72); + HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_L(80); + HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_L(88); + HS_KEY_TYPE r1_5 = HS_SLAB_LOCAL_R(96); + HS_KEY_TYPE r1_6 = HS_SLAB_LOCAL_R(104); + HS_KEY_TYPE r1_7 = HS_SLAB_LOCAL_R(112); + HS_KEY_TYPE r1_8 = HS_SLAB_LOCAL_R(120); + HS_CMP_XCHG(r1_4, r1_5); + HS_CMP_XCHG(r1_3, r1_6); + HS_CMP_XCHG(r1_2, r1_7); + HS_CMP_XCHG(r1_1, r1_8); + HS_CMP_XCHG(r1_5, r1_7); + HS_CMP_XCHG(r1_6, r1_8); + HS_CMP_XCHG(r1_5, r1_6); + HS_CMP_XCHG(r1_7, r1_8); + HS_CMP_XCHG(r1_1, r1_3); + HS_CMP_XCHG(r1_2, r1_4); + HS_CMP_XCHG(r1_1, r1_2); + HS_CMP_XCHG(r1_3, r1_4); + HS_SLAB_LOCAL_L(64) = r1_1; + HS_SLAB_LOCAL_L(72) = r1_2; + HS_SLAB_LOCAL_L(80) = r1_3; + HS_SLAB_LOCAL_L(88) = r1_4; + HS_SLAB_LOCAL_R(96) = r1_5; + HS_SLAB_LOCAL_R(104) = r1_6; + HS_SLAB_LOCAL_R(112) = r1_7; + HS_SLAB_LOCAL_R(120) = r1_8; + } + } + HS_BLOCK_BARRIER(); + r1 = HS_BX_LOCAL_V(16 * 8 * 0); + r16 = HS_BX_LOCAL_V(16 * 8 * 1); + r2 = HS_BX_LOCAL_V(16 * 8 * 2); + r15 = HS_BX_LOCAL_V(16 * 8 * 3); + r3 = HS_BX_LOCAL_V(16 * 8 * 4); + r14 = HS_BX_LOCAL_V(16 * 8 * 5); + r4 = HS_BX_LOCAL_V(16 * 8 * 6); + r13 = HS_BX_LOCAL_V(16 * 8 * 7); + r5 = HS_BX_LOCAL_V(16 * 8 * 8); + r12 = HS_BX_LOCAL_V(16 * 8 * 9); + r6 = HS_BX_LOCAL_V(16 * 8 * 10); + r11 = HS_BX_LOCAL_V(16 * 8 * 11); + r7 = HS_BX_LOCAL_V(16 * 8 * 12); + r10 = HS_BX_LOCAL_V(16 * 8 * 13); + r8 = HS_BX_LOCAL_V(16 * 8 * 14); + r9 = HS_BX_LOCAL_V(16 * 8 * 15); + { + { + HS_SLAB_HALF_PREAMBLE(4); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + } + HS_BX_LOCAL_V(16 * 8 * 0) = r1; + HS_BX_LOCAL_V(16 * 8 * 1) = r16; + HS_BX_LOCAL_V(16 * 8 * 2) = r2; + HS_BX_LOCAL_V(16 * 8 * 3) = r15; + HS_BX_LOCAL_V(16 * 8 * 4) = r3; + HS_BX_LOCAL_V(16 * 8 * 5) = r14; + HS_BX_LOCAL_V(16 * 8 * 6) = r4; + HS_BX_LOCAL_V(16 * 8 * 7) = r13; + HS_BX_LOCAL_V(16 * 8 * 8) = r5; + HS_BX_LOCAL_V(16 * 8 * 9) = r12; + HS_BX_LOCAL_V(16 * 8 * 10) = r6; + HS_BX_LOCAL_V(16 * 8 * 11) = r11; + HS_BX_LOCAL_V(16 * 8 * 12) = r7; + HS_BX_LOCAL_V(16 * 8 * 13) = r10; + HS_BX_LOCAL_V(16 * 8 * 14) = r8; + HS_BX_LOCAL_V(16 * 8 * 15) = r9; + HS_BLOCK_BARRIER(); + { + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8); + HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(16); + HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(24); + HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_L(32); + HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_L(40); + HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_L(48); + HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_L(56); + HS_KEY_TYPE r0_9 = HS_SLAB_LOCAL_R(64); + HS_KEY_TYPE r0_10 = HS_SLAB_LOCAL_R(72); + HS_KEY_TYPE r0_11 = HS_SLAB_LOCAL_R(80); + HS_KEY_TYPE r0_12 = HS_SLAB_LOCAL_R(88); + HS_KEY_TYPE r0_13 = HS_SLAB_LOCAL_R(96); + HS_KEY_TYPE r0_14 = HS_SLAB_LOCAL_R(104); + HS_KEY_TYPE r0_15 = HS_SLAB_LOCAL_R(112); + HS_KEY_TYPE r0_16 = HS_SLAB_LOCAL_R(120); + HS_CMP_XCHG(r0_8, r0_9); + HS_CMP_XCHG(r0_7, r0_10); + HS_CMP_XCHG(r0_6, r0_11); + HS_CMP_XCHG(r0_5, r0_12); + HS_CMP_XCHG(r0_4, r0_13); + HS_CMP_XCHG(r0_3, r0_14); + HS_CMP_XCHG(r0_2, r0_15); + HS_CMP_XCHG(r0_1, r0_16); + HS_CMP_XCHG(r0_9, r0_13); + HS_CMP_XCHG(r0_11, r0_15); + HS_CMP_XCHG(r0_9, r0_11); + HS_CMP_XCHG(r0_13, r0_15); + HS_CMP_XCHG(r0_10, r0_14); + HS_CMP_XCHG(r0_12, r0_16); + HS_CMP_XCHG(r0_10, r0_12); + HS_CMP_XCHG(r0_14, r0_16); + HS_CMP_XCHG(r0_9, r0_10); + HS_CMP_XCHG(r0_11, r0_12); + HS_CMP_XCHG(r0_13, r0_14); + HS_CMP_XCHG(r0_15, r0_16); + HS_CMP_XCHG(r0_1, r0_5); + HS_CMP_XCHG(r0_3, r0_7); + HS_CMP_XCHG(r0_1, r0_3); + HS_CMP_XCHG(r0_5, r0_7); + HS_CMP_XCHG(r0_2, r0_6); + HS_CMP_XCHG(r0_4, r0_8); + HS_CMP_XCHG(r0_2, r0_4); + HS_CMP_XCHG(r0_6, r0_8); + HS_CMP_XCHG(r0_1, r0_2); + HS_CMP_XCHG(r0_3, r0_4); + HS_CMP_XCHG(r0_5, r0_6); + HS_CMP_XCHG(r0_7, r0_8); + HS_SLAB_LOCAL_L(0) = r0_1; + HS_SLAB_LOCAL_L(8) = r0_2; + HS_SLAB_LOCAL_L(16) = r0_3; + HS_SLAB_LOCAL_L(24) = r0_4; + HS_SLAB_LOCAL_L(32) = r0_5; + HS_SLAB_LOCAL_L(40) = r0_6; + HS_SLAB_LOCAL_L(48) = r0_7; + HS_SLAB_LOCAL_L(56) = r0_8; + HS_SLAB_LOCAL_R(64) = r0_9; + HS_SLAB_LOCAL_R(72) = r0_10; + HS_SLAB_LOCAL_R(80) = r0_11; + HS_SLAB_LOCAL_R(88) = r0_12; + HS_SLAB_LOCAL_R(96) = r0_13; + HS_SLAB_LOCAL_R(104) = r0_14; + HS_SLAB_LOCAL_R(112) = r0_15; + HS_SLAB_LOCAL_R(120) = r0_16; + } + } + HS_BLOCK_BARRIER(); + r1 = HS_BX_LOCAL_V(16 * 8 * 0); + r16 = HS_BX_LOCAL_V(16 * 8 * 1); + r2 = HS_BX_LOCAL_V(16 * 8 * 2); + r15 = HS_BX_LOCAL_V(16 * 8 * 3); + r3 = HS_BX_LOCAL_V(16 * 8 * 4); + r14 = HS_BX_LOCAL_V(16 * 8 * 5); + r4 = HS_BX_LOCAL_V(16 * 8 * 6); + r13 = HS_BX_LOCAL_V(16 * 8 * 7); + r5 = HS_BX_LOCAL_V(16 * 8 * 8); + r12 = HS_BX_LOCAL_V(16 * 8 * 9); + r6 = HS_BX_LOCAL_V(16 * 8 * 10); + r11 = HS_BX_LOCAL_V(16 * 8 * 11); + r7 = HS_BX_LOCAL_V(16 * 8 * 12); + r10 = HS_BX_LOCAL_V(16 * 8 * 13); + r8 = HS_BX_LOCAL_V(16 * 8 * 14); + r9 = HS_BX_LOCAL_V(16 * 8 * 15); + { + { + HS_SLAB_HALF_PREAMBLE(4); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + } + HS_SLAB_GLOBAL_STORE(8, 0, r1); + HS_SLAB_GLOBAL_STORE(8, 1, r2); + HS_SLAB_GLOBAL_STORE(8, 2, r3); + HS_SLAB_GLOBAL_STORE(8, 3, r4); + HS_SLAB_GLOBAL_STORE(8, 4, r5); + HS_SLAB_GLOBAL_STORE(8, 5, r6); + HS_SLAB_GLOBAL_STORE(8, 6, r7); + HS_SLAB_GLOBAL_STORE(8, 7, r8); + HS_SLAB_GLOBAL_STORE(8, 8, r9); + HS_SLAB_GLOBAL_STORE(8, 9, r10); + HS_SLAB_GLOBAL_STORE(8, 10, r11); + HS_SLAB_GLOBAL_STORE(8, 11, r12); + HS_SLAB_GLOBAL_STORE(8, 12, r13); + HS_SLAB_GLOBAL_STORE(8, 13, r14); + HS_SLAB_GLOBAL_STORE(8, 14, r15); + HS_SLAB_GLOBAL_STORE(8, 15, r16); +} + +HS_BS_KERNEL_PROTO(8, 8, 3) +{ + HS_BLOCK_LOCAL_MEM_DECL(64, 16); + + HS_SLAB_GLOBAL_PREAMBLE(8, 16); + HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 8, 0); + HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 8, 1); + HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 8, 2); + HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 8, 3); + HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 8, 4); + HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 8, 5); + HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 8, 6); + HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 8, 7); + HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8, 8); + HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 8, 9); + HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 8, 10); + HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 8, 11); + HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 8, 12); + HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 8, 13); + HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 8, 14); + HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 8, 15); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r6, r11); + HS_CMP_XCHG(r7, r10); + HS_CMP_XCHG(r4, r13); + HS_CMP_XCHG(r14, r15); + HS_CMP_XCHG(r8, r12); + HS_CMP_XCHG(r2, r3); + HS_CMP_XCHG(r5, r9); + HS_CMP_XCHG(r2, r5); + HS_CMP_XCHG(r8, r14); + HS_CMP_XCHG(r3, r9); + HS_CMP_XCHG(r12, r15); + HS_CMP_XCHG(r3, r5); + HS_CMP_XCHG(r6, r7); + HS_CMP_XCHG(r10, r11); + HS_CMP_XCHG(r12, r14); + HS_CMP_XCHG(r4, r9); + HS_CMP_XCHG(r8, r13); + HS_CMP_XCHG(r7, r9); + HS_CMP_XCHG(r11, r13); + HS_CMP_XCHG(r4, r6); + HS_CMP_XCHG(r8, r10); + HS_CMP_XCHG(r4, r5); + HS_CMP_XCHG(r6, r7); + HS_CMP_XCHG(r8, r9); + HS_CMP_XCHG(r10, r11); + HS_CMP_XCHG(r12, r13); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + { + HS_SLAB_FLIP_PREAMBLE(1); + HS_CMP_FLIP(0, r1, r16); + HS_CMP_FLIP(1, r2, r15); + HS_CMP_FLIP(2, r3, r14); + HS_CMP_FLIP(3, r4, r13); + HS_CMP_FLIP(4, r5, r12); + HS_CMP_FLIP(5, r6, r11); + HS_CMP_FLIP(6, r7, r10); + HS_CMP_FLIP(7, r8, r9); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + { + HS_SLAB_FLIP_PREAMBLE(3); + HS_CMP_FLIP(0, r1, r16); + HS_CMP_FLIP(1, r2, r15); + HS_CMP_FLIP(2, r3, r14); + HS_CMP_FLIP(3, r4, r13); + HS_CMP_FLIP(4, r5, r12); + HS_CMP_FLIP(5, r6, r11); + HS_CMP_FLIP(6, r7, r10); + HS_CMP_FLIP(7, r8, r9); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + { + HS_SLAB_FLIP_PREAMBLE(7); + HS_CMP_FLIP(0, r1, r16); + HS_CMP_FLIP(1, r2, r15); + HS_CMP_FLIP(2, r3, r14); + HS_CMP_FLIP(3, r4, r13); + HS_CMP_FLIP(4, r5, r12); + HS_CMP_FLIP(5, r6, r11); + HS_CMP_FLIP(6, r7, r10); + HS_CMP_FLIP(7, r8, r9); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + HS_BS_MERGE_H_PREAMBLE(8, 8); + HS_BX_LOCAL_V(8 * 8 * 0) = r1; + HS_BX_LOCAL_V(8 * 8 * 1) = r16; + HS_BX_LOCAL_V(8 * 8 * 2) = r2; + HS_BX_LOCAL_V(8 * 8 * 3) = r15; + HS_BX_LOCAL_V(8 * 8 * 4) = r3; + HS_BX_LOCAL_V(8 * 8 * 5) = r14; + HS_BX_LOCAL_V(8 * 8 * 6) = r4; + HS_BX_LOCAL_V(8 * 8 * 7) = r13; + HS_BX_LOCAL_V(8 * 8 * 8) = r5; + HS_BX_LOCAL_V(8 * 8 * 9) = r12; + HS_BX_LOCAL_V(8 * 8 * 10) = r6; + HS_BX_LOCAL_V(8 * 8 * 11) = r11; + HS_BX_LOCAL_V(8 * 8 * 12) = r7; + HS_BX_LOCAL_V(8 * 8 * 13) = r10; + HS_BX_LOCAL_V(8 * 8 * 14) = r8; + HS_BX_LOCAL_V(8 * 8 * 15) = r9; + HS_BLOCK_BARRIER(); + { + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(8); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(0) = r0_1; + HS_SLAB_LOCAL_R(8) = r0_2; + } + { + HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(16); + HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(24); + HS_CMP_XCHG(r1_1, r1_2); + HS_SLAB_LOCAL_L(16) = r1_1; + HS_SLAB_LOCAL_R(24) = r1_2; + } + { + HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(32); + HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(40); + HS_CMP_XCHG(r2_1, r2_2); + HS_SLAB_LOCAL_L(32) = r2_1; + HS_SLAB_LOCAL_R(40) = r2_2; + } + { + HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(48); + HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(56); + HS_CMP_XCHG(r3_1, r3_2); + HS_SLAB_LOCAL_L(48) = r3_1; + HS_SLAB_LOCAL_R(56) = r3_2; + } + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(520); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(512) = r0_1; + HS_SLAB_LOCAL_R(520) = r0_2; + } + { + HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(528); + HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(536); + HS_CMP_XCHG(r1_1, r1_2); + HS_SLAB_LOCAL_L(528) = r1_1; + HS_SLAB_LOCAL_R(536) = r1_2; + } + { + HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(544); + HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(552); + HS_CMP_XCHG(r2_1, r2_2); + HS_SLAB_LOCAL_L(544) = r2_1; + HS_SLAB_LOCAL_R(552) = r2_2; + } + { + HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(560); + HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(568); + HS_CMP_XCHG(r3_1, r3_2); + HS_SLAB_LOCAL_L(560) = r3_1; + HS_SLAB_LOCAL_R(568) = r3_2; + } + } + HS_BLOCK_BARRIER(); + r1 = HS_BX_LOCAL_V(8 * 8 * 0); + r16 = HS_BX_LOCAL_V(8 * 8 * 1); + r2 = HS_BX_LOCAL_V(8 * 8 * 2); + r15 = HS_BX_LOCAL_V(8 * 8 * 3); + r3 = HS_BX_LOCAL_V(8 * 8 * 4); + r14 = HS_BX_LOCAL_V(8 * 8 * 5); + r4 = HS_BX_LOCAL_V(8 * 8 * 6); + r13 = HS_BX_LOCAL_V(8 * 8 * 7); + r5 = HS_BX_LOCAL_V(8 * 8 * 8); + r12 = HS_BX_LOCAL_V(8 * 8 * 9); + r6 = HS_BX_LOCAL_V(8 * 8 * 10); + r11 = HS_BX_LOCAL_V(8 * 8 * 11); + r7 = HS_BX_LOCAL_V(8 * 8 * 12); + r10 = HS_BX_LOCAL_V(8 * 8 * 13); + r8 = HS_BX_LOCAL_V(8 * 8 * 14); + r9 = HS_BX_LOCAL_V(8 * 8 * 15); + { + { + HS_SLAB_HALF_PREAMBLE(4); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + } + HS_BX_LOCAL_V(8 * 8 * 0) = r1; + HS_BX_LOCAL_V(8 * 8 * 1) = r16; + HS_BX_LOCAL_V(8 * 8 * 2) = r2; + HS_BX_LOCAL_V(8 * 8 * 3) = r15; + HS_BX_LOCAL_V(8 * 8 * 4) = r3; + HS_BX_LOCAL_V(8 * 8 * 5) = r14; + HS_BX_LOCAL_V(8 * 8 * 6) = r4; + HS_BX_LOCAL_V(8 * 8 * 7) = r13; + HS_BX_LOCAL_V(8 * 8 * 8) = r5; + HS_BX_LOCAL_V(8 * 8 * 9) = r12; + HS_BX_LOCAL_V(8 * 8 * 10) = r6; + HS_BX_LOCAL_V(8 * 8 * 11) = r11; + HS_BX_LOCAL_V(8 * 8 * 12) = r7; + HS_BX_LOCAL_V(8 * 8 * 13) = r10; + HS_BX_LOCAL_V(8 * 8 * 14) = r8; + HS_BX_LOCAL_V(8 * 8 * 15) = r9; + HS_BLOCK_BARRIER(); + { + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8); + HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(16); + HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(24); + HS_CMP_XCHG(r0_2, r0_3); + HS_CMP_XCHG(r0_1, r0_4); + HS_CMP_XCHG(r0_3, r0_4); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(0) = r0_1; + HS_SLAB_LOCAL_L(8) = r0_2; + HS_SLAB_LOCAL_R(16) = r0_3; + HS_SLAB_LOCAL_R(24) = r0_4; + } + { + HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(32); + HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(40); + HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(48); + HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(56); + HS_CMP_XCHG(r1_2, r1_3); + HS_CMP_XCHG(r1_1, r1_4); + HS_CMP_XCHG(r1_3, r1_4); + HS_CMP_XCHG(r1_1, r1_2); + HS_SLAB_LOCAL_L(32) = r1_1; + HS_SLAB_LOCAL_L(40) = r1_2; + HS_SLAB_LOCAL_R(48) = r1_3; + HS_SLAB_LOCAL_R(56) = r1_4; + } + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(520); + HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(528); + HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(536); + HS_CMP_XCHG(r0_2, r0_3); + HS_CMP_XCHG(r0_1, r0_4); + HS_CMP_XCHG(r0_3, r0_4); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(512) = r0_1; + HS_SLAB_LOCAL_L(520) = r0_2; + HS_SLAB_LOCAL_R(528) = r0_3; + HS_SLAB_LOCAL_R(536) = r0_4; + } + { + HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(544); + HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(552); + HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(560); + HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(568); + HS_CMP_XCHG(r1_2, r1_3); + HS_CMP_XCHG(r1_1, r1_4); + HS_CMP_XCHG(r1_3, r1_4); + HS_CMP_XCHG(r1_1, r1_2); + HS_SLAB_LOCAL_L(544) = r1_1; + HS_SLAB_LOCAL_L(552) = r1_2; + HS_SLAB_LOCAL_R(560) = r1_3; + HS_SLAB_LOCAL_R(568) = r1_4; + } + } + HS_BLOCK_BARRIER(); + r1 = HS_BX_LOCAL_V(8 * 8 * 0); + r16 = HS_BX_LOCAL_V(8 * 8 * 1); + r2 = HS_BX_LOCAL_V(8 * 8 * 2); + r15 = HS_BX_LOCAL_V(8 * 8 * 3); + r3 = HS_BX_LOCAL_V(8 * 8 * 4); + r14 = HS_BX_LOCAL_V(8 * 8 * 5); + r4 = HS_BX_LOCAL_V(8 * 8 * 6); + r13 = HS_BX_LOCAL_V(8 * 8 * 7); + r5 = HS_BX_LOCAL_V(8 * 8 * 8); + r12 = HS_BX_LOCAL_V(8 * 8 * 9); + r6 = HS_BX_LOCAL_V(8 * 8 * 10); + r11 = HS_BX_LOCAL_V(8 * 8 * 11); + r7 = HS_BX_LOCAL_V(8 * 8 * 12); + r10 = HS_BX_LOCAL_V(8 * 8 * 13); + r8 = HS_BX_LOCAL_V(8 * 8 * 14); + r9 = HS_BX_LOCAL_V(8 * 8 * 15); + { + { + HS_SLAB_HALF_PREAMBLE(4); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + } + HS_BX_LOCAL_V(8 * 8 * 0) = r1; + HS_BX_LOCAL_V(8 * 8 * 1) = r16; + HS_BX_LOCAL_V(8 * 8 * 2) = r2; + HS_BX_LOCAL_V(8 * 8 * 3) = r15; + HS_BX_LOCAL_V(8 * 8 * 4) = r3; + HS_BX_LOCAL_V(8 * 8 * 5) = r14; + HS_BX_LOCAL_V(8 * 8 * 6) = r4; + HS_BX_LOCAL_V(8 * 8 * 7) = r13; + HS_BX_LOCAL_V(8 * 8 * 8) = r5; + HS_BX_LOCAL_V(8 * 8 * 9) = r12; + HS_BX_LOCAL_V(8 * 8 * 10) = r6; + HS_BX_LOCAL_V(8 * 8 * 11) = r11; + HS_BX_LOCAL_V(8 * 8 * 12) = r7; + HS_BX_LOCAL_V(8 * 8 * 13) = r10; + HS_BX_LOCAL_V(8 * 8 * 14) = r8; + HS_BX_LOCAL_V(8 * 8 * 15) = r9; + HS_BLOCK_BARRIER(); + { + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8); + HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(16); + HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(24); + HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(32); + HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(40); + HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(48); + HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(56); + HS_CMP_XCHG(r0_4, r0_5); + HS_CMP_XCHG(r0_3, r0_6); + HS_CMP_XCHG(r0_2, r0_7); + HS_CMP_XCHG(r0_1, r0_8); + HS_CMP_XCHG(r0_5, r0_7); + HS_CMP_XCHG(r0_6, r0_8); + HS_CMP_XCHG(r0_5, r0_6); + HS_CMP_XCHG(r0_7, r0_8); + HS_CMP_XCHG(r0_1, r0_3); + HS_CMP_XCHG(r0_2, r0_4); + HS_CMP_XCHG(r0_1, r0_2); + HS_CMP_XCHG(r0_3, r0_4); + HS_SLAB_LOCAL_L(0) = r0_1; + HS_SLAB_LOCAL_L(8) = r0_2; + HS_SLAB_LOCAL_L(16) = r0_3; + HS_SLAB_LOCAL_L(24) = r0_4; + HS_SLAB_LOCAL_R(32) = r0_5; + HS_SLAB_LOCAL_R(40) = r0_6; + HS_SLAB_LOCAL_R(48) = r0_7; + HS_SLAB_LOCAL_R(56) = r0_8; + } + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(520); + HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(528); + HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(536); + HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(544); + HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(552); + HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(560); + HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(568); + HS_CMP_XCHG(r0_4, r0_5); + HS_CMP_XCHG(r0_3, r0_6); + HS_CMP_XCHG(r0_2, r0_7); + HS_CMP_XCHG(r0_1, r0_8); + HS_CMP_XCHG(r0_5, r0_7); + HS_CMP_XCHG(r0_6, r0_8); + HS_CMP_XCHG(r0_5, r0_6); + HS_CMP_XCHG(r0_7, r0_8); + HS_CMP_XCHG(r0_1, r0_3); + HS_CMP_XCHG(r0_2, r0_4); + HS_CMP_XCHG(r0_1, r0_2); + HS_CMP_XCHG(r0_3, r0_4); + HS_SLAB_LOCAL_L(512) = r0_1; + HS_SLAB_LOCAL_L(520) = r0_2; + HS_SLAB_LOCAL_L(528) = r0_3; + HS_SLAB_LOCAL_L(536) = r0_4; + HS_SLAB_LOCAL_R(544) = r0_5; + HS_SLAB_LOCAL_R(552) = r0_6; + HS_SLAB_LOCAL_R(560) = r0_7; + HS_SLAB_LOCAL_R(568) = r0_8; + } + } + HS_BLOCK_BARRIER(); + r1 = HS_BX_LOCAL_V(8 * 8 * 0); + r16 = HS_BX_LOCAL_V(8 * 8 * 1); + r2 = HS_BX_LOCAL_V(8 * 8 * 2); + r15 = HS_BX_LOCAL_V(8 * 8 * 3); + r3 = HS_BX_LOCAL_V(8 * 8 * 4); + r14 = HS_BX_LOCAL_V(8 * 8 * 5); + r4 = HS_BX_LOCAL_V(8 * 8 * 6); + r13 = HS_BX_LOCAL_V(8 * 8 * 7); + r5 = HS_BX_LOCAL_V(8 * 8 * 8); + r12 = HS_BX_LOCAL_V(8 * 8 * 9); + r6 = HS_BX_LOCAL_V(8 * 8 * 10); + r11 = HS_BX_LOCAL_V(8 * 8 * 11); + r7 = HS_BX_LOCAL_V(8 * 8 * 12); + r10 = HS_BX_LOCAL_V(8 * 8 * 13); + r8 = HS_BX_LOCAL_V(8 * 8 * 14); + r9 = HS_BX_LOCAL_V(8 * 8 * 15); + { + { + HS_SLAB_HALF_PREAMBLE(4); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + } + HS_SLAB_GLOBAL_STORE(8, 0, r1); + HS_SLAB_GLOBAL_STORE(8, 1, r2); + HS_SLAB_GLOBAL_STORE(8, 2, r3); + HS_SLAB_GLOBAL_STORE(8, 3, r4); + HS_SLAB_GLOBAL_STORE(8, 4, r5); + HS_SLAB_GLOBAL_STORE(8, 5, r6); + HS_SLAB_GLOBAL_STORE(8, 6, r7); + HS_SLAB_GLOBAL_STORE(8, 7, r8); + HS_SLAB_GLOBAL_STORE(8, 8, r9); + HS_SLAB_GLOBAL_STORE(8, 9, r10); + HS_SLAB_GLOBAL_STORE(8, 10, r11); + HS_SLAB_GLOBAL_STORE(8, 11, r12); + HS_SLAB_GLOBAL_STORE(8, 12, r13); + HS_SLAB_GLOBAL_STORE(8, 13, r14); + HS_SLAB_GLOBAL_STORE(8, 14, r15); + HS_SLAB_GLOBAL_STORE(8, 15, r16); +} + +HS_BS_KERNEL_PROTO(8, 4, 2) +{ + HS_BLOCK_LOCAL_MEM_DECL(32, 16); + + HS_SLAB_GLOBAL_PREAMBLE(8, 16); + HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 8, 0); + HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 8, 1); + HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 8, 2); + HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 8, 3); + HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 8, 4); + HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 8, 5); + HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 8, 6); + HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 8, 7); + HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8, 8); + HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 8, 9); + HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 8, 10); + HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 8, 11); + HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 8, 12); + HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 8, 13); + HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 8, 14); + HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 8, 15); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r6, r11); + HS_CMP_XCHG(r7, r10); + HS_CMP_XCHG(r4, r13); + HS_CMP_XCHG(r14, r15); + HS_CMP_XCHG(r8, r12); + HS_CMP_XCHG(r2, r3); + HS_CMP_XCHG(r5, r9); + HS_CMP_XCHG(r2, r5); + HS_CMP_XCHG(r8, r14); + HS_CMP_XCHG(r3, r9); + HS_CMP_XCHG(r12, r15); + HS_CMP_XCHG(r3, r5); + HS_CMP_XCHG(r6, r7); + HS_CMP_XCHG(r10, r11); + HS_CMP_XCHG(r12, r14); + HS_CMP_XCHG(r4, r9); + HS_CMP_XCHG(r8, r13); + HS_CMP_XCHG(r7, r9); + HS_CMP_XCHG(r11, r13); + HS_CMP_XCHG(r4, r6); + HS_CMP_XCHG(r8, r10); + HS_CMP_XCHG(r4, r5); + HS_CMP_XCHG(r6, r7); + HS_CMP_XCHG(r8, r9); + HS_CMP_XCHG(r10, r11); + HS_CMP_XCHG(r12, r13); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + { + HS_SLAB_FLIP_PREAMBLE(1); + HS_CMP_FLIP(0, r1, r16); + HS_CMP_FLIP(1, r2, r15); + HS_CMP_FLIP(2, r3, r14); + HS_CMP_FLIP(3, r4, r13); + HS_CMP_FLIP(4, r5, r12); + HS_CMP_FLIP(5, r6, r11); + HS_CMP_FLIP(6, r7, r10); + HS_CMP_FLIP(7, r8, r9); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + { + HS_SLAB_FLIP_PREAMBLE(3); + HS_CMP_FLIP(0, r1, r16); + HS_CMP_FLIP(1, r2, r15); + HS_CMP_FLIP(2, r3, r14); + HS_CMP_FLIP(3, r4, r13); + HS_CMP_FLIP(4, r5, r12); + HS_CMP_FLIP(5, r6, r11); + HS_CMP_FLIP(6, r7, r10); + HS_CMP_FLIP(7, r8, r9); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + { + HS_SLAB_FLIP_PREAMBLE(7); + HS_CMP_FLIP(0, r1, r16); + HS_CMP_FLIP(1, r2, r15); + HS_CMP_FLIP(2, r3, r14); + HS_CMP_FLIP(3, r4, r13); + HS_CMP_FLIP(4, r5, r12); + HS_CMP_FLIP(5, r6, r11); + HS_CMP_FLIP(6, r7, r10); + HS_CMP_FLIP(7, r8, r9); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + HS_BS_MERGE_H_PREAMBLE(8, 4); + HS_BX_LOCAL_V(4 * 8 * 0) = r1; + HS_BX_LOCAL_V(4 * 8 * 1) = r16; + HS_BX_LOCAL_V(4 * 8 * 2) = r2; + HS_BX_LOCAL_V(4 * 8 * 3) = r15; + HS_BX_LOCAL_V(4 * 8 * 4) = r3; + HS_BX_LOCAL_V(4 * 8 * 5) = r14; + HS_BX_LOCAL_V(4 * 8 * 6) = r4; + HS_BX_LOCAL_V(4 * 8 * 7) = r13; + HS_BX_LOCAL_V(4 * 8 * 8) = r5; + HS_BX_LOCAL_V(4 * 8 * 9) = r12; + HS_BX_LOCAL_V(4 * 8 * 10) = r6; + HS_BX_LOCAL_V(4 * 8 * 11) = r11; + HS_BX_LOCAL_V(4 * 8 * 12) = r7; + HS_BX_LOCAL_V(4 * 8 * 13) = r10; + HS_BX_LOCAL_V(4 * 8 * 14) = r8; + HS_BX_LOCAL_V(4 * 8 * 15) = r9; + HS_BLOCK_BARRIER(); + { + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(8); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(0) = r0_1; + HS_SLAB_LOCAL_R(8) = r0_2; + } + { + HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(16); + HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(24); + HS_CMP_XCHG(r1_1, r1_2); + HS_SLAB_LOCAL_L(16) = r1_1; + HS_SLAB_LOCAL_R(24) = r1_2; + } + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(136); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(128) = r0_1; + HS_SLAB_LOCAL_R(136) = r0_2; + } + { + HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(144); + HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(152); + HS_CMP_XCHG(r1_1, r1_2); + HS_SLAB_LOCAL_L(144) = r1_1; + HS_SLAB_LOCAL_R(152) = r1_2; + } + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(264); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(256) = r0_1; + HS_SLAB_LOCAL_R(264) = r0_2; + } + { + HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(272); + HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(280); + HS_CMP_XCHG(r1_1, r1_2); + HS_SLAB_LOCAL_L(272) = r1_1; + HS_SLAB_LOCAL_R(280) = r1_2; + } + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(384); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(392); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(384) = r0_1; + HS_SLAB_LOCAL_R(392) = r0_2; + } + { + HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(400); + HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(408); + HS_CMP_XCHG(r1_1, r1_2); + HS_SLAB_LOCAL_L(400) = r1_1; + HS_SLAB_LOCAL_R(408) = r1_2; + } + } + HS_BLOCK_BARRIER(); + r1 = HS_BX_LOCAL_V(4 * 8 * 0); + r16 = HS_BX_LOCAL_V(4 * 8 * 1); + r2 = HS_BX_LOCAL_V(4 * 8 * 2); + r15 = HS_BX_LOCAL_V(4 * 8 * 3); + r3 = HS_BX_LOCAL_V(4 * 8 * 4); + r14 = HS_BX_LOCAL_V(4 * 8 * 5); + r4 = HS_BX_LOCAL_V(4 * 8 * 6); + r13 = HS_BX_LOCAL_V(4 * 8 * 7); + r5 = HS_BX_LOCAL_V(4 * 8 * 8); + r12 = HS_BX_LOCAL_V(4 * 8 * 9); + r6 = HS_BX_LOCAL_V(4 * 8 * 10); + r11 = HS_BX_LOCAL_V(4 * 8 * 11); + r7 = HS_BX_LOCAL_V(4 * 8 * 12); + r10 = HS_BX_LOCAL_V(4 * 8 * 13); + r8 = HS_BX_LOCAL_V(4 * 8 * 14); + r9 = HS_BX_LOCAL_V(4 * 8 * 15); + { + { + HS_SLAB_HALF_PREAMBLE(4); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + } + HS_BX_LOCAL_V(4 * 8 * 0) = r1; + HS_BX_LOCAL_V(4 * 8 * 1) = r16; + HS_BX_LOCAL_V(4 * 8 * 2) = r2; + HS_BX_LOCAL_V(4 * 8 * 3) = r15; + HS_BX_LOCAL_V(4 * 8 * 4) = r3; + HS_BX_LOCAL_V(4 * 8 * 5) = r14; + HS_BX_LOCAL_V(4 * 8 * 6) = r4; + HS_BX_LOCAL_V(4 * 8 * 7) = r13; + HS_BX_LOCAL_V(4 * 8 * 8) = r5; + HS_BX_LOCAL_V(4 * 8 * 9) = r12; + HS_BX_LOCAL_V(4 * 8 * 10) = r6; + HS_BX_LOCAL_V(4 * 8 * 11) = r11; + HS_BX_LOCAL_V(4 * 8 * 12) = r7; + HS_BX_LOCAL_V(4 * 8 * 13) = r10; + HS_BX_LOCAL_V(4 * 8 * 14) = r8; + HS_BX_LOCAL_V(4 * 8 * 15) = r9; + HS_BLOCK_BARRIER(); + { + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8); + HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(16); + HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(24); + HS_CMP_XCHG(r0_2, r0_3); + HS_CMP_XCHG(r0_1, r0_4); + HS_CMP_XCHG(r0_3, r0_4); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(0) = r0_1; + HS_SLAB_LOCAL_L(8) = r0_2; + HS_SLAB_LOCAL_R(16) = r0_3; + HS_SLAB_LOCAL_R(24) = r0_4; + } + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(136); + HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(144); + HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(152); + HS_CMP_XCHG(r0_2, r0_3); + HS_CMP_XCHG(r0_1, r0_4); + HS_CMP_XCHG(r0_3, r0_4); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(128) = r0_1; + HS_SLAB_LOCAL_L(136) = r0_2; + HS_SLAB_LOCAL_R(144) = r0_3; + HS_SLAB_LOCAL_R(152) = r0_4; + } + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(264); + HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(272); + HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(280); + HS_CMP_XCHG(r0_2, r0_3); + HS_CMP_XCHG(r0_1, r0_4); + HS_CMP_XCHG(r0_3, r0_4); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(256) = r0_1; + HS_SLAB_LOCAL_L(264) = r0_2; + HS_SLAB_LOCAL_R(272) = r0_3; + HS_SLAB_LOCAL_R(280) = r0_4; + } + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(384); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(392); + HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(400); + HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(408); + HS_CMP_XCHG(r0_2, r0_3); + HS_CMP_XCHG(r0_1, r0_4); + HS_CMP_XCHG(r0_3, r0_4); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(384) = r0_1; + HS_SLAB_LOCAL_L(392) = r0_2; + HS_SLAB_LOCAL_R(400) = r0_3; + HS_SLAB_LOCAL_R(408) = r0_4; + } + } + HS_BLOCK_BARRIER(); + r1 = HS_BX_LOCAL_V(4 * 8 * 0); + r16 = HS_BX_LOCAL_V(4 * 8 * 1); + r2 = HS_BX_LOCAL_V(4 * 8 * 2); + r15 = HS_BX_LOCAL_V(4 * 8 * 3); + r3 = HS_BX_LOCAL_V(4 * 8 * 4); + r14 = HS_BX_LOCAL_V(4 * 8 * 5); + r4 = HS_BX_LOCAL_V(4 * 8 * 6); + r13 = HS_BX_LOCAL_V(4 * 8 * 7); + r5 = HS_BX_LOCAL_V(4 * 8 * 8); + r12 = HS_BX_LOCAL_V(4 * 8 * 9); + r6 = HS_BX_LOCAL_V(4 * 8 * 10); + r11 = HS_BX_LOCAL_V(4 * 8 * 11); + r7 = HS_BX_LOCAL_V(4 * 8 * 12); + r10 = HS_BX_LOCAL_V(4 * 8 * 13); + r8 = HS_BX_LOCAL_V(4 * 8 * 14); + r9 = HS_BX_LOCAL_V(4 * 8 * 15); + { + { + HS_SLAB_HALF_PREAMBLE(4); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + } + HS_SLAB_GLOBAL_STORE(8, 0, r1); + HS_SLAB_GLOBAL_STORE(8, 1, r2); + HS_SLAB_GLOBAL_STORE(8, 2, r3); + HS_SLAB_GLOBAL_STORE(8, 3, r4); + HS_SLAB_GLOBAL_STORE(8, 4, r5); + HS_SLAB_GLOBAL_STORE(8, 5, r6); + HS_SLAB_GLOBAL_STORE(8, 6, r7); + HS_SLAB_GLOBAL_STORE(8, 7, r8); + HS_SLAB_GLOBAL_STORE(8, 8, r9); + HS_SLAB_GLOBAL_STORE(8, 9, r10); + HS_SLAB_GLOBAL_STORE(8, 10, r11); + HS_SLAB_GLOBAL_STORE(8, 11, r12); + HS_SLAB_GLOBAL_STORE(8, 12, r13); + HS_SLAB_GLOBAL_STORE(8, 13, r14); + HS_SLAB_GLOBAL_STORE(8, 14, r15); + HS_SLAB_GLOBAL_STORE(8, 15, r16); +} + +HS_BS_KERNEL_PROTO(8, 2, 1) +{ + HS_BLOCK_LOCAL_MEM_DECL(16, 16); + + HS_SLAB_GLOBAL_PREAMBLE(8, 16); + HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 8, 0); + HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 8, 1); + HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 8, 2); + HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 8, 3); + HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 8, 4); + HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 8, 5); + HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 8, 6); + HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 8, 7); + HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8, 8); + HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 8, 9); + HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 8, 10); + HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 8, 11); + HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 8, 12); + HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 8, 13); + HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 8, 14); + HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 8, 15); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r6, r11); + HS_CMP_XCHG(r7, r10); + HS_CMP_XCHG(r4, r13); + HS_CMP_XCHG(r14, r15); + HS_CMP_XCHG(r8, r12); + HS_CMP_XCHG(r2, r3); + HS_CMP_XCHG(r5, r9); + HS_CMP_XCHG(r2, r5); + HS_CMP_XCHG(r8, r14); + HS_CMP_XCHG(r3, r9); + HS_CMP_XCHG(r12, r15); + HS_CMP_XCHG(r3, r5); + HS_CMP_XCHG(r6, r7); + HS_CMP_XCHG(r10, r11); + HS_CMP_XCHG(r12, r14); + HS_CMP_XCHG(r4, r9); + HS_CMP_XCHG(r8, r13); + HS_CMP_XCHG(r7, r9); + HS_CMP_XCHG(r11, r13); + HS_CMP_XCHG(r4, r6); + HS_CMP_XCHG(r8, r10); + HS_CMP_XCHG(r4, r5); + HS_CMP_XCHG(r6, r7); + HS_CMP_XCHG(r8, r9); + HS_CMP_XCHG(r10, r11); + HS_CMP_XCHG(r12, r13); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + { + HS_SLAB_FLIP_PREAMBLE(1); + HS_CMP_FLIP(0, r1, r16); + HS_CMP_FLIP(1, r2, r15); + HS_CMP_FLIP(2, r3, r14); + HS_CMP_FLIP(3, r4, r13); + HS_CMP_FLIP(4, r5, r12); + HS_CMP_FLIP(5, r6, r11); + HS_CMP_FLIP(6, r7, r10); + HS_CMP_FLIP(7, r8, r9); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + { + HS_SLAB_FLIP_PREAMBLE(3); + HS_CMP_FLIP(0, r1, r16); + HS_CMP_FLIP(1, r2, r15); + HS_CMP_FLIP(2, r3, r14); + HS_CMP_FLIP(3, r4, r13); + HS_CMP_FLIP(4, r5, r12); + HS_CMP_FLIP(5, r6, r11); + HS_CMP_FLIP(6, r7, r10); + HS_CMP_FLIP(7, r8, r9); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + { + HS_SLAB_FLIP_PREAMBLE(7); + HS_CMP_FLIP(0, r1, r16); + HS_CMP_FLIP(1, r2, r15); + HS_CMP_FLIP(2, r3, r14); + HS_CMP_FLIP(3, r4, r13); + HS_CMP_FLIP(4, r5, r12); + HS_CMP_FLIP(5, r6, r11); + HS_CMP_FLIP(6, r7, r10); + HS_CMP_FLIP(7, r8, r9); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + HS_BS_MERGE_H_PREAMBLE(8, 2); + HS_BX_LOCAL_V(2 * 8 * 0) = r1; + HS_BX_LOCAL_V(2 * 8 * 1) = r16; + HS_BX_LOCAL_V(2 * 8 * 2) = r2; + HS_BX_LOCAL_V(2 * 8 * 3) = r15; + HS_BX_LOCAL_V(2 * 8 * 4) = r3; + HS_BX_LOCAL_V(2 * 8 * 5) = r14; + HS_BX_LOCAL_V(2 * 8 * 6) = r4; + HS_BX_LOCAL_V(2 * 8 * 7) = r13; + HS_BX_LOCAL_V(2 * 8 * 8) = r5; + HS_BX_LOCAL_V(2 * 8 * 9) = r12; + HS_BX_LOCAL_V(2 * 8 * 10) = r6; + HS_BX_LOCAL_V(2 * 8 * 11) = r11; + HS_BX_LOCAL_V(2 * 8 * 12) = r7; + HS_BX_LOCAL_V(2 * 8 * 13) = r10; + HS_BX_LOCAL_V(2 * 8 * 14) = r8; + HS_BX_LOCAL_V(2 * 8 * 15) = r9; + HS_BLOCK_BARRIER(); + { + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(8); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(0) = r0_1; + HS_SLAB_LOCAL_R(8) = r0_2; + } + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(32); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(40); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(32) = r0_1; + HS_SLAB_LOCAL_R(40) = r0_2; + } + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(64); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(72); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(64) = r0_1; + HS_SLAB_LOCAL_R(72) = r0_2; + } + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(96); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(104); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(96) = r0_1; + HS_SLAB_LOCAL_R(104) = r0_2; + } + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(136); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(128) = r0_1; + HS_SLAB_LOCAL_R(136) = r0_2; + } + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(160); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(168); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(160) = r0_1; + HS_SLAB_LOCAL_R(168) = r0_2; + } + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(192); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(200); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(192) = r0_1; + HS_SLAB_LOCAL_R(200) = r0_2; + } + { + HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(224); + HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(232); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(224) = r0_1; + HS_SLAB_LOCAL_R(232) = r0_2; + } + } + HS_BLOCK_BARRIER(); + r1 = HS_BX_LOCAL_V(2 * 8 * 0); + r16 = HS_BX_LOCAL_V(2 * 8 * 1); + r2 = HS_BX_LOCAL_V(2 * 8 * 2); + r15 = HS_BX_LOCAL_V(2 * 8 * 3); + r3 = HS_BX_LOCAL_V(2 * 8 * 4); + r14 = HS_BX_LOCAL_V(2 * 8 * 5); + r4 = HS_BX_LOCAL_V(2 * 8 * 6); + r13 = HS_BX_LOCAL_V(2 * 8 * 7); + r5 = HS_BX_LOCAL_V(2 * 8 * 8); + r12 = HS_BX_LOCAL_V(2 * 8 * 9); + r6 = HS_BX_LOCAL_V(2 * 8 * 10); + r11 = HS_BX_LOCAL_V(2 * 8 * 11); + r7 = HS_BX_LOCAL_V(2 * 8 * 12); + r10 = HS_BX_LOCAL_V(2 * 8 * 13); + r8 = HS_BX_LOCAL_V(2 * 8 * 14); + r9 = HS_BX_LOCAL_V(2 * 8 * 15); + { + { + HS_SLAB_HALF_PREAMBLE(4); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + } + HS_SLAB_GLOBAL_STORE(8, 0, r1); + HS_SLAB_GLOBAL_STORE(8, 1, r2); + HS_SLAB_GLOBAL_STORE(8, 2, r3); + HS_SLAB_GLOBAL_STORE(8, 3, r4); + HS_SLAB_GLOBAL_STORE(8, 4, r5); + HS_SLAB_GLOBAL_STORE(8, 5, r6); + HS_SLAB_GLOBAL_STORE(8, 6, r7); + HS_SLAB_GLOBAL_STORE(8, 7, r8); + HS_SLAB_GLOBAL_STORE(8, 8, r9); + HS_SLAB_GLOBAL_STORE(8, 9, r10); + HS_SLAB_GLOBAL_STORE(8, 10, r11); + HS_SLAB_GLOBAL_STORE(8, 11, r12); + HS_SLAB_GLOBAL_STORE(8, 12, r13); + HS_SLAB_GLOBAL_STORE(8, 13, r14); + HS_SLAB_GLOBAL_STORE(8, 14, r15); + HS_SLAB_GLOBAL_STORE(8, 15, r16); +} + +HS_BS_KERNEL_PROTO(8, 1, 0) +{ + HS_SLAB_GLOBAL_PREAMBLE(8, 16); + HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 8, 0); + HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 8, 1); + HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 8, 2); + HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 8, 3); + HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 8, 4); + HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 8, 5); + HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 8, 6); + HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 8, 7); + HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8, 8); + HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 8, 9); + HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 8, 10); + HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 8, 11); + HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 8, 12); + HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 8, 13); + HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 8, 14); + HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 8, 15); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r6, r11); + HS_CMP_XCHG(r7, r10); + HS_CMP_XCHG(r4, r13); + HS_CMP_XCHG(r14, r15); + HS_CMP_XCHG(r8, r12); + HS_CMP_XCHG(r2, r3); + HS_CMP_XCHG(r5, r9); + HS_CMP_XCHG(r2, r5); + HS_CMP_XCHG(r8, r14); + HS_CMP_XCHG(r3, r9); + HS_CMP_XCHG(r12, r15); + HS_CMP_XCHG(r3, r5); + HS_CMP_XCHG(r6, r7); + HS_CMP_XCHG(r10, r11); + HS_CMP_XCHG(r12, r14); + HS_CMP_XCHG(r4, r9); + HS_CMP_XCHG(r8, r13); + HS_CMP_XCHG(r7, r9); + HS_CMP_XCHG(r11, r13); + HS_CMP_XCHG(r4, r6); + HS_CMP_XCHG(r8, r10); + HS_CMP_XCHG(r4, r5); + HS_CMP_XCHG(r6, r7); + HS_CMP_XCHG(r8, r9); + HS_CMP_XCHG(r10, r11); + HS_CMP_XCHG(r12, r13); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + { + HS_SLAB_FLIP_PREAMBLE(1); + HS_CMP_FLIP(0, r1, r16); + HS_CMP_FLIP(1, r2, r15); + HS_CMP_FLIP(2, r3, r14); + HS_CMP_FLIP(3, r4, r13); + HS_CMP_FLIP(4, r5, r12); + HS_CMP_FLIP(5, r6, r11); + HS_CMP_FLIP(6, r7, r10); + HS_CMP_FLIP(7, r8, r9); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + { + HS_SLAB_FLIP_PREAMBLE(3); + HS_CMP_FLIP(0, r1, r16); + HS_CMP_FLIP(1, r2, r15); + HS_CMP_FLIP(2, r3, r14); + HS_CMP_FLIP(3, r4, r13); + HS_CMP_FLIP(4, r5, r12); + HS_CMP_FLIP(5, r6, r11); + HS_CMP_FLIP(6, r7, r10); + HS_CMP_FLIP(7, r8, r9); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + { + HS_SLAB_FLIP_PREAMBLE(7); + HS_CMP_FLIP(0, r1, r16); + HS_CMP_FLIP(1, r2, r15); + HS_CMP_FLIP(2, r3, r14); + HS_CMP_FLIP(3, r4, r13); + HS_CMP_FLIP(4, r5, r12); + HS_CMP_FLIP(5, r6, r11); + HS_CMP_FLIP(6, r7, r10); + HS_CMP_FLIP(7, r8, r9); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + HS_SLAB_GLOBAL_STORE(8, 0, r1); + HS_SLAB_GLOBAL_STORE(8, 1, r2); + HS_SLAB_GLOBAL_STORE(8, 2, r3); + HS_SLAB_GLOBAL_STORE(8, 3, r4); + HS_SLAB_GLOBAL_STORE(8, 4, r5); + HS_SLAB_GLOBAL_STORE(8, 5, r6); + HS_SLAB_GLOBAL_STORE(8, 6, r7); + HS_SLAB_GLOBAL_STORE(8, 7, r8); + HS_SLAB_GLOBAL_STORE(8, 8, r9); + HS_SLAB_GLOBAL_STORE(8, 9, r10); + HS_SLAB_GLOBAL_STORE(8, 10, r11); + HS_SLAB_GLOBAL_STORE(8, 11, r12); + HS_SLAB_GLOBAL_STORE(8, 12, r13); + HS_SLAB_GLOBAL_STORE(8, 13, r14); + HS_SLAB_GLOBAL_STORE(8, 14, r15); + HS_SLAB_GLOBAL_STORE(8, 15, r16); +} + +HS_BC_KERNEL_PROTO(8, 16, 4) +{ + HS_BLOCK_LOCAL_MEM_DECL(128, 16); + + HS_SLAB_GLOBAL_PREAMBLE(8, 16); + HS_BC_MERGE_H_PREAMBLE(8, 16, 16); + { + { + HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 0); + HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 16); + HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(8, 32); + HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(8, 48); + HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(8, 64); + HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(8, 80); + HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(8, 96); + HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(8, 112); + HS_KEY_TYPE r0_9 = HS_BC_GLOBAL_LOAD_L(8, 128); + HS_KEY_TYPE r0_10 = HS_BC_GLOBAL_LOAD_L(8, 144); + HS_KEY_TYPE r0_11 = HS_BC_GLOBAL_LOAD_L(8, 160); + HS_KEY_TYPE r0_12 = HS_BC_GLOBAL_LOAD_L(8, 176); + HS_KEY_TYPE r0_13 = HS_BC_GLOBAL_LOAD_L(8, 192); + HS_KEY_TYPE r0_14 = HS_BC_GLOBAL_LOAD_L(8, 208); + HS_KEY_TYPE r0_15 = HS_BC_GLOBAL_LOAD_L(8, 224); + HS_KEY_TYPE r0_16 = HS_BC_GLOBAL_LOAD_L(8, 240); + HS_CMP_XCHG(r0_1, r0_9); + HS_CMP_XCHG(r0_5, r0_13); + HS_CMP_XCHG(r0_1, r0_5); + HS_CMP_XCHG(r0_9, r0_13); + HS_CMP_XCHG(r0_3, r0_11); + HS_CMP_XCHG(r0_7, r0_15); + HS_CMP_XCHG(r0_3, r0_7); + HS_CMP_XCHG(r0_11, r0_15); + HS_CMP_XCHG(r0_1, r0_3); + HS_CMP_XCHG(r0_5, r0_7); + HS_CMP_XCHG(r0_9, r0_11); + HS_CMP_XCHG(r0_13, r0_15); + HS_CMP_XCHG(r0_2, r0_10); + HS_CMP_XCHG(r0_6, r0_14); + HS_CMP_XCHG(r0_2, r0_6); + HS_CMP_XCHG(r0_10, r0_14); + HS_CMP_XCHG(r0_4, r0_12); + HS_CMP_XCHG(r0_8, r0_16); + HS_CMP_XCHG(r0_4, r0_8); + HS_CMP_XCHG(r0_12, r0_16); + HS_CMP_XCHG(r0_2, r0_4); + HS_CMP_XCHG(r0_6, r0_8); + HS_CMP_XCHG(r0_10, r0_12); + HS_CMP_XCHG(r0_14, r0_16); + HS_CMP_XCHG(r0_1, r0_2); + HS_CMP_XCHG(r0_3, r0_4); + HS_CMP_XCHG(r0_5, r0_6); + HS_CMP_XCHG(r0_7, r0_8); + HS_CMP_XCHG(r0_9, r0_10); + HS_CMP_XCHG(r0_11, r0_12); + HS_CMP_XCHG(r0_13, r0_14); + HS_CMP_XCHG(r0_15, r0_16); + HS_SLAB_LOCAL_L(0) = r0_1; + HS_SLAB_LOCAL_L(8) = r0_2; + HS_SLAB_LOCAL_L(16) = r0_3; + HS_SLAB_LOCAL_L(24) = r0_4; + HS_SLAB_LOCAL_L(32) = r0_5; + HS_SLAB_LOCAL_L(40) = r0_6; + HS_SLAB_LOCAL_L(48) = r0_7; + HS_SLAB_LOCAL_L(56) = r0_8; + HS_SLAB_LOCAL_L(64) = r0_9; + HS_SLAB_LOCAL_L(72) = r0_10; + HS_SLAB_LOCAL_L(80) = r0_11; + HS_SLAB_LOCAL_L(88) = r0_12; + HS_SLAB_LOCAL_L(96) = r0_13; + HS_SLAB_LOCAL_L(104) = r0_14; + HS_SLAB_LOCAL_L(112) = r0_15; + HS_SLAB_LOCAL_L(120) = r0_16; + } + } + HS_BLOCK_BARRIER(); + HS_KEY_TYPE r1 = HS_BX_LOCAL_V(16 * 8 * 0); + HS_KEY_TYPE r2 = HS_BX_LOCAL_V(16 * 8 * 1); + HS_KEY_TYPE r3 = HS_BX_LOCAL_V(16 * 8 * 2); + HS_KEY_TYPE r4 = HS_BX_LOCAL_V(16 * 8 * 3); + HS_KEY_TYPE r5 = HS_BX_LOCAL_V(16 * 8 * 4); + HS_KEY_TYPE r6 = HS_BX_LOCAL_V(16 * 8 * 5); + HS_KEY_TYPE r7 = HS_BX_LOCAL_V(16 * 8 * 6); + HS_KEY_TYPE r8 = HS_BX_LOCAL_V(16 * 8 * 7); + HS_KEY_TYPE r9 = HS_BX_LOCAL_V(16 * 8 * 8); + HS_KEY_TYPE r10 = HS_BX_LOCAL_V(16 * 8 * 9); + HS_KEY_TYPE r11 = HS_BX_LOCAL_V(16 * 8 * 10); + HS_KEY_TYPE r12 = HS_BX_LOCAL_V(16 * 8 * 11); + HS_KEY_TYPE r13 = HS_BX_LOCAL_V(16 * 8 * 12); + HS_KEY_TYPE r14 = HS_BX_LOCAL_V(16 * 8 * 13); + HS_KEY_TYPE r15 = HS_BX_LOCAL_V(16 * 8 * 14); + HS_KEY_TYPE r16 = HS_BX_LOCAL_V(16 * 8 * 15); + { + { + HS_SLAB_HALF_PREAMBLE(4); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + } + HS_SLAB_GLOBAL_STORE(8, 0, r1); + HS_SLAB_GLOBAL_STORE(8, 1, r2); + HS_SLAB_GLOBAL_STORE(8, 2, r3); + HS_SLAB_GLOBAL_STORE(8, 3, r4); + HS_SLAB_GLOBAL_STORE(8, 4, r5); + HS_SLAB_GLOBAL_STORE(8, 5, r6); + HS_SLAB_GLOBAL_STORE(8, 6, r7); + HS_SLAB_GLOBAL_STORE(8, 7, r8); + HS_SLAB_GLOBAL_STORE(8, 8, r9); + HS_SLAB_GLOBAL_STORE(8, 9, r10); + HS_SLAB_GLOBAL_STORE(8, 10, r11); + HS_SLAB_GLOBAL_STORE(8, 11, r12); + HS_SLAB_GLOBAL_STORE(8, 12, r13); + HS_SLAB_GLOBAL_STORE(8, 13, r14); + HS_SLAB_GLOBAL_STORE(8, 14, r15); + HS_SLAB_GLOBAL_STORE(8, 15, r16); +} + +HS_BC_KERNEL_PROTO(8, 8, 3) +{ + HS_BLOCK_LOCAL_MEM_DECL(64, 16); + + HS_SLAB_GLOBAL_PREAMBLE(8, 16); + HS_BC_MERGE_H_PREAMBLE(8, 16, 8); + { + { + HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 0); + HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 16); + HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(8, 32); + HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(8, 48); + HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(8, 64); + HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(8, 80); + HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(8, 96); + HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(8, 112); + HS_CMP_XCHG(r0_1, r0_5); + HS_CMP_XCHG(r0_3, r0_7); + HS_CMP_XCHG(r0_1, r0_3); + HS_CMP_XCHG(r0_5, r0_7); + HS_CMP_XCHG(r0_2, r0_6); + HS_CMP_XCHG(r0_4, r0_8); + HS_CMP_XCHG(r0_2, r0_4); + HS_CMP_XCHG(r0_6, r0_8); + HS_CMP_XCHG(r0_1, r0_2); + HS_CMP_XCHG(r0_3, r0_4); + HS_CMP_XCHG(r0_5, r0_6); + HS_CMP_XCHG(r0_7, r0_8); + HS_SLAB_LOCAL_L(0) = r0_1; + HS_SLAB_LOCAL_L(8) = r0_2; + HS_SLAB_LOCAL_L(16) = r0_3; + HS_SLAB_LOCAL_L(24) = r0_4; + HS_SLAB_LOCAL_L(32) = r0_5; + HS_SLAB_LOCAL_L(40) = r0_6; + HS_SLAB_LOCAL_L(48) = r0_7; + HS_SLAB_LOCAL_L(56) = r0_8; + } + { + HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 8); + HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 24); + HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(8, 40); + HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(8, 56); + HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(8, 72); + HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(8, 88); + HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(8, 104); + HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(8, 120); + HS_CMP_XCHG(r0_1, r0_5); + HS_CMP_XCHG(r0_3, r0_7); + HS_CMP_XCHG(r0_1, r0_3); + HS_CMP_XCHG(r0_5, r0_7); + HS_CMP_XCHG(r0_2, r0_6); + HS_CMP_XCHG(r0_4, r0_8); + HS_CMP_XCHG(r0_2, r0_4); + HS_CMP_XCHG(r0_6, r0_8); + HS_CMP_XCHG(r0_1, r0_2); + HS_CMP_XCHG(r0_3, r0_4); + HS_CMP_XCHG(r0_5, r0_6); + HS_CMP_XCHG(r0_7, r0_8); + HS_SLAB_LOCAL_L(512) = r0_1; + HS_SLAB_LOCAL_L(520) = r0_2; + HS_SLAB_LOCAL_L(528) = r0_3; + HS_SLAB_LOCAL_L(536) = r0_4; + HS_SLAB_LOCAL_L(544) = r0_5; + HS_SLAB_LOCAL_L(552) = r0_6; + HS_SLAB_LOCAL_L(560) = r0_7; + HS_SLAB_LOCAL_L(568) = r0_8; + } + } + HS_BLOCK_BARRIER(); + HS_KEY_TYPE r1 = HS_BX_LOCAL_V(8 * 8 * 0); + HS_KEY_TYPE r2 = HS_BX_LOCAL_V(8 * 8 * 1); + HS_KEY_TYPE r3 = HS_BX_LOCAL_V(8 * 8 * 2); + HS_KEY_TYPE r4 = HS_BX_LOCAL_V(8 * 8 * 3); + HS_KEY_TYPE r5 = HS_BX_LOCAL_V(8 * 8 * 4); + HS_KEY_TYPE r6 = HS_BX_LOCAL_V(8 * 8 * 5); + HS_KEY_TYPE r7 = HS_BX_LOCAL_V(8 * 8 * 6); + HS_KEY_TYPE r8 = HS_BX_LOCAL_V(8 * 8 * 7); + HS_KEY_TYPE r9 = HS_BX_LOCAL_V(8 * 8 * 8); + HS_KEY_TYPE r10 = HS_BX_LOCAL_V(8 * 8 * 9); + HS_KEY_TYPE r11 = HS_BX_LOCAL_V(8 * 8 * 10); + HS_KEY_TYPE r12 = HS_BX_LOCAL_V(8 * 8 * 11); + HS_KEY_TYPE r13 = HS_BX_LOCAL_V(8 * 8 * 12); + HS_KEY_TYPE r14 = HS_BX_LOCAL_V(8 * 8 * 13); + HS_KEY_TYPE r15 = HS_BX_LOCAL_V(8 * 8 * 14); + HS_KEY_TYPE r16 = HS_BX_LOCAL_V(8 * 8 * 15); + { + { + HS_SLAB_HALF_PREAMBLE(4); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + } + HS_SLAB_GLOBAL_STORE(8, 0, r1); + HS_SLAB_GLOBAL_STORE(8, 1, r2); + HS_SLAB_GLOBAL_STORE(8, 2, r3); + HS_SLAB_GLOBAL_STORE(8, 3, r4); + HS_SLAB_GLOBAL_STORE(8, 4, r5); + HS_SLAB_GLOBAL_STORE(8, 5, r6); + HS_SLAB_GLOBAL_STORE(8, 6, r7); + HS_SLAB_GLOBAL_STORE(8, 7, r8); + HS_SLAB_GLOBAL_STORE(8, 8, r9); + HS_SLAB_GLOBAL_STORE(8, 9, r10); + HS_SLAB_GLOBAL_STORE(8, 10, r11); + HS_SLAB_GLOBAL_STORE(8, 11, r12); + HS_SLAB_GLOBAL_STORE(8, 12, r13); + HS_SLAB_GLOBAL_STORE(8, 13, r14); + HS_SLAB_GLOBAL_STORE(8, 14, r15); + HS_SLAB_GLOBAL_STORE(8, 15, r16); +} + +HS_BC_KERNEL_PROTO(8, 4, 2) +{ + HS_BLOCK_LOCAL_MEM_DECL(32, 16); + + HS_SLAB_GLOBAL_PREAMBLE(8, 16); + HS_BC_MERGE_H_PREAMBLE(8, 16, 4); + { + { + HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 0); + HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 16); + HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(8, 32); + HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(8, 48); + HS_CMP_XCHG(r0_1, r0_3); + HS_CMP_XCHG(r0_2, r0_4); + HS_CMP_XCHG(r0_1, r0_2); + HS_CMP_XCHG(r0_3, r0_4); + HS_SLAB_LOCAL_L(0) = r0_1; + HS_SLAB_LOCAL_L(8) = r0_2; + HS_SLAB_LOCAL_L(16) = r0_3; + HS_SLAB_LOCAL_L(24) = r0_4; + } + { + HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 4); + HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 20); + HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(8, 36); + HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(8, 52); + HS_CMP_XCHG(r0_1, r0_3); + HS_CMP_XCHG(r0_2, r0_4); + HS_CMP_XCHG(r0_1, r0_2); + HS_CMP_XCHG(r0_3, r0_4); + HS_SLAB_LOCAL_L(128) = r0_1; + HS_SLAB_LOCAL_L(136) = r0_2; + HS_SLAB_LOCAL_L(144) = r0_3; + HS_SLAB_LOCAL_L(152) = r0_4; + } + { + HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 8); + HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 24); + HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(8, 40); + HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(8, 56); + HS_CMP_XCHG(r0_1, r0_3); + HS_CMP_XCHG(r0_2, r0_4); + HS_CMP_XCHG(r0_1, r0_2); + HS_CMP_XCHG(r0_3, r0_4); + HS_SLAB_LOCAL_L(256) = r0_1; + HS_SLAB_LOCAL_L(264) = r0_2; + HS_SLAB_LOCAL_L(272) = r0_3; + HS_SLAB_LOCAL_L(280) = r0_4; + } + { + HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 12); + HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 28); + HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(8, 44); + HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(8, 60); + HS_CMP_XCHG(r0_1, r0_3); + HS_CMP_XCHG(r0_2, r0_4); + HS_CMP_XCHG(r0_1, r0_2); + HS_CMP_XCHG(r0_3, r0_4); + HS_SLAB_LOCAL_L(384) = r0_1; + HS_SLAB_LOCAL_L(392) = r0_2; + HS_SLAB_LOCAL_L(400) = r0_3; + HS_SLAB_LOCAL_L(408) = r0_4; + } + } + HS_BLOCK_BARRIER(); + HS_KEY_TYPE r1 = HS_BX_LOCAL_V(4 * 8 * 0); + HS_KEY_TYPE r2 = HS_BX_LOCAL_V(4 * 8 * 1); + HS_KEY_TYPE r3 = HS_BX_LOCAL_V(4 * 8 * 2); + HS_KEY_TYPE r4 = HS_BX_LOCAL_V(4 * 8 * 3); + HS_KEY_TYPE r5 = HS_BX_LOCAL_V(4 * 8 * 4); + HS_KEY_TYPE r6 = HS_BX_LOCAL_V(4 * 8 * 5); + HS_KEY_TYPE r7 = HS_BX_LOCAL_V(4 * 8 * 6); + HS_KEY_TYPE r8 = HS_BX_LOCAL_V(4 * 8 * 7); + HS_KEY_TYPE r9 = HS_BX_LOCAL_V(4 * 8 * 8); + HS_KEY_TYPE r10 = HS_BX_LOCAL_V(4 * 8 * 9); + HS_KEY_TYPE r11 = HS_BX_LOCAL_V(4 * 8 * 10); + HS_KEY_TYPE r12 = HS_BX_LOCAL_V(4 * 8 * 11); + HS_KEY_TYPE r13 = HS_BX_LOCAL_V(4 * 8 * 12); + HS_KEY_TYPE r14 = HS_BX_LOCAL_V(4 * 8 * 13); + HS_KEY_TYPE r15 = HS_BX_LOCAL_V(4 * 8 * 14); + HS_KEY_TYPE r16 = HS_BX_LOCAL_V(4 * 8 * 15); + { + { + HS_SLAB_HALF_PREAMBLE(4); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + } + HS_SLAB_GLOBAL_STORE(8, 0, r1); + HS_SLAB_GLOBAL_STORE(8, 1, r2); + HS_SLAB_GLOBAL_STORE(8, 2, r3); + HS_SLAB_GLOBAL_STORE(8, 3, r4); + HS_SLAB_GLOBAL_STORE(8, 4, r5); + HS_SLAB_GLOBAL_STORE(8, 5, r6); + HS_SLAB_GLOBAL_STORE(8, 6, r7); + HS_SLAB_GLOBAL_STORE(8, 7, r8); + HS_SLAB_GLOBAL_STORE(8, 8, r9); + HS_SLAB_GLOBAL_STORE(8, 9, r10); + HS_SLAB_GLOBAL_STORE(8, 10, r11); + HS_SLAB_GLOBAL_STORE(8, 11, r12); + HS_SLAB_GLOBAL_STORE(8, 12, r13); + HS_SLAB_GLOBAL_STORE(8, 13, r14); + HS_SLAB_GLOBAL_STORE(8, 14, r15); + HS_SLAB_GLOBAL_STORE(8, 15, r16); +} + +HS_BC_KERNEL_PROTO(8, 2, 1) +{ + HS_BLOCK_LOCAL_MEM_DECL(16, 16); + + HS_SLAB_GLOBAL_PREAMBLE(8, 16); + HS_BC_MERGE_H_PREAMBLE(8, 16, 2); + { + { + HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 0); + HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 16); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(0) = r0_1; + HS_SLAB_LOCAL_L(8) = r0_2; + } + { + HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 2); + HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 18); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(32) = r0_1; + HS_SLAB_LOCAL_L(40) = r0_2; + } + { + HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 4); + HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 20); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(64) = r0_1; + HS_SLAB_LOCAL_L(72) = r0_2; + } + { + HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 6); + HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 22); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(96) = r0_1; + HS_SLAB_LOCAL_L(104) = r0_2; + } + { + HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 8); + HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 24); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(128) = r0_1; + HS_SLAB_LOCAL_L(136) = r0_2; + } + { + HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 10); + HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 26); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(160) = r0_1; + HS_SLAB_LOCAL_L(168) = r0_2; + } + { + HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 12); + HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 28); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(192) = r0_1; + HS_SLAB_LOCAL_L(200) = r0_2; + } + { + HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8, 14); + HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8, 30); + HS_CMP_XCHG(r0_1, r0_2); + HS_SLAB_LOCAL_L(224) = r0_1; + HS_SLAB_LOCAL_L(232) = r0_2; + } + } + HS_BLOCK_BARRIER(); + HS_KEY_TYPE r1 = HS_BX_LOCAL_V(2 * 8 * 0); + HS_KEY_TYPE r2 = HS_BX_LOCAL_V(2 * 8 * 1); + HS_KEY_TYPE r3 = HS_BX_LOCAL_V(2 * 8 * 2); + HS_KEY_TYPE r4 = HS_BX_LOCAL_V(2 * 8 * 3); + HS_KEY_TYPE r5 = HS_BX_LOCAL_V(2 * 8 * 4); + HS_KEY_TYPE r6 = HS_BX_LOCAL_V(2 * 8 * 5); + HS_KEY_TYPE r7 = HS_BX_LOCAL_V(2 * 8 * 6); + HS_KEY_TYPE r8 = HS_BX_LOCAL_V(2 * 8 * 7); + HS_KEY_TYPE r9 = HS_BX_LOCAL_V(2 * 8 * 8); + HS_KEY_TYPE r10 = HS_BX_LOCAL_V(2 * 8 * 9); + HS_KEY_TYPE r11 = HS_BX_LOCAL_V(2 * 8 * 10); + HS_KEY_TYPE r12 = HS_BX_LOCAL_V(2 * 8 * 11); + HS_KEY_TYPE r13 = HS_BX_LOCAL_V(2 * 8 * 12); + HS_KEY_TYPE r14 = HS_BX_LOCAL_V(2 * 8 * 13); + HS_KEY_TYPE r15 = HS_BX_LOCAL_V(2 * 8 * 14); + HS_KEY_TYPE r16 = HS_BX_LOCAL_V(2 * 8 * 15); + { + { + HS_SLAB_HALF_PREAMBLE(4); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + } + HS_SLAB_GLOBAL_STORE(8, 0, r1); + HS_SLAB_GLOBAL_STORE(8, 1, r2); + HS_SLAB_GLOBAL_STORE(8, 2, r3); + HS_SLAB_GLOBAL_STORE(8, 3, r4); + HS_SLAB_GLOBAL_STORE(8, 4, r5); + HS_SLAB_GLOBAL_STORE(8, 5, r6); + HS_SLAB_GLOBAL_STORE(8, 6, r7); + HS_SLAB_GLOBAL_STORE(8, 7, r8); + HS_SLAB_GLOBAL_STORE(8, 8, r9); + HS_SLAB_GLOBAL_STORE(8, 9, r10); + HS_SLAB_GLOBAL_STORE(8, 10, r11); + HS_SLAB_GLOBAL_STORE(8, 11, r12); + HS_SLAB_GLOBAL_STORE(8, 12, r13); + HS_SLAB_GLOBAL_STORE(8, 13, r14); + HS_SLAB_GLOBAL_STORE(8, 14, r15); + HS_SLAB_GLOBAL_STORE(8, 15, r16); +} + +HS_BC_KERNEL_PROTO(8, 1, 0) +{ + HS_SLAB_GLOBAL_PREAMBLE(8, 16); + HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 8, 0); + HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 8, 1); + HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 8, 2); + HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 8, 3); + HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 8, 4); + HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 8, 5); + HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 8, 6); + HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 8, 7); + HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vout, 8, 8); + HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vout, 8, 9); + HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vout, 8, 10); + HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vout, 8, 11); + HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vout, 8, 12); + HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vout, 8, 13); + HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vout, 8, 14); + HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vout, 8, 15); + { + { + HS_SLAB_HALF_PREAMBLE(4); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(2); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + { + HS_SLAB_HALF_PREAMBLE(1); + HS_CMP_HALF(0, r1); + HS_CMP_HALF(1, r2); + HS_CMP_HALF(2, r3); + HS_CMP_HALF(3, r4); + HS_CMP_HALF(4, r5); + HS_CMP_HALF(5, r6); + HS_CMP_HALF(6, r7); + HS_CMP_HALF(7, r8); + HS_CMP_HALF(8, r9); + HS_CMP_HALF(9, r10); + HS_CMP_HALF(10, r11); + HS_CMP_HALF(11, r12); + HS_CMP_HALF(12, r13); + HS_CMP_HALF(13, r14); + HS_CMP_HALF(14, r15); + HS_CMP_HALF(15, r16); + } + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + } + HS_SLAB_GLOBAL_STORE(8, 0, r1); + HS_SLAB_GLOBAL_STORE(8, 1, r2); + HS_SLAB_GLOBAL_STORE(8, 2, r3); + HS_SLAB_GLOBAL_STORE(8, 3, r4); + HS_SLAB_GLOBAL_STORE(8, 4, r5); + HS_SLAB_GLOBAL_STORE(8, 5, r6); + HS_SLAB_GLOBAL_STORE(8, 6, r7); + HS_SLAB_GLOBAL_STORE(8, 7, r8); + HS_SLAB_GLOBAL_STORE(8, 8, r9); + HS_SLAB_GLOBAL_STORE(8, 9, r10); + HS_SLAB_GLOBAL_STORE(8, 10, r11); + HS_SLAB_GLOBAL_STORE(8, 11, r12); + HS_SLAB_GLOBAL_STORE(8, 12, r13); + HS_SLAB_GLOBAL_STORE(8, 13, r14); + HS_SLAB_GLOBAL_STORE(8, 14, r15); + HS_SLAB_GLOBAL_STORE(8, 15, r16); +} + +HS_FM_KERNEL_PROTO(1, 4) +{ + HS_FM_PREAMBLE(16); + HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); + HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); + HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); + HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); + HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); + HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); + HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); + HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); + HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8); + HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9); + HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10); + HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11); + HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12); + HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13); + HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14); + HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15); + HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0); + HS_KEY_TYPE r18 = HS_FM_GLOBAL_LOAD_R(1); + HS_KEY_TYPE r19 = HS_FM_GLOBAL_LOAD_R(2); + HS_KEY_TYPE r20 = HS_FM_GLOBAL_LOAD_R(3); + HS_KEY_TYPE r21 = HS_FM_GLOBAL_LOAD_R(4); + HS_KEY_TYPE r22 = HS_FM_GLOBAL_LOAD_R(5); + HS_KEY_TYPE r23 = HS_FM_GLOBAL_LOAD_R(6); + HS_KEY_TYPE r24 = HS_FM_GLOBAL_LOAD_R(7); + HS_KEY_TYPE r25 = HS_FM_GLOBAL_LOAD_R(8); + HS_KEY_TYPE r26 = HS_FM_GLOBAL_LOAD_R(9); + HS_KEY_TYPE r27 = HS_FM_GLOBAL_LOAD_R(10); + HS_KEY_TYPE r28 = HS_FM_GLOBAL_LOAD_R(11); + HS_KEY_TYPE r29 = HS_FM_GLOBAL_LOAD_R(12); + HS_KEY_TYPE r30 = HS_FM_GLOBAL_LOAD_R(13); + HS_KEY_TYPE r31 = HS_FM_GLOBAL_LOAD_R(14); + HS_KEY_TYPE r32 = HS_FM_GLOBAL_LOAD_R(15); + HS_CMP_XCHG(r16, r17); + HS_CMP_XCHG(r15, r18); + HS_CMP_XCHG(r14, r19); + HS_CMP_XCHG(r13, r20); + HS_CMP_XCHG(r12, r21); + HS_CMP_XCHG(r11, r22); + HS_CMP_XCHG(r10, r23); + HS_CMP_XCHG(r9, r24); + HS_CMP_XCHG(r8, r25); + HS_CMP_XCHG(r7, r26); + HS_CMP_XCHG(r6, r27); + HS_CMP_XCHG(r5, r28); + HS_CMP_XCHG(r4, r29); + HS_CMP_XCHG(r3, r30); + HS_CMP_XCHG(r2, r31); + HS_CMP_XCHG(r1, r32); + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + HS_CMP_XCHG(r17, r25); + HS_CMP_XCHG(r21, r29); + HS_CMP_XCHG(r17, r21); + HS_CMP_XCHG(r25, r29); + HS_CMP_XCHG(r19, r27); + HS_CMP_XCHG(r23, r31); + HS_CMP_XCHG(r19, r23); + HS_CMP_XCHG(r27, r31); + HS_CMP_XCHG(r17, r19); + HS_CMP_XCHG(r21, r23); + HS_CMP_XCHG(r25, r27); + HS_CMP_XCHG(r29, r31); + HS_CMP_XCHG(r18, r26); + HS_CMP_XCHG(r22, r30); + HS_CMP_XCHG(r18, r22); + HS_CMP_XCHG(r26, r30); + HS_CMP_XCHG(r20, r28); + HS_CMP_XCHG(r24, r32); + HS_CMP_XCHG(r20, r24); + HS_CMP_XCHG(r28, r32); + HS_CMP_XCHG(r18, r20); + HS_CMP_XCHG(r22, r24); + HS_CMP_XCHG(r26, r28); + HS_CMP_XCHG(r30, r32); + HS_CMP_XCHG(r17, r18); + HS_CMP_XCHG(r19, r20); + HS_CMP_XCHG(r21, r22); + HS_CMP_XCHG(r23, r24); + HS_CMP_XCHG(r25, r26); + HS_CMP_XCHG(r27, r28); + HS_CMP_XCHG(r29, r30); + HS_CMP_XCHG(r31, r32); + HS_XM_GLOBAL_STORE_L(0, r1); + HS_XM_GLOBAL_STORE_L(1, r2); + HS_XM_GLOBAL_STORE_L(2, r3); + HS_XM_GLOBAL_STORE_L(3, r4); + HS_XM_GLOBAL_STORE_L(4, r5); + HS_XM_GLOBAL_STORE_L(5, r6); + HS_XM_GLOBAL_STORE_L(6, r7); + HS_XM_GLOBAL_STORE_L(7, r8); + HS_XM_GLOBAL_STORE_L(8, r9); + HS_XM_GLOBAL_STORE_L(9, r10); + HS_XM_GLOBAL_STORE_L(10, r11); + HS_XM_GLOBAL_STORE_L(11, r12); + HS_XM_GLOBAL_STORE_L(12, r13); + HS_XM_GLOBAL_STORE_L(13, r14); + HS_XM_GLOBAL_STORE_L(14, r15); + HS_XM_GLOBAL_STORE_L(15, r16); + HS_FM_GLOBAL_STORE_R(0, r17); + HS_FM_GLOBAL_STORE_R(1, r18); + HS_FM_GLOBAL_STORE_R(2, r19); + HS_FM_GLOBAL_STORE_R(3, r20); + HS_FM_GLOBAL_STORE_R(4, r21); + HS_FM_GLOBAL_STORE_R(5, r22); + HS_FM_GLOBAL_STORE_R(6, r23); + HS_FM_GLOBAL_STORE_R(7, r24); + HS_FM_GLOBAL_STORE_R(8, r25); + HS_FM_GLOBAL_STORE_R(9, r26); + HS_FM_GLOBAL_STORE_R(10, r27); + HS_FM_GLOBAL_STORE_R(11, r28); + HS_FM_GLOBAL_STORE_R(12, r29); + HS_FM_GLOBAL_STORE_R(13, r30); + HS_FM_GLOBAL_STORE_R(14, r31); + HS_FM_GLOBAL_STORE_R(15, r32); +} + +HS_FM_KERNEL_PROTO(1, 3) +{ + HS_FM_PREAMBLE(16); + HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); + HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); + HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); + HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); + HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); + HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); + HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); + HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); + HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8); + HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9); + HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10); + HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11); + HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12); + HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13); + HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14); + HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15); + HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0); + HS_KEY_TYPE r18 = HS_FM_GLOBAL_LOAD_R(1); + HS_KEY_TYPE r19 = HS_FM_GLOBAL_LOAD_R(2); + HS_KEY_TYPE r20 = HS_FM_GLOBAL_LOAD_R(3); + HS_KEY_TYPE r21 = HS_FM_GLOBAL_LOAD_R(4); + HS_KEY_TYPE r22 = HS_FM_GLOBAL_LOAD_R(5); + HS_KEY_TYPE r23 = HS_FM_GLOBAL_LOAD_R(6); + HS_KEY_TYPE r24 = HS_FM_GLOBAL_LOAD_R(7); + HS_CMP_XCHG(r16, r17); + HS_CMP_XCHG(r15, r18); + HS_CMP_XCHG(r14, r19); + HS_CMP_XCHG(r13, r20); + HS_CMP_XCHG(r12, r21); + HS_CMP_XCHG(r11, r22); + HS_CMP_XCHG(r10, r23); + HS_CMP_XCHG(r9, r24); + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + HS_CMP_XCHG(r17, r21); + HS_CMP_XCHG(r19, r23); + HS_CMP_XCHG(r17, r19); + HS_CMP_XCHG(r21, r23); + HS_CMP_XCHG(r18, r22); + HS_CMP_XCHG(r20, r24); + HS_CMP_XCHG(r18, r20); + HS_CMP_XCHG(r22, r24); + HS_CMP_XCHG(r17, r18); + HS_CMP_XCHG(r19, r20); + HS_CMP_XCHG(r21, r22); + HS_CMP_XCHG(r23, r24); + HS_XM_GLOBAL_STORE_L(0, r1); + HS_XM_GLOBAL_STORE_L(1, r2); + HS_XM_GLOBAL_STORE_L(2, r3); + HS_XM_GLOBAL_STORE_L(3, r4); + HS_XM_GLOBAL_STORE_L(4, r5); + HS_XM_GLOBAL_STORE_L(5, r6); + HS_XM_GLOBAL_STORE_L(6, r7); + HS_XM_GLOBAL_STORE_L(7, r8); + HS_XM_GLOBAL_STORE_L(8, r9); + HS_XM_GLOBAL_STORE_L(9, r10); + HS_XM_GLOBAL_STORE_L(10, r11); + HS_XM_GLOBAL_STORE_L(11, r12); + HS_XM_GLOBAL_STORE_L(12, r13); + HS_XM_GLOBAL_STORE_L(13, r14); + HS_XM_GLOBAL_STORE_L(14, r15); + HS_XM_GLOBAL_STORE_L(15, r16); + HS_FM_GLOBAL_STORE_R(0, r17); + HS_FM_GLOBAL_STORE_R(1, r18); + HS_FM_GLOBAL_STORE_R(2, r19); + HS_FM_GLOBAL_STORE_R(3, r20); + HS_FM_GLOBAL_STORE_R(4, r21); + HS_FM_GLOBAL_STORE_R(5, r22); + HS_FM_GLOBAL_STORE_R(6, r23); + HS_FM_GLOBAL_STORE_R(7, r24); +} + +HS_FM_KERNEL_PROTO(1, 2) +{ + HS_FM_PREAMBLE(16); + HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); + HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); + HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); + HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); + HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); + HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); + HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); + HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); + HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8); + HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9); + HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10); + HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11); + HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12); + HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13); + HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14); + HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15); + HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0); + HS_KEY_TYPE r18 = HS_FM_GLOBAL_LOAD_R(1); + HS_KEY_TYPE r19 = HS_FM_GLOBAL_LOAD_R(2); + HS_KEY_TYPE r20 = HS_FM_GLOBAL_LOAD_R(3); + HS_CMP_XCHG(r16, r17); + HS_CMP_XCHG(r15, r18); + HS_CMP_XCHG(r14, r19); + HS_CMP_XCHG(r13, r20); + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + HS_CMP_XCHG(r17, r19); + HS_CMP_XCHG(r18, r20); + HS_CMP_XCHG(r17, r18); + HS_CMP_XCHG(r19, r20); + HS_XM_GLOBAL_STORE_L(0, r1); + HS_XM_GLOBAL_STORE_L(1, r2); + HS_XM_GLOBAL_STORE_L(2, r3); + HS_XM_GLOBAL_STORE_L(3, r4); + HS_XM_GLOBAL_STORE_L(4, r5); + HS_XM_GLOBAL_STORE_L(5, r6); + HS_XM_GLOBAL_STORE_L(6, r7); + HS_XM_GLOBAL_STORE_L(7, r8); + HS_XM_GLOBAL_STORE_L(8, r9); + HS_XM_GLOBAL_STORE_L(9, r10); + HS_XM_GLOBAL_STORE_L(10, r11); + HS_XM_GLOBAL_STORE_L(11, r12); + HS_XM_GLOBAL_STORE_L(12, r13); + HS_XM_GLOBAL_STORE_L(13, r14); + HS_XM_GLOBAL_STORE_L(14, r15); + HS_XM_GLOBAL_STORE_L(15, r16); + HS_FM_GLOBAL_STORE_R(0, r17); + HS_FM_GLOBAL_STORE_R(1, r18); + HS_FM_GLOBAL_STORE_R(2, r19); + HS_FM_GLOBAL_STORE_R(3, r20); +} + +HS_FM_KERNEL_PROTO(1, 1) +{ + HS_FM_PREAMBLE(16); + HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); + HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); + HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); + HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); + HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); + HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); + HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); + HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); + HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8); + HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9); + HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10); + HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11); + HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12); + HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13); + HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14); + HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15); + HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0); + HS_KEY_TYPE r18 = HS_FM_GLOBAL_LOAD_R(1); + HS_CMP_XCHG(r16, r17); + HS_CMP_XCHG(r15, r18); + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + HS_CMP_XCHG(r17, r18); + HS_XM_GLOBAL_STORE_L(0, r1); + HS_XM_GLOBAL_STORE_L(1, r2); + HS_XM_GLOBAL_STORE_L(2, r3); + HS_XM_GLOBAL_STORE_L(3, r4); + HS_XM_GLOBAL_STORE_L(4, r5); + HS_XM_GLOBAL_STORE_L(5, r6); + HS_XM_GLOBAL_STORE_L(6, r7); + HS_XM_GLOBAL_STORE_L(7, r8); + HS_XM_GLOBAL_STORE_L(8, r9); + HS_XM_GLOBAL_STORE_L(9, r10); + HS_XM_GLOBAL_STORE_L(10, r11); + HS_XM_GLOBAL_STORE_L(11, r12); + HS_XM_GLOBAL_STORE_L(12, r13); + HS_XM_GLOBAL_STORE_L(13, r14); + HS_XM_GLOBAL_STORE_L(14, r15); + HS_XM_GLOBAL_STORE_L(15, r16); + HS_FM_GLOBAL_STORE_R(0, r17); + HS_FM_GLOBAL_STORE_R(1, r18); +} + +HS_FM_KERNEL_PROTO(1, 0) +{ + HS_FM_PREAMBLE(16); + HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); + HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); + HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); + HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); + HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); + HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); + HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); + HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); + HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8); + HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9); + HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10); + HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11); + HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12); + HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13); + HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14); + HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15); + HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0); + HS_CMP_XCHG(r16, r17); + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + HS_XM_GLOBAL_STORE_L(0, r1); + HS_XM_GLOBAL_STORE_L(1, r2); + HS_XM_GLOBAL_STORE_L(2, r3); + HS_XM_GLOBAL_STORE_L(3, r4); + HS_XM_GLOBAL_STORE_L(4, r5); + HS_XM_GLOBAL_STORE_L(5, r6); + HS_XM_GLOBAL_STORE_L(6, r7); + HS_XM_GLOBAL_STORE_L(7, r8); + HS_XM_GLOBAL_STORE_L(8, r9); + HS_XM_GLOBAL_STORE_L(9, r10); + HS_XM_GLOBAL_STORE_L(10, r11); + HS_XM_GLOBAL_STORE_L(11, r12); + HS_XM_GLOBAL_STORE_L(12, r13); + HS_XM_GLOBAL_STORE_L(13, r14); + HS_XM_GLOBAL_STORE_L(14, r15); + HS_XM_GLOBAL_STORE_L(15, r16); + HS_FM_GLOBAL_STORE_R(0, r17); +} + +HS_HM_KERNEL_PROTO(1) +{ + HS_HM_PREAMBLE(16); + HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); + HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); + HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); + HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); + HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); + HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); + HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); + HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); + HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8); + HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9); + HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10); + HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11); + HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12); + HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13); + HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14); + HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15); + HS_KEY_TYPE r17 = HS_XM_GLOBAL_LOAD_L(16); + HS_KEY_TYPE r18 = HS_XM_GLOBAL_LOAD_L(17); + HS_KEY_TYPE r19 = HS_XM_GLOBAL_LOAD_L(18); + HS_KEY_TYPE r20 = HS_XM_GLOBAL_LOAD_L(19); + HS_KEY_TYPE r21 = HS_XM_GLOBAL_LOAD_L(20); + HS_KEY_TYPE r22 = HS_XM_GLOBAL_LOAD_L(21); + HS_KEY_TYPE r23 = HS_XM_GLOBAL_LOAD_L(22); + HS_KEY_TYPE r24 = HS_XM_GLOBAL_LOAD_L(23); + HS_KEY_TYPE r25 = HS_XM_GLOBAL_LOAD_L(24); + HS_KEY_TYPE r26 = HS_XM_GLOBAL_LOAD_L(25); + HS_KEY_TYPE r27 = HS_XM_GLOBAL_LOAD_L(26); + HS_KEY_TYPE r28 = HS_XM_GLOBAL_LOAD_L(27); + HS_KEY_TYPE r29 = HS_XM_GLOBAL_LOAD_L(28); + HS_KEY_TYPE r30 = HS_XM_GLOBAL_LOAD_L(29); + HS_KEY_TYPE r31 = HS_XM_GLOBAL_LOAD_L(30); + HS_KEY_TYPE r32 = HS_XM_GLOBAL_LOAD_L(31); + HS_CMP_XCHG(r1, r17); + HS_CMP_XCHG(r9, r25); + HS_CMP_XCHG(r1, r9); + HS_CMP_XCHG(r17, r25); + HS_CMP_XCHG(r5, r21); + HS_CMP_XCHG(r13, r29); + HS_CMP_XCHG(r5, r13); + HS_CMP_XCHG(r21, r29); + HS_CMP_XCHG(r1, r5); + HS_CMP_XCHG(r9, r13); + HS_CMP_XCHG(r17, r21); + HS_CMP_XCHG(r25, r29); + HS_CMP_XCHG(r3, r19); + HS_CMP_XCHG(r11, r27); + HS_CMP_XCHG(r3, r11); + HS_CMP_XCHG(r19, r27); + HS_CMP_XCHG(r7, r23); + HS_CMP_XCHG(r15, r31); + HS_CMP_XCHG(r7, r15); + HS_CMP_XCHG(r23, r31); + HS_CMP_XCHG(r3, r7); + HS_CMP_XCHG(r11, r15); + HS_CMP_XCHG(r19, r23); + HS_CMP_XCHG(r27, r31); + HS_CMP_XCHG(r1, r3); + HS_CMP_XCHG(r5, r7); + HS_CMP_XCHG(r9, r11); + HS_CMP_XCHG(r13, r15); + HS_CMP_XCHG(r17, r19); + HS_CMP_XCHG(r21, r23); + HS_CMP_XCHG(r25, r27); + HS_CMP_XCHG(r29, r31); + HS_CMP_XCHG(r2, r18); + HS_CMP_XCHG(r10, r26); + HS_CMP_XCHG(r2, r10); + HS_CMP_XCHG(r18, r26); + HS_CMP_XCHG(r6, r22); + HS_CMP_XCHG(r14, r30); + HS_CMP_XCHG(r6, r14); + HS_CMP_XCHG(r22, r30); + HS_CMP_XCHG(r2, r6); + HS_CMP_XCHG(r10, r14); + HS_CMP_XCHG(r18, r22); + HS_CMP_XCHG(r26, r30); + HS_CMP_XCHG(r4, r20); + HS_CMP_XCHG(r12, r28); + HS_CMP_XCHG(r4, r12); + HS_CMP_XCHG(r20, r28); + HS_CMP_XCHG(r8, r24); + HS_CMP_XCHG(r16, r32); + HS_CMP_XCHG(r8, r16); + HS_CMP_XCHG(r24, r32); + HS_CMP_XCHG(r4, r8); + HS_CMP_XCHG(r12, r16); + HS_CMP_XCHG(r20, r24); + HS_CMP_XCHG(r28, r32); + HS_CMP_XCHG(r2, r4); + HS_CMP_XCHG(r6, r8); + HS_CMP_XCHG(r10, r12); + HS_CMP_XCHG(r14, r16); + HS_CMP_XCHG(r18, r20); + HS_CMP_XCHG(r22, r24); + HS_CMP_XCHG(r26, r28); + HS_CMP_XCHG(r30, r32); + HS_CMP_XCHG(r1, r2); + HS_CMP_XCHG(r3, r4); + HS_CMP_XCHG(r5, r6); + HS_CMP_XCHG(r7, r8); + HS_CMP_XCHG(r9, r10); + HS_CMP_XCHG(r11, r12); + HS_CMP_XCHG(r13, r14); + HS_CMP_XCHG(r15, r16); + HS_CMP_XCHG(r17, r18); + HS_CMP_XCHG(r19, r20); + HS_CMP_XCHG(r21, r22); + HS_CMP_XCHG(r23, r24); + HS_CMP_XCHG(r25, r26); + HS_CMP_XCHG(r27, r28); + HS_CMP_XCHG(r29, r30); + HS_CMP_XCHG(r31, r32); + HS_XM_GLOBAL_STORE_L(0, r1); + HS_XM_GLOBAL_STORE_L(1, r2); + HS_XM_GLOBAL_STORE_L(2, r3); + HS_XM_GLOBAL_STORE_L(3, r4); + HS_XM_GLOBAL_STORE_L(4, r5); + HS_XM_GLOBAL_STORE_L(5, r6); + HS_XM_GLOBAL_STORE_L(6, r7); + HS_XM_GLOBAL_STORE_L(7, r8); + HS_XM_GLOBAL_STORE_L(8, r9); + HS_XM_GLOBAL_STORE_L(9, r10); + HS_XM_GLOBAL_STORE_L(10, r11); + HS_XM_GLOBAL_STORE_L(11, r12); + HS_XM_GLOBAL_STORE_L(12, r13); + HS_XM_GLOBAL_STORE_L(13, r14); + HS_XM_GLOBAL_STORE_L(14, r15); + HS_XM_GLOBAL_STORE_L(15, r16); + HS_XM_GLOBAL_STORE_L(16, r17); + HS_XM_GLOBAL_STORE_L(17, r18); + HS_XM_GLOBAL_STORE_L(18, r19); + HS_XM_GLOBAL_STORE_L(19, r20); + HS_XM_GLOBAL_STORE_L(20, r21); + HS_XM_GLOBAL_STORE_L(21, r22); + HS_XM_GLOBAL_STORE_L(22, r23); + HS_XM_GLOBAL_STORE_L(23, r24); + HS_XM_GLOBAL_STORE_L(24, r25); + HS_XM_GLOBAL_STORE_L(25, r26); + HS_XM_GLOBAL_STORE_L(26, r27); + HS_XM_GLOBAL_STORE_L(27, r28); + HS_XM_GLOBAL_STORE_L(28, r29); + HS_XM_GLOBAL_STORE_L(29, r30); + HS_XM_GLOBAL_STORE_L(30, r31); + HS_XM_GLOBAL_STORE_L(31, r32); +} + +// +// +// diff --git a/src/compute/hs/cl/intel/gen8/u64/hs_cl.h b/src/compute/hs/cl/intel/gen8/u64/hs_cl.h new file mode 100644 index 0000000000..d1c996fce9 --- /dev/null +++ b/src/compute/hs/cl/intel/gen8/u64/hs_cl.h @@ -0,0 +1,100 @@ +// +// Copyright 2016 Google Inc. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// + +#ifndef HS_CL_ONCE +#define HS_CL_ONCE + +#define HS_SLAB_THREADS_LOG2 3 +#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2) +#define HS_SLAB_WIDTH_LOG2 3 +#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2) +#define HS_SLAB_HEIGHT 16 +#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT) +#define HS_REG_LAST(c) c##16 +#define HS_KEY_TYPE ulong +#define HS_KEY_WORDS 2 +#define HS_VAL_WORDS 0 +#define HS_BS_SLABS 16 +#define HS_BS_SLABS_LOG2_RU 4 +#define HS_BC_SLABS_LOG2_MAX 4 +#define HS_FM_SCALE_MIN 1 +#define HS_FM_SCALE_MAX 1 +#define HS_HM_SCALE_MIN 1 +#define HS_HM_SCALE_MAX 1 +#define HS_EMPTY + +#define HS_SLAB_ROWS() \ + HS_SLAB_ROW( 1, 0 ) \ + HS_SLAB_ROW( 2, 1 ) \ + HS_SLAB_ROW( 3, 2 ) \ + HS_SLAB_ROW( 4, 3 ) \ + HS_SLAB_ROW( 5, 4 ) \ + HS_SLAB_ROW( 6, 5 ) \ + HS_SLAB_ROW( 7, 6 ) \ + HS_SLAB_ROW( 8, 7 ) \ + HS_SLAB_ROW( 9, 8 ) \ + HS_SLAB_ROW( 10, 9 ) \ + HS_SLAB_ROW( 11, 10 ) \ + HS_SLAB_ROW( 12, 11 ) \ + HS_SLAB_ROW( 13, 12 ) \ + HS_SLAB_ROW( 14, 13 ) \ + HS_SLAB_ROW( 15, 14 ) \ + HS_SLAB_ROW( 16, 15 ) \ + HS_EMPTY + +#define HS_TRANSPOSE_SLAB() \ + HS_TRANSPOSE_STAGE( 1 ) \ + HS_TRANSPOSE_STAGE( 2 ) \ + HS_TRANSPOSE_STAGE( 3 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 2, 1 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 4, 3 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 6, 5 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 8, 7 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 10, 9 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 12, 11 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 14, 13 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 16, 15 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 3, 1 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 4, 2 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 7, 5 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 8, 6 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 11, 9 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 12, 10 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 15, 13 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 16, 14 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 5, 1 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 6, 2 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 7, 3 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 8, 4 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 13, 9 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 14, 10 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 15, 11 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 16, 12 ) \ + HS_TRANSPOSE_REMAP( u, 1, 1 ) \ + HS_TRANSPOSE_REMAP( u, 2, 3 ) \ + HS_TRANSPOSE_REMAP( u, 3, 5 ) \ + HS_TRANSPOSE_REMAP( u, 4, 7 ) \ + HS_TRANSPOSE_REMAP( u, 5, 9 ) \ + HS_TRANSPOSE_REMAP( u, 6, 11 ) \ + HS_TRANSPOSE_REMAP( u, 7, 13 ) \ + HS_TRANSPOSE_REMAP( u, 8, 15 ) \ + HS_TRANSPOSE_REMAP( u, 9, 2 ) \ + HS_TRANSPOSE_REMAP( u, 10, 4 ) \ + HS_TRANSPOSE_REMAP( u, 11, 6 ) \ + HS_TRANSPOSE_REMAP( u, 12, 8 ) \ + HS_TRANSPOSE_REMAP( u, 13, 10 ) \ + HS_TRANSPOSE_REMAP( u, 14, 12 ) \ + HS_TRANSPOSE_REMAP( u, 15, 14 ) \ + HS_TRANSPOSE_REMAP( u, 16, 16 ) \ + HS_EMPTY + +#endif + +// +// +// + diff --git a/src/compute/hs/cl/intel/gen8/u64/hs_cl_macros.h b/src/compute/hs/cl/intel/gen8/u64/hs_cl_macros.h new file mode 100644 index 0000000000..9406339b36 --- /dev/null +++ b/src/compute/hs/cl/intel/gen8/u64/hs_cl_macros.h @@ -0,0 +1,361 @@ +// +// Copyright 2016 Google Inc. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// + +#ifndef HS_CL_MACROS_ONCE +#define HS_CL_MACROS_ONCE + +// +// +// + +#include "hs_cl.h" + +// +// FYI, restrict shouldn't have any impact on these kernels and +// benchmarks appear to prove that true +// + +#define HS_RESTRICT restrict + +// +// KERNEL PROTOS +// + +#define HS_TRANSPOSE_KERNEL_PROTO(slab_width) \ + __kernel \ + __attribute__((intel_reqd_sub_group_size(slab_width))) \ + void \ + hs_kernel_transpose(__global HS_KEY_TYPE * const HS_RESTRICT vout) + +#define HS_BS_KERNEL_PROTO(slab_width,slab_count,slab_count_ru_log2) \ + __kernel \ + __attribute__((reqd_work_group_size(slab_count*slab_width,1,1))) \ + __attribute__((intel_reqd_sub_group_size(slab_width))) \ + void \ + hs_kernel_bs_##slab_count_ru_log2##(__global HS_KEY_TYPE const * const HS_RESTRICT vin, \ + __global HS_KEY_TYPE * const HS_RESTRICT vout) + +#define HS_BC_KERNEL_PROTO(slab_width,slab_count,slab_count_log2) \ + __kernel \ + __attribute__((reqd_work_group_size(slab_count*slab_width,1,1))) \ + __attribute__((intel_reqd_sub_group_size(slab_width))) \ + void \ + hs_kernel_bc_##slab_count_log2##(__global HS_KEY_TYPE * const HS_RESTRICT vout) + +#define HS_HM_KERNEL_PROTO(s) \ + __kernel void \ + hs_kernel_hm_##s##(__global HS_KEY_TYPE * const HS_RESTRICT vout) + +#define HS_FM_KERNEL_PROTO(s,r) \ + __kernel void \ + hs_kernel_fm_##s##_##r##(__global HS_KEY_TYPE * const HS_RESTRICT vout) + +// +// BLOCK LOCAL MEMORY DECLARATION +// + +#define HS_BLOCK_LOCAL_MEM_DECL(width,height) \ + __local struct { \ + HS_KEY_TYPE m[width * height]; \ + } shared + +// +// BLOCK BARRIER +// + +#define HS_BLOCK_BARRIER() \ + barrier(CLK_LOCAL_MEM_FENCE) + +// +// SLAB GLOBAL +// + +#define HS_SLAB_GLOBAL_PREAMBLE(slab_width,slab_height) \ + uint const gmem_idx = \ + (get_global_id(0) & ~(slab_width-1)) * slab_height + \ + get_sub_group_local_id() + +#define HS_SLAB_GLOBAL_LOAD(extent,slab_width,row_idx) \ + extent[gmem_idx + slab_width * row_idx] + +#define HS_SLAB_GLOBAL_STORE(slab_width,row_idx,reg) \ + vout[gmem_idx + slab_width * row_idx] = reg + +// +// SLAB LOCAL +// + +#define HS_SLAB_LOCAL_L(offset) \ + shared.m[smem_l_idx + (offset)] + +#define HS_SLAB_LOCAL_R(offset) \ + shared.m[smem_r_idx + (offset)] + +// +// SLAB LOCAL VERTICAL LOADS +// + +#define HS_BX_LOCAL_V(offset) \ + shared.m[get_local_id(0) + (offset)] + +// +// BLOCK SORT MERGE HORIZONTAL +// + +#define HS_BS_MERGE_H_PREAMBLE(slab_width,slab_count) \ + uint const smem_l_idx = \ + get_sub_group_id() * (slab_width * slab_count) + \ + get_sub_group_local_id(); \ + uint const smem_r_idx = \ + (get_sub_group_id() ^ 1) * (slab_width * slab_count) + \ + (get_sub_group_local_id() ^ (slab_width - 1)) + +// +// BLOCK CLEAN MERGE HORIZONTAL +// + +#define HS_BC_MERGE_H_PREAMBLE(slab_width,slab_height,slab_count) \ + uint const gmem_l_idx = \ + (get_global_id(0) & ~(slab_width*slab_count-1)) * slab_height + \ + get_local_id(0); \ + uint const smem_l_idx = \ + get_sub_group_id() * (slab_width * slab_count) + \ + get_sub_group_local_id() + +#define HS_BC_GLOBAL_LOAD_L(slab_width,slab_idx) \ + vout[gmem_l_idx + (slab_width * slab_idx)] + +// +// SLAB FLIP AND HALF PREAMBLES +// + +#define HS_SLAB_FLIP_PREAMBLE(mask) \ + uint const flip_lane_idx = get_sub_group_local_id() ^ mask; \ + int const t_lt = get_sub_group_local_id() < flip_lane_idx; + +#define HS_SLAB_HALF_PREAMBLE(mask) \ + uint const half_lane_idx = get_sub_group_local_id() ^ mask; \ + int const t_lt = get_sub_group_local_id() < half_lane_idx; + +// +// Inter-lane compare exchange +// + +// default +#define HS_CMP_XCHG_V0(a,b) \ + { \ + HS_KEY_TYPE const t = min(a,b); \ + b = max(a,b); \ + a = t; \ + } + +// super slow +#define HS_CMP_XCHG_V1(a,b) \ + { \ + HS_KEY_TYPE const tmp = a; \ + a = (a < b) ? a : b; \ + b ^= a ^ tmp; \ + } + +// best +#define HS_CMP_XCHG_V2(a,b) \ + if (a >= b) { \ + HS_KEY_TYPE const t = a; \ + a = b; \ + b = t; \ + } + +// good +#define HS_CMP_XCHG_V3(a,b) \ + { \ + int const ge = a >= b; \ + HS_KEY_TYPE const t = a; \ + a = ge ? b : a; \ + b = ge ? t : b; \ + } + +// +// +// + +#if (HS_KEY_WORDS == 1) +#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V0(a,b) +#elif (HS_KEY_WORDS == 2) +#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V2(a,b) +#endif + +// +// The flip/half comparisons rely on a "conditional min/max": +// +// - if the flag is false, return min(a,b) +// - otherwise, return max(a,b) +// +// What's a little surprising is that sequence (1) is faster than (2) +// for 32-bit keys. +// +// I suspect either a code generation problem or that the sequence +// maps well to the GEN instruction set. +// +// We mostly care about 64-bit keys and unsurprisingly sequence (2) is +// fastest for this wider type. +// + +// this is what you would normally use +#define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) ^ lt) ? b : a + +// this seems to be faster for 32-bit keys +#define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b)) + +// +// +// + +#if (HS_KEY_WORDS == 1) +#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V1(lt,a,b) +#elif (HS_KEY_WORDS == 2) +#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b) +#endif + +// +// Conditional inter-subgroup flip/half compare exchange +// + +#define HS_CMP_FLIP(i,a,b) \ + { \ + HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,flip_lane_idx); \ + HS_KEY_TYPE const tb = intel_sub_group_shuffle(b,flip_lane_idx); \ + a = HS_COND_MIN_MAX(t_lt,a,tb); \ + b = HS_COND_MIN_MAX(t_lt,b,ta); \ + } + +#define HS_CMP_HALF(i,a) \ + { \ + HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,half_lane_idx); \ + a = HS_COND_MIN_MAX(t_lt,a,ta); \ + } + +// +// The device's comparison operator might return what we actually +// want. For example, it appears GEN 'cmp' returns {true:-1,false:0}. +// + +#define HS_CMP_IS_ZERO_ONE + +#ifdef HS_CMP_IS_ZERO_ONE +// OpenCL requires a {true: +1, false: 0} scalar result +// (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF } +#define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b)) +#define HS_CMP_TO_MASK(a) (HS_KEY_TYPE)(-a) +#else +// However, OpenCL requires { -1, 0 } for vectors +// (a < b) -> { 0xFFFFFFFF, 0 } +#define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64 +#define HS_CMP_TO_MASK(a) (a) +#endif + +// +// The "flip-merge" and "half-merge" preambles are very similar +// + +#define HS_HM_PREAMBLE(half_span) \ + uint const span_idx = get_global_id(2) * get_global_size(1) + get_global_id(1); \ + uint const span_stride = get_global_size(0); \ + uint const span_size = span_stride * half_span * 2; \ + uint const span_base = span_idx * span_size; \ + uint const span_off = get_global_id(0); \ + uint const span_l = span_base + span_off + +#define HS_FM_PREAMBLE(half_span) \ + HS_HM_PREAMBLE(half_span); \ + uint const span_r = span_base + span_stride * (half_span + 1) - span_off - 1 + +// +// +// + +#define HS_XM_GLOBAL_L(stride_idx) \ + vout[span_l + span_stride * stride_idx] + +#define HS_XM_GLOBAL_LOAD_L(stride_idx) \ + HS_XM_GLOBAL_L(stride_idx) + +#define HS_XM_GLOBAL_STORE_L(stride_idx,reg) \ + HS_XM_GLOBAL_L(stride_idx) = reg + +#define HS_FM_GLOBAL_R(stride_idx) \ + vout[span_r + span_stride * stride_idx] + +#define HS_FM_GLOBAL_LOAD_R(stride_idx) \ + HS_FM_GLOBAL_R(stride_idx) + +#define HS_FM_GLOBAL_STORE_R(stride_idx,reg) \ + HS_FM_GLOBAL_R(stride_idx) = reg + +// +// This snarl of macros is for transposing a "slab" of sorted elements +// into linear order. +// +// This can occur as the last step in hs_sort() or via a custom kernel +// that inspects the slab and then transposes and stores it to memory. +// +// The slab format can be inspected more efficiently than a linear +// arrangement. +// +// The prime example is detecting when adjacent keys (in sort order) +// have differing high order bits ("key changes"). The index of each +// change is recorded to an auxilary array. +// +// A post-processing step like this needs to be able to navigate the +// slab and eventually transpose and store the slab in linear order. +// + +#define HS_SUBGROUP_SHUFFLE_XOR(v,m) intel_sub_group_shuffle_xor(v,m) + +#define HS_TRANSPOSE_REG(prefix,row) prefix##row +#define HS_TRANSPOSE_DECL(prefix,row) HS_KEY_TYPE const HS_TRANSPOSE_REG(prefix,row) +#define HS_TRANSPOSE_PRED(level) is_lo_##level + +#define HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) \ + prefix_curr##row_ll##_##row_ur + +#define HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) \ + HS_KEY_TYPE const HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) + +#define HS_TRANSPOSE_STAGE(level) \ + bool const HS_TRANSPOSE_PRED(level) = \ + (get_sub_group_local_id() & (1 << (level-1))) == 0; + +#define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \ + HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) = \ + HS_SUBGROUP_SHUFFLE_XOR(HS_TRANSPOSE_PRED(level) ? \ + HS_TRANSPOSE_REG(prefix_prev,row_ll) : \ + HS_TRANSPOSE_REG(prefix_prev,row_ur), \ + 1<<(level-1)); \ + \ + HS_TRANSPOSE_DECL(prefix_curr,row_ll) = \ + HS_TRANSPOSE_PRED(level) ? \ + HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) : \ + HS_TRANSPOSE_REG(prefix_prev,row_ll); \ + \ + HS_TRANSPOSE_DECL(prefix_curr,row_ur) = \ + HS_TRANSPOSE_PRED(level) ? \ + HS_TRANSPOSE_REG(prefix_prev,row_ur) : \ + HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur); + +#define HS_TRANSPOSE_REMAP(prefix,row_from,row_to) \ + vout[gmem_idx + ((row_to-1) << HS_SLAB_WIDTH_LOG2)] = \ + HS_TRANSPOSE_REG(prefix,row_from); + +// +// +// + +#endif + +// +// +// diff --git a/src/compute/hs/cl/intel/gen8/u64/hs_target.h b/src/compute/hs/cl/intel/gen8/u64/hs_target.h new file mode 100644 index 0000000000..c543c7b523 --- /dev/null +++ b/src/compute/hs/cl/intel/gen8/u64/hs_target.h @@ -0,0 +1,115 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "../../../hs_cl_target.h" + +// +// +// + +#include "hs_cl.h" + +// +// +// + +#ifndef HS_TARGET_NAME +#define HS_TARGET_NAME hs_target +#endif + +#define HS_TARGET_HELPER(a) a + +// +// +// + +static struct hs_cl_target const HS_TARGET_NAME = +{ + .config = { + .slab = { + .threads_log2 = HS_SLAB_THREADS_LOG2, + .width_log2 = HS_SLAB_WIDTH_LOG2, + .height = HS_SLAB_HEIGHT + }, + + .words = { + .key = HS_KEY_WORDS, + .val = HS_VAL_WORDS + }, + + .block = { + .slabs = HS_BS_SLABS + }, + + .merge = { + .fm = { + .scale_min = HS_FM_SCALE_MIN, + .scale_max = HS_FM_SCALE_MAX + }, + .hm = { + .scale_min = HS_HM_SCALE_MIN, + .scale_max = HS_HM_SCALE_MAX, + } + } + }, + + .program = { +#ifndef HS_DUMP_SOURCE + 0, // KERNELS ARE BINARIES +#include "hs_cl.bin.len.xxd" + , +#include "hs_cl.bin.xxd" +#else + 1, // KERNELS ARE SOURCE +#include "hs_cl.src.len.xxd" + , +#include "hs_cl.src.xxd" +#endif + } +}; + +// +// +// + +#ifdef HS_DUMP + +#include +#include + +int +main(int argc, char const * argv[]) +{ + FILE * fp = fopen("hs_target.bin","wb"); + + fwrite(&HS_TARGET_NAME.config,1,sizeof(HS_TARGET_NAME.config),fp); + + size_t progsize = + (HS_TARGET_NAME.program[1]<<24) | (HS_TARGET_NAME.program[2]<<16) | + (HS_TARGET_NAME.program[3]<< 8) | HS_TARGET_NAME.program[4]; + + // fprintf(stderr,"%zu\n",progsize); + + progsize += 1 + sizeof(uint32_t); + + fwrite(HS_TARGET_NAME.program,1,progsize,fp); + + fclose(fp); + + return EXIT_SUCCESS; +} + +#endif + +// +// +// diff --git a/src/compute/hs/cl/intel/gen8/u64/make_all.bat b/src/compute/hs/cl/intel/gen8/u64/make_all.bat new file mode 100644 index 0000000000..ee075b3f92 --- /dev/null +++ b/src/compute/hs/cl/intel/gen8/u64/make_all.bat @@ -0,0 +1,26 @@ +@ECHO OFF + +SET HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen + +:: --- 32-bit keys --- + +:: %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +:: %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +:: %HS_GEN% -v -a "opencl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +:: --- 64-bit keys + +%HS_GEN% -v -a "opencl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +:: %HS_GEN% -v -a "opencl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +:: +:: remove trailing whitespace from generated files +:: + +sed -i 's/[[:space:]]*$//' hs_cl.h + +:: +:: preprocess and build kernels +:: + +make_inl_cl.bat hs_cl.cl diff --git a/src/compute/hs/cl/intel/gen8/u64/make_inl_cl.bat b/src/compute/hs/cl/intel/gen8/u64/make_inl_cl.bat new file mode 100644 index 0000000000..d7a3c0a951 --- /dev/null +++ b/src/compute/hs/cl/intel/gen8/u64/make_inl_cl.bat @@ -0,0 +1,113 @@ +@ECHO OFF + +:: +:: +:: + +SET OPENCL_STD=-cl-std=CL1.2 +SET OPENCL_PRE=__OPENCL_C_VERSION__=120 + +:: SET OPENCL_STD=-cl-std=CL2.0 +:: SET OPENCL_PRE=__OPENCL_C_VERSION__=200 + +:: +:: +:: + +SET IOC=ioc64 + +:: +:: +:: + +SET IOC_IR_OPTS_OPT=%OPENCL_STD% -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info + +SET IOC_IR_OPTS_DBG=%OPENCL_STD% -cl-kernel-arg-info -g + +SET IOC_IR_OPTS=%IOC_IR_OPTS_OPT% + +:: +:: +:: + +SET PRE_DIR=%~p1 + +CD %PRE_DIR% + +SET PRE_SRC=%~n1.pre.cl +SET PRE_SRC_XXD=%~n1.src.xxd +SET PRE_SRC_LEN_XXD=%~n1.src.len.xxd + +SET PRE_BIN=%~n1.bin +SET PRE_BIN_XXD=%~n1.bin.xxd +SET PRE_BIN_LEN_XXD=%~n1.bin.len.xxd + +:: +:: *.pre.cl +:: + +clang-format -style=Mozilla -i %1 || goto :error +cl -I . -I "%INTELOCLSDKROOT%\include" -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_SRC%" || goto :error +clang-format -style=Mozilla -i %PRE_SRC% || goto :error +dos2unix -q %PRE_SRC% || goto :error + +echo %PRE_SRC% + +:: +:: *.src.xxd +:: *.src.len.xxd +:: + +xxd -i < %PRE_SRC% > %PRE_SRC_XXD% || goto :error + +for /f %%A in ('wc -c %PRE_SRC%') do ( + echo %PRE_SRC% %%A + printf "%%.8x" %%A | xxd -r -p | xxd -i > %PRE_SRC_LEN_XXD% || goto :error +) + +echo %PRE_SRC_XXD% +echo %PRE_SRC_LEN_XXD% + +:: +:: *.pre.bin +:: + +%IOC% -cmd=build -bo="%IOC_IR_OPTS%" -device=gpu -input=%PRE_SRC% -ir=%PRE_BIN% || goto :error + +echo %PRE_BIN% + +:: +:: *.bin.xxd +:: *.bin.len.xxd +:: + +xxd -i < %PRE_BIN% > %PRE_BIN_XXD% || goto :error + +for /f %%A in ('wc -c %PRE_BIN%') do ( + echo %PRE_BIN% %%A + printf "%%.8x" %%A | xxd -r -p | xxd -i > %PRE_BIN_LEN_XXD% || goto :error +) + +echo %PRE_BIN_XXD% +echo %PRE_BIN_LEN_XXD% + +:: +:: dump a binary +:: + +cl /DHS_DUMP /Fe:hs_dump.exe /Tchs_target.h +hs_dump + +:: +:: delete temporary files +:: + +:: del *.pre.cl +del *.obj +del *.exe + +exit /b 0 + +:error + +exit /b %errorlevel% diff --git a/src/compute/hs/cl/intel/gen9lp/u32/make_inl_cl.bat b/src/compute/hs/cl/intel/gen9lp/u32/make_inl_cl.bat new file mode 100644 index 0000000000..54b1aac48f --- /dev/null +++ b/src/compute/hs/cl/intel/gen9lp/u32/make_inl_cl.bat @@ -0,0 +1,77 @@ +@ECHO OFF + +:: +:: +:: + +SET OPENCL_STD=-cl-std=CL1.2 +SET OPENCL_PRE=__OPENCL_C_VERSION__=120 + +:: SET OPENCL_STD=-cl-std=CL2.0 +:: SET OPENCL_PRE=__OPENCL_C_VERSION__=200 + +:: +:: +:: + +SET IOC=ioc64 + +:: +:: +:: + +SET IOC_IR_OPTS_OPT=%OPENCL_STD% -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info + +SET IOC_IR_OPTS_DBG=%OPENCL_STD% -cl-kernel-arg-info -g + +SET IOC_IR_OPTS=%IOC_IR_OPTS_OPT% + +:: +:: +:: + +SET PRE_DIR=%~p1 + +CD %PRE_DIR% + +SET PRE_CL=%~n1 +SET PRE_CL=%PRE_CL%.pre.cl + +SET PRE_SRC_INL=%~n1 +SET PRE_SRC_INL=%PRE_SRC_INL%.pre.src.inl + +SET PRE_BIN_IR=%~n1 +SET PRE_BIN_IR=%PRE_BIN_IR%.pre.ir + +SET PRE_BIN_INL=%~n1 +SET PRE_BIN_INL=%PRE_BIN_INL%.pre.bin.inl + +:: +:: *.pre.cl +:: *.pre.src.inl +:: + +CMD /C clang-format -style=Mozilla -i %1 +CMD /C cl -I . -I "%INTELOCLSDKROOT%\include" -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_CL%" +CMD /C clang-format -style=Mozilla -i %PRE_CL% +CMD /C dos2unix -q %PRE_CL% +CMD /C xxd -i %PRE_CL% %PRE_SRC_INL% + +echo %PRE_CL% +echo %PRE_SRC_INL% + +:: +:: *.pre.cl +:: *.pre.src.inl +:: + +CMD /C touch %PRE_BIN_IR% +ECHO ON +@CMD /C %IOC% -cmd=build -bo="%IOC_IR_OPTS%" -device=gpu -input=%PRE_CL% -ir=%PRE_BIN_IR% +@ECHO OFF +CMD /C xxd -i %PRE_BIN_IR% %PRE_BIN_INL% + +echo %PRE_BIN_IR% +echo %PRE_BIN_INL% + + diff --git a/src/compute/hs/cl/intel/gen9lp/u32b32/make_inl_cl.bat b/src/compute/hs/cl/intel/gen9lp/u32b32/make_inl_cl.bat new file mode 100644 index 0000000000..54b1aac48f --- /dev/null +++ b/src/compute/hs/cl/intel/gen9lp/u32b32/make_inl_cl.bat @@ -0,0 +1,77 @@ +@ECHO OFF + +:: +:: +:: + +SET OPENCL_STD=-cl-std=CL1.2 +SET OPENCL_PRE=__OPENCL_C_VERSION__=120 + +:: SET OPENCL_STD=-cl-std=CL2.0 +:: SET OPENCL_PRE=__OPENCL_C_VERSION__=200 + +:: +:: +:: + +SET IOC=ioc64 + +:: +:: +:: + +SET IOC_IR_OPTS_OPT=%OPENCL_STD% -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info + +SET IOC_IR_OPTS_DBG=%OPENCL_STD% -cl-kernel-arg-info -g + +SET IOC_IR_OPTS=%IOC_IR_OPTS_OPT% + +:: +:: +:: + +SET PRE_DIR=%~p1 + +CD %PRE_DIR% + +SET PRE_CL=%~n1 +SET PRE_CL=%PRE_CL%.pre.cl + +SET PRE_SRC_INL=%~n1 +SET PRE_SRC_INL=%PRE_SRC_INL%.pre.src.inl + +SET PRE_BIN_IR=%~n1 +SET PRE_BIN_IR=%PRE_BIN_IR%.pre.ir + +SET PRE_BIN_INL=%~n1 +SET PRE_BIN_INL=%PRE_BIN_INL%.pre.bin.inl + +:: +:: *.pre.cl +:: *.pre.src.inl +:: + +CMD /C clang-format -style=Mozilla -i %1 +CMD /C cl -I . -I "%INTELOCLSDKROOT%\include" -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_CL%" +CMD /C clang-format -style=Mozilla -i %PRE_CL% +CMD /C dos2unix -q %PRE_CL% +CMD /C xxd -i %PRE_CL% %PRE_SRC_INL% + +echo %PRE_CL% +echo %PRE_SRC_INL% + +:: +:: *.pre.cl +:: *.pre.src.inl +:: + +CMD /C touch %PRE_BIN_IR% +ECHO ON +@CMD /C %IOC% -cmd=build -bo="%IOC_IR_OPTS%" -device=gpu -input=%PRE_CL% -ir=%PRE_BIN_IR% +@ECHO OFF +CMD /C xxd -i %PRE_BIN_IR% %PRE_BIN_INL% + +echo %PRE_BIN_IR% +echo %PRE_BIN_INL% + + diff --git a/src/compute/hs/cl/intel/gen9lp/u64/make_inl_cl.bat b/src/compute/hs/cl/intel/gen9lp/u64/make_inl_cl.bat new file mode 100644 index 0000000000..54b1aac48f --- /dev/null +++ b/src/compute/hs/cl/intel/gen9lp/u64/make_inl_cl.bat @@ -0,0 +1,77 @@ +@ECHO OFF + +:: +:: +:: + +SET OPENCL_STD=-cl-std=CL1.2 +SET OPENCL_PRE=__OPENCL_C_VERSION__=120 + +:: SET OPENCL_STD=-cl-std=CL2.0 +:: SET OPENCL_PRE=__OPENCL_C_VERSION__=200 + +:: +:: +:: + +SET IOC=ioc64 + +:: +:: +:: + +SET IOC_IR_OPTS_OPT=%OPENCL_STD% -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-kernel-arg-info + +SET IOC_IR_OPTS_DBG=%OPENCL_STD% -cl-kernel-arg-info -g + +SET IOC_IR_OPTS=%IOC_IR_OPTS_OPT% + +:: +:: +:: + +SET PRE_DIR=%~p1 + +CD %PRE_DIR% + +SET PRE_CL=%~n1 +SET PRE_CL=%PRE_CL%.pre.cl + +SET PRE_SRC_INL=%~n1 +SET PRE_SRC_INL=%PRE_SRC_INL%.pre.src.inl + +SET PRE_BIN_IR=%~n1 +SET PRE_BIN_IR=%PRE_BIN_IR%.pre.ir + +SET PRE_BIN_INL=%~n1 +SET PRE_BIN_INL=%PRE_BIN_INL%.pre.bin.inl + +:: +:: *.pre.cl +:: *.pre.src.inl +:: + +CMD /C clang-format -style=Mozilla -i %1 +CMD /C cl -I . -I "%INTELOCLSDKROOT%\include" -D %OPENCL_PRE% -EP %1 -P -Fi"%PRE_CL%" +CMD /C clang-format -style=Mozilla -i %PRE_CL% +CMD /C dos2unix -q %PRE_CL% +CMD /C xxd -i %PRE_CL% %PRE_SRC_INL% + +echo %PRE_CL% +echo %PRE_SRC_INL% + +:: +:: *.pre.cl +:: *.pre.src.inl +:: + +CMD /C touch %PRE_BIN_IR% +ECHO ON +@CMD /C %IOC% -cmd=build -bo="%IOC_IR_OPTS%" -device=gpu -input=%PRE_CL% -ir=%PRE_BIN_IR% +@ECHO OFF +CMD /C xxd -i %PRE_BIN_IR% %PRE_BIN_INL% + +echo %PRE_BIN_IR% +echo %PRE_BIN_INL% + + diff --git a/src/compute/hs/gen/gen.h b/src/compute/hs/gen/gen.h index 4043a8df5c..3635d553cf 100644 --- a/src/compute/hs/gen/gen.h +++ b/src/compute/hs/gen/gen.h @@ -9,21 +9,20 @@ #pragma once // +// TODO: // +// Add Key-Val sorting support -- easy. // #include #include // +// All code generation is driven by the specified architectural +// details and host platform API. // -// - -#define MERGE_LEVELS_MAX_LOG2 7 // merge up to 128 warps -#define MERGE_LEVELS_MAX_SIZE (1 << MERGE_LEVELS_MAX_LOG2) // ((1 << MERGE_MAX_LOG2) - 1) // incorrect debug error - -// -// +// In general, the warps-per-block and keys-per-thread are the +// critical knobs for tuning performance. // struct hsg_config @@ -58,6 +57,7 @@ struct hsg_config struct { uint32_t lanes; + uint32_t lanes_log2; uint32_t skpw_bs; } warp; @@ -72,7 +72,7 @@ struct hsg_config }; // -// +// HotSort can merge non-power-of-two blocks of warps // struct hsg_level @@ -91,6 +91,16 @@ struct hsg_level } active; }; +// +// +// + +#define MERGE_LEVELS_MAX_LOG2 7 // merge up to 128 warps +#define MERGE_LEVELS_MAX_SIZE (1 << MERGE_LEVELS_MAX_LOG2) + +// +// This is computed +// struct hsg_merge { @@ -113,6 +123,8 @@ struct hsg_merge // // +#if 0 + #define HSG_FILE_NAME_SIZE 80 struct hsg_file @@ -126,18 +138,6 @@ struct hsg_file // // -typedef enum hsg_kernel_type { - - HSG_KERNEL_TYPE_SORT_BLOCK, - - HSG_KERNEL_TYPE_COUNT - -} hsg_kernel_type; - -// -// -// - typedef enum hsg_file_type { HSG_FILE_TYPE_HEADER, @@ -147,6 +147,8 @@ typedef enum hsg_file_type { } hsg_file_type; +#endif + // // // @@ -158,10 +160,8 @@ typedef enum hsg_file_type { HSG_OP_EXPAND_X(HSG_OP_TYPE_BEGIN) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_ELSE) \ \ - HSG_OP_EXPAND_X(HSG_OP_TYPE_FILE_HEADER) \ - HSG_OP_EXPAND_X(HSG_OP_TYPE_FILE_FOOTER) \ - \ - HSG_OP_EXPAND_X(HSG_OP_TYPE_DUMMY_KERNEL) \ + HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_BEGIN) \ + HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_END) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE) \ @@ -186,12 +186,13 @@ typedef enum hsg_file_type { HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT) \ + HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_MERGE_RIGHT_PRED) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_LOAD) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_STORE) \ \ - HSG_OP_EXPAND_X(HSG_OP_TYPE_WARP_FLIP) \ - HSG_OP_EXPAND_X(HSG_OP_TYPE_WARP_HALF) \ + HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_FLIP) \ + HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_HALF) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_FLIP) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_HALF) \ @@ -221,8 +222,6 @@ typedef enum hsg_file_type { \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_ACTIVE_PRED) \ \ - HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_MERGE_RIGHT_PRED) \ - \ HSG_OP_EXPAND_X(HSG_OP_TYPE_COUNT) // @@ -271,42 +270,63 @@ struct hsg_op // // -typedef void (*hsg_target_pfn)(struct hsg_file * const files, - struct hsg_merge const * const merge, - struct hsg_op const * const ops, - uint32_t const depth); +extern char const * const hsg_op_type_string[]; // // // -extern struct hsg_config hsg_config; -extern struct hsg_merge hsg_merge[MERGE_LEVELS_MAX_LOG2]; +struct hsg_target +{ + struct hsg_target_state * state; +}; // +// All targets share this prototype +// + +typedef +void +(*hsg_target_pfn)(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth); // // +// + +extern +void +hsg_target_debug(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth); extern void -hsg_target_debug (struct hsg_file * const files, - struct hsg_merge const * const merge, - struct hsg_op const * const ops, - uint32_t const depth); +hsg_target_cuda(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth); extern void -hsg_target_cuda_sm3x(struct hsg_file * const files, - struct hsg_merge const * const merge, - struct hsg_op const * const ops, - uint32_t const depth); +hsg_target_opencl(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth); extern void -hsg_target_igp_genx (struct hsg_file * const files, - struct hsg_merge const * const merge, - struct hsg_op const * const ops, - uint32_t const depth); +hsg_target_glsl(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth); // // // diff --git a/src/compute/hs/gen/main.c b/src/compute/hs/gen/main.c index 42f4518bfd..e06e23029b 100644 --- a/src/compute/hs/gen/main.c +++ b/src/compute/hs/gen/main.c @@ -20,14 +20,8 @@ // #include "networks.h" -#include "macros.h" -#include "util.h" - -// -// -// - -#define HSG_INDENT 2 +#include "common/util.h" +#include "common/macros.h" // // @@ -36,7 +30,6 @@ #undef HSG_OP_EXPAND_X #define HSG_OP_EXPAND_X(t) #t , -static char const * const hsg_op_type_string[] = { @@ -53,10 +46,8 @@ hsg_op_type_string[] = #define BEGIN() (struct hsg_op){ HSG_OP_TYPE_BEGIN } #define ELSE() (struct hsg_op){ HSG_OP_TYPE_ELSE } -#define STORE_SLAB_EARLY_EXIT() (struct hsg_op){ HSG_OP_TYPE_STORE_SLAB_EARLY_EXIT } - -#define FILE_HEADER() (struct hsg_op){ HSG_OP_TYPE_FILE_HEADER } -#define FILE_FOOTER() (struct hsg_op){ HSG_OP_TYPE_FILE_FOOTER } +#define TARGET_BEGIN() (struct hsg_op){ HSG_OP_TYPE_TARGET_BEGIN } +#define TARGET_END() (struct hsg_op){ HSG_OP_TYPE_TARGET_END } #define TRANSPOSE_KERNEL_PROTO() (struct hsg_op){ HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO } #define TRANSPOSE_KERNEL_PREAMBLE() (struct hsg_op){ HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE } @@ -68,11 +59,11 @@ hsg_op_type_string[] = #define BC_KERNEL_PROTO(i) (struct hsg_op){ HSG_OP_TYPE_BC_KERNEL_PROTO, { i } } #define BC_KERNEL_PREAMBLE(i) (struct hsg_op){ HSG_OP_TYPE_BC_KERNEL_PREAMBLE, { i } } -#define FM_KERNEL_PROTO(l,s) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PROTO, { l, s } } -#define FM_KERNEL_PREAMBLE(w,s) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PREAMBLE, { w, s } } +#define FM_KERNEL_PROTO(s,r) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PROTO, { s, r } } +#define FM_KERNEL_PREAMBLE(h) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PREAMBLE, { h } } -#define HM_KERNEL_PROTO(d,w) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PROTO, { d, w } } -#define HM_KERNEL_PREAMBLE(w,s) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PREAMBLE, { w, s } } +#define HM_KERNEL_PROTO(s) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PROTO, { s } } +#define HM_KERNEL_PREAMBLE(h) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PREAMBLE, { h } } #define BX_REG_GLOBAL_LOAD(n,v) (struct hsg_op){ HSG_OP_TYPE_BX_REG_GLOBAL_LOAD, { n, v } } #define BX_REG_GLOBAL_STORE(n) (struct hsg_op){ HSG_OP_TYPE_BX_REG_GLOBAL_STORE, { n } } @@ -81,12 +72,13 @@ hsg_op_type_string[] = #define FM_REG_GLOBAL_STORE_LEFT(n,i) (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT, { n, i } } #define FM_REG_GLOBAL_LOAD_RIGHT(n,i) (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT, { n, i } } #define FM_REG_GLOBAL_STORE_RIGHT(n,i) (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT, { n, i } } +#define FM_MERGE_RIGHT_PRED(n,s) (struct hsg_op){ HSG_OP_TYPE_FM_MERGE_RIGHT_PRED, { n, s } } #define HM_REG_GLOBAL_LOAD(n,i) (struct hsg_op){ HSG_OP_TYPE_HM_REG_GLOBAL_LOAD, { n, i } } #define HM_REG_GLOBAL_STORE(n,i) (struct hsg_op){ HSG_OP_TYPE_HM_REG_GLOBAL_STORE, { n, i } } -#define WARP_FLIP(f) (struct hsg_op){ HSG_OP_TYPE_WARP_FLIP, { f } } -#define WARP_HALF(h) (struct hsg_op){ HSG_OP_TYPE_WARP_HALF, { h } } +#define SLAB_FLIP(f) (struct hsg_op){ HSG_OP_TYPE_SLAB_FLIP, { f } } +#define SLAB_HALF(h) (struct hsg_op){ HSG_OP_TYPE_SLAB_HALF, { h } } #define CMP_FLIP(a,b,c) (struct hsg_op){ HSG_OP_TYPE_CMP_FLIP, { a, b, c } } #define CMP_HALF(a,b) (struct hsg_op){ HSG_OP_TYPE_CMP_HALF, { a, b } } @@ -121,13 +113,12 @@ hsg_op_type_string[] = #define BS_ACTIVE_PRED(m,l) (struct hsg_op){ HSG_OP_TYPE_BS_ACTIVE_PRED, { m, l } } -#define FM_MERGE_RIGHT_PRED(n,s) (struct hsg_op){ HSG_OP_TYPE_FM_MERGE_RIGHT_PRED, { n, s } } - // // DEFAULTS // -struct hsg_config hsg_config = // FIXME -- how useful is this? +static +struct hsg_config hsg_config = { .merge = { .flip = { @@ -138,8 +129,6 @@ struct hsg_config hsg_config = // FIXME -- how useful is this? .lo = 1, .hi = 1 }, - - .max_log2 = 27 // 2^27th = 128m }, .block = { @@ -156,6 +145,7 @@ struct hsg_config hsg_config = // FIXME -- how useful is this? .warp = { .lanes = 32, + .lanes_log2 = 5, }, .thread = { @@ -172,45 +162,11 @@ struct hsg_config hsg_config = // FIXME -- how useful is this? // ZERO HSG_MERGE STRUCT // +static struct hsg_merge hsg_merge[MERGE_LEVELS_MAX_LOG2] = { 0 }; // -// -// - -static const hsg_target_pfn hsg_target_pfns[] = - { - hsg_target_debug, - hsg_target_cuda_sm3x, - hsg_target_igp_genx, - // hsg_target_adreno_5xx, - // hsg_target_amd_gcn, - // hsg_target_x86_sse, - // hsg_target_x86_avx2, - }; - -static const char * hsg_target_pfn_string[] = - { - "hs_debug", - "hs_cuda", - "hs_cl" - }; - -static const char * hsg_file_type_string[][2] = - { - { ".h", ".txt" }, - { ".h", ".cu" }, - { ".h", ".cl" } - }; - -// -// -// - -#define HSG_TARGET_PFN_COUNT ARRAY_LENGTH(hsg_target_pfns) - -// -// +// STATS ON INSTRUCTIONS // static hsg_op_type hsg_op_type_counts[HSG_OP_TYPE_COUNT] = { 0 }; @@ -223,8 +179,18 @@ static void hsg_op_debug() { + uint32_t total = 0; + for (hsg_op_type t=HSG_OP_TYPE_EXIT; twarps * hsg_config.warp.lanes; + uint32_t const bs_threads = merge->warps << hsg_config.warp.lanes_log2; uint32_t const bs_keys = hsg_config.block.smem_bs / (hsg_config.type.words * sizeof(uint32_t)); uint32_t const bs_kpt = bs_keys / bs_threads; uint32_t const bs_kpt_mod = (bs_kpt / hsg_config.block.warps_mod) * hsg_config.block.warps_mod; @@ -282,7 +248,7 @@ hsg_merge_levels_init_shared(struct hsg_merge * const merge) } // clamp to number of registers - merge->rows_bs = min(bs_rows_even, hsg_config.thread.regs); + merge->rows_bs = MIN_MACRO(bs_rows_even, hsg_config.thread.regs); } // @@ -297,19 +263,19 @@ hsg_merge_levels_init_shared(struct hsg_merge * const merge) // // if merge->warps is not pow2 then we're going to skip creating a bc elsewhere // - uint32_t const bc_warps_min = max(merge->warps,hsg_config.block.warps_min); - uint32_t const bc_threads = bc_warps_min * hsg_config.warp.lanes; + uint32_t const bc_warps_min = MAX_MACRO(merge->warps,hsg_config.block.warps_min); + uint32_t const bc_threads = bc_warps_min << hsg_config.warp.lanes_log2; uint32_t const bc_block_rd = (((hsg_config.block.smem_bc * bc_warps_min) / hsg_config.block.warps_max) / hsg_config.block.smem_quantum) * hsg_config.block.smem_quantum; - uint32_t const bc_block_max = max(bc_block_rd,hsg_config.block.smem_min); - uint32_t const bc_block_smem = min(bc_block_max,hsg_config.block.smem_bs); + uint32_t const bc_block_max = MAX_MACRO(bc_block_rd,hsg_config.block.smem_min); + uint32_t const bc_block_smem = MIN_MACRO(bc_block_max,hsg_config.block.smem_bs); // what is the max amount of shared in each possible bc block config? uint32_t const bc_keys = bc_block_smem / (hsg_config.type.words * sizeof(uint32_t)); uint32_t const bc_kpt = bc_keys / bc_threads; uint32_t const bc_kpt_mod = (bc_kpt / hsg_config.block.warps_mod) * hsg_config.block.warps_mod; - merge->rows_bc = min(bc_kpt_mod, hsg_config.thread.regs); + merge->rows_bc = MIN_MACRO(bc_kpt_mod, hsg_config.thread.regs); merge->skpw_bc = bc_keys / bc_warps_min; } } @@ -441,7 +407,7 @@ hsg_merge_levels_hint(struct hsg_merge * const merge, bool const autotune) for (uint32_t level=0; levellevels[level].networks[0], + uint32_t const n_max = MAX_MACRO(merge->levels[level].networks[0], merge->levels[level].networks[1]); if (n_max <= (merge->rows_bs + hsg_config.thread.xtra)) @@ -533,7 +499,7 @@ hsg_network_copy(struct hsg_op * ops, for (uint32_t ii=0; iia,cx->b,prefix)); } @@ -638,7 +604,7 @@ hsg_warp_half_downto(struct hsg_op * ops, uint32_t h) { ops = hsg_begin(ops); - ops = hsg_op(ops,WARP_HALF(h)); + ops = hsg_op(ops,SLAB_HALF(h)); ops = hsg_warp_half_network(ops); ops = hsg_end(ops); @@ -665,7 +631,7 @@ hsg_warp_flip(struct hsg_op * ops, uint32_t f) { ops = hsg_begin(ops); - ops = hsg_op(ops,WARP_FLIP(f)); + ops = hsg_op(ops,SLAB_FLIP(f)); ops = hsg_warp_flip_network(ops); ops = hsg_end(ops); @@ -782,7 +748,7 @@ hsg_bc_half_merge_level(struct hsg_op * ops, uint32_t const net_even = merge->levels[0].networks[0]; // min of warps in block and remaining horizontal rows - uint32_t const active = min(s_count, net_even); + uint32_t const active = MIN_MACRO(s_count, net_even); // conditional on blockIdx.x if (active < merge->warps) @@ -834,7 +800,7 @@ hsg_bc_half_merge(struct hsg_op * ops, struct hsg_merge const * const merge) // // will only be called with merge->warps >= 2 // - uint32_t const warps = max(merge->warps,hsg_config.block.warps_min); + uint32_t const warps = MAX_MACRO(merge->warps,hsg_config.block.warps_min); // guaranteed to be an even network uint32_t const net_even = merge->levels[0].networks[0]; @@ -851,7 +817,7 @@ hsg_bc_half_merge(struct hsg_op * ops, struct hsg_merge const * const merge) { // compute store count uint32_t const r_rem = hsg_config.thread.regs + 1 - r_lo; - uint32_t const s_count = min(s_max,r_rem); + uint32_t const s_count = MIN_MACRO(s_max,r_rem); // block sync -- can skip if first if (r_lo > 1) @@ -1010,7 +976,7 @@ hsg_bs_flip_merge(struct hsg_op * ops, struct hsg_merge const * const merge) uint32_t r_hi = hsg_config.thread.regs + 1 - r_lo; // compute store count - uint32_t const s_pairs = min(s_pairs_max,r_mid - r_lo); + uint32_t const s_pairs = MIN_MACRO(s_pairs_max,r_mid - r_lo); // store rows to shared for (uint32_t c=0; cindex)); @@ -1125,7 +1091,7 @@ hsg_bs_sort_all(struct hsg_op * ops) { for (uint32_t merge_idx=0; merge_idxwarps == 0) break; @@ -1142,7 +1108,7 @@ hsg_bs_sort_all(struct hsg_op * ops) static struct hsg_op * -hsg_bc_clean(struct hsg_op * ops, const struct hsg_merge * const merge) +hsg_bc_clean(struct hsg_op * ops, struct hsg_merge const * const merge) { // func proto ops = hsg_op(ops,BC_KERNEL_PROTO(merge->index)); @@ -1189,7 +1155,7 @@ hsg_bc_clean_all(struct hsg_op * ops) { for (uint32_t merge_idx=0; merge_idxwarps == 0) break; @@ -1215,9 +1181,7 @@ static struct hsg_op * hsg_fm_thread_load_left(struct hsg_op * ops, uint32_t const n) { - uint32_t const mid = n/2; - - for (uint32_t r=1; r<=mid; r++) + for (uint32_t r=1; r<=n; r++) ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_LEFT(r,r-1)); return ops; @@ -1227,9 +1191,7 @@ static struct hsg_op * hsg_fm_thread_store_left(struct hsg_op * ops, uint32_t const n) { - uint32_t const mid = n/2; - - for (uint32_t r=mid; r>=1; r--) + for (uint32_t r=1; r<=n; r++) ops = hsg_op(ops,FM_REG_GLOBAL_STORE_LEFT(r,r-1)); return ops; @@ -1237,53 +1199,60 @@ hsg_fm_thread_store_left(struct hsg_op * ops, uint32_t const n) static struct hsg_op * -hsg_fm_thread_load_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right) +hsg_fm_thread_load_right(struct hsg_op * ops, uint32_t const half_span, uint32_t const half_case) { - uint32_t const mid = n / 2; - uint32_t const first = mid + 1; - uint32_t const last = mid + span_right; - - for (uint32_t r=first; r<=last; r++) - ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_RIGHT(r,r-first)); + for (uint32_t r=0; r=first; r--) - ops = hsg_op(ops,FM_REG_GLOBAL_STORE_RIGHT(r,r-first)); + for (uint32_t r=0; r= 1; span_pow2 /= 2) - { - ops = hsg_fm_thread_merge_right(ops,span,span_pow2); - } - - return ops; -} - -static -struct hsg_op * -hsg_fm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uint32_t const fm_scale) -{ - // func proto - ops = hsg_op(ops,FM_KERNEL_PROTO(level,fm_scale)); - - // begin - ops = hsg_begin(ops); - - // shared declare - ops = hsg_op(ops,FM_KERNEL_PREAMBLE(span,fm_scale)); - - // load - ops = hsg_fm_thread_load_left(ops,span); - - // right merging network - ops = hsg_fm_thread_merge_right_all(ops,span); - - // left merging network - ops = hsg_thread_merge(ops,span/2); - - // store - ops = hsg_fm_thread_store_left(ops,span); + uint32_t const span_left = (warps << scale_log2) / 2; - // end - ops = hsg_end(ops); + for (uint32_t span_right=span_left; span_right >= 1; span_right=pow2_ru_u32(span_right)/2) + ops = hsg_fm_merge(ops,scale_log2,span_left,span_right); return ops; } @@ -1354,7 +1290,7 @@ static struct hsg_op * hsg_hm_thread_store(struct hsg_op * ops, uint32_t const n) { - for (uint32_t r=n; r>=1; r--) + for (uint32_t r=1; r<=n; r++) ops = hsg_op(ops,HM_REG_GLOBAL_STORE(r,r-1)); return ops; @@ -1362,16 +1298,18 @@ hsg_hm_thread_store(struct hsg_op * ops, uint32_t const n) static struct hsg_op * -hsg_hm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uint32_t const hm_scale) +hsg_hm_merge(struct hsg_op * ops, uint32_t const scale_log2, uint32_t const warps_pow2) { + uint32_t const span = warps_pow2 << scale_log2; + // func proto - ops = hsg_op(ops,HM_KERNEL_PROTO(level,level-msb_idx_u32(span))); + ops = hsg_op(ops,HM_KERNEL_PROTO(scale_log2)); // begin ops = hsg_begin(ops); - // declarations - ops = hsg_op(ops,HM_KERNEL_PREAMBLE(span,hm_scale)); + // preamble for loading/storing + ops = hsg_op(ops,HM_KERNEL_PREAMBLE(span/2)); // load ops = hsg_hm_thread_load(ops,span); @@ -1388,55 +1326,6 @@ hsg_hm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uin return ops; } -// -// -// - -static -struct hsg_op * -hsg_fm_merge_level(struct hsg_op * ops, uint32_t const level) -{ - uint32_t const bc_max = pow2_rd_u32(hsg_merge[0].warps); - uint32_t const bc_max_log2 = msb_idx_u32(bc_max); - - uint32_t const fm_level = (level <= bc_max_log2) ? hsg_config.merge.flip.lo : min(level - bc_max_log2,hsg_config.merge.flip.hi); - uint32_t const fm_scale = level - fm_level; - - ops = hsg_fm_merge(ops, - level, - hsg_merge[0].warps * (1u << fm_level), - fm_scale); - - return ops; -} - -// -// -// - -static -struct hsg_op * -hsg_hm_merge_level(struct hsg_op * ops, uint32_t const level) -{ - uint32_t const bc_max = pow2_rd_u32(hsg_merge[0].warps); - uint32_t const bc_max_log2 = msb_idx_u32(bc_max); - - uint32_t const fm_log2_max = bc_max_log2 + hsg_config.merge.flip.hi; - - if (level > fm_log2_max) - { - uint32_t const down_warps_log2 = level - fm_log2_max; - uint32_t const hm_level = max(hsg_config.merge.half.lo,min(hsg_config.merge.half.hi,down_warps_log2)); - - ops = hsg_hm_merge(ops, - level - hsg_config.merge.flip.hi, - bc_max * (1u << hm_level), - down_warps_log2 - hm_level); - } - - return ops; -} - // // GENERATE MERGE KERNELS // @@ -1445,23 +1334,20 @@ static struct hsg_op * hsg_xm_merge_all(struct hsg_op * ops) { - uint32_t const keys_per_block = hsg_merge[0].warps * hsg_config.warp.lanes * hsg_config.thread.regs; - uint32_t const blocks = ((1U << hsg_config.merge.max_log2) + keys_per_block - 1) / keys_per_block; - uint32_t const blocks_ru = pow2_ru_u32(blocks); - uint32_t const blocks_log2 = msb_idx_u32(blocks_ru); + uint32_t const warps = hsg_merge[0].warps; + uint32_t const warps_pow2 = pow2_rd_u32(warps); - for (uint32_t level=1; level<=blocks_log2; level+=1) - { - // - // GENERATE FLIP MERGE KERNELS - // - ops = hsg_fm_merge_level(ops,level); + // + // GENERATE FLIP MERGE KERNELS + // + for (uint32_t scale_log2=hsg_config.merge.flip.lo; scale_log2<=hsg_config.merge.flip.hi; scale_log2++) + ops = hsg_fm_merge_all(ops,scale_log2,warps); - // - // GENERATE HALF MERGE KERNELS - // - ops = hsg_hm_merge_level(ops,level); - } + // + // GENERATE HALF MERGE KERNELS + // + for (uint32_t scale_log2=hsg_config.merge.half.lo; scale_log2<=hsg_config.merge.half.hi; scale_log2++) + ops = hsg_hm_merge(ops,scale_log2,warps_pow2); return ops; } @@ -1470,93 +1356,30 @@ hsg_xm_merge_all(struct hsg_op * ops) // // -void -hsg_target_indent(struct hsg_file * const files, uint32_t const depth) -{ - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%*s", - depth*HSG_INDENT,""); -} - -void -hsg_target_debug(struct hsg_file * const files, - const struct hsg_merge * const merge, - const struct hsg_op * const ops, - uint32_t const depth) -{ - - hsg_target_indent(files,depth); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s\n", - hsg_op_type_string[ops->type]); -} - -// -// -// - -static -struct hsg_file* -hsg_files_open(const char * prefix, const char ** suffix) -{ -#define STR_BUF_SIZE 80 - - struct hsg_file * files = malloc(sizeof(struct hsg_file) * HSG_FILE_TYPE_COUNT); - - for (int32_t ii=0; iitype != HSG_OP_TYPE_EXIT) { switch (ops->type) { case HSG_OP_TYPE_END: - target_pfn(files,merge,ops,depth-1); + target_pfn(target,config,merge,ops,depth-1); return ops + 1; case HSG_OP_TYPE_BEGIN: - target_pfn(files,merge,ops,depth); - ops = hsg_op_translate_depth(target_pfn,files,merge,ops+1,depth+1); + target_pfn(target,config,merge,ops,depth); + ops = hsg_op_translate_depth(target_pfn,target,config,merge,ops+1,depth+1); break; default: - target_pfn(files,merge,ops++,depth); + target_pfn(target,config,merge,ops++,depth); } } @@ -1565,12 +1388,13 @@ hsg_op_translate_depth(hsg_target_pfn target_pfn, static void -hsg_op_translate(hsg_target_pfn target_pfn, - struct hsg_file * const files, - const struct hsg_merge * const merge, - const struct hsg_op * ops) +hsg_op_translate(hsg_target_pfn target_pfn, + struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * ops) { - hsg_op_translate_depth(target_pfn,files,merge,ops,0); + hsg_op_translate_depth(target_pfn,target,config,merge,ops,0); } // @@ -1580,38 +1404,28 @@ hsg_op_translate(hsg_target_pfn target_pfn, int main(int argc, char * argv[]) { - // - // INIT - // - for (uint32_t ii=0; ii<=MERGE_LEVELS_MAX_LOG2; ii++) - { - hsg_merge[ii].index = ii; - hsg_merge[ii].warps = 32 / (1u << ii); - } - // // PROCESS OPTIONS // - int32_t arch = 0; - int32_t opt = 0; - - bool quiet = false; - bool autotune = false; + int32_t opt = 0; + bool verbose = false; + bool autotune = false; + char const * arch = "undefined"; - while ((opt = getopt(argc,argv,"hqa:g:G:s:S:w:b:B:m:M:k:r:x:t:f:F:c:C:z")) != EOF) + while ((opt = getopt(argc,argv,"hva:g:G:s:S:w:b:B:m:M:k:r:x:t:f:F:c:C:z")) != EOF) { switch (opt) { case 'h': fprintf(stderr,"Help goes here...\n"); - return -1; + return EXIT_FAILURE; - case 'q': - quiet = true; + case 'v': + verbose = true; break; case 'a': - arch = atoi(optarg); + arch = optarg; break; case 'g': @@ -1635,30 +1449,28 @@ main(int argc, char * argv[]) break; case 'w': - hsg_config.warp.lanes = atoi(optarg); + hsg_config.warp.lanes = atoi(optarg); + hsg_config.warp.lanes_log2 = msb_idx_u32(hsg_config.warp.lanes); break; case 'b': // maximum warps in a workgroup / cta / thread block { - uint32_t const warps = atoi(optarg); - uint32_t const warps_ru_pow2 = pow2_ru_u32(warps); - - // set warps_max if not already set - if (hsg_config.block.warps_max == UINT32_MAX) - hsg_config.block.warps_max = warps_ru_pow2; + uint32_t const warps = atoi(optarg); // must always be even - if ((warps&1) != 0) + if ((warps & 1) != 0) { fprintf(stderr,"Error: -b must be even.\n"); - exit(-1); + return EXIT_FAILURE; } + hsg_merge[0].index = 0; hsg_merge[0].warps = warps; - for (uint32_t ii=1; ii<=MERGE_LEVELS_MAX_LOG2; ii++) - hsg_merge[ii].warps = warps_ru_pow2 / (1u << ii); + // set warps_max if not already set + if (hsg_config.block.warps_max == UINT32_MAX) + hsg_config.block.warps_max = pow2_ru_u32(warps); } break; @@ -1677,18 +1489,14 @@ main(int argc, char * argv[]) hsg_config.block.warps_mod = atoi(optarg); break; - case 'k': - hsg_config.merge.max_log2 = atoi(optarg); - break; - case 'r': { uint32_t const regs = atoi(optarg); - if ((regs&1) != 0) + if ((regs & 1) != 0) { fprintf(stderr,"Error: -r must be even.\n"); - exit(-1); + return EXIT_FAILURE; } hsg_config.thread.regs = regs; @@ -1726,17 +1534,39 @@ main(int argc, char * argv[]) } // - // WHICH ARCH TARGET? + // INIT MERGE // - hsg_target_pfn hsg_target_pfn = (arch < HSG_TARGET_PFN_COUNT) ? hsg_target_pfns[arch] : hsg_target_debug; + uint32_t const warps_ru_pow2 = pow2_ru_u32(hsg_merge[0].warps); + + for (uint32_t ii=1; ii<=MERGE_LEVELS_MAX_LOG2; ii++) + { + hsg_merge[ii].index = ii; + hsg_merge[ii].warps = warps_ru_pow2 >> ii; + } // - // OPEN FILES + // WHICH ARCH TARGET? // - struct hsg_file * files = hsg_files_open(hsg_target_pfn_string[arch],hsg_file_type_string[arch]); + hsg_target_pfn hsg_target_pfn; + + if (strcmp(arch,"debug") == 0) + hsg_target_pfn = hsg_target_debug; + else if (strcmp(arch,"cuda") == 0) + hsg_target_pfn = hsg_target_cuda; + else if (strcmp(arch,"opencl") == 0) + hsg_target_pfn = hsg_target_opencl; + else if (strcmp(arch,"glsl") == 0) + hsg_target_pfn = hsg_target_glsl; + else { + fprintf(stderr,"Invalid arch: %s\n",arch); + exit(EXIT_FAILURE); + } + + if (verbose) + fprintf(stderr,"Target: %s\n",arch); // - // INIT F_KEYS + // INIT SMEM KEY ALLOCATION // hsg_config_init_shared(); @@ -1766,27 +1596,26 @@ main(int argc, char * argv[]) // // THESE ARE FOR DEBUG/INSPECTION // - - if (!quiet) + if (verbose) { hsg_merge_levels_debug(merge); } } - if (!quiet) + if (verbose) fprintf(stderr,"\n\n"); // + // GENERATE THE OPCODES // - // - uint32_t const op_count = 1024*1024; // 2^20 ops for now! - struct hsg_op * const ops_begin = malloc(op_count * sizeof(*ops_begin)); + uint32_t const op_count = 1<<17; + struct hsg_op * const ops_begin = malloc(sizeof(*ops_begin) * op_count); struct hsg_op * ops = ops_begin; // - // APPEND HEADER + // OPEN INITIAL FILES AND APPEND HEADER // - ops = hsg_op(ops,FILE_HEADER()); + ops = hsg_op(ops,TARGET_BEGIN()); // // GENERATE TRANSPOSE KERNEL @@ -1809,9 +1638,9 @@ main(int argc, char * argv[]) ops = hsg_xm_merge_all(ops); // - // APPEND FOOTER + // APPEND FOOTER AND CLOSE INITIAL FILES // - ops = hsg_op(ops,FILE_FOOTER()); + ops = hsg_op(ops,TARGET_END()); // // ... WE'RE DONE! @@ -1821,20 +1650,17 @@ main(int argc, char * argv[]) // // APPLY TARGET TRANSLATOR TO ACCUMULATED OPS // - hsg_op_translate(hsg_target_pfn,files,hsg_merge,ops_begin); + struct hsg_target target; - // - // - // - if (!quiet) - hsg_op_debug(); + hsg_op_translate(hsg_target_pfn,&target,&hsg_config,hsg_merge,ops_begin); // + // DUMP INSTRUCTION COUNTS // - // - hsg_files_close(files); + if (verbose) + hsg_op_debug(); - return 0; + return EXIT_SUCCESS; } // diff --git a/src/compute/hs/gen/networks_merging.c b/src/compute/hs/gen/networks_merging.c index 90dca03c21..f93958c842 100644 --- a/src/compute/hs/gen/networks_merging.c +++ b/src/compute/hs/gen/networks_merging.c @@ -11,7 +11,7 @@ // #include "networks.h" -#include "macros.h" +#include "common/macros.h" // // @@ -24,7 +24,7 @@ // // -#define LM(n) { ARRAY_LENGTH(mn##n), mn##n } +#define LM(n) { ARRAY_LENGTH_MACRO(mn##n), mn##n } // // diff --git a/src/compute/hs/gen/networks_sorting.c b/src/compute/hs/gen/networks_sorting.c index c7beb6b45e..3d8d364399 100644 --- a/src/compute/hs/gen/networks_sorting.c +++ b/src/compute/hs/gen/networks_sorting.c @@ -14,7 +14,7 @@ // #include "networks.h" -#include "macros.h" +#include "common/macros.h" // // @@ -27,7 +27,7 @@ // // -#define LS(n) { ARRAY_LENGTH(sn##n), sn##n } +#define LS(n) { ARRAY_LENGTH_MACRO(sn##n), sn##n } // // diff --git a/src/compute/hs/gen/target_cuda.c b/src/compute/hs/gen/target_cuda.c new file mode 100644 index 0000000000..e140c4be4c --- /dev/null +++ b/src/compute/hs/gen/target_cuda.c @@ -0,0 +1,600 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#include +#include + +// +// +// + +#include "gen.h" +#include "transpose.h" + +#include "common/util.h" +#include "common/macros.h" + +// +// +// + +struct hsg_transpose_state +{ + FILE * header; + struct hsg_config const * config; +}; + +static +char +hsg_transpose_reg_prefix(uint32_t const cols_log2) +{ + return 'a' + (('r' + cols_log2 - 'a') % 26); +} + +static +void +hsg_transpose_blend(uint32_t const cols_log2, + uint32_t const row_ll, // lower-left + uint32_t const row_ur, // upper-right + struct hsg_transpose_state * const state) +{ + // we're starting register names at '1' for now + fprintf(state->header, + " HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n", + hsg_transpose_reg_prefix(cols_log2-1), + hsg_transpose_reg_prefix(cols_log2), + cols_log2,row_ll+1,row_ur+1); +} + +static +void +hsg_transpose_remap(uint32_t const row_from, + uint32_t const row_to, + struct hsg_transpose_state * const state) +{ + // we're starting register names at '1' for now + fprintf(state->header, + " HS_TRANSPOSE_REMAP( %c, %3u, %3u ) \\\n", + hsg_transpose_reg_prefix(state->config->warp.lanes_log2), + row_from+1,row_to+1); +} + +// +// +// + +static +void +hsg_copyright(FILE * file) +{ + fprintf(file, + "// \n" + "// Copyright 2016 Google Inc. \n" + "// \n" + "// Use of this source code is governed by a BSD-style \n" + "// license that can be found in the LICENSE file. \n" + "// \n" + "\n"); +} + +// +// +// + +struct hsg_target_state +{ + FILE * header; + FILE * source; +}; + +// +// +// + +void +hsg_target_cuda(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth) +{ + switch (ops->type) + { + case HSG_OP_TYPE_END: + fprintf(target->state->source, + "}\n"); + break; + + case HSG_OP_TYPE_BEGIN: + fprintf(target->state->source, + "{\n"); + break; + + case HSG_OP_TYPE_ELSE: + fprintf(target->state->source, + "else\n"); + break; + + case HSG_OP_TYPE_TARGET_BEGIN: + { + // allocate state + target->state = malloc(sizeof(*target->state)); + + // allocate files + fopen_s(&target->state->header,"hs_cuda.h", "wb"); + fopen_s(&target->state->source,"hs_cuda.cu","wb"); + + // initialize header + uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps)); + + hsg_copyright(target->state->header); + + fprintf(target->state->header, + "#ifndef HS_CUDA_ONCE \n" + "#define HS_CUDA_ONCE \n" + " \n" + "#define HS_SLAB_THREADS_LOG2 %u \n" + "#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2) \n" + "#define HS_SLAB_WIDTH_LOG2 %u \n" + "#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2) \n" + "#define HS_SLAB_HEIGHT %u \n" + "#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)\n" + "#define HS_REG_LAST(c) c##%u \n" + "#define HS_KEY_TYPE %s \n" + "#define HS_KEY_WORDS %u \n" + "#define HS_VAL_WORDS 0 \n" + "#define HS_BS_SLABS %u \n" + "#define HS_BS_SLABS_LOG2_RU %u \n" + "#define HS_BC_SLABS_LOG2_MAX %u \n" + "#define HS_FM_SCALE_MIN %u \n" + "#define HS_FM_SCALE_MAX %u \n" + "#define HS_HM_SCALE_MIN %u \n" + "#define HS_HM_SCALE_MAX %u \n" + "#define HS_EMPTY \n" + " \n", + config->warp.lanes_log2, + config->warp.lanes_log2, + config->thread.regs, + config->thread.regs, + (config->type.words == 2) ? "ulong" : "uint", + config->type.words, + merge->warps, + msb_idx_u32(pow2_ru_u32(merge->warps)), + bc_max, + config->merge.flip.lo, + config->merge.flip.hi, + config->merge.half.lo, + config->merge.half.hi); + + fprintf(target->state->header, + "#define HS_SLAB_ROWS() \\\n"); + + for (uint32_t ii=1; ii<=config->thread.regs; ii++) + fprintf(target->state->header, + " HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1); + + fprintf(target->state->header, + " HS_EMPTY\n" + " \n"); + + fprintf(target->state->header, + "#define HS_TRANSPOSE_SLAB() \\\n"); + + for (uint32_t ii=1; ii<=config->warp.lanes_log2; ii++) + fprintf(target->state->header, + " HS_TRANSPOSE_STAGE( %u ) \\\n",ii); + + struct hsg_transpose_state state[1] = + { + { .header = target->state->header, + .config = config + } + }; + + hsg_transpose(config->warp.lanes_log2, + config->thread.regs, + hsg_transpose_blend,state, + hsg_transpose_remap,state); + + fprintf(target->state->header, + " HS_EMPTY\n" + " \n"); + + hsg_copyright(target->state->source); + + fprintf(target->state->source, + "#include \"hs_cuda_macros.h\" \n" + " \n" + "// \n" + "// \n" + "// \n"); + } + break; + + case HSG_OP_TYPE_TARGET_END: + // decorate the files + fprintf(target->state->header, + "#endif \n" + " \n" + "// \n" + "// \n" + "// \n" + " \n"); + fprintf(target->state->source, + " \n" + "// \n" + "// \n" + "// \n" + " \n"); + + // close files + fclose(target->state->header); + fclose(target->state->source); + + // free state + free(target->state); + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO: + { + fprintf(target->state->source, + "\nHS_TRANSPOSE_KERNEL_PROTO(%u)\n", + config->warp.lanes); + } + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE: + { + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + } + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY: + { + fprintf(target->state->source, + "HS_TRANSPOSE_SLAB()\n"); + } + break; + + case HSG_OP_TYPE_BS_KERNEL_PROTO: + { + struct hsg_merge const * const m = merge + ops->a; + + uint32_t const bs = pow2_ru_u32(m->warps); + uint32_t const msb = msb_idx_u32(bs); + + fprintf(target->state->source, + "\nHS_BS_KERNEL_PROTO(%u,%u,%u)\n", + config->warp.lanes,m->warps,msb); + } + break; + + case HSG_OP_TYPE_BS_KERNEL_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + if (m->warps > 1) + { + fprintf(target->state->source, + "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n", + m->warps * config->warp.lanes, + m->rows_bs); + } + + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + } + break; + + case HSG_OP_TYPE_BC_KERNEL_PROTO: + { + struct hsg_merge const * const m = merge + ops->a; + + uint32_t const msb = msb_idx_u32(m->warps); + + fprintf(target->state->source, + "\nHS_BC_KERNEL_PROTO(%u,%u,%u)\n", + config->warp.lanes,m->warps,msb); + } + break; + + case HSG_OP_TYPE_BC_KERNEL_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + if (m->warps > 1) + { + fprintf(target->state->source, + "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n", + m->warps * config->warp.lanes, + m->rows_bc); + } + + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + } + break; + + case HSG_OP_TYPE_FM_KERNEL_PROTO: + fprintf(target->state->source, + "\nHS_FM_KERNEL_PROTO(%u,%u)\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_FM_KERNEL_PREAMBLE: + fprintf(target->state->source, + "HS_FM_PREAMBLE(%u);\n", + ops->a); + break; + + case HSG_OP_TYPE_HM_KERNEL_PROTO: + { + fprintf(target->state->source, + "\nHS_HM_KERNEL_PROTO(%u)\n", + ops->a); + } + break; + + case HSG_OP_TYPE_HM_KERNEL_PREAMBLE: + fprintf(target->state->source, + "HS_HM_PREAMBLE(%u);\n", + ops->a); + break; + + case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD: + { + static char const * const vstr[] = { "vin", "vout" }; + + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_SLAB_GLOBAL_LOAD(%s,%u,%u);\n", + ops->n,vstr[ops->v],config->warp.lanes,ops->n-1); + } + break; + + case HSG_OP_TYPE_BX_REG_GLOBAL_STORE: + fprintf(target->state->source, + "HS_SLAB_GLOBAL_STORE(%u,%u,r%u);\n", + config->warp.lanes,ops->n-1,ops->n); + break; + + case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_HM_REG_GLOBAL_STORE: + fprintf(target->state->source, + "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT: + fprintf(target->state->source, + "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_FM_GLOBAL_LOAD_R(%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT: + fprintf(target->state->source, + "HS_FM_GLOBAL_STORE_R(%-3u,r%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED: + { + if (ops->a <= ops->b) + { + fprintf(target->state->source, + "if (HS_FM_IS_NOT_LAST_SPAN() || (fm_frac == 0))\n"); + } + else if (ops->b > 1) + { + fprintf(target->state->source, + "else if (fm_frac == %u)\n", + ops->b); + } + else + { + fprintf(target->state->source, + "else\n"); + } + } + break; + + case HSG_OP_TYPE_SLAB_FLIP: + fprintf(target->state->source, + "HS_SLAB_FLIP_PREAMBLE(%u);\n", + ops->n-1); + break; + + case HSG_OP_TYPE_SLAB_HALF: + fprintf(target->state->source, + "HS_SLAB_HALF_PREAMBLE(%u);\n", + ops->n / 2); + break; + + case HSG_OP_TYPE_CMP_FLIP: + fprintf(target->state->source, + "HS_CMP_FLIP(%-3u,r%-3u,r%-3u);\n",ops->a,ops->b,ops->c); + break; + + case HSG_OP_TYPE_CMP_HALF: + fprintf(target->state->source, + "HS_CMP_HALF(%-3u,r%-3u);\n",ops->a,ops->b); + break; + + case HSG_OP_TYPE_CMP_XCHG: + if (ops->c == UINT32_MAX) + { + fprintf(target->state->source, + "HS_CMP_XCHG(r%-3u,r%-3u);\n", + ops->a,ops->b); + } + else + { + fprintf(target->state->source, + "HS_CMP_XCHG(r%u_%u,r%u_%u);\n", + ops->c,ops->a,ops->c,ops->b); + } + break; + + case HSG_OP_TYPE_BS_REG_SHARED_STORE_V: + fprintf(target->state->source, + "HS_BX_LOCAL_V(%-3u * %-2u * %-3u) = r%u;\n", + merge[ops->a].warps,config->warp.lanes,ops->c,ops->b); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V: + fprintf(target->state->source, + "r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n", + ops->b,merge[ops->a].warps,config->warp.lanes,ops->c); + break; + + case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n", + ops->b,ops->a,config->warp.lanes,ops->c); + break; + + case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT: + fprintf(target->state->source, + "HS_SLAB_LOCAL_L(%5u) = r%u_%u;\n", + ops->b * config->warp.lanes, + ops->c, + ops->a); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT: + fprintf(target->state->source, + "HS_SLAB_LOCAL_R(%5u) = r%u_%u;\n", + ops->b * config->warp.lanes, + ops->c, + ops->a); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_L(%u);\n", + ops->c, + ops->a, + ops->b * config->warp.lanes); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_R(%u);\n", + ops->c, + ops->a, + ops->b * config->warp.lanes); + break; + + case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_BC_GLOBAL_LOAD_L(%u,%u);\n", + ops->c, + ops->a, + config->warp.lanes,ops->b); + break; + + case HSG_OP_TYPE_BLOCK_SYNC: + fprintf(target->state->source, + "HS_BLOCK_BARRIER();\n"); + // + // FIXME - Named barriers to allow coordinating warps to proceed? + // + break; + + case HSG_OP_TYPE_BS_FRAC_PRED: + { + if (ops->m == 0) + { + fprintf(target->state->source, + "if (warp_idx < bs_full)\n"); + } + else + { + fprintf(target->state->source, + "else if (bs_frac == %u)\n", + ops->w); + } + } + break; + + case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + fprintf(target->state->source, + "HS_BS_MERGE_H_PREAMBLE(%u,%u);\n", + config->warp.lanes,m->warps); + } + break; + + case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + fprintf(target->state->source, + "HS_BC_MERGE_H_PREAMBLE(%u,%u,%u);\n", + config->warp.lanes,config->thread.regs,m->warps); + } + break; + + case HSG_OP_TYPE_BX_MERGE_H_PRED: + fprintf(target->state->source, + "if (get_sub_group_id() < %u)\n", + ops->a); + break; + + case HSG_OP_TYPE_BS_ACTIVE_PRED: + { + struct hsg_merge const * const m = merge + ops->a; + + if (m->warps <= 32) + { + fprintf(target->state->source, + "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n", + m->levels[ops->b].active.b32a2[0]); + } + else + { + fprintf(target->state->source, + "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n", + m->levels[ops->b].active.b32a2[1], + m->levels[ops->b].active.b32a2[0]); + } + } + break; + + default: + fprintf(stderr,"type not found: %s\n",hsg_op_type_string[ops->type]); + exit(EXIT_FAILURE); + break; + } +} + +// +// +// diff --git a/src/compute/hs/gen/target_cuda_sm3x.c b/src/compute/hs/gen/target_cuda_sm3x.c deleted file mode 100644 index 6369aa33b0..0000000000 --- a/src/compute/hs/gen/target_cuda_sm3x.c +++ /dev/null @@ -1,776 +0,0 @@ -/* - * Copyright 2016 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#include - -// -// -// - -#include "gen.h" -#include "util.h" - -// -// -// - -void -hsg_target_cuda_sm3x(struct hsg_file * const files, - struct hsg_merge const * const merge, - struct hsg_op const * const ops, - uint32_t const depth) -{ - const char* const type = (hsg_config.type.words == 2) ? "uint64_t" : "uint32_t"; - const char* const type_max = (hsg_config.type.words == 2) ? "UINT64_MAX" : "UINT32_MAX"; - - switch (ops->type) - { - case HSG_OP_TYPE_END: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "}\n"); - break; - - case HSG_OP_TYPE_BEGIN: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "{\n"); - break; - - case HSG_OP_TYPE_ELSE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "else\n"); - break; - - case HSG_OP_TYPE_FILE_HEADER: - { - uint32_t const bc_min = msb_idx_u32(hsg_config.block.warps_min); - uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge[0].warps)); - - fprintf(files[HSG_FILE_TYPE_HEADER].file, - "// \n" - "// Copyright 2016 Google Inc. \n" - "// \n" - "// Use of this source code is governed by a BSD-style \n" - "// license that can be found in the LICENSE file. \n" - "// \n" - " \n" - "#pragma once \n" - " \n" - "#include \n" - " \n" - "#define HS_LANES_PER_WARP %u \n" - "#define HS_BS_WARPS_PER_BLOCK %u \n" - "#define HS_BC_WARPS_LOG2_MIN %u \n" - "#define HS_BC_WARPS_LOG2_MAX %u \n" - "#define HS_KEYS_PER_THREAD %u \n" - "#define HS_KEY_WORDS %u \n" - "#define HS_KEY_TYPE %s \n" - " \n" - "#include <%s_args.h> \n" - " \n", - hsg_config.warp.lanes, - merge->warps, - bc_min, - bc_max, - hsg_config.thread.regs, - hsg_config.type.words, - type, - files[HSG_FILE_TYPE_SOURCE].prefix); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "// -*- compile-command: \"nvcc -arch sm_52 -Xptxas=-v,-abi=no -cubin -I. %s\"; -*-\n", - files[HSG_FILE_TYPE_SOURCE].name); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "// \n" - "// Copyright 2016 Google Inc. \n" - "// \n" - "// Use of this source code is governed by a BSD-style \n" - "// license that can be found in the LICENSE file. \n" - "// \n" - " \n" - "#ifdef __cplusplus \n" - "extern \"C\" { \n" - "#endif \n" - " \n" - "#include \"%s_launcher.h\" \n" - " \n" - "#ifdef __cplusplus \n" - "} \n" - "#endif \n" - " \n" - "#include \"%s_launch_bounds.h\" \n" - "#include <%s_finalize.inl> \n" - " \n" - "// \n" - "// \n" - "// \n", - files[HSG_FILE_TYPE_HEADER].prefix, - files[HSG_FILE_TYPE_SOURCE].prefix, - files[HSG_FILE_TYPE_SOURCE].prefix); - } - break; - - case HSG_OP_TYPE_FILE_FOOTER: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "// \n" - "// \n" - "// \n" - " \n" - "#include \"%s_launcher.inl\" \n" - " \n" - "// \n" - "// \n" - "// \n", - files[HSG_FILE_TYPE_SOURCE].prefix); - break; - - case HSG_OP_TYPE_BS_KERNEL_PROTO: - { - const uint32_t tpb = merge->warps * hsg_config.warp.lanes; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "extern \"C\" \n" - "__global__ \n" - "__launch_bounds__(%u,%u) \n" - "void \n" - "hs_bs_kernel(const struct hs_args args) \n", - tpb,1); - } - break; - - case HSG_OP_TYPE_BS_KERNEL_PREAMBLE: - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "__shared__ union { \n"); - - for (uint32_t ii=0; iiwarps < 2) - break; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " %s m%u[%u][%u];\n", - type, - ii, - m->rows_bs, - m->warps * hsg_config.warp.lanes); - } - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " struct { \n" - " %s f[%u][%u]; \n" - " %s l[%u]; \n" - " }; \n", - type, - merge[0].warps, - hsg_config.warp.skpw_bs - 1, - type, - merge[0].warps); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "} shared; \n" - " \n"); - - const uint32_t kpw = hsg_config.warp.lanes * hsg_config.thread.regs; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "const int32_t block_warp_idx = threadIdx.x / %u; \n" - "const int32_t warp_lane_idx = threadIdx.x & %u; \n" - "const int32_t warp_idx = blockIdx.x * %u + block_warp_idx; \n" - "const int32_t warp_gmem_idx = warp_idx * %u + warp_lane_idx; \n" - " \n" - "%s const * const vin_ptr = args.vin + warp_gmem_idx; \n" - "%s * const vout_ptr = args.vout + warp_gmem_idx; \n" - " \n", - - hsg_config.warp.lanes, - hsg_config.warp.lanes - 1, - merge[0].warps, - kpw, - type, - type); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (warp_idx >= args.bs.full + args.bs.frac) \n" - " return; \n" - " \n"); - } - break; - - case HSG_OP_TYPE_BC_KERNEL_PROTO: - { - uint32_t const bc_warps = merge[ops->a].warps; - uint32_t const tpb = bc_warps * hsg_config.warp.lanes; - uint32_t const bpm = hsg_config.block.warps_max / bc_warps; - uint32_t const msb = msb_idx_u32(bc_warps); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "extern \"C\" \n" - "__global__ \n" - "__launch_bounds__(%u,%u) \n" - "void \n" - "hs_bc_%u_kernel(const struct hs_args args) \n", - tpb,bpm, - msb); - } - break; - - case HSG_OP_TYPE_BC_KERNEL_PREAMBLE: - { - const struct hsg_merge* const m = merge + ops->a; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "__shared__ union { \n"); - - if (m->warps >= 2) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " %s m%u[%u][%u]; \n", - type, - ops->a, - m->rows_bc, - m->warps * hsg_config.warp.lanes); - } - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " struct { \n" - " %s f[%u][%u]; \n" - " %s l[%u]; \n" - " }; \n" - "} shared; \n" - " \n", - type,m->warps,m->skpw_bc - 1, - type,m->warps); - - const uint32_t kpw = hsg_config.warp.lanes * hsg_config.thread.regs; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "const int32_t block_warp_idx = threadIdx.x / %u; \n" - "const int32_t warp_lane_idx = threadIdx.x & %u; \n" - "const int32_t warp_gmem_base = blockIdx.x * %u * %u + warp_lane_idx; \n" - "const int32_t warp_gmem_idx = warp_gmem_base + block_warp_idx * %u; \n" - " \n" - "%s * const vout_ptr = args.vout + warp_gmem_idx; \n" - " \n", - hsg_config.warp.lanes, - hsg_config.warp.lanes - 1, - m->warps,kpw, - kpw, - type); - -#if 0 - // - // NO LONGER NEED THIS TEST - // - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (warp_idx >= args.bc.full) \n" - " return; \n" - " \n"); -#endif - } - break; - - case HSG_OP_TYPE_FM_KERNEL_PROTO: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "#define HS_FM_WARPS_LOG2_%u %u \n" - "extern \"C\" \n" - "__global__ \n" - "HS_FM_LAUNCH_BOUNDS_%u \n" - "void \n" - "hs_fm_%u_kernel(const struct hs_args args) \n", - ops->a, - ops->b, - ops->a - ops->b, - ops->a); - break; - - case HSG_OP_TYPE_FM_KERNEL_PREAMBLE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "const int32_t warp_idx = (blockDim.x * blockIdx.x + threadIdx.x) / %u; \n" - "const int32_t warp_lane_idx = threadIdx.x & %u; \n" - " \n" - "const int32_t merge_idx = warp_idx / %u >> %u; \n" - " \n" - "const int32_t merge_stride = %u * %u << %u; \n" - "const int32_t merge_keys = merge_stride * %u; \n" - " \n" - "const int32_t merge_base = merge_idx * merge_keys; \n" - " \n" - "const int32_t merge_l_off = (warp_idx - merge_idx * (%u << %u)) * %u + warp_lane_idx; \n" - "const int32_t merge_l_end = merge_l_off + merge_stride * (%u / 2 - 1); \n" - "%s * const merge_l = args.vout + merge_base + merge_l_off; \n" - " \n" - "const int32_t merge_r_off = merge_keys - merge_l_end - 1; \n" - "%s * const merge_r = args.vout + merge_base + merge_r_off; \n" - " \n", - hsg_config.warp.lanes, - hsg_config.warp.lanes-1, - hsg_config.thread.regs,ops->b, - hsg_config.thread.regs,hsg_config.warp.lanes,ops->b, - ops->a, - hsg_config.thread.regs,ops->b,hsg_config.warp.lanes, - ops->a, - type, - type); - break; - - case HSG_OP_TYPE_HM_KERNEL_PROTO: - { - const uint32_t bc_max = msb_idx_u32(pow2_rd_u32(merge[0].warps)); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "#define HS_HM_WARPS_LOG2_%u %u \n" - "extern \"C\" \n" - "__global__ \n" - "HS_HM_LAUNCH_BOUNDS_%u \n" - "void \n" - "hs_hm_%u_kernel(const struct hs_args args) \n", - ops->a, - ops->b, - ops->a - ops->b - bc_max, - ops->a); - } - break; - - case HSG_OP_TYPE_HM_KERNEL_PREAMBLE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "const int32_t warp_idx = (blockDim.x * blockIdx.x + threadIdx.x) / %u; \n" - "const int32_t warp_lane_idx = threadIdx.x & %u; \n" - " \n" - "const int32_t merge_idx = (warp_idx / %u) >> %u; \n" - " \n" - "const int32_t merge_stride = %u * %u << %u; \n" - "const int32_t merge_keys = merge_stride * %u; \n" - " \n" - "const int32_t merge_base = merge_idx * merge_keys; \n" - " \n" - "const int32_t merge_off = (warp_idx - merge_idx * (%u << %u)) * %u; \n" - "%s * const merge_ptr = args.vout + merge_base + merge_off + warp_lane_idx; \n" - " \n", - hsg_config.warp.lanes, - hsg_config.warp.lanes-1, - hsg_config.thread.regs,ops->b, - hsg_config.thread.regs,hsg_config.warp.lanes,ops->b, - ops->a, - hsg_config.thread.regs,ops->b,hsg_config.warp.lanes, - type); - break; - - case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD: - { - static const char* const vstr[] = { "vin_ptr", "vout_ptr" }; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s r%-3u = %s[%-3u * %u]; \n", - type,ops->n,vstr[ops->v],ops->n-1,hsg_config.warp.lanes); - } - break; - - case HSG_OP_TYPE_BX_REG_GLOBAL_STORE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "vout_ptr[%-3u * %u] = r%u; \n", - ops->n-1,hsg_config.warp.lanes,ops->n); - break; - -#if 0 - case HSG_OP_TYPE_BX_WARP_STORE_PRED: - if (ops->a == 1) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (!args.is_final) \n"); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (blockIdx.x * %u + block_warp_idx >= args.bx.ru) \n" - "{ \n" - " return; \n" - "} \n" - "else if (!args.is_final) \n", - ops->a); - } - break; -#endif - - case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s r%-3u = merge_ptr[%-3u * merge_stride];\n", - type,ops->a,ops->b); - break; - - case HSG_OP_TYPE_HM_REG_GLOBAL_STORE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "merge_ptr[%-3u * merge_stride] = r%u;\n", - ops->b,ops->a); - break; - - case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s r%-3u = merge_l[%-3u * merge_stride];\n", - type,ops->a,ops->b); - break; - - case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "merge_l[%-3u * merge_stride] = r%u;\n", - ops->b,ops->a); - break; - - case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s r%-3u = merge_r[%-3u * merge_stride];\n", - type,ops->a,ops->b); - break; - - case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "merge_r[%-3u * merge_stride] = r%u;\n", - ops->b,ops->a); - break; - - case HSG_OP_TYPE_WARP_FLIP: - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "const int32_t flip_lane_mask = %u; \n" - "const int32_t flip_lane_idx = warp_lane_idx ^ flip_lane_mask; \n" - "const bool t_lt = warp_lane_idx < flip_lane_idx; \n", - ops->n-1); - } - break; - - case HSG_OP_TYPE_WARP_HALF: - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "const int32_t half_lane_mask = %u; \n" - "const int32_t half_lane_idx = warp_lane_idx ^ half_lane_mask; \n" - "const bool t_lt = warp_lane_idx < half_lane_idx; \n", - ops->n / 2); - } - break; - - case HSG_OP_TYPE_CMP_FLIP: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_CMP_FLIP(r%-3u,r%-3u,r%-3u)\n",ops->a,ops->b,ops->c); - break; - - case HSG_OP_TYPE_CMP_HALF: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_CMP_HALF(r%-3u,r%-3u)\n",ops->a,ops->b); - break; - - case HSG_OP_TYPE_CMP_XCHG: - if (ops->c == UINT32_MAX) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_CMP_XCHG(r%-3u,r%-3u)\n", - ops->a,ops->b); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_CMP_XCHG(r%u_%u,r%u_%u)\n", - ops->c,ops->a,ops->c,ops->b); - } - break; - - case HSG_OP_TYPE_BS_REG_SHARED_STORE_V: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "smem_v[%-3u * %-2u * %-3u] = r%u;\n", - ops->a,hsg_config.warp.lanes,ops->c,ops->b); - break; - - case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "r%-3u = smem_v[%-3u * %-2u * %-3u];\n", - ops->b,ops->a,hsg_config.warp.lanes,ops->c); - break; - - case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s r%-3u = smem_v[%-3u * %-2u * %-3u];\n", - type,ops->b,ops->a,hsg_config.warp.lanes,ops->c); - break; - - case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "smem_l[%5u] = r%u_%u;\n", - ops->b * hsg_config.warp.lanes, - ops->c, - ops->a); - break; - - case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "smem_r[%5u] = r%u_%u;\n", - ops->b * hsg_config.warp.lanes, - ops->c, - ops->a); - break; - - case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s r%u_%-3u = smem_l[%u];\n", - type, - ops->c, - ops->a, - ops->b * hsg_config.warp.lanes); - break; - - case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s r%u_%-3u = smem_r[%u];\n", - type, - ops->c, - ops->a, - ops->b * hsg_config.warp.lanes); - break; - - case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s r%u_%-3u = gmem_l[%u];\n", - type, - ops->c, - ops->a, - ops->b * hsg_config.warp.lanes); - break; - -#if 0 - case HSG_OP_TYPE_REG_F_PREAMBLE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s* const f_%u_smem_st_ptr = &shared.f[block_warp_idx]", - type, - ops->a); - - if (ops->a >= (int32_t)hsg_config.warp.lanes) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "[warp_lane_idx * %u];\n", - (ops->a / hsg_config.warp.lanes) * hsg_config.warp.lanes + 1); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "[(warp_lane_idx & 0x%X) * %u + (warp_lane_idx & ~0x%X)];\n", - ops->a-1, - hsg_config.warp.lanes + 1, - ops->a-1); - } - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "const %s* const f_%u_smem_ld_ptr = &shared.f[block_warp_idx][warp_lane_idx];\n", - type, - ops->a); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s* const f_%u_gmem_st_ptr = args.vout + warp_gmem_idx", - type, - ops->a); - - if (ops->a >= (int32_t)hsg_config.warp.lanes) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file,";\n"); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " - warp_lane_idx + (warp_lane_idx & ~0x%X) * %u + (warp_lane_idx & 0x%X);\n", - ops->a-1, - hsg_config.thread.regs, - ops->a-1); - } - break; - - case HSG_OP_TYPE_REG_SHARED_STORE_F: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "f_%u_smem_st_ptr[%-3u] = r%u;\n", - ops->c, - ops->b, - ops->a); - break; - - case HSG_OP_TYPE_REG_SHARED_LOAD_F: - if (ops->c >= (int32_t)hsg_config.warp.lanes) - { - uint32_t const adjacent = ops->c / hsg_config.warp.lanes; - uint32_t const stride = adjacent * hsg_config.warp.lanes + 1; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "r%-3u = f_%u_smem_ld_ptr[%-3u];\n", - ops->a, - ops->c, - (ops->b / adjacent) * stride + (ops->b % adjacent) * hsg_config.warp.lanes); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "r%-3u = f_%u_smem_ld_ptr[%-3u];\n", - ops->a, - ops->c, - ops->b * (hsg_config.warp.lanes + 1)); - } - break; - - case HSG_OP_TYPE_REG_GLOBAL_STORE_F: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "f_%u_gmem_st_ptr[%-3u * %u + %-3u] = r%u;\n", - ops->c, - ops->b, - hsg_config.thread.regs, // hsg_config.warp.lanes, - (ops->a - 1) & ~(ops->c - 1), - ops->a); - break; -#endif - -#if 0 - case HSG_OP_TYPE_FINALIZE: - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_FINALIZE(%s,args,shared.f[block_warp_idx],shared.l,\n" - " block_warp_idx,warp_lane_idx,warp_gmem_idx,\n" - " r%-3u", - ops->a == 1 ? "true" : "false", - 1); - -#define HS_WARP_FINALIZE_PRETTY_PRINT 8 - - for (uint32_t r=2; r<=hsg_config.thread.regs; r++) - { - if (r % HS_WARP_FINALIZE_PRETTY_PRINT == 1) - fprintf(files[HSG_FILE_TYPE_SOURCE].file,",\n"); - else - fprintf(files[HSG_FILE_TYPE_SOURCE].file,","); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file,"r%-3u",r); - } - - fprintf(files[HSG_FILE_TYPE_SOURCE].file,");\n"); - } - break; -#endif - - case HSG_OP_TYPE_BLOCK_SYNC: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "__syncthreads();\n"); - break; - - case HSG_OP_TYPE_BS_FRAC_PRED: - { - if (ops->m == 0) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (warp_idx < args.bs.full)\n"); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "else if (args.bs.frac == %u)\n", - ops->w); - } - } - break; - -#if 0 // DELETED - case HSG_OP_TYPE_BX_MERGE_V_PREAMBLE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s * const smem_v = shared.m%u[0] + threadIdx.x; \n", - type,ops->a); - break; -#endif - - case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE: - if (ops->c == 0) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s * smem_l = shared.m%u[block_warp_idx ] + warp_lane_idx; \n" - "%s * smem_r = shared.m%u[block_warp_idx ^ 1] + (warp_lane_idx ^ %u); \n", - type,ops->a, - type,ops->a,hsg_config.warp.lanes-1); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "smem_l = shared.m%u[block_warp_idx ] + warp_lane_idx; \n" - "smem_r = shared.m%u[block_warp_idx ^ 1] + (warp_lane_idx ^ %u); \n", - ops->a, - ops->a,hsg_config.warp.lanes-1); - } - break; - - case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s const * const gmem_l = args.vout + (warp_gmem_base + block_warp_idx * %u); \n" - "%s * const smem_l = shared.m%u[block_warp_idx] + warp_lane_idx; \n" - "%s * const smem_v = shared.m%u[0] + threadIdx.x; \n", - type,hsg_config.warp.lanes, - type,ops->a, - type,ops->a); - break; - - case HSG_OP_TYPE_BX_MERGE_H_PRED: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (threadIdx.x < %u)\n", - ops->a * hsg_config.warp.lanes); - break; - - case HSG_OP_TYPE_BS_ACTIVE_PRED: - { - const struct hsg_merge* const m = merge + ops->a; - - if (m->warps <= 32) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (((1u << block_warp_idx) & 0x%08X) != 0)\n", - m->levels[ops->b].active.b32a2[0]); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (((1UL << block_warp_idx) & 0x%08X%08XL) != 0L)\n", - m->levels[ops->b].active.b32a2[1], - m->levels[ops->b].active.b32a2[0]); - } - } - break; - - case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED: - { - if (ops->a == ops->b) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (merge_idx < args.fm.full) \n"); - } - else if (ops->b > 1) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "else if (args.fm.frac == %u) \n", - ops->b); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "else\n"); - } - } - break; - - default: - hsg_target_debug(files,merge,ops,depth); - break; - } -} - -// -// -// diff --git a/src/compute/hs/gen/target_debug.c b/src/compute/hs/gen/target_debug.c new file mode 100644 index 0000000000..1481ca8041 --- /dev/null +++ b/src/compute/hs/gen/target_debug.c @@ -0,0 +1,73 @@ +/* + * Copyright 2018 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#include +#include + +// +// +// + +#include "gen.h" + +// +// +// + +#define HSG_INDENT 2 + +// +// +// + +struct hsg_target_state +{ + FILE * txt; +}; + +// +// +// + +void +hsg_target_indent(struct hsg_target * const target, uint32_t const depth) +{ + fprintf(target->state->txt, + "%*s", + depth*HSG_INDENT,""); +} + +void +hsg_target_debug(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth) +{ + if (ops->type == HSG_OP_TYPE_TARGET_BEGIN) + { + target->state = malloc(sizeof(*target->state)); + fopen_s(&target->state->txt,"hs_debug.txt","wb"); + } + + hsg_target_indent(target,depth); + + fprintf(target->state->txt, + "%s\n", + hsg_op_type_string[ops->type]); + + if (ops->type == HSG_OP_TYPE_TARGET_END) + { + fclose(target->state->txt); + free(target->state); + } +} + +// +// +// diff --git a/src/compute/hs/gen/target_glsl.c b/src/compute/hs/gen/target_glsl.c new file mode 100644 index 0000000000..2bb75797ab --- /dev/null +++ b/src/compute/hs/gen/target_glsl.c @@ -0,0 +1,674 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#include +#include + +// +// +// + +#include "gen.h" +#include "transpose.h" + +#include "common/util.h" +#include "common/macros.h" + +// +// +// + +struct hsg_transpose_state +{ + FILE * header; + struct hsg_config const * config; +}; + +static +char +hsg_transpose_reg_prefix(uint32_t const cols_log2) +{ + return 'a' + (('r' + cols_log2 - 'a') % 26); +} + +static +void +hsg_transpose_blend(uint32_t const cols_log2, + uint32_t const row_ll, // lower-left + uint32_t const row_ur, // upper-right + struct hsg_transpose_state * const state) +{ + // we're starting register names at '1' for now + fprintf(state->header, + " HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n", + hsg_transpose_reg_prefix(cols_log2-1), + hsg_transpose_reg_prefix(cols_log2), + cols_log2,row_ll+1,row_ur+1); +} + +static +void +hsg_transpose_remap(uint32_t const row_from, + uint32_t const row_to, + struct hsg_transpose_state * const state) +{ + // we're starting register names at '1' for now + fprintf(state->header, + " HS_TRANSPOSE_REMAP( %c, %3u, %3u ) \\\n", + hsg_transpose_reg_prefix(state->config->warp.lanes_log2), + row_from+1,row_to+1); +} + +// +// +// + +static +void +hsg_copyright(FILE * file) +{ + fprintf(file, + "// \n" + "// Copyright 2016 Google Inc. \n" + "// \n" + "// Use of this source code is governed by a BSD-style \n" + "// license that can be found in the LICENSE file. \n" + "// \n" + "\n"); +} + +static +void +hsg_macros(FILE * file) +{ + fprintf(file, + "#include \"hs_glsl_macros.h\" \n" + " \n" + "// \n" + "// \n" + "// \n" + "\n"); +} + +// +// +// + +struct hsg_target_state +{ + FILE * header; + FILE * embeds; + FILE * source; +}; + +// +// +// + +void +hsg_target_glsl(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth) +{ + switch (ops->type) + { + case HSG_OP_TYPE_END: + fprintf(target->state->source, + "}\n"); + + if (depth == 0) { + fclose(target->state->source); + target->state->source = NULL; + } + break; + + case HSG_OP_TYPE_BEGIN: + fprintf(target->state->source, + "{\n"); + break; + + case HSG_OP_TYPE_ELSE: + fprintf(target->state->source, + "else\n"); + break; + + case HSG_OP_TYPE_TARGET_BEGIN: + { + // allocate state + target->state = malloc(sizeof(*target->state)); + + // allocate files + fopen_s(&target->state->header,"hs_glsl.h", "wb"); + fopen_s(&target->state->embeds,"hs_kernels.h","wb"); + + hsg_copyright(target->state->header); + hsg_copyright(target->state->embeds); + + // initialize header + uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps)); + + fprintf(target->state->header, + "#ifndef HS_GLSL_ONCE \n" + "#define HS_GLSL_ONCE \n" + " \n" + "#define HS_SLAB_THREADS_LOG2 %u \n" + "#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2) \n" + "#define HS_SLAB_WIDTH_LOG2 %u \n" + "#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2) \n" + "#define HS_SLAB_HEIGHT %u \n" + "#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)\n" + "#define HS_REG_LAST(c) c##%u \n" + "#define HS_KEY_TYPE %s \n" + "#define HS_KEY_WORDS %u \n" + "#define HS_VAL_WORDS 0 \n" + "#define HS_BS_SLABS %u \n" + "#define HS_BS_SLABS_LOG2_RU %u \n" + "#define HS_BC_SLABS_LOG2_MAX %u \n" + "#define HS_FM_SCALE_MIN %u \n" + "#define HS_FM_SCALE_MAX %u \n" + "#define HS_HM_SCALE_MIN %u \n" + "#define HS_HM_SCALE_MAX %u \n" + "#define HS_EMPTY \n" + " \n", + config->warp.lanes_log2, + config->warp.lanes_log2, + config->thread.regs, + config->thread.regs, + (config->type.words == 2) ? "uint64_t" : "uint32_t", + config->type.words, + merge->warps, + msb_idx_u32(pow2_ru_u32(merge->warps)), + bc_max, + config->merge.flip.lo, + config->merge.flip.hi, + config->merge.half.lo, + config->merge.half.hi); + + fprintf(target->state->header, + "#define HS_SLAB_ROWS() \\\n"); + + for (uint32_t ii=1; ii<=config->thread.regs; ii++) + fprintf(target->state->header, + " HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1); + + fprintf(target->state->header, + " HS_EMPTY\n" + " \n"); + + fprintf(target->state->header, + "#define HS_TRANSPOSE_SLAB() \\\n"); + + for (uint32_t ii=1; ii<=config->warp.lanes_log2; ii++) + fprintf(target->state->header, + " HS_TRANSPOSE_STAGE( %u ) \\\n",ii); + + struct hsg_transpose_state state[1] = + { + { .header = target->state->header, + .config = config + } + }; + + hsg_transpose(config->warp.lanes_log2, + config->thread.regs, + hsg_transpose_blend,state, + hsg_transpose_remap,state); + + fprintf(target->state->header, + " HS_EMPTY\n" + " \n"); + +#if 0 + fprintf(target->state->source, + "#include \n" + " \n" + "// \n" + "// \n" + "// \n"); +#endif + } + break; + + case HSG_OP_TYPE_TARGET_END: + // decorate the files + fprintf(target->state->header, + "#endif \n" + " \n" + "// \n" + "// \n" + "// \n" + " \n"); + + // close files + fclose(target->state->header); + fclose(target->state->embeds); + + // free state + free(target->state); + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO: + { + fprintf(target->state->embeds, + "#include \"hs_transpose.len.xxd\"\n,\n" + "#include \"hs_transpose.spv.xxd\"\n,\n"); + + fopen_s(&target->state->source,"hs_transpose.comp","w+"); + + hsg_copyright(target->state->source); + + hsg_macros(target->state->source); + + fprintf(target->state->source, + "HS_TRANSPOSE_KERNEL_PROTO(%u)\n", + config->warp.lanes); + } + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE: + { + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + } + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY: + { + fprintf(target->state->source, + "HS_TRANSPOSE_SLAB()\n"); + } + break; + + case HSG_OP_TYPE_BS_KERNEL_PROTO: + { + struct hsg_merge const * const m = merge + ops->a; + + uint32_t const bs = pow2_ru_u32(m->warps); + uint32_t const msb = msb_idx_u32(bs); + + fprintf(target->state->embeds, + "#include \"hs_bs_%u.len.xxd\"\n,\n" + "#include \"hs_bs_%u.spv.xxd\"\n,\n", + msb, + msb); + + char filename[] = { "hs_bs_123.comp" }; + sprintf_s(filename,sizeof(filename),"hs_bs_%u.comp",msb); + fopen_s(&target->state->source,filename,"w+"); + + hsg_copyright(target->state->source); + + hsg_macros(target->state->source); + + if (m->warps > 1) + { + fprintf(target->state->source, + "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n", + m->warps * config->warp.lanes, + m->rows_bs); + } + + fprintf(target->state->source, + "HS_BS_KERNEL_PROTO(%u,%u,%u)\n", + config->warp.lanes,m->warps,msb); + } + break; + + case HSG_OP_TYPE_BS_KERNEL_PREAMBLE: + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + break; + + case HSG_OP_TYPE_BC_KERNEL_PROTO: + { + struct hsg_merge const * const m = merge + ops->a; + + uint32_t const msb = msb_idx_u32(m->warps); + + fprintf(target->state->embeds, + "#include \"hs_bc_%u.len.xxd\"\n,\n" + "#include \"hs_bc_%u.spv.xxd\"\n,\n", + msb, + msb); + + char filename[] = { "hs_bc_123.comp" }; + sprintf_s(filename,sizeof(filename),"hs_bc_%u.comp",msb); + fopen_s(&target->state->source,filename,"w+"); + + hsg_copyright(target->state->source); + + hsg_macros(target->state->source); + + if (m->warps > 1) + { + fprintf(target->state->source, + "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n", + m->warps * config->warp.lanes, + m->rows_bc); + } + + fprintf(target->state->source, + "HS_BC_KERNEL_PROTO(%u,%u,%u)\n", + config->warp.lanes,m->warps,msb); + } + break; + + case HSG_OP_TYPE_BC_KERNEL_PREAMBLE: + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + break; + + case HSG_OP_TYPE_FM_KERNEL_PROTO: + { + fprintf(target->state->embeds, + "#include \"hs_fm_%u_%u.len.xxd\"\n,\n" + "#include \"hs_fm_%u_%u.spv.xxd\"\n,\n", + ops->a,ops->b, + ops->a,ops->b); + + char filename[] = { "hs_fm_123_123.comp" }; + sprintf_s(filename,sizeof(filename),"hs_fm_%u_%u.comp",ops->a,ops->b); + fopen_s(&target->state->source,filename,"w+"); + + hsg_copyright(target->state->source); + + hsg_macros(target->state->source); + + fprintf(target->state->source, + "HS_FM_KERNEL_PROTO(%u,%u)\n", + ops->a,ops->b); + } + break; + + case HSG_OP_TYPE_FM_KERNEL_PREAMBLE: + fprintf(target->state->source, + "HS_FM_PREAMBLE(%u);\n", + ops->a); + break; + + case HSG_OP_TYPE_HM_KERNEL_PROTO: + { + fprintf(target->state->embeds, + "#include \"hs_hm_%u_%u.len.xxd\"\n,\n" + "#include \"hs_hm_%u_%u.spv.xxd\"\n,\n", + ops->a,ops->b, + ops->a,ops->b); + + char filename[] = { "hs_hm_123_123.comp" }; + sprintf_s(filename,sizeof(filename),"hs_hm_%u_%u.comp",ops->a,ops->b); + fopen_s(&target->state->source,filename,"w+"); + + hsg_copyright(target->state->source); + + hsg_macros(target->state->source); + + fprintf(target->state->source, + "HS_HM_KERNEL_PROTO(%u)\n", + ops->a); + } + break; + + case HSG_OP_TYPE_HM_KERNEL_PREAMBLE: + fprintf(target->state->source, + "HS_HM_PREAMBLE(%u);\n", + ops->a); + break; + + case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD: + { + static char const * const vstr[] = { "vin", "vout" }; + + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_SLAB_GLOBAL_LOAD(%s,%u,%u);\n", + ops->n,vstr[ops->v],config->warp.lanes,ops->n-1); + } + break; + + case HSG_OP_TYPE_BX_REG_GLOBAL_STORE: + fprintf(target->state->source, + "HS_SLAB_GLOBAL_STORE(%u,%u,r%u);\n", + config->warp.lanes,ops->n-1,ops->n); + break; + + case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_HM_REG_GLOBAL_STORE: + fprintf(target->state->source, + "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT: + fprintf(target->state->source, + "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_FM_GLOBAL_LOAD_R(%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT: + fprintf(target->state->source, + "HS_FM_GLOBAL_STORE_R(%-3u,r%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED: + { + if (ops->a <= ops->b) + { + fprintf(target->state->source, + "if (HS_FM_IS_NOT_LAST_SPAN() || (fm_frac == 0))\n"); + } + else if (ops->b > 1) + { + fprintf(target->state->source, + "else if (fm_frac == %u)\n", + ops->b); + } + else + { + fprintf(target->state->source, + "else\n"); + } + } + break; + + case HSG_OP_TYPE_SLAB_FLIP: + fprintf(target->state->source, + "HS_SLAB_FLIP_PREAMBLE(%u);\n", + ops->n-1); + break; + + case HSG_OP_TYPE_SLAB_HALF: + fprintf(target->state->source, + "HS_SLAB_HALF_PREAMBLE(%u);\n", + ops->n / 2); + break; + + case HSG_OP_TYPE_CMP_FLIP: + fprintf(target->state->source, + "HS_CMP_FLIP(%-3u,r%-3u,r%-3u);\n",ops->a,ops->b,ops->c); + break; + + case HSG_OP_TYPE_CMP_HALF: + fprintf(target->state->source, + "HS_CMP_HALF(%-3u,r%-3u);\n",ops->a,ops->b); + break; + + case HSG_OP_TYPE_CMP_XCHG: + if (ops->c == UINT32_MAX) + { + fprintf(target->state->source, + "HS_CMP_XCHG(r%-3u,r%-3u);\n", + ops->a,ops->b); + } + else + { + fprintf(target->state->source, + "HS_CMP_XCHG(r%u_%u,r%u_%u);\n", + ops->c,ops->a,ops->c,ops->b); + } + break; + + case HSG_OP_TYPE_BS_REG_SHARED_STORE_V: + fprintf(target->state->source, + "HS_BX_LOCAL_V(%-3u * %-2u * %-3u) = r%u;\n", + merge[ops->a].warps,config->warp.lanes,ops->c,ops->b); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V: + fprintf(target->state->source, + "r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n", + ops->b,merge[ops->a].warps,config->warp.lanes,ops->c); + break; + + case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n", + ops->b,ops->a,config->warp.lanes,ops->c); + break; + + case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT: + fprintf(target->state->source, + "HS_SLAB_LOCAL_L(%5u) = r%u_%u;\n", + ops->b * config->warp.lanes, + ops->c, + ops->a); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT: + fprintf(target->state->source, + "HS_SLAB_LOCAL_R(%5u) = r%u_%u;\n", + ops->b * config->warp.lanes, + ops->c, + ops->a); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_L(%u);\n", + ops->c, + ops->a, + ops->b * config->warp.lanes); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_R(%u);\n", + ops->c, + ops->a, + ops->b * config->warp.lanes); + break; + + case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_BC_GLOBAL_LOAD_L(%u,%u);\n", + ops->c, + ops->a, + config->warp.lanes,ops->b); + break; + + case HSG_OP_TYPE_BLOCK_SYNC: + fprintf(target->state->source, + "HS_BLOCK_BARRIER();\n"); + // + // FIXME - Named barriers to allow coordinating warps to proceed? + // + break; + + case HSG_OP_TYPE_BS_FRAC_PRED: + { + if (ops->m == 0) + { + fprintf(target->state->source, + "if (warp_idx < bs_full)\n"); + } + else + { + fprintf(target->state->source, + "else if (bs_frac == %u)\n", + ops->w); + } + } + break; + + case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + fprintf(target->state->source, + "HS_BS_MERGE_H_PREAMBLE(%u,%u);\n", + config->warp.lanes,m->warps); + } + break; + + case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + fprintf(target->state->source, + "HS_BC_MERGE_H_PREAMBLE(%u,%u,%u);\n", + config->warp.lanes,config->thread.regs,m->warps); + } + break; + + case HSG_OP_TYPE_BX_MERGE_H_PRED: + fprintf(target->state->source, + "if (get_sub_group_id() < %u)\n", + ops->a); + break; + + case HSG_OP_TYPE_BS_ACTIVE_PRED: + { + struct hsg_merge const * const m = merge + ops->a; + + if (m->warps <= 32) + { + fprintf(target->state->source, + "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n", + m->levels[ops->b].active.b32a2[0]); + } + else + { + fprintf(target->state->source, + "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n", + m->levels[ops->b].active.b32a2[1], + m->levels[ops->b].active.b32a2[0]); + } + } + break; + + default: + fprintf(stderr,"type not found: %s\n",hsg_op_type_string[ops->type]); + exit(EXIT_FAILURE); + break; + } +} + +// +// +// diff --git a/src/compute/hs/gen/target_igp_genx.c b/src/compute/hs/gen/target_igp_genx.c deleted file mode 100644 index 3d0f2bc1b8..0000000000 --- a/src/compute/hs/gen/target_igp_genx.c +++ /dev/null @@ -1,672 +0,0 @@ -/* - * Copyright 2016 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#include - -// -// -// - -#include "gen.h" -#include "util.h" -#include "macros.h" -#include "transpose.h" - -// -// -// - -static -char -hsg_transpose_reg_prefix(uint32_t const cols_log2) -{ - return 'a' + (('r' + cols_log2 - 'a') % 26); -} - -static -void -hsg_transpose_blend(uint32_t const cols_log2, - uint32_t const row_ll, // lower-left - uint32_t const row_ur, // upper-right - FILE * file) -{ - // we're starting register names at '1' for now - fprintf(file, - " HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n", - hsg_transpose_reg_prefix(cols_log2-1), - hsg_transpose_reg_prefix(cols_log2), - cols_log2,row_ll+1,row_ur+1); -} - -static -void -hsg_transpose_remap(uint32_t const row_from, - uint32_t const row_to, - FILE * file) -{ - // we're starting register names at '1' for now - fprintf(file, - " HS_TRANSPOSE_REMAP( %c, %3u, %3u ) \\\n", - hsg_transpose_reg_prefix(msb_idx_u32(hsg_config.warp.lanes)), - row_from+1,row_to+1); -} - -// -// -// - -void -hsg_target_igp_genx(struct hsg_file * const files, - struct hsg_merge const * const merge, - struct hsg_op const * const ops, - uint32_t const depth) -{ - switch (ops->type) - { - case HSG_OP_TYPE_END: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "}\n"); - break; - - case HSG_OP_TYPE_BEGIN: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "{\n"); - break; - - case HSG_OP_TYPE_ELSE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "else\n"); - break; - - case HSG_OP_TYPE_FILE_HEADER: - { - uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps)); - uint32_t const warp_lanes_log2 = msb_idx_u32(hsg_config.warp.lanes); - - fprintf(files[HSG_FILE_TYPE_HEADER].file, - "// \n" - "// Copyright 2016 Google Inc. \n" - "// \n" - "// Use of this source code is governed by a BSD-style \n" - "// license that can be found in the LICENSE file. \n" - "// \n" - " \n" - "#ifndef HS_CL_ONCE \n" - "#define HS_CL_ONCE \n" - " \n" - "#define HS_LANES_PER_WARP_LOG2 %u \n" - "#define HS_LANES_PER_WARP (1 << HS_LANES_PER_WARP_LOG2) \n" - "#define HS_BS_WARPS %u \n" - "#define HS_BS_WARPS_LOG2_RU %u \n" - "#define HS_BC_WARPS_LOG2_MAX %u \n" - "#define HS_FM_BLOCKS_LOG2_MIN %u \n" - "#define HS_HM_BLOCKS_LOG2_MIN %u \n" - "#define HS_KEYS_PER_LANE %u \n" - "#define HS_REG_LAST(c) c##%u \n" - "#define HS_KEY_WORDS %u \n" - "#define HS_KEY_TYPE %s \n" - "#define HS_EMPTY \n" - " \n", - warp_lanes_log2, - merge->warps, - msb_idx_u32(pow2_ru_u32(merge->warps)), - bc_max, - hsg_config.merge.flip.lo, - hsg_config.merge.half.lo, - hsg_config.thread.regs, - hsg_config.thread.regs, - hsg_config.type.words, - (hsg_config.type.words == 2) ? "ulong" : "uint"); - - fprintf(files[HSG_FILE_TYPE_HEADER].file, - "#define HS_SLAB_ROWS() \\\n"); - - for (uint32_t ii=1; ii<=hsg_config.thread.regs; ii++) - fprintf(files[HSG_FILE_TYPE_HEADER].file, - " HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1); - - fprintf(files[HSG_FILE_TYPE_HEADER].file, - " HS_EMPTY\n" - " \n"); - - fprintf(files[HSG_FILE_TYPE_HEADER].file, - "#define HS_TRANSPOSE_SLAB() \\\n"); - - for (uint32_t ii=1; ii<=warp_lanes_log2; ii++) - fprintf(files[HSG_FILE_TYPE_HEADER].file, - " HS_TRANSPOSE_STAGE( %u ) \\\n",ii); - - hsg_transpose(msb_idx_u32(hsg_config.warp.lanes), - hsg_config.thread.regs, - files[HSG_FILE_TYPE_HEADER].file, - files[HSG_FILE_TYPE_HEADER].file, - hsg_transpose_blend, - hsg_transpose_remap); - - fprintf(files[HSG_FILE_TYPE_HEADER].file, - " HS_EMPTY\n" - " \n"); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "// \n" - "// Copyright 2016 Google Inc. \n" - "// \n" - "// Use of this source code is governed by a BSD-style \n" - "// license that can be found in the LICENSE file. \n" - "// \n" - " \n" - "#include <%s_macros.h> \n" - " \n" - "// \n" - "// \n" - "// \n", - files[HSG_FILE_TYPE_SOURCE].prefix); - } - break; - - case HSG_OP_TYPE_FILE_FOOTER: - fprintf(files[HSG_FILE_TYPE_HEADER].file, - " \n" - "#endif \n" - " \n" - "// \n" - "// \n" - "// \n" - " \n"); - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "// \n" - "// \n" - "// \n" - " \n"); - break; - - case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO: - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "__kernel \n" - "__attribute__((intel_reqd_sub_group_size(%u))) \n" - "void hs_kernel_transpose(__global HS_KEY_TYPE * const restrict vout) \n", - hsg_config.warp.lanes); - } - break; - - case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE: - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const global_id = get_global_id(0); \n" - "uint const gmem_idx = (global_id / %u) * %u + (global_id & %u); \n" - " \n", - hsg_config.warp.lanes, - hsg_config.warp.lanes * hsg_config.thread.regs, - hsg_config.warp.lanes-1); - } - break; - - case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY: - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_TRANSPOSE_SLAB()\n"); - } - break; - - case HSG_OP_TYPE_BS_KERNEL_PROTO: - { - struct hsg_merge const * const m = merge + ops->a; - - uint32_t const tpb = m->warps * hsg_config.warp.lanes; - uint32_t const bs = pow2_ru_u32(m->warps); - uint32_t const msb = msb_idx_u32(bs); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "__kernel \n" - "__attribute__((reqd_work_group_size(%u,1,1))) \n" - "__attribute__((intel_reqd_sub_group_size(%u))) \n" - "void hs_kernel_bs_%u(__global HS_KEY_TYPE const * const restrict vin, \n" - " __global HS_KEY_TYPE * const restrict vout) \n", - tpb, - hsg_config.warp.lanes, - msb); - } - break; - - case HSG_OP_TYPE_BS_KERNEL_PREAMBLE: - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "__local union { \n"); - - struct hsg_merge const * const m = merge + ops->a; - - if (m->warps > 1) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " HS_KEY_TYPE m[%u * %u];\n", - m->rows_bs, - m->warps * hsg_config.warp.lanes); - } - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "} shared; \n" - " \n"); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const global_id = get_global_id(0); \n" - "uint const gmem_idx = (global_id / %u) * %u + (global_id & %u); \n" - " \n", - hsg_config.warp.lanes, - hsg_config.warp.lanes * hsg_config.thread.regs, - hsg_config.warp.lanes-1); - } - break; - - case HSG_OP_TYPE_BC_KERNEL_PROTO: - { - uint32_t const bc_max = pow2_rd_u32(merge[0].warps); - uint32_t const tpb = bc_max * hsg_config.warp.lanes; - uint32_t const msb = msb_idx_u32(merge[ops->a].warps); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "__kernel \n" - "__attribute__((intel_reqd_sub_group_size(%u))) \n" - "void hs_kernel_bc_%u(__global HS_KEY_TYPE * const restrict vout) \n", - hsg_config.warp.lanes,msb); - } - break; - - case HSG_OP_TYPE_BC_KERNEL_PREAMBLE: - { - struct hsg_merge const * const m = merge + ops->a; - uint32_t const bc_max = pow2_rd_u32(merge[0].warps); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "__local union { \n"); - - if (m->warps > 1) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " HS_KEY_TYPE m[%-3u * %u];\n", - m->rows_bc, - m->warps * hsg_config.warp.lanes); - } - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "} shared; \n" - " \n"); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const global_id = get_global_id(0); \n" - "uint const gmem_idx = (global_id / %u) * %u + (global_id & %u); \n" - " \n", - hsg_config.warp.lanes, - hsg_config.warp.lanes * hsg_config.thread.regs, - hsg_config.warp.lanes-1); - } - break; - - case HSG_OP_TYPE_FM_KERNEL_PROTO: - fprintf(files[HSG_FILE_TYPE_HEADER].file, - "#define HS_FM_BLOCKS_LOG2_%-2u %u \n", - ops->a,ops->b); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "__kernel \n" - "__attribute__((intel_reqd_sub_group_size(%u))) \n" - "void hs_kernel_fm_%u(__global HS_KEY_TYPE * const restrict vout, \n" - " uint const fm_full, \n" - " uint const fm_frac) \n", - hsg_config.warp.lanes,ops->a); - break; - - case HSG_OP_TYPE_FM_KERNEL_PREAMBLE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const global_id = (uint)get_global_id(0); \n" - "uint const warp_idx = global_id / %u; \n" - "uint const warp_lane_idx = global_id & %u; \n" - " \n" - "uint const merge_idx = warp_idx / %u >> %u; \n" - " \n" - "uint const merge_stride = %u * %u << %u; \n" - "uint const merge_keys = merge_stride * %u; \n" - " \n" - "uint const merge_base = merge_idx * merge_keys; \n" - " \n" - "uint const merge_l_off = (warp_idx - merge_idx * (%u << %u)) * %u + warp_lane_idx; \n" - "uint const merge_l_end = merge_stride * (%u / 2 - 1) + merge_l_off; \n" - " \n" - "int const merge_r_off = merge_keys - merge_l_end - 1; \n" - " \n" - "__global HS_KEY_TYPE * const restrict merge_l = vout + (merge_base + merge_l_off); \n" - "__global HS_KEY_TYPE * const restrict merge_r = vout + (merge_base + merge_r_off); \n" - " \n", - hsg_config.warp.lanes, - hsg_config.warp.lanes-1, - hsg_config.thread.regs,ops->b, - hsg_config.thread.regs,hsg_config.warp.lanes,ops->b, - ops->a, - hsg_config.thread.regs,ops->b,hsg_config.warp.lanes, - ops->a); - break; - - case HSG_OP_TYPE_HM_KERNEL_PROTO: - { - uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge[0].warps)); - - fprintf(files[HSG_FILE_TYPE_HEADER].file, - "#define HS_HM_BLOCKS_LOG2_%-2u %u \n", - ops->a,ops->b); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "__kernel \n" - "__attribute__((intel_reqd_sub_group_size(%u))) \n" - "void hs_kernel_hm_%u(__global HS_KEY_TYPE * const restrict vout) \n", - hsg_config.warp.lanes,ops->a); - } - break; - - case HSG_OP_TYPE_HM_KERNEL_PREAMBLE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const global_id = (uint)get_global_id(0); \n" - "uint const warp_idx = global_id / %u; \n" - "uint const warp_lane_idx = global_id & %u; \n" - " \n" - "uint const merge_idx = (warp_idx / %u) >> %u; \n" - " \n" - "uint const merge_stride = %u * %u << %u; \n" - "uint const merge_keys = merge_stride * %u; \n" - " \n" - "uint const merge_base = merge_idx * merge_keys; \n" - "uint const merge_off = (warp_idx - merge_idx * (%u << %u)) * %u; \n" - " \n" - "__global HS_KEY_TYPE * const restrict merge_ptr = vout + (merge_base + merge_off + warp_lane_idx); \n" - " \n", - hsg_config.warp.lanes, - hsg_config.warp.lanes-1, - hsg_config.thread.regs,ops->b, - hsg_config.thread.regs,hsg_config.warp.lanes,ops->b, - ops->a, - hsg_config.thread.regs,ops->b,hsg_config.warp.lanes); - break; - - case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD: - { - static char const * const vstr[] = { "vin", "vout" }; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_KEY_TYPE r%-3u = (%s + gmem_idx)[%-3u * %u]; \n", - ops->n,vstr[ops->v],ops->n-1,hsg_config.warp.lanes); - } - break; - - case HSG_OP_TYPE_BX_REG_GLOBAL_STORE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "(vout + gmem_idx)[%-3u * %u] = r%u; \n", - ops->n-1,hsg_config.warp.lanes,ops->n); - break; - - case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_KEY_TYPE r%-3u = merge_ptr[%-3u * merge_stride];\n", - ops->a,ops->b); - break; - - case HSG_OP_TYPE_HM_REG_GLOBAL_STORE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "merge_ptr[%-3u * merge_stride] = r%u;\n", - ops->b,ops->a); - break; - - case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_KEY_TYPE r%-3u = merge_l[%-3u * merge_stride];\n", - ops->a,ops->b); - break; - - case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "merge_l[%-3u * merge_stride] = r%u;\n", - ops->b,ops->a); - break; - - case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_KEY_TYPE r%-3u = merge_r[%-3u * merge_stride];\n", - ops->a,ops->b); - break; - - case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "merge_r[%-3u * merge_stride] = r%u;\n", - ops->b,ops->a); - break; - - case HSG_OP_TYPE_WARP_FLIP: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const flip_lane_mask = %u; \n" - "uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; \n" - "int const t_lt = get_sub_group_local_id() < flip_lane_idx; \n", - ops->n-1); - break; - - case HSG_OP_TYPE_WARP_HALF: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const half_lane_mask = %u; \n" - "uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; \n" - "int const t_lt = get_sub_group_local_id() < half_lane_idx; \n", - ops->n / 2); - break; - - case HSG_OP_TYPE_CMP_FLIP: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_CMP_FLIP(%-3u,r%-3u,r%-3u)\n",ops->a,ops->b,ops->c); - break; - - case HSG_OP_TYPE_CMP_HALF: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_CMP_HALF(%-3u,r%-3u)\n",ops->a,ops->b); - break; - - case HSG_OP_TYPE_CMP_XCHG: - if (ops->c == UINT32_MAX) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_CMP_XCHG(r%-3u,r%-3u)\n", - ops->a,ops->b); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_CMP_XCHG(r%u_%u,r%u_%u)\n", - ops->c,ops->a,ops->c,ops->b); - } - break; - - case HSG_OP_TYPE_BS_REG_SHARED_STORE_V: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "(shared.m + get_local_id(0))[%-3u * %-2u * %-3u] = r%u;\n", - merge[ops->a].warps,hsg_config.warp.lanes,ops->c,ops->b); - break; - - case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "r%-3u = (shared.m + get_local_id(0))[%-3u * %-2u * %-3u];\n", - ops->b,merge[ops->a].warps,hsg_config.warp.lanes,ops->c); - break; - - case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_KEY_TYPE r%-3u = (shared.m + get_local_id(0))[%-3u * %-2u * %-3u];\n", - ops->b,ops->a,hsg_config.warp.lanes,ops->c); - break; - - case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "(shared.m + smem_l_idx)[%5u] = r%u_%u;\n", - ops->b * hsg_config.warp.lanes, - ops->c, - ops->a); - break; - - case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "(shared.m + smem_r_idx)[%5u] = r%u_%u;\n", - ops->b * hsg_config.warp.lanes, - ops->c, - ops->a); - break; - - case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_KEY_TYPE r%u_%-3u = (shared.m + smem_l_idx)[%u];\n", - ops->c, - ops->a, - ops->b * hsg_config.warp.lanes); - break; - - case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_KEY_TYPE r%u_%-3u = (shared.m + smem_r_idx)[%u];\n", - ops->c, - ops->a, - ops->b * hsg_config.warp.lanes); - break; - - case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_KEY_TYPE r%u_%-3u = (vout + gmem_l_idx)[%u];\n", - ops->c, - ops->a, - ops->b * hsg_config.warp.lanes); - break; - - case HSG_OP_TYPE_BLOCK_SYNC: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "barrier(CLK_LOCAL_MEM_FENCE);\n"); // OpenCL 2.0+: work_group_barrier - break; - - case HSG_OP_TYPE_BS_FRAC_PRED: - { - if (ops->m == 0) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (warp_idx < bs_full)\n"); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "else if (bs_frac == %u)\n", - ops->w); - } - } - break; - - case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE: - { - struct hsg_merge const * const m = merge + ops->a; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const smem_l_idx = get_sub_group_id() * %u + get_sub_group_local_id(); \n" - "uint const smem_r_idx = (get_sub_group_id() ^ 1) * %u + (get_sub_group_local_id() ^ %u); \n", - m->warps * hsg_config.warp.lanes, - m->warps * hsg_config.warp.lanes, hsg_config.warp.lanes-1); -#if 0 - if (ops->b == true) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint smem_l_idx = get_sub_group_id() * %u + get_sub_group_local_id(); \n" - "uint smem_r_idx = (get_sub_group_id() ^ 1) * %u + (get_sub_group_local_id() ^ %u); \n", - m->warps * hsg_config.warp.lanes, - m->warps * hsg_config.warp.lanes, hsg_config.warp.lanes-1); - } - else // update - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "smem_l_idx = get_sub_group_id() * %u + get_sub_group_local_id(); \n" - "smem_r_idx = (get_sub_group_id() ^ 1) * %u + (get_sub_group_local_id() ^ %u); \n", - m->warps * hsg_config.warp.lanes, - m->warps * hsg_config.warp.lanes, hsg_config.warp.lanes-1); - } -#endif - } - break; - - case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE: - { - struct hsg_merge const * const m = merge + ops->a; - uint32_t const b = m->warps * hsg_config.warp.lanes; - uint32_t const k = b * hsg_config.thread.regs; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const gmem_l_idx = (global_id / %u) * %u + (global_id & %u); \n" - "uint const smem_l_idx = get_sub_group_id() * %u + get_sub_group_local_id(); \n", - b,k,b-1, - b); - - } - break; - - case HSG_OP_TYPE_BX_MERGE_H_PRED: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (get_sub_group_id() < %u)\n", - ops->a); - break; - - case HSG_OP_TYPE_BS_ACTIVE_PRED: - { - struct hsg_merge const * const m = merge + ops->a; - - if (m->warps <= 32) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n", - m->levels[ops->b].active.b32a2[0]); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n", - m->levels[ops->b].active.b32a2[1], - m->levels[ops->b].active.b32a2[0]); - } - } - break; - - case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED: - { - if (ops->a == ops->b) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (merge_idx < fm_full) \n"); - } - else if (ops->b > 1) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "else if (fm_frac == %u) \n", - ops->b); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "else\n"); - } - } - break; - - default: - hsg_target_debug(files,merge,ops,depth); - break; - } -} - -// -// -// diff --git a/src/compute/hs/gen/target_opencl.c b/src/compute/hs/gen/target_opencl.c new file mode 100644 index 0000000000..fe7343ba5d --- /dev/null +++ b/src/compute/hs/gen/target_opencl.c @@ -0,0 +1,600 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#include +#include + +// +// +// + +#include "gen.h" +#include "transpose.h" + +#include "common/util.h" +#include "common/macros.h" + +// +// +// + +struct hsg_transpose_state +{ + FILE * header; + struct hsg_config const * config; +}; + +static +char +hsg_transpose_reg_prefix(uint32_t const cols_log2) +{ + return 'a' + (('r' + cols_log2 - 'a') % 26); +} + +static +void +hsg_transpose_blend(uint32_t const cols_log2, + uint32_t const row_ll, // lower-left + uint32_t const row_ur, // upper-right + struct hsg_transpose_state * const state) +{ + // we're starting register names at '1' for now + fprintf(state->header, + " HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n", + hsg_transpose_reg_prefix(cols_log2-1), + hsg_transpose_reg_prefix(cols_log2), + cols_log2,row_ll+1,row_ur+1); +} + +static +void +hsg_transpose_remap(uint32_t const row_from, + uint32_t const row_to, + struct hsg_transpose_state * const state) +{ + // we're starting register names at '1' for now + fprintf(state->header, + " HS_TRANSPOSE_REMAP( %c, %3u, %3u ) \\\n", + hsg_transpose_reg_prefix(state->config->warp.lanes_log2), + row_from+1,row_to+1); +} + +// +// +// + +static +void +hsg_copyright(FILE * file) +{ + fprintf(file, + "// \n" + "// Copyright 2016 Google Inc. \n" + "// \n" + "// Use of this source code is governed by a BSD-style \n" + "// license that can be found in the LICENSE file. \n" + "// \n" + "\n"); +} + +// +// +// + +struct hsg_target_state +{ + FILE * header; + FILE * source; +}; + +// +// +// + +void +hsg_target_opencl(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth) +{ + switch (ops->type) + { + case HSG_OP_TYPE_END: + fprintf(target->state->source, + "}\n"); + break; + + case HSG_OP_TYPE_BEGIN: + fprintf(target->state->source, + "{\n"); + break; + + case HSG_OP_TYPE_ELSE: + fprintf(target->state->source, + "else\n"); + break; + + case HSG_OP_TYPE_TARGET_BEGIN: + { + // allocate state + target->state = malloc(sizeof(*target->state)); + + // allocate files + fopen_s(&target->state->header,"hs_cl.h", "wb"); + fopen_s(&target->state->source,"hs_cl.cl","wb"); + + // initialize header + uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps)); + + hsg_copyright(target->state->header); + + fprintf(target->state->header, + "#ifndef HS_CL_ONCE \n" + "#define HS_CL_ONCE \n" + " \n" + "#define HS_SLAB_THREADS_LOG2 %u \n" + "#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2) \n" + "#define HS_SLAB_WIDTH_LOG2 %u \n" + "#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2) \n" + "#define HS_SLAB_HEIGHT %u \n" + "#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)\n" + "#define HS_REG_LAST(c) c##%u \n" + "#define HS_KEY_TYPE %s \n" + "#define HS_KEY_WORDS %u \n" + "#define HS_VAL_WORDS 0 \n" + "#define HS_BS_SLABS %u \n" + "#define HS_BS_SLABS_LOG2_RU %u \n" + "#define HS_BC_SLABS_LOG2_MAX %u \n" + "#define HS_FM_SCALE_MIN %u \n" + "#define HS_FM_SCALE_MAX %u \n" + "#define HS_HM_SCALE_MIN %u \n" + "#define HS_HM_SCALE_MAX %u \n" + "#define HS_EMPTY \n" + " \n", + config->warp.lanes_log2, + config->warp.lanes_log2, + config->thread.regs, + config->thread.regs, + (config->type.words == 2) ? "ulong" : "uint", + config->type.words, + merge->warps, + msb_idx_u32(pow2_ru_u32(merge->warps)), + bc_max, + config->merge.flip.lo, + config->merge.flip.hi, + config->merge.half.lo, + config->merge.half.hi); + + fprintf(target->state->header, + "#define HS_SLAB_ROWS() \\\n"); + + for (uint32_t ii=1; ii<=config->thread.regs; ii++) + fprintf(target->state->header, + " HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1); + + fprintf(target->state->header, + " HS_EMPTY\n" + " \n"); + + fprintf(target->state->header, + "#define HS_TRANSPOSE_SLAB() \\\n"); + + for (uint32_t ii=1; ii<=config->warp.lanes_log2; ii++) + fprintf(target->state->header, + " HS_TRANSPOSE_STAGE( %u ) \\\n",ii); + + struct hsg_transpose_state state[1] = + { + { .header = target->state->header, + .config = config + } + }; + + hsg_transpose(config->warp.lanes_log2, + config->thread.regs, + hsg_transpose_blend,state, + hsg_transpose_remap,state); + + fprintf(target->state->header, + " HS_EMPTY\n" + " \n"); + + hsg_copyright(target->state->source); + + fprintf(target->state->source, + "#include \"hs_cl_macros.h\" \n" + " \n" + "// \n" + "// \n" + "// \n"); + } + break; + + case HSG_OP_TYPE_TARGET_END: + // decorate the files + fprintf(target->state->header, + "#endif \n" + " \n" + "// \n" + "// \n" + "// \n" + " \n"); + fprintf(target->state->source, + " \n" + "// \n" + "// \n" + "// \n" + " \n"); + + // close files + fclose(target->state->header); + fclose(target->state->source); + + // free state + free(target->state); + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO: + { + fprintf(target->state->source, + "\nHS_TRANSPOSE_KERNEL_PROTO(%u)\n", + config->warp.lanes); + } + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE: + { + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + } + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY: + { + fprintf(target->state->source, + "HS_TRANSPOSE_SLAB()\n"); + } + break; + + case HSG_OP_TYPE_BS_KERNEL_PROTO: + { + struct hsg_merge const * const m = merge + ops->a; + + uint32_t const bs = pow2_ru_u32(m->warps); + uint32_t const msb = msb_idx_u32(bs); + + fprintf(target->state->source, + "\nHS_BS_KERNEL_PROTO(%u,%u,%u)\n", + config->warp.lanes,m->warps,msb); + } + break; + + case HSG_OP_TYPE_BS_KERNEL_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + if (m->warps > 1) + { + fprintf(target->state->source, + "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n", + m->warps * config->warp.lanes, + m->rows_bs); + } + + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + } + break; + + case HSG_OP_TYPE_BC_KERNEL_PROTO: + { + struct hsg_merge const * const m = merge + ops->a; + + uint32_t const msb = msb_idx_u32(m->warps); + + fprintf(target->state->source, + "\nHS_BC_KERNEL_PROTO(%u,%u,%u)\n", + config->warp.lanes,m->warps,msb); + } + break; + + case HSG_OP_TYPE_BC_KERNEL_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + if (m->warps > 1) + { + fprintf(target->state->source, + "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n", + m->warps * config->warp.lanes, + m->rows_bc); + } + + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + } + break; + + case HSG_OP_TYPE_FM_KERNEL_PROTO: + fprintf(target->state->source, + "\nHS_FM_KERNEL_PROTO(%u,%u)\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_FM_KERNEL_PREAMBLE: + fprintf(target->state->source, + "HS_FM_PREAMBLE(%u);\n", + ops->a); + break; + + case HSG_OP_TYPE_HM_KERNEL_PROTO: + { + fprintf(target->state->source, + "\nHS_HM_KERNEL_PROTO(%u)\n", + ops->a); + } + break; + + case HSG_OP_TYPE_HM_KERNEL_PREAMBLE: + fprintf(target->state->source, + "HS_HM_PREAMBLE(%u);\n", + ops->a); + break; + + case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD: + { + static char const * const vstr[] = { "vin", "vout" }; + + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_SLAB_GLOBAL_LOAD(%s,%u,%u);\n", + ops->n,vstr[ops->v],config->warp.lanes,ops->n-1); + } + break; + + case HSG_OP_TYPE_BX_REG_GLOBAL_STORE: + fprintf(target->state->source, + "HS_SLAB_GLOBAL_STORE(%u,%u,r%u);\n", + config->warp.lanes,ops->n-1,ops->n); + break; + + case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_HM_REG_GLOBAL_STORE: + fprintf(target->state->source, + "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT: + fprintf(target->state->source, + "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_FM_GLOBAL_LOAD_R(%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT: + fprintf(target->state->source, + "HS_FM_GLOBAL_STORE_R(%-3u,r%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED: + { + if (ops->a <= ops->b) + { + fprintf(target->state->source, + "if (HS_FM_IS_NOT_LAST_SPAN() || (fm_frac == 0))\n"); + } + else if (ops->b > 1) + { + fprintf(target->state->source, + "else if (fm_frac == %u)\n", + ops->b); + } + else + { + fprintf(target->state->source, + "else\n"); + } + } + break; + + case HSG_OP_TYPE_SLAB_FLIP: + fprintf(target->state->source, + "HS_SLAB_FLIP_PREAMBLE(%u);\n", + ops->n-1); + break; + + case HSG_OP_TYPE_SLAB_HALF: + fprintf(target->state->source, + "HS_SLAB_HALF_PREAMBLE(%u);\n", + ops->n / 2); + break; + + case HSG_OP_TYPE_CMP_FLIP: + fprintf(target->state->source, + "HS_CMP_FLIP(%-3u,r%-3u,r%-3u);\n",ops->a,ops->b,ops->c); + break; + + case HSG_OP_TYPE_CMP_HALF: + fprintf(target->state->source, + "HS_CMP_HALF(%-3u,r%-3u);\n",ops->a,ops->b); + break; + + case HSG_OP_TYPE_CMP_XCHG: + if (ops->c == UINT32_MAX) + { + fprintf(target->state->source, + "HS_CMP_XCHG(r%-3u,r%-3u);\n", + ops->a,ops->b); + } + else + { + fprintf(target->state->source, + "HS_CMP_XCHG(r%u_%u,r%u_%u);\n", + ops->c,ops->a,ops->c,ops->b); + } + break; + + case HSG_OP_TYPE_BS_REG_SHARED_STORE_V: + fprintf(target->state->source, + "HS_BX_LOCAL_V(%-3u * %-2u * %-3u) = r%u;\n", + merge[ops->a].warps,config->warp.lanes,ops->c,ops->b); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V: + fprintf(target->state->source, + "r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n", + ops->b,merge[ops->a].warps,config->warp.lanes,ops->c); + break; + + case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n", + ops->b,ops->a,config->warp.lanes,ops->c); + break; + + case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT: + fprintf(target->state->source, + "HS_SLAB_LOCAL_L(%5u) = r%u_%u;\n", + ops->b * config->warp.lanes, + ops->c, + ops->a); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT: + fprintf(target->state->source, + "HS_SLAB_LOCAL_R(%5u) = r%u_%u;\n", + ops->b * config->warp.lanes, + ops->c, + ops->a); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_L(%u);\n", + ops->c, + ops->a, + ops->b * config->warp.lanes); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_R(%u);\n", + ops->c, + ops->a, + ops->b * config->warp.lanes); + break; + + case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_BC_GLOBAL_LOAD_L(%u,%u);\n", + ops->c, + ops->a, + config->warp.lanes,ops->b); + break; + + case HSG_OP_TYPE_BLOCK_SYNC: + fprintf(target->state->source, + "HS_BLOCK_BARRIER();\n"); + // + // FIXME - Named barriers to allow coordinating warps to proceed? + // + break; + + case HSG_OP_TYPE_BS_FRAC_PRED: + { + if (ops->m == 0) + { + fprintf(target->state->source, + "if (warp_idx < bs_full)\n"); + } + else + { + fprintf(target->state->source, + "else if (bs_frac == %u)\n", + ops->w); + } + } + break; + + case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + fprintf(target->state->source, + "HS_BS_MERGE_H_PREAMBLE(%u,%u);\n", + config->warp.lanes,m->warps); + } + break; + + case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + fprintf(target->state->source, + "HS_BC_MERGE_H_PREAMBLE(%u,%u,%u);\n", + config->warp.lanes,config->thread.regs,m->warps); + } + break; + + case HSG_OP_TYPE_BX_MERGE_H_PRED: + fprintf(target->state->source, + "if (get_sub_group_id() < %u)\n", + ops->a); + break; + + case HSG_OP_TYPE_BS_ACTIVE_PRED: + { + struct hsg_merge const * const m = merge + ops->a; + + if (m->warps <= 32) + { + fprintf(target->state->source, + "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n", + m->levels[ops->b].active.b32a2[0]); + } + else + { + fprintf(target->state->source, + "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n", + m->levels[ops->b].active.b32a2[1], + m->levels[ops->b].active.b32a2[0]); + } + } + break; + + default: + fprintf(stderr,"type not found: %s\n",hsg_op_type_string[ops->type]); + exit(EXIT_FAILURE); + break; + } +} + +// +// +// diff --git a/src/compute/hs/gen/transpose.c b/src/compute/hs/gen/transpose.c index de15c62631..095f53d330 100644 --- a/src/compute/hs/gen/transpose.c +++ b/src/compute/hs/gen/transpose.c @@ -11,7 +11,7 @@ // #include "transpose.h" -#include "macros.h" +#include "common/macros.h" // // Rows must be an even number. This is enforced elsewhere. @@ -21,19 +21,19 @@ void hsg_transpose(uint32_t const cols_log2, uint32_t const rows, - void * blend, - void * remap, void (*pfn_blend)(uint32_t const cols_log2, uint32_t const row_ll, // lower-left uint32_t const row_ur, // upper-right void * blend), + void * blend, void (*pfn_remap)(uint32_t const row_from, uint32_t const row_to, - void * remap)) + void * remap), + void * remap) { // get mapping array - uint32_t * map_curr = ALLOCA(rows * sizeof(*map_curr)); - uint32_t * map_next = ALLOCA(rows * sizeof(*map_next)); + uint32_t * map_curr = ALLOCA_MACRO(rows * sizeof(*map_curr)); + uint32_t * map_next = ALLOCA_MACRO(rows * sizeof(*map_next)); // init the mapping array for (uint32_t ii=0; ii> cols_log2-1) & 1) ? ll[ii] : ur[ii^(1<> cols_log2-1) & 1) ? ll[ii] : ur[ii^(1<> cols_log2-1) & 1) ? ll[ii^(1<> cols_log2-1) & 1) ? ll[ii^(1< + +// +// This structure packages all of the parameters and SPIR-V kernels +// for a target architecture. +// + +struct hs_spirv_target_config +{ + struct { + uint8_t threads_log2; + uint8_t width_log2; + uint8_t height; + } slab; + + struct { + uint8_t key; + uint8_t val; + } words; + + struct { + uint8_t slabs; + } block; + + struct { + struct { + uint8_t scale_min; + uint8_t scale_max; + } fm; + struct { + uint8_t scale_min; + uint8_t scale_max; + } hm; + } merge; + + uint8_t pad[2]; +}; + +static_assert(sizeof(struct hs_spirv_target_config) == 12, + "modules.words[] must start on a 32-bit boundary"); + +// +// For now, kernels are appended end-to-end with a leading big-endian +// length followed by a SPIR-V binary. +// +// The entry point for each kernel is "main". +// +// When the tools support packaging multiple named compute shaders in +// one SPIR-V module then reevaluate this encoding. +// + +struct hs_spirv_target +{ + struct hs_spirv_target_config config; + union { + uint8_t bytes[]; + uint32_t words[]; + } modules; +}; + +// +// +// diff --git a/src/compute/hs/vk/hs_vk_launcher.c b/src/compute/hs/vk/hs_vk_launcher.c new file mode 100644 index 0000000000..e1080a0e8b --- /dev/null +++ b/src/compute/hs/vk/hs_vk_launcher.c @@ -0,0 +1,248 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#include +#include + +#include "common/vk/assert_vk.h" +#include "common/util.h" + +#include "hs_vk_launcher.h" +#include "hs_spirv_target.h" + +// +// +// + +struct hs_vk +{ + struct hs_spirv_target_config config; + + uint32_t key_val_size; + uint32_t slab_keys; + uint32_t bs_slabs_log2_ru; + uint32_t bc_slabs_log2_max; + + VkDevice device; + VkAllocationCallbacks const * allocator; + + struct { + uint32_t count; + VkPipeline * transpose; + VkPipeline * bs; + VkPipeline * bc; + VkPipeline * fm[3]; + VkPipeline * hm[3]; + VkPipeline all[]; + } pipelines; +}; + +// +// +// + +struct hs_vk * +hs_vk_create(struct hs_spirv_target const * const target, + VkDevice device, + VkAllocationCallbacks const * allocator, + VkPipelineCache pipeline_cache) +{ + // + // we reference these values a lot + // + uint32_t const bs_slabs_log2_ru = msb_idx_u32(pow2_ru_u32(target->config.block.slabs)); + uint32_t const bc_slabs_log2_max = msb_idx_u32(pow2_rd_u32(target->config.block.slabs)); + + // + // how many kernels will be created? + // + uint32_t const count_bs = bs_slabs_log2_ru + 1; + uint32_t const count_bc = bc_slabs_log2_max + 1; + uint32_t count_fm[3] = { 0 }; + uint32_t count_hm[3] = { 0 }; + + // guaranteed to be in range [0,2] + for (uint32_t scale = target->config.merge.fm.scale_min; + scale <= target->config.merge.fm.scale_max; + scale++) + { + count_fm[scale] = msb_idx_u32(pow2_ru_u32(target->config.block.slabs>>(scale-1))) + 1; + } + + // guaranteed to be in range [0,2] + for (uint32_t scale = target->config.merge.hm.scale_min; + scale <= target->config.merge.hm.scale_max; + scale++) + { + count_hm[scale] = 1; + } + + uint32_t const count_all = + 1 + + count_bs + + count_bc + + count_fm[0] + count_fm[1] + count_fm[2] + + count_hm[0] + count_hm[1] + count_hm[2]; + + // + // allocate hs_vk + // + struct hs_vk * hs; + + if (allocator == NULL) + { + hs = malloc(sizeof(*hs) + sizeof(VkPipeline*) * count_all); + } + else + { + hs = NULL; + } + + // save the config + memcpy(&hs->config,&target->config,sizeof(hs->config)); + + // save some frequently used calculated values + hs->key_val_size = (target->config.words.key + target->config.words.val) * 4; + hs->slab_keys = target->config.slab.height << target->config.slab.width_log2; + hs->bs_slabs_log2_ru = bs_slabs_log2_ru; + hs->bc_slabs_log2_max = bc_slabs_log2_max; + + // save device & allocator + hs->device = device; + hs->allocator = allocator; + + // save kernel count + hs->pipelines.count = count_all; + + // + // create all the compute pipelines + // + VkComputePipelineCreateInfo cpci = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .pNext = NULL, + .flags = VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT, + .stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = NULL, + .flags = 0, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = VK_NULL_HANDLE, + .pName = "main", + .pSpecializationInfo = NULL + }, + .basePipelineHandle = VK_NULL_HANDLE, + .basePipelineIndex = -1 + }; + + // + // Create a shader module, use it to create a pipeline... and + // dispose of the shader module. + // + uint32_t const * modules = target->modules.words; + + for (uint32_t ii=0; iipipelines.all+ii)); + + vkDestroyShaderModule(device, + cpci.stage.module, + allocator); + } + + // + // initialize pointers to pipeline handles + // + VkPipeline * pipeline_next = hs->pipelines.all; + + // TRANSPOSE + hs->pipelines.transpose = pipeline_next; + pipeline_next += 1; + + // BS + hs->pipelines.bs = pipeline_next; + pipeline_next += count_bs; + + // BC + hs->pipelines.bc = pipeline_next; + pipeline_next += count_bc; + + // FM[0] + hs->pipelines.fm[0] = count_fm[0] ? pipeline_next : NULL; + pipeline_next += count_fm[0]; + + // FM[1] + hs->pipelines.fm[1] = count_fm[1] ? pipeline_next : NULL; + pipeline_next += count_fm[1]; + + // FM[2] + hs->pipelines.fm[2] = count_fm[2] ? pipeline_next : NULL; + pipeline_next += count_fm[2]; + + // HM[0] + hs->pipelines.hm[0] = count_hm[0] ? pipeline_next : NULL; + pipeline_next += count_hm[0]; + + // HM[1] + hs->pipelines.hm[1] = count_hm[1] ? pipeline_next : NULL; + pipeline_next += count_hm[1]; + + // HM[2] + hs->pipelines.hm[2] = count_hm[2] ? pipeline_next : NULL; + pipeline_next += count_hm[2]; + + return hs; +} + +// +// +// + +void +hs_vk_release(struct hs_vk * const hs) +{ + for (uint32_t ii=0; iipipelines.count; ii++) + vkDestroyPipeline(hs->device, + hs->pipelines.all[ii], + hs->allocator); + + if (hs->allocator == NULL) + { + free(hs); + } + else + { + ; + } +} + +// +// +// diff --git a/src/compute/hs/vk/hs_vk_launcher.h b/src/compute/hs/vk/hs_vk_launcher.h new file mode 100644 index 0000000000..a549666985 --- /dev/null +++ b/src/compute/hs/vk/hs_vk_launcher.h @@ -0,0 +1,88 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#pragma once + +// +// +// + +#include + +// +// +// + +#include +#include + +// +// +// + +#include "hs_spirv_target.h" + +// +// +// + +struct hs_vk * +hs_vk_create(struct hs_spirv_target const * const target, + VkDevice device, + VkAllocationCallbacks const * allocator, + VkPipelineCache pipeline_cache); + +// +// Resources will be disposed of with the same device and allocator +// used for creation. +// + +void +hs_vk_release(struct hs_vk * const hs); + +// +// Determine what padding will be applied to the input and output +// buffers. +// +// Always check to see if the allocated buffers are large enough. +// +// count : number of keys +// count + count_padded_in : additional keys required for sorting +// count + count_padded_out : additional keys required for merging +// + +void +hs_vk_pad(struct hs_vk const * const hs, + uint32_t const count, + uint32_t * const count_padded_in, + uint32_t * const count_padded_out); + +// +// Sort the keys in the vin buffer and store them in the vout buffer. +// +// If vout is NULL then the sort will be performed in place. +// + +#if 0 +void +hs_vk_sort(struct hs_vk const * const hs, + vk_command_queue cq, + uint32_t const wait_list_size, + vk_event * wait_list, + vk_event * event, + vk_mem vin, + vk_mem vout, + uint32_t const count, + uint32_t const count_padded_in, + uint32_t const count_padded_out, + bool const linearize); +#endif + +// +// +// diff --git a/src/compute/hs/vk/intel/gen8/u32/make_all.bat b/src/compute/hs/vk/intel/gen8/u32/make_all.bat new file mode 100644 index 0000000000..9afd7b3a72 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u32/make_all.bat @@ -0,0 +1,48 @@ +@ECHO OFF + +del *.comp +del *.pre.comp +del *.spv + +REM +REM +REM + +set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen + +REM --- 32-bit keys --- + +REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +REM --- 64-bit keys + +CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +REM CMD /C make_inl_cl.bat hs_cl.cl + +for %%f in (*.comp) do ( + echo %%~nf + dos2unix %%f + clang-format -style=Mozilla -i %%f || goto :error + cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error + clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error + glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error + spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error + xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error + for /f %%A in ('wc -c %%~nf.spv') do ( + printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd + ) +) + +del *.comp +del *.pre.comp +del *.spv + +exit /b 0 + +:error + +exit /b %errorlevel% diff --git a/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat b/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat new file mode 100644 index 0000000000..9afd7b3a72 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat @@ -0,0 +1,48 @@ +@ECHO OFF + +del *.comp +del *.pre.comp +del *.spv + +REM +REM +REM + +set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen + +REM --- 32-bit keys --- + +REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +REM --- 64-bit keys + +CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +REM CMD /C make_inl_cl.bat hs_cl.cl + +for %%f in (*.comp) do ( + echo %%~nf + dos2unix %%f + clang-format -style=Mozilla -i %%f || goto :error + cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error + clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error + glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error + spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error + xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error + for /f %%A in ('wc -c %%~nf.spv') do ( + printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd + ) +) + +del *.comp +del *.pre.comp +del *.spv + +exit /b 0 + +:error + +exit /b %errorlevel% diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h b/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h new file mode 100644 index 0000000000..d4376114e5 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h @@ -0,0 +1,100 @@ +// +// Copyright 2016 Google Inc. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// + +#ifndef HS_GLSL_ONCE +#define HS_GLSL_ONCE + +#define HS_SLAB_THREADS_LOG2 3 +#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2) +#define HS_SLAB_WIDTH_LOG2 3 +#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2) +#define HS_SLAB_HEIGHT 16 +#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT) +#define HS_REG_LAST(c) c##16 +#define HS_KEY_TYPE uint64_t +#define HS_KEY_WORDS 2 +#define HS_VAL_WORDS 0 +#define HS_BS_SLABS 16 +#define HS_BS_SLABS_LOG2_RU 4 +#define HS_BC_SLABS_LOG2_MAX 4 +#define HS_FM_SCALE_MIN 1 +#define HS_FM_SCALE_MAX 1 +#define HS_HM_SCALE_MIN 1 +#define HS_HM_SCALE_MAX 1 +#define HS_EMPTY + +#define HS_SLAB_ROWS() \ + HS_SLAB_ROW( 1, 0 ) \ + HS_SLAB_ROW( 2, 1 ) \ + HS_SLAB_ROW( 3, 2 ) \ + HS_SLAB_ROW( 4, 3 ) \ + HS_SLAB_ROW( 5, 4 ) \ + HS_SLAB_ROW( 6, 5 ) \ + HS_SLAB_ROW( 7, 6 ) \ + HS_SLAB_ROW( 8, 7 ) \ + HS_SLAB_ROW( 9, 8 ) \ + HS_SLAB_ROW( 10, 9 ) \ + HS_SLAB_ROW( 11, 10 ) \ + HS_SLAB_ROW( 12, 11 ) \ + HS_SLAB_ROW( 13, 12 ) \ + HS_SLAB_ROW( 14, 13 ) \ + HS_SLAB_ROW( 15, 14 ) \ + HS_SLAB_ROW( 16, 15 ) \ + HS_EMPTY + +#define HS_TRANSPOSE_SLAB() \ + HS_TRANSPOSE_STAGE( 1 ) \ + HS_TRANSPOSE_STAGE( 2 ) \ + HS_TRANSPOSE_STAGE( 3 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 2, 1 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 4, 3 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 6, 5 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 8, 7 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 10, 9 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 12, 11 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 14, 13 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 16, 15 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 3, 1 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 4, 2 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 7, 5 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 8, 6 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 11, 9 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 12, 10 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 15, 13 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 16, 14 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 5, 1 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 6, 2 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 7, 3 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 8, 4 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 13, 9 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 14, 10 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 15, 11 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 16, 12 ) \ + HS_TRANSPOSE_REMAP( u, 1, 1 ) \ + HS_TRANSPOSE_REMAP( u, 2, 3 ) \ + HS_TRANSPOSE_REMAP( u, 3, 5 ) \ + HS_TRANSPOSE_REMAP( u, 4, 7 ) \ + HS_TRANSPOSE_REMAP( u, 5, 9 ) \ + HS_TRANSPOSE_REMAP( u, 6, 11 ) \ + HS_TRANSPOSE_REMAP( u, 7, 13 ) \ + HS_TRANSPOSE_REMAP( u, 8, 15 ) \ + HS_TRANSPOSE_REMAP( u, 9, 2 ) \ + HS_TRANSPOSE_REMAP( u, 10, 4 ) \ + HS_TRANSPOSE_REMAP( u, 11, 6 ) \ + HS_TRANSPOSE_REMAP( u, 12, 8 ) \ + HS_TRANSPOSE_REMAP( u, 13, 10 ) \ + HS_TRANSPOSE_REMAP( u, 14, 12 ) \ + HS_TRANSPOSE_REMAP( u, 15, 14 ) \ + HS_TRANSPOSE_REMAP( u, 16, 16 ) \ + HS_EMPTY + +#endif + +// +// +// + diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h b/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h new file mode 100644 index 0000000000..c67dffa3a0 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h @@ -0,0 +1,417 @@ +// +// Copyright 2016 Google Inc. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// + +#ifndef HS_GLSL_MACROS_ONCE +#define HS_GLSL_MACROS_ONCE + +// +// +// + +#define HS_HASH # +#define HS_EVAL(a) a +#define HS_GLSL_EXT() HS_EVAL(HS_HASH)##extension +#define HS_GLSL_EXT_ENABLE(name) HS_GLSL_EXT() name : enable +#define HS_GLSL_VERSION(ver) HS_EVAL(HS_HASH)##version ver + +// +// +// + +// HS_GLSL_VERSION(460) + +HS_GLSL_EXT_ENABLE(GL_ARB_gpu_shader_int64) +HS_GLSL_EXT_ENABLE(GL_KHR_shader_subgroup_shuffle) +HS_GLSL_EXT_ENABLE(GL_KHR_shader_subgroup_basic) + +// +// +// + +#include "hs_glsl.h" + +// +// +// + +#if (HS_KEY_WORDS == 1) +#define HS_SHUFFLE_CAST_TO(v) v +#define HS_SHUFFLE_CAST_FROM(v) v +#elif (HS_KEY_WORDS == 2) +#define HS_SHUFFLE_CAST_TO(v) uint64BitsToDouble(v) +#define HS_SHUFFLE_CAST_FROM(v) doubleBitsToUint64(v) +#endif + +#define HS_SUBGROUP_SHUFFLE(v,i) HS_SHUFFLE_CAST_FROM(subgroupShuffle(HS_SHUFFLE_CAST_TO(v),i)) +#define HS_SUBGROUP_SHUFFLE_XOR(v,m) HS_SHUFFLE_CAST_FROM(subgroupShuffleXor(HS_SHUFFLE_CAST_TO(v),m)) +#define HS_SUBGROUP_SHUFFLE_UP(v,d) HS_SHUFFLE_CAST_FROM(subgroupShuffleUp(HS_SHUFFLE_CAST_TO(v),d)) +#define HS_SUBGROUP_SHUFFLE_DOWN(v,d) HS_SHUFFLE_CAST_FROM(subgroupShuffleDown(HS_SHUFFLE_CAST_TO(v),d)) + +// +// This up/down shuffle has defined values for [0,subgroup size) +// + +#define HS_SUBGROUP_SHUFFLE_UP_2(prev,curr,delta) + +#define HS_SUBGROUP_SHUFFLE_DOWN_2(curr,next,delta) + +// +// FYI, restrict shouldn't have any impact on these kernels and +// benchmarks appear to prove that true +// + +#define HS_RESTRICT restrict + +// +// +// + +#define HS_GLSL_WORKGROUP_SIZE(x,y,z) \ + layout (local_size_x = x, \ + local_size_y = y, \ + local_size_z = z) in + +#define HS_GLSL_SUBGROUP_SIZE(x) + +// +// KERNEL PROTOS +// + +#define HS_TRANSPOSE_KERNEL_PROTO(slab_width) \ + buffer _vout { HS_KEY_TYPE vout[]; }; \ + HS_GLSL_WORKGROUP_SIZE(slab_width,1,1); \ + HS_GLSL_SUBGROUP_SIZE(slab_width) \ + void main() + +#define HS_BS_KERNEL_PROTO(slab_width,slab_count,slab_count_ru_log2) \ + buffer readonly _vin { HS_KEY_TYPE vin[]; }; \ + buffer writeonly _vout { HS_KEY_TYPE vout[]; }; \ + HS_GLSL_WORKGROUP_SIZE(slab_width*slab_count,1,1); \ + HS_GLSL_SUBGROUP_SIZE(slab_width) \ + void main() + +#define HS_BC_KERNEL_PROTO(slab_width,slab_count,slab_count_log2) \ + buffer _vout { HS_KEY_TYPE vout[]; }; \ + HS_GLSL_WORKGROUP_SIZE(slab_width*slab_count,1,1); \ + HS_GLSL_SUBGROUP_SIZE(slab_width) \ + void main() + +#define HS_HM_KERNEL_PROTO(s) \ + buffer _vout { HS_KEY_TYPE vout[]; }; \ + HS_GLSL_WORKGROUP_SIZE(HS_SLAB_KEYS,1,1); \ + void main() + +#define HS_FM_KERNEL_PROTO(s,r) \ + buffer _vout { HS_KEY_TYPE vout[]; }; \ + HS_GLSL_WORKGROUP_SIZE(HS_SLAB_KEYS,1,1); \ + void main() + +// +// BLOCK LOCAL MEMORY DECLARATION +// + +#define HS_BLOCK_LOCAL_MEM_DECL(width,height) \ + shared struct { \ + HS_KEY_TYPE m[width * height]; \ + } smem + +// +// BLOCK BARRIER +// + +#define HS_BLOCK_BARRIER() \ + barrier() + +// +// SLAB GLOBAL +// + +#define HS_SLAB_GLOBAL_PREAMBLE(slab_width,slab_height) \ + const uint gmem_idx = \ + (gl_GlobalInvocationID.x & ~(slab_width-1)) * slab_height + \ + gl_SubgroupInvocationID + +#define HS_SLAB_GLOBAL_LOAD(extent,slab_width,row_idx) \ + extent[gmem_idx + slab_width * row_idx] + +#define HS_SLAB_GLOBAL_STORE(slab_width,row_idx,reg) \ + vout[gmem_idx + slab_width * row_idx] = reg + +// +// SLAB LOCAL +// + +#define HS_SLAB_LOCAL_L(offset) \ + smem.m[smem_l_idx + (offset)] + +#define HS_SLAB_LOCAL_R(offset) \ + smem.m[smem_r_idx + (offset)] + +// +// SLAB LOCAL VERTICAL LOADS +// + +#define HS_BX_LOCAL_V(offset) \ + smem.m[gl_LocalInvocationID.x + (offset)] + +// +// BLOCK SORT MERGE HORIZONTAL +// + +#define HS_BS_MERGE_H_PREAMBLE(slab_width,slab_count) \ + const uint smem_l_idx = \ + gl_SubgroupID * (slab_width * slab_count) + \ + gl_SubgroupInvocationID; \ + const uint smem_r_idx = \ + (gl_SubgroupID ^ 1) * (slab_width * slab_count) + \ + (gl_SubgroupInvocationID ^ (slab_width - 1)) + +// +// BLOCK CLEAN MERGE HORIZONTAL +// + +#define HS_BC_MERGE_H_PREAMBLE(slab_width,slab_height,slab_count) \ + const uint gmem_l_idx = \ + (gl_GlobalInvocationID.x & ~(slab_width*slab_count-1)) * slab_height + \ + gl_LocalInvocationID.x; \ + const uint smem_l_idx = \ + gl_SubgroupID * (slab_width * slab_count) + \ + gl_SubgroupInvocationID + +#define HS_BC_GLOBAL_LOAD_L(slab_width,slab_idx) \ + vout[gmem_l_idx + (slab_width * slab_idx)] + +// +// SLAB FLIP AND HALF PREAMBLES +// + +#define HS_SLAB_FLIP_PREAMBLE(mask) \ + const uint flip_lane_idx = gl_SubgroupInvocationID ^ mask; \ + const bool t_lt = gl_SubgroupInvocationID < flip_lane_idx; + +#define HS_SLAB_HALF_PREAMBLE(mask) \ + const uint half_lane_idx = gl_SubgroupInvocationID ^ mask; \ + const bool t_lt = gl_SubgroupInvocationID < half_lane_idx; + +// +// Inter-lane compare exchange +// + +// default +#define HS_CMP_XCHG_V0(a,b) \ + { \ + const HS_KEY_TYPE t = min(a,b); \ + b = max(a,b); \ + a = t; \ + } + +// super slow +#define HS_CMP_XCHG_V1(a,b) \ + { \ + const HS_KEY_TYPE tmp = a; \ + a = (a < b) ? a : b; \ + b ^= a ^ tmp; \ + } + +// best +#define HS_CMP_XCHG_V2(a,b) \ + if (a >= b) { \ + const HS_KEY_TYPE t = a; \ + a = b; \ + b = t; \ + } + +// good +#define HS_CMP_XCHG_V3(a,b) \ + { \ + const bool ge = a >= b; \ + const HS_KEY_TYPE t = a; \ + a = ge ? b : a; \ + b = ge ? t : b; \ + } + +// +// +// + +#if (HS_KEY_WORDS == 1) +#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V0(a,b) +#elif (HS_KEY_WORDS == 2) +#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V2(a,b) +#endif + +// +// The flip/half comparisons rely on a "conditional min/max": +// +// - if the flag is false, return min(a,b) +// - otherwise, return max(a,b) +// +// What's a little surprising is that sequence (1) is faster than (2) +// for 32-bit keys. +// +// I suspect either a code generation problem or that the sequence +// maps well to the GEN instruction set. +// +// We mostly care about 64-bit keys and unsurprisingly sequence (2) is +// fastest for this wider type. +// + +#define HS_LOGICAL_XOR() != + +// this is what you would normally use +#define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) HS_LOGICAL_XOR() lt) ? b : a + +// this seems to be faster for 32-bit keys +#define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b)) + +// +// +// + +#if (HS_KEY_WORDS == 1) +#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V1(lt,a,b) +#elif (HS_KEY_WORDS == 2) +#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b) +#endif + +// +// Conditional inter-subgroup flip/half compare exchange +// + +#define HS_CMP_FLIP(i,a,b) \ + { \ + const HS_KEY_TYPE ta = HS_SUBGROUP_SHUFFLE(a,flip_lane_idx); \ + const HS_KEY_TYPE tb = HS_SUBGROUP_SHUFFLE(b,flip_lane_idx); \ + a = HS_COND_MIN_MAX(t_lt,a,tb); \ + b = HS_COND_MIN_MAX(t_lt,b,ta); \ + } + +#define HS_CMP_HALF(i,a) \ + { \ + const HS_KEY_TYPE ta = HS_SUBGROUP_SHUFFLE(a,half_lane_idx); \ + a = HS_COND_MIN_MAX(t_lt,a,ta); \ + } + +// +// The device's comparison operator might return what we actually +// want. For example, it appears GEN 'cmp' returns {true:-1,false:0}. +// + +#define HS_CMP_IS_ZERO_ONE + +#ifdef HS_CMP_IS_ZERO_ONE +// OpenCL requires a {true: +1, false: 0} scalar result +// (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF } +#define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b)) +#define HS_CMP_TO_MASK(a) (HS_KEY_TYPE)(-a) +#else +// However, OpenCL requires { -1, 0 } for vectors +// (a < b) -> { 0xFFFFFFFF, 0 } +#define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64 +#define HS_CMP_TO_MASK(a) (a) +#endif + +// +// The "flip-merge" and "half-merge" preambles are very similar +// + +#define HS_HM_PREAMBLE(half_span) \ + const uint span_idx = gl_GlobalInvocationID.z * gl_NumWorkGroups.y + gl_GlobalInvocationID.y; \ + const uint span_stride = gl_NumWorkGroups.x * gl_WorkGroupSize.x; \ + const uint span_size = span_stride * half_span * 2; \ + const uint span_base = span_idx * span_size; \ + const uint span_off = gl_GlobalInvocationID.x; \ + const uint span_l = span_base + span_off + +#define HS_FM_PREAMBLE(half_span) \ + HS_HM_PREAMBLE(half_span); \ + const uint span_r = span_base + span_stride * (half_span + 1) - span_off - 1 + +// +// +// + +#define HS_XM_GLOBAL_L(stride_idx) \ + vout[span_l + span_stride * stride_idx] + +#define HS_XM_GLOBAL_LOAD_L(stride_idx) \ + HS_XM_GLOBAL_L(stride_idx) + +#define HS_XM_GLOBAL_STORE_L(stride_idx,reg) \ + HS_XM_GLOBAL_L(stride_idx) = reg + +#define HS_FM_GLOBAL_R(stride_idx) \ + vout[span_r + span_stride * stride_idx] + +#define HS_FM_GLOBAL_LOAD_R(stride_idx) \ + HS_FM_GLOBAL_R(stride_idx) + +#define HS_FM_GLOBAL_STORE_R(stride_idx,reg) \ + HS_FM_GLOBAL_R(stride_idx) = reg + +// +// This snarl of macros is for transposing a "slab" of sorted elements +// into linear order. +// +// This can occur as the last step in hs_sort() or via a custom kernel +// that inspects the slab and then transposes and stores it to memory. +// +// The slab format can be inspected more efficiently than a linear +// arrangement. +// +// The prime example is detecting when adjacent keys (in sort order) +// have differing high order bits ("key changes"). The index of each +// change is recorded to an auxilary array. +// +// A post-processing step like this needs to be able to navigate the +// slab and eventually transpose and store the slab in linear order. +// + +#define HS_TRANSPOSE_REG(prefix,row) prefix##row +#define HS_TRANSPOSE_DECL(prefix,row) const HS_KEY_TYPE HS_TRANSPOSE_REG(prefix,row) +#define HS_TRANSPOSE_PRED(level) is_lo_##level + +#define HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) \ + prefix_curr##row_ll##_##row_ur + +#define HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) \ + const HS_KEY_TYPE HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) + +#define HS_TRANSPOSE_STAGE(level) \ + const bool HS_TRANSPOSE_PRED(level) = \ + (gl_SubgroupInvocationID & (1 << (level-1))) == 0; + +#define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \ + HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) = \ + HS_SUBGROUP_SHUFFLE_XOR(HS_TRANSPOSE_PRED(level) ? \ + HS_TRANSPOSE_REG(prefix_prev,row_ll) : \ + HS_TRANSPOSE_REG(prefix_prev,row_ur), \ + 1<<(level-1)); \ + \ + HS_TRANSPOSE_DECL(prefix_curr,row_ll) = \ + HS_TRANSPOSE_PRED(level) ? \ + HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) : \ + HS_TRANSPOSE_REG(prefix_prev,row_ll); \ + \ + HS_TRANSPOSE_DECL(prefix_curr,row_ur) = \ + HS_TRANSPOSE_PRED(level) ? \ + HS_TRANSPOSE_REG(prefix_prev,row_ur) : \ + HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur); + +#define HS_TRANSPOSE_REMAP(prefix,row_from,row_to) \ + vout[gmem_idx + ((row_to-1) << HS_SLAB_WIDTH_LOG2)] = \ + HS_TRANSPOSE_REG(prefix,row_from); + +// +// +// + +#endif + +// +// +// diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h b/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h new file mode 100644 index 0000000000..551fc52180 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h @@ -0,0 +1,75 @@ +// +// Copyright 2016 Google Inc. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// + +#include "hs_transpose.len.xxd" +, +#include "hs_transpose.spv.xxd" +, +#include "hs_bs_4.len.xxd" +, +#include "hs_bs_4.spv.xxd" +, +#include "hs_bs_3.len.xxd" +, +#include "hs_bs_3.spv.xxd" +, +#include "hs_bs_2.len.xxd" +, +#include "hs_bs_2.spv.xxd" +, +#include "hs_bs_1.len.xxd" +, +#include "hs_bs_1.spv.xxd" +, +#include "hs_bs_0.len.xxd" +, +#include "hs_bs_0.spv.xxd" +, +#include "hs_bc_4.len.xxd" +, +#include "hs_bc_4.spv.xxd" +, +#include "hs_bc_3.len.xxd" +, +#include "hs_bc_3.spv.xxd" +, +#include "hs_bc_2.len.xxd" +, +#include "hs_bc_2.spv.xxd" +, +#include "hs_bc_1.len.xxd" +, +#include "hs_bc_1.spv.xxd" +, +#include "hs_bc_0.len.xxd" +, +#include "hs_bc_0.spv.xxd" +, +#include "hs_fm_1_4.len.xxd" +, +#include "hs_fm_1_4.spv.xxd" +, +#include "hs_fm_1_3.len.xxd" +, +#include "hs_fm_1_3.spv.xxd" +, +#include "hs_fm_1_2.len.xxd" +, +#include "hs_fm_1_2.spv.xxd" +, +#include "hs_fm_1_1.len.xxd" +, +#include "hs_fm_1_1.spv.xxd" +, +#include "hs_fm_1_0.len.xxd" +, +#include "hs_fm_1_0.spv.xxd" +, +#include "hs_hm_1_0.len.xxd" +, +#include "hs_hm_1_0.spv.xxd" +, diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_target.h b/src/compute/hs/vk/intel/gen8/u64/hs_target.h new file mode 100644 index 0000000000..f379c23066 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u64/hs_target.h @@ -0,0 +1,113 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "../../../hs_spirv_target.h" + +// +// +// + +#include "hs_glsl.h" + +// +// +// + +#ifndef HS_TARGET_NAME +#define HS_TARGET_NAME hs_target +#endif + +#define HS_TARGET_HELPER(a) a + +// +// +// + +static struct hs_spirv_target const HS_TARGET_NAME = +{ + .config = { + .slab = { + .threads_log2 = HS_SLAB_THREADS_LOG2, + .width_log2 = HS_SLAB_WIDTH_LOG2, + .height = HS_SLAB_HEIGHT + }, + + .words = { + .key = HS_KEY_WORDS, + .val = HS_VAL_WORDS + }, + + .block = { + .slabs = HS_BS_SLABS + }, + + .merge = { + .fm = { + .scale_min = HS_FM_SCALE_MIN, + .scale_max = HS_FM_SCALE_MAX + }, + .hm = { + .scale_min = HS_HM_SCALE_MIN, + .scale_max = HS_HM_SCALE_MAX, + } + }, + + .pad = { 0 } + }, + + .modules.bytes = { + +#include "hs_kernels.h" + +#ifdef HS_DUMP + 0,0,0,0 +#endif + } +}; + +// +// +// + +#ifdef HS_DUMP + +#include +#include + +int +main(int argc, char const * argv[]) +{ + FILE * fp = fopen("hs_target.bin","wb"); + + fwrite(&HS_TARGET_NAME.config,1,sizeof(HS_TARGET_NAME.config),fp); + + uint8_t const * modules = HS_TARGET_NAME.modules.bytes; + size_t modsize = (modules[0]<<24) | (modules[1]<<16) | (modules[2]<<8) | modules[3]; + + while (modsize > 0) { + // fprintf(stderr,"%zu\n",modsize); + modsize += sizeof(uint32_t); + fwrite(modules,1,modsize,fp); + modules += modsize; + modsize = (modules[0]<<24) | (modules[1]<<16) | (modules[2]<<8) | modules[3]; + } + + fclose(fp); + + return EXIT_SUCCESS; +} + +#endif + +// +// +// diff --git a/src/compute/hs/vk/intel/gen8/u64/make_all.bat b/src/compute/hs/vk/intel/gen8/u64/make_all.bat new file mode 100644 index 0000000000..d148ef0113 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u64/make_all.bat @@ -0,0 +1,79 @@ +@ECHO OFF + +:: +:: delete the previous images +:: + +del *.pre.comp +del *.comp +del *.spv +del *.xxd + +:: +:: +:: + +set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen + +:: --- 32-bit keys --- + +:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +:: --- 64-bit keys + +%HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +:: CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +:: +:: remove trailing whitespace from generated files +:: + +sed -i 's/[[:space:]]*$//' hs_glsl.h +sed -i 's/[[:space:]]*$//' hs_kernels.h + +:: +:: FIXME -- convert this to a bash script +:: +:: Note that we can use xargs instead of the cmd for/do +:: + +for %%f in (*.comp) do ( + dos2unix %%f + clang-format -style=Mozilla -i %%f || goto :error + cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error + clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error +:: glslangValidator -V110 -o %%~nf.spv %%~nf.pre.comp || goto :error + glslc --target-env=vulkan1.1 -std=460 -I . -o %%~nf.spv %%~nf.pre.comp || goto :error + spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error +:: spirv-remap ... || goto :error + xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error + for /f %%A in ('wc -c %%~nf.spv') do ( + echo %%~nf.spv %%A + printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd || goto :error + ) +) + +:: +:: dump a binary +:: + +cl /DHS_DUMP /Fe:hs_dump.exe /Tchs_target.h +hs_dump + +:: +:: delete temporary files +:: + +:: del *.pre.comp +del *.comp +del *.spv +del *.obj +del *.exe + +exit /b 0 + +:error + +exit /b %errorlevel% diff --git a/src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat b/src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat new file mode 100644 index 0000000000..9afd7b3a72 --- /dev/null +++ b/src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat @@ -0,0 +1,48 @@ +@ECHO OFF + +del *.comp +del *.pre.comp +del *.spv + +REM +REM +REM + +set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen + +REM --- 32-bit keys --- + +REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +REM --- 64-bit keys + +CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +REM CMD /C make_inl_cl.bat hs_cl.cl + +for %%f in (*.comp) do ( + echo %%~nf + dos2unix %%f + clang-format -style=Mozilla -i %%f || goto :error + cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error + clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error + glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error + spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error + xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error + for /f %%A in ('wc -c %%~nf.spv') do ( + printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd + ) +) + +del *.comp +del *.pre.comp +del *.spv + +exit /b 0 + +:error + +exit /b %errorlevel% diff --git a/src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat b/src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat new file mode 100644 index 0000000000..9afd7b3a72 --- /dev/null +++ b/src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat @@ -0,0 +1,48 @@ +@ECHO OFF + +del *.comp +del *.pre.comp +del *.spv + +REM +REM +REM + +set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen + +REM --- 32-bit keys --- + +REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +REM --- 64-bit keys + +CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +REM CMD /C make_inl_cl.bat hs_cl.cl + +for %%f in (*.comp) do ( + echo %%~nf + dos2unix %%f + clang-format -style=Mozilla -i %%f || goto :error + cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error + clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error + glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error + spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error + xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error + for /f %%A in ('wc -c %%~nf.spv') do ( + printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd + ) +) + +del *.comp +del *.pre.comp +del *.spv + +exit /b 0 + +:error + +exit /b %errorlevel% diff --git a/src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat b/src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat new file mode 100644 index 0000000000..9afd7b3a72 --- /dev/null +++ b/src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat @@ -0,0 +1,48 @@ +@ECHO OFF + +del *.comp +del *.pre.comp +del *.spv + +REM +REM +REM + +set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen + +REM --- 32-bit keys --- + +REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +REM --- 64-bit keys + +CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +REM CMD /C make_inl_cl.bat hs_cl.cl + +for %%f in (*.comp) do ( + echo %%~nf + dos2unix %%f + clang-format -style=Mozilla -i %%f || goto :error + cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error + clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error + glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error + spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error + xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error + for /f %%A in ('wc -c %%~nf.spv') do ( + printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd + ) +) + +del *.comp +del *.pre.comp +del *.spv + +exit /b 0 + +:error + +exit /b %errorlevel% diff --git a/src/compute/skc/extent_ring.c b/src/compute/skc/extent_ring.c index ecb41e6fcd..251b4208b2 100644 --- a/src/compute/skc/extent_ring.c +++ b/src/compute/skc/extent_ring.c @@ -202,4 +202,3 @@ skc_extent_ring_snap_to(struct skc_extent_ring_snap const * const snap) // // // - diff --git a/src/compute/skc/main.c b/src/compute/skc/main.c index 067d24c773..2af8ebb6fc 100644 --- a/src/compute/skc/main.c +++ b/src/compute/skc/main.c @@ -69,7 +69,7 @@ skc_runtime_cl_12_debug(struct skc_context * const context); // int -main(int argc, char** argv) +main(int argc, char const * argv[]) { // // @@ -242,7 +242,8 @@ main(int argc, char** argv) skc_composition_seal(composition); } - uint32_t const clip[] = { 0, 0, 65535, 65535 }; // tile clip is <= 9 bits (512) + uint32_t const clip[] = { 0, 0, 65535, 65535 }; + int32_t const txty[] = { 0, 0 }; // render the styled composition to the surface skc_surface_render(surface, @@ -250,6 +251,7 @@ main(int argc, char** argv) composition, skc_interop_get_framebuffer(interop), clip, + txty, NULL, NULL); diff --git a/src/compute/skc/path_builder.h b/src/compute/skc/path_builder.h index a956475f49..1c32d6a166 100644 --- a/src/compute/skc/path_builder.h +++ b/src/compute/skc/path_builder.h @@ -82,4 +82,3 @@ struct skc_path_builder // // // - diff --git a/src/compute/skc/platforms/cl_12/allocator_device_cl.c b/src/compute/skc/platforms/cl_12/allocator_device_cl.c index 8003504706..425952d09a 100644 --- a/src/compute/skc/platforms/cl_12/allocator_device_cl.c +++ b/src/compute/skc/platforms/cl_12/allocator_device_cl.c @@ -133,4 +133,3 @@ skc_allocator_device_dispose(struct skc_runtime * const runtime) // // // - diff --git a/src/compute/skc/platforms/cl_12/allocator_device_cl.h b/src/compute/skc/platforms/cl_12/allocator_device_cl.h index 08c4518a6a..17011b7233 100644 --- a/src/compute/skc/platforms/cl_12/allocator_device_cl.h +++ b/src/compute/skc/platforms/cl_12/allocator_device_cl.h @@ -51,4 +51,3 @@ skc_allocator_device_dispose(struct skc_runtime * const runtime); // // // - diff --git a/src/compute/skc/platforms/cl_12/composition_cl_12.c b/src/compute/skc/platforms/cl_12/composition_cl_12.c index 5db86762f3..96e7834b03 100644 --- a/src/compute/skc/platforms/cl_12/composition_cl_12.c +++ b/src/compute/skc/platforms/cl_12/composition_cl_12.c @@ -380,7 +380,8 @@ static void skc_composition_sort_grid_pfn_execute(skc_grid_t const grid) { - struct skc_composition_impl * const impl = skc_grid_get_data(grid); + struct skc_composition_impl * const impl = skc_grid_get_data(grid); + struct skc_runtime * const runtime = impl->runtime; // we should be sealing assert(impl->state == SKC_COMPOSITION_STATE_SEALING); @@ -395,22 +396,24 @@ skc_composition_sort_grid_pfn_execute(skc_grid_t const grid) { uint32_t keys_padded_in, keys_padded_out; - hs_pad(atomics->keys,&keys_padded_in,&keys_padded_out); + hs_cl_pad(runtime->hs,atomics->keys,&keys_padded_in,&keys_padded_out); - hs_sort(impl->cq, - impl->keys.drw, - impl->keys.drw, - atomics->keys, - keys_padded_in, - keys_padded_out, - false); + hs_cl_sort(impl->runtime->hs, + impl->cq, + 0,NULL,NULL, + impl->keys.drw, + NULL, + atomics->keys, + keys_padded_in, + keys_padded_out, + false); cl(SetKernelArg(impl->kernels.segment,0,SKC_CL_ARG(impl->keys.drw))); cl(SetKernelArg(impl->kernels.segment,1,SKC_CL_ARG(impl->offsets.drw))); cl(SetKernelArg(impl->kernels.segment,2,SKC_CL_ARG(impl->atomics.drw))); // find start of each tile - skc_device_enqueue_kernel(impl->runtime->device, + skc_device_enqueue_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK, impl->cq, impl->kernels.segment, diff --git a/src/compute/skc/platforms/cl_12/export_cl_12.h b/src/compute/skc/platforms/cl_12/export_cl_12.h index 23ff2343e6..244a5282f6 100644 --- a/src/compute/skc/platforms/cl_12/export_cl_12.h +++ b/src/compute/skc/platforms/cl_12/export_cl_12.h @@ -60,4 +60,3 @@ skc_surface_cl_12_create(struct skc_context * const context, // // // - diff --git a/src/compute/skc/platforms/cl_12/extent_cl_12.c b/src/compute/skc/platforms/cl_12/extent_cl_12.c index e145d979c2..2d90d0ecfa 100644 --- a/src/compute/skc/platforms/cl_12/extent_cl_12.c +++ b/src/compute/skc/platforms/cl_12/extent_cl_12.c @@ -166,7 +166,7 @@ skc_extent_thr_tdrw_alloc(struct skc_runtime * const runtime, { extent->size = size; extent->hr = skc_runtime_host_temp_alloc(runtime, - SKC_MEM_FLAGS_READ_WRITE, + SKC_MEM_FLAGS_READ_ONLY, size,&extent->id.hr,NULL); extent->drw = skc_runtime_device_temp_alloc(runtime, CL_MEM_READ_WRITE, diff --git a/src/compute/skc/platforms/cl_12/interop/interop_glfw.c b/src/compute/skc/platforms/cl_12/interop/interop_glfw.c index 8f94100552..f3c11ee9f1 100644 --- a/src/compute/skc/platforms/cl_12/interop/interop_glfw.c +++ b/src/compute/skc/platforms/cl_12/interop/interop_glfw.c @@ -354,7 +354,7 @@ skc_interop_create() .interop = interop, .post_render = skc_interop_blit }, - .is_msecs = true, + .is_msecs = false, .is_srgb = true, .is_vsync_on = false, .is_fullscreen = false, @@ -747,5 +747,3 @@ skc_interop_get_size(struct skc_interop * interop, // // // - - diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c index 0be97235f3..9ff0ba53b7 100644 --- a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c +++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c @@ -23,7 +23,7 @@ #include "device_cl_12.h" #include "hs/cl/hs_cl_launcher.h" -#include "hs/cl/gen9/hs_cl.h" +#include "hs/cl/intel/gen8/u64/hs_target.h" // // @@ -500,11 +500,11 @@ skc_device_shaper_segment_ttrk(size_t const work_size, size_t * const work_local) { // work_size is number of keys -- round up to a whole slab - size_t keys_ru = SKC_ROUND_UP(work_size,HS_LANES_PER_WARP*HS_KEYS_PER_LANE); + size_t keys_ru = SKC_ROUND_UP(work_size,HS_SLAB_WIDTH*HS_SLAB_HEIGHT); work_dim [0] = 1; - work_global[0] = keys_ru / HS_KEYS_PER_LANE; - work_local [0] = HS_LANES_PER_WARP; // or just return NULL + work_global[0] = keys_ru / HS_SLAB_HEIGHT; + work_local [0] = HS_SLAB_WIDTH; // or just return NULL return work_local; } @@ -517,11 +517,11 @@ skc_device_shaper_segment_ttck(size_t const work_size, size_t * const work_local) { // work_size is number of keys -- round up to a whole slab - size_t keys_ru = SKC_ROUND_UP(work_size,HS_LANES_PER_WARP*HS_KEYS_PER_LANE); + size_t keys_ru = SKC_ROUND_UP(work_size,HS_SLAB_WIDTH*HS_SLAB_HEIGHT); work_dim [0] = 1; - work_global[0] = keys_ru / HS_KEYS_PER_LANE; - work_local [0] = HS_LANES_PER_WARP; // or just return NULL + work_global[0] = keys_ru / HS_SLAB_HEIGHT; + work_local [0] = HS_SLAB_WIDTH; // or just return NULL return work_local; } @@ -894,8 +894,10 @@ skc_device_create(struct skc_runtime * const runtime) SKC_DEVICE_BUILD_PROGRAM(paths_reclaim); SKC_DEVICE_BUILD_PROGRAM(rasters_reclaim); - // create HotSort instance -- FIXME -- how this occurs needs to be cleaned up - hs_create(runtime->cl.context,runtime->cl.device_id,NULL); + // create HotSort instance + runtime->hs = hs_cl_create(&hs_target, + runtime->cl.context, + runtime->cl.device_id); } void @@ -906,6 +908,8 @@ skc_device_dispose(struct skc_runtime * const runtime) // skc_runtime_host_perm_free(runtime,runtime->device); + + // dispose of hotsort etc. } // diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/kernel_cl_12.h b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/kernel_cl_12.h index 0cac2261e7..224d5c9d91 100644 --- a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/kernel_cl_12.h +++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/kernel_cl_12.h @@ -17,12 +17,6 @@ #include "block.h" -// -// -// - -#include - // // HOW TO SELECT A SUBBLOCK AND BLOCK SIZES: // diff --git a/src/compute/skc/platforms/cl_12/kernels/rasterize.cl b/src/compute/skc/platforms/cl_12/kernels/rasterize.cl index f20f6456b3..a879c99b00 100644 --- a/src/compute/skc/platforms/cl_12/kernels/rasterize.cl +++ b/src/compute/skc/platforms/cl_12/kernels/rasterize.cl @@ -1029,8 +1029,8 @@ skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT c { return max(1.0f, ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC * - SKC_WANG_LENGTH(fabs(t2x - 2.0f * t1x + t0x), - fabs(t2y - 2.0f * t1y + t0y))))); + SKC_WANG_LENGTH(t2x - 2.0f * t1x + t0x, + t2y - 2.0f * t1y + t0y)))); } // diff --git a/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl b/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl index 7f48978782..a6a2df661c 100644 --- a/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl +++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl @@ -15,13 +15,13 @@ #include "tile.h" #include "atomic_cl.h" #include "kernel_cl_12.h" +#include "hs/cl/intel/gen8/u64/hs_cl_macros.h" // // // -#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP) -#define HS_LANE_MASK (HS_LANES_PER_WARP - 1) +#define HS_LANE_MASK (HS_SLAB_WIDTH - 1) // // @@ -35,23 +35,23 @@ // __kernel -__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP))) +__attribute__((intel_reqd_sub_group_size(HS_SLAB_WIDTH))) void skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const vout, __global uint * SKC_RESTRICT const indices, __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics) { uint const global_id = get_global_id(0); - uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB; + uint const gmem_base = (global_id >> HS_SLAB_WIDTH_LOG2) * HS_SLAB_KEYS; uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK); - uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE; + uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_SLAB_HEIGHT; // // LOAD ALL THE ROWS // #undef HS_SLAB_ROW #define HS_SLAB_ROW(row,prev) \ - HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP]; + HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_SLAB_WIDTH]; HS_SLAB_ROWS(); @@ -103,11 +103,11 @@ skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const v // uint next = 0; - if (get_sub_group_local_id() == HS_LANES_PER_WARP-1) + if (get_sub_group_local_id() == HS_SLAB_WIDTH-1) next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset // distribute base across subgroup - next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1); + next = exclusive + sub_group_broadcast(next,HS_SLAB_WIDTH-1); // // STORE THE INDICES diff --git a/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl b/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl index 9db82d5f98..c4ace0b2a1 100644 --- a/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl +++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl @@ -13,15 +13,15 @@ // #include "tile.h" -#include "raster_builder_cl_12.h" // need meta_in structure #include "kernel_cl_12.h" +#include "raster_builder_cl_12.h" // need meta_in structure +#include "hs/cl/intel/gen8/u64/hs_cl_macros.h" // // // -#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP) -#define HS_LANE_MASK (HS_LANES_PER_WARP - 1) +#define HS_LANE_MASK (HS_SLAB_WIDTH - 1) // // THE BEST TYPE TO ZERO SMEM @@ -39,7 +39,7 @@ // 3: rk // -#if (HS_KEYS_PER_SLAB < 256) +#if (HS_SLAB_KEYS < 256) #define SKC_META_TYPE uint #define SKC_META_WORDS 1 @@ -96,7 +96,7 @@ #define SKC_ZERO_RATIO (SKC_ZERO_WORDS / SKC_META_WORDS) #define SKC_META_ZERO_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE)) -#define SKC_META_ZERO_REM (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2)) +#define SKC_META_ZERO_REM (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_SLAB_WIDTH_LOG2)) #define SKC_META_COMPONENTS 4 #define SKC_META_COMPONENT_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE)) @@ -106,7 +106,7 @@ // __kernel -__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP))) +__attribute__((intel_reqd_sub_group_size(HS_SLAB_WIDTH))) void skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout, __global uint * SKC_RESTRICT const metas) @@ -119,16 +119,16 @@ skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout, } shared; uint const global_id = get_global_id(0); - uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB; + uint const gmem_base = (global_id >> HS_SLAB_WIDTH_LOG2) * HS_SLAB_KEYS; uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK); - uint const gmem_off = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE; + uint const gmem_off = (global_id & HS_LANE_MASK) * HS_SLAB_HEIGHT; // // LOAD ALL THE ROWS // #undef HS_SLAB_ROW #define HS_SLAB_ROW(row,prev) \ - HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP]; + HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_SLAB_WIDTH]; HS_SLAB_ROWS(); @@ -169,7 +169,7 @@ skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout, // DEBUG // #if 0 - if (gmem_base == HS_KEYS_PER_SLAB * 7) + if (gmem_base == HS_SLAB_KEYS * 7) { if (get_sub_group_local_id() == 0) printf("\n%llX ",as_ulong(r0)); @@ -267,14 +267,14 @@ skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout, // the min cohort is the first key in the slab uint const c_min = sub_group_broadcast(c1,0); - + // the max cohort is the max across all lanes c_max = sub_group_reduce_max(c_max); #if 0 // REMOVE ME LATER if (get_sub_group_local_id() == 0) printf("%3u : ( %3u , %3u )\n", - get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max); + get_global_id(0)>>HS_SLAB_WIDTH_LOG2,c_min,c_max); #endif // @@ -286,7 +286,7 @@ skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout, uint zz = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id(); uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO; - for (; zz<=zz_max; zz+=HS_LANES_PER_WARP) + for (; zz<=zz_max; zz+=HS_SLAB_WIDTH) shared.z[zz] = 0; #else // ERROR -- it's highly unlikely that the zero type is smaller than @@ -348,7 +348,7 @@ skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout, // ATOMICALLY ADD THE CARRIED OUT METAS // #if 0 // BUG - if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0)) + if ((valid & (1<<(HS_SLAB_HEIGHT-1))) && (meta != 0)) SKC_META_LOCAL_ADD(meta); #else if (meta != 0) @@ -378,9 +378,9 @@ skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout, atomic_add(metas+cc,c+adjust); } - cc += HS_LANES_PER_WARP; + cc += HS_SLAB_WIDTH; - for (; cc<=cc_max; cc+=HS_LANES_PER_WARP) + for (; cc<=cc_max; cc+=HS_SLAB_WIDTH) { uint const c = shared.c[cc]; diff --git a/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c index d84b92bfd7..507e1bf077 100644 --- a/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c +++ b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c @@ -566,15 +566,17 @@ skc_raster_cohort_sort_prefix(skc_grid_t const grid) // uint32_t keys_padded_in, keys_padded_out; - hs_pad(atomics->keys,&keys_padded_in,&keys_padded_out); - - hs_sort(cohort->cq, - cohort->keys.drw, - cohort->keys.drw, - atomics->keys, - keys_padded_in, - keys_padded_out, - false); + hs_cl_pad(runtime->hs,atomics->keys,&keys_padded_in,&keys_padded_out); + + hs_cl_sort(runtime->hs, + cohort->cq, + 0,NULL,NULL, + cohort->keys.drw, + NULL, + atomics->keys, + keys_padded_in, + keys_padded_out, + false); cl(SetKernelArg(impl->kernels.segment,0,SKC_CL_ARG(cohort->keys.drw))); cl(SetKernelArg(impl->kernels.segment,1,SKC_CL_ARG(cohort->metas.drw))); diff --git a/src/compute/skc/platforms/cl_12/runtime_cl_12.c b/src/compute/skc/platforms/cl_12/runtime_cl_12.c index 81f3aba02f..55b2854c4d 100644 --- a/src/compute/skc/platforms/cl_12/runtime_cl_12.c +++ b/src/compute/skc/platforms/cl_12/runtime_cl_12.c @@ -277,4 +277,3 @@ skc_runtime_cl_12_debug(struct skc_context * const context) // // // - diff --git a/src/compute/skc/platforms/cl_12/runtime_cl_12.h b/src/compute/skc/platforms/cl_12/runtime_cl_12.h index beb924f3ca..1cfd51161e 100644 --- a/src/compute/skc/platforms/cl_12/runtime_cl_12.h +++ b/src/compute/skc/platforms/cl_12/runtime_cl_12.h @@ -58,6 +58,8 @@ struct skc_runtime struct skc_config const * config; // FIXME: config will be determined by device with some opportunities to resize struct skc_device * device; // opaque bundle of kernels + + struct hs_cl const * hs; // opaque hotsort }; // @@ -185,4 +187,3 @@ skc_runtime_cl_12_debug(struct skc_context * const context); // // // - diff --git a/src/compute/skc/platforms/cl_12/surface_cl_12.c b/src/compute/skc/platforms/cl_12/surface_cl_12.c index c4e205a04b..e5f79c2d53 100644 --- a/src/compute/skc/platforms/cl_12/surface_cl_12.c +++ b/src/compute/skc/platforms/cl_12/surface_cl_12.c @@ -56,6 +56,7 @@ struct skc_surface_impl struct skc_surface_render { skc_uint clip[4]; + skc_uint txty[2]; struct skc_surface_impl * impl; struct skc_styling * styling; @@ -329,6 +330,7 @@ skc_surface_pfn_render(struct skc_surface_impl * const impl, skc_composition_t composition, skc_framebuffer_t fb, uint32_t const clip[4], + int32_t const txty[2], skc_surface_render_notify notify, void * data) { @@ -359,6 +361,9 @@ skc_surface_pfn_render(struct skc_surface_impl * const impl, render->clip[2] = clip[2]; render->clip[3] = clip[3]; + render->txty[0] = txty[0]; + render->txty[1] = txty[1]; + render->impl = impl; render->styling = styling; render->composition = composition; diff --git a/src/compute/skc/skc.h b/src/compute/skc/skc.h index a5e81fb2ff..53d5f273af 100644 --- a/src/compute/skc/skc.h +++ b/src/compute/skc/skc.h @@ -323,6 +323,7 @@ skc_surface_render(skc_surface_t surface, skc_composition_t composition, skc_framebuffer_t fb, uint32_t const clip[4], + int32_t const txty[2], skc_surface_render_notify notify, void * data); diff --git a/src/compute/skc/styling.h b/src/compute/skc/styling.h index 310a739a07..b5326f6090 100644 --- a/src/compute/skc/styling.h +++ b/src/compute/skc/styling.h @@ -52,4 +52,3 @@ struct skc_styling // // // - diff --git a/src/compute/skc/styling_types.h b/src/compute/skc/styling_types.h index ee0e7aa7e3..10442e8f05 100644 --- a/src/compute/skc/styling_types.h +++ b/src/compute/skc/styling_types.h @@ -246,5 +246,3 @@ SKC_STATIC_ASSERT(sizeof(union skc_gradient_vector) == sizeof(skc_float4)); // // // - - diff --git a/src/compute/skc/surface.c b/src/compute/skc/surface.c index 3d96bb65ac..107c02dd84 100644 --- a/src/compute/skc/surface.c +++ b/src/compute/skc/surface.c @@ -44,6 +44,7 @@ skc_surface_render(skc_surface_t surface, skc_composition_t composition, skc_framebuffer_t fb, uint32_t const clip[4], + int32_t const txty[2], skc_surface_render_notify notify, void * data) { @@ -70,7 +71,10 @@ skc_surface_render(skc_surface_t surface, // non-overlapping clips. This is fairly easy but at this point // doesn't seem like a common use case. // - surface->render(surface->impl,styling,composition,fb,clip,notify,data); + surface->render(surface->impl, + styling,composition, + fb,clip,txty, + notify,data); return SKC_ERR_SUCCESS; } diff --git a/src/compute/skc/surface.h b/src/compute/skc/surface.h index 94f9128841..8d363569cb 100644 --- a/src/compute/skc/surface.h +++ b/src/compute/skc/surface.h @@ -33,6 +33,7 @@ struct skc_surface skc_composition_t composition, skc_framebuffer_t fb, uint32_t const clip[4], + int32_t const txty[2], skc_surface_render_notify notify, void * data); }; diff --git a/src/compute/skc/weakref.h b/src/compute/skc/weakref.h index c6ce6490f8..d239b7e9f7 100644 --- a/src/compute/skc/weakref.h +++ b/src/compute/skc/weakref.h @@ -46,5 +46,3 @@ skc_weakref_index(skc_weakref_t const * const weakref); // // // - - -- cgit v1.2.3