diff options
Diffstat (limited to 'src/compute/hs/vk/intel')
-rw-r--r-- | src/compute/hs/vk/intel/gen8/u32/make_all.bat | 48 | ||||
-rw-r--r-- | src/compute/hs/vk/intel/gen8/u32b32/make_all.bat | 48 | ||||
-rw-r--r-- | src/compute/hs/vk/intel/gen8/u64/hs_glsl.h | 100 | ||||
-rw-r--r-- | src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h | 417 | ||||
-rw-r--r-- | src/compute/hs/vk/intel/gen8/u64/hs_kernels.h | 75 | ||||
-rw-r--r-- | src/compute/hs/vk/intel/gen8/u64/hs_target.h | 113 | ||||
-rw-r--r-- | src/compute/hs/vk/intel/gen8/u64/make_all.bat | 79 |
7 files changed, 880 insertions, 0 deletions
diff --git a/src/compute/hs/vk/intel/gen8/u32/make_all.bat b/src/compute/hs/vk/intel/gen8/u32/make_all.bat new file mode 100644 index 0000000000..9afd7b3a72 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u32/make_all.bat @@ -0,0 +1,48 @@ +@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+ echo %%~nf
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+ )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat b/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat new file mode 100644 index 0000000000..9afd7b3a72 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat @@ -0,0 +1,48 @@ +@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+ echo %%~nf
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+ )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h b/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h new file mode 100644 index 0000000000..d4376114e5 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h @@ -0,0 +1,100 @@ +// +// Copyright 2016 Google Inc. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// + +#ifndef HS_GLSL_ONCE +#define HS_GLSL_ONCE + +#define HS_SLAB_THREADS_LOG2 3 +#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2) +#define HS_SLAB_WIDTH_LOG2 3 +#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2) +#define HS_SLAB_HEIGHT 16 +#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT) +#define HS_REG_LAST(c) c##16 +#define HS_KEY_TYPE uint64_t +#define HS_KEY_WORDS 2 +#define HS_VAL_WORDS 0 +#define HS_BS_SLABS 16 +#define HS_BS_SLABS_LOG2_RU 4 +#define HS_BC_SLABS_LOG2_MAX 4 +#define HS_FM_SCALE_MIN 1 +#define HS_FM_SCALE_MAX 1 +#define HS_HM_SCALE_MIN 1 +#define HS_HM_SCALE_MAX 1 +#define HS_EMPTY + +#define HS_SLAB_ROWS() \ + HS_SLAB_ROW( 1, 0 ) \ + HS_SLAB_ROW( 2, 1 ) \ + HS_SLAB_ROW( 3, 2 ) \ + HS_SLAB_ROW( 4, 3 ) \ + HS_SLAB_ROW( 5, 4 ) \ + HS_SLAB_ROW( 6, 5 ) \ + HS_SLAB_ROW( 7, 6 ) \ + HS_SLAB_ROW( 8, 7 ) \ + HS_SLAB_ROW( 9, 8 ) \ + HS_SLAB_ROW( 10, 9 ) \ + HS_SLAB_ROW( 11, 10 ) \ + HS_SLAB_ROW( 12, 11 ) \ + HS_SLAB_ROW( 13, 12 ) \ + HS_SLAB_ROW( 14, 13 ) \ + HS_SLAB_ROW( 15, 14 ) \ + HS_SLAB_ROW( 16, 15 ) \ + HS_EMPTY + +#define HS_TRANSPOSE_SLAB() \ + HS_TRANSPOSE_STAGE( 1 ) \ + HS_TRANSPOSE_STAGE( 2 ) \ + HS_TRANSPOSE_STAGE( 3 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 2, 1 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 4, 3 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 6, 5 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 8, 7 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 10, 9 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 12, 11 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 14, 13 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 16, 15 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 3, 1 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 4, 2 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 7, 5 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 8, 6 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 11, 9 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 12, 10 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 15, 13 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 16, 14 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 5, 1 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 6, 2 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 7, 3 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 8, 4 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 13, 9 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 14, 10 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 15, 11 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 16, 12 ) \ + HS_TRANSPOSE_REMAP( u, 1, 1 ) \ + HS_TRANSPOSE_REMAP( u, 2, 3 ) \ + HS_TRANSPOSE_REMAP( u, 3, 5 ) \ + HS_TRANSPOSE_REMAP( u, 4, 7 ) \ + HS_TRANSPOSE_REMAP( u, 5, 9 ) \ + HS_TRANSPOSE_REMAP( u, 6, 11 ) \ + HS_TRANSPOSE_REMAP( u, 7, 13 ) \ + HS_TRANSPOSE_REMAP( u, 8, 15 ) \ + HS_TRANSPOSE_REMAP( u, 9, 2 ) \ + HS_TRANSPOSE_REMAP( u, 10, 4 ) \ + HS_TRANSPOSE_REMAP( u, 11, 6 ) \ + HS_TRANSPOSE_REMAP( u, 12, 8 ) \ + HS_TRANSPOSE_REMAP( u, 13, 10 ) \ + HS_TRANSPOSE_REMAP( u, 14, 12 ) \ + HS_TRANSPOSE_REMAP( u, 15, 14 ) \ + HS_TRANSPOSE_REMAP( u, 16, 16 ) \ + HS_EMPTY + +#endif + +// +// +// + diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h b/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h new file mode 100644 index 0000000000..c67dffa3a0 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h @@ -0,0 +1,417 @@ +// +// Copyright 2016 Google Inc. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// + +#ifndef HS_GLSL_MACROS_ONCE +#define HS_GLSL_MACROS_ONCE + +// +// +// + +#define HS_HASH # +#define HS_EVAL(a) a +#define HS_GLSL_EXT() HS_EVAL(HS_HASH)##extension +#define HS_GLSL_EXT_ENABLE(name) HS_GLSL_EXT() name : enable +#define HS_GLSL_VERSION(ver) HS_EVAL(HS_HASH)##version ver + +// +// +// + +// HS_GLSL_VERSION(460) + +HS_GLSL_EXT_ENABLE(GL_ARB_gpu_shader_int64) +HS_GLSL_EXT_ENABLE(GL_KHR_shader_subgroup_shuffle) +HS_GLSL_EXT_ENABLE(GL_KHR_shader_subgroup_basic) + +// +// +// + +#include "hs_glsl.h" + +// +// +// + +#if (HS_KEY_WORDS == 1) +#define HS_SHUFFLE_CAST_TO(v) v +#define HS_SHUFFLE_CAST_FROM(v) v +#elif (HS_KEY_WORDS == 2) +#define HS_SHUFFLE_CAST_TO(v) uint64BitsToDouble(v) +#define HS_SHUFFLE_CAST_FROM(v) doubleBitsToUint64(v) +#endif + +#define HS_SUBGROUP_SHUFFLE(v,i) HS_SHUFFLE_CAST_FROM(subgroupShuffle(HS_SHUFFLE_CAST_TO(v),i)) +#define HS_SUBGROUP_SHUFFLE_XOR(v,m) HS_SHUFFLE_CAST_FROM(subgroupShuffleXor(HS_SHUFFLE_CAST_TO(v),m)) +#define HS_SUBGROUP_SHUFFLE_UP(v,d) HS_SHUFFLE_CAST_FROM(subgroupShuffleUp(HS_SHUFFLE_CAST_TO(v),d)) +#define HS_SUBGROUP_SHUFFLE_DOWN(v,d) HS_SHUFFLE_CAST_FROM(subgroupShuffleDown(HS_SHUFFLE_CAST_TO(v),d)) + +// +// This up/down shuffle has defined values for [0,subgroup size) +// + +#define HS_SUBGROUP_SHUFFLE_UP_2(prev,curr,delta) + +#define HS_SUBGROUP_SHUFFLE_DOWN_2(curr,next,delta) + +// +// FYI, restrict shouldn't have any impact on these kernels and +// benchmarks appear to prove that true +// + +#define HS_RESTRICT restrict + +// +// +// + +#define HS_GLSL_WORKGROUP_SIZE(x,y,z) \ + layout (local_size_x = x, \ + local_size_y = y, \ + local_size_z = z) in + +#define HS_GLSL_SUBGROUP_SIZE(x) + +// +// KERNEL PROTOS +// + +#define HS_TRANSPOSE_KERNEL_PROTO(slab_width) \ + buffer _vout { HS_KEY_TYPE vout[]; }; \ + HS_GLSL_WORKGROUP_SIZE(slab_width,1,1); \ + HS_GLSL_SUBGROUP_SIZE(slab_width) \ + void main() + +#define HS_BS_KERNEL_PROTO(slab_width,slab_count,slab_count_ru_log2) \ + buffer readonly _vin { HS_KEY_TYPE vin[]; }; \ + buffer writeonly _vout { HS_KEY_TYPE vout[]; }; \ + HS_GLSL_WORKGROUP_SIZE(slab_width*slab_count,1,1); \ + HS_GLSL_SUBGROUP_SIZE(slab_width) \ + void main() + +#define HS_BC_KERNEL_PROTO(slab_width,slab_count,slab_count_log2) \ + buffer _vout { HS_KEY_TYPE vout[]; }; \ + HS_GLSL_WORKGROUP_SIZE(slab_width*slab_count,1,1); \ + HS_GLSL_SUBGROUP_SIZE(slab_width) \ + void main() + +#define HS_HM_KERNEL_PROTO(s) \ + buffer _vout { HS_KEY_TYPE vout[]; }; \ + HS_GLSL_WORKGROUP_SIZE(HS_SLAB_KEYS,1,1); \ + void main() + +#define HS_FM_KERNEL_PROTO(s,r) \ + buffer _vout { HS_KEY_TYPE vout[]; }; \ + HS_GLSL_WORKGROUP_SIZE(HS_SLAB_KEYS,1,1); \ + void main() + +// +// BLOCK LOCAL MEMORY DECLARATION +// + +#define HS_BLOCK_LOCAL_MEM_DECL(width,height) \ + shared struct { \ + HS_KEY_TYPE m[width * height]; \ + } smem + +// +// BLOCK BARRIER +// + +#define HS_BLOCK_BARRIER() \ + barrier() + +// +// SLAB GLOBAL +// + +#define HS_SLAB_GLOBAL_PREAMBLE(slab_width,slab_height) \ + const uint gmem_idx = \ + (gl_GlobalInvocationID.x & ~(slab_width-1)) * slab_height + \ + gl_SubgroupInvocationID + +#define HS_SLAB_GLOBAL_LOAD(extent,slab_width,row_idx) \ + extent[gmem_idx + slab_width * row_idx] + +#define HS_SLAB_GLOBAL_STORE(slab_width,row_idx,reg) \ + vout[gmem_idx + slab_width * row_idx] = reg + +// +// SLAB LOCAL +// + +#define HS_SLAB_LOCAL_L(offset) \ + smem.m[smem_l_idx + (offset)] + +#define HS_SLAB_LOCAL_R(offset) \ + smem.m[smem_r_idx + (offset)] + +// +// SLAB LOCAL VERTICAL LOADS +// + +#define HS_BX_LOCAL_V(offset) \ + smem.m[gl_LocalInvocationID.x + (offset)] + +// +// BLOCK SORT MERGE HORIZONTAL +// + +#define HS_BS_MERGE_H_PREAMBLE(slab_width,slab_count) \ + const uint smem_l_idx = \ + gl_SubgroupID * (slab_width * slab_count) + \ + gl_SubgroupInvocationID; \ + const uint smem_r_idx = \ + (gl_SubgroupID ^ 1) * (slab_width * slab_count) + \ + (gl_SubgroupInvocationID ^ (slab_width - 1)) + +// +// BLOCK CLEAN MERGE HORIZONTAL +// + +#define HS_BC_MERGE_H_PREAMBLE(slab_width,slab_height,slab_count) \ + const uint gmem_l_idx = \ + (gl_GlobalInvocationID.x & ~(slab_width*slab_count-1)) * slab_height + \ + gl_LocalInvocationID.x; \ + const uint smem_l_idx = \ + gl_SubgroupID * (slab_width * slab_count) + \ + gl_SubgroupInvocationID + +#define HS_BC_GLOBAL_LOAD_L(slab_width,slab_idx) \ + vout[gmem_l_idx + (slab_width * slab_idx)] + +// +// SLAB FLIP AND HALF PREAMBLES +// + +#define HS_SLAB_FLIP_PREAMBLE(mask) \ + const uint flip_lane_idx = gl_SubgroupInvocationID ^ mask; \ + const bool t_lt = gl_SubgroupInvocationID < flip_lane_idx; + +#define HS_SLAB_HALF_PREAMBLE(mask) \ + const uint half_lane_idx = gl_SubgroupInvocationID ^ mask; \ + const bool t_lt = gl_SubgroupInvocationID < half_lane_idx; + +// +// Inter-lane compare exchange +// + +// default +#define HS_CMP_XCHG_V0(a,b) \ + { \ + const HS_KEY_TYPE t = min(a,b); \ + b = max(a,b); \ + a = t; \ + } + +// super slow +#define HS_CMP_XCHG_V1(a,b) \ + { \ + const HS_KEY_TYPE tmp = a; \ + a = (a < b) ? a : b; \ + b ^= a ^ tmp; \ + } + +// best +#define HS_CMP_XCHG_V2(a,b) \ + if (a >= b) { \ + const HS_KEY_TYPE t = a; \ + a = b; \ + b = t; \ + } + +// good +#define HS_CMP_XCHG_V3(a,b) \ + { \ + const bool ge = a >= b; \ + const HS_KEY_TYPE t = a; \ + a = ge ? b : a; \ + b = ge ? t : b; \ + } + +// +// +// + +#if (HS_KEY_WORDS == 1) +#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V0(a,b) +#elif (HS_KEY_WORDS == 2) +#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V2(a,b) +#endif + +// +// The flip/half comparisons rely on a "conditional min/max": +// +// - if the flag is false, return min(a,b) +// - otherwise, return max(a,b) +// +// What's a little surprising is that sequence (1) is faster than (2) +// for 32-bit keys. +// +// I suspect either a code generation problem or that the sequence +// maps well to the GEN instruction set. +// +// We mostly care about 64-bit keys and unsurprisingly sequence (2) is +// fastest for this wider type. +// + +#define HS_LOGICAL_XOR() != + +// this is what you would normally use +#define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) HS_LOGICAL_XOR() lt) ? b : a + +// this seems to be faster for 32-bit keys +#define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b)) + +// +// +// + +#if (HS_KEY_WORDS == 1) +#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V1(lt,a,b) +#elif (HS_KEY_WORDS == 2) +#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b) +#endif + +// +// Conditional inter-subgroup flip/half compare exchange +// + +#define HS_CMP_FLIP(i,a,b) \ + { \ + const HS_KEY_TYPE ta = HS_SUBGROUP_SHUFFLE(a,flip_lane_idx); \ + const HS_KEY_TYPE tb = HS_SUBGROUP_SHUFFLE(b,flip_lane_idx); \ + a = HS_COND_MIN_MAX(t_lt,a,tb); \ + b = HS_COND_MIN_MAX(t_lt,b,ta); \ + } + +#define HS_CMP_HALF(i,a) \ + { \ + const HS_KEY_TYPE ta = HS_SUBGROUP_SHUFFLE(a,half_lane_idx); \ + a = HS_COND_MIN_MAX(t_lt,a,ta); \ + } + +// +// The device's comparison operator might return what we actually +// want. For example, it appears GEN 'cmp' returns {true:-1,false:0}. +// + +#define HS_CMP_IS_ZERO_ONE + +#ifdef HS_CMP_IS_ZERO_ONE +// OpenCL requires a {true: +1, false: 0} scalar result +// (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF } +#define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b)) +#define HS_CMP_TO_MASK(a) (HS_KEY_TYPE)(-a) +#else +// However, OpenCL requires { -1, 0 } for vectors +// (a < b) -> { 0xFFFFFFFF, 0 } +#define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64 +#define HS_CMP_TO_MASK(a) (a) +#endif + +// +// The "flip-merge" and "half-merge" preambles are very similar +// + +#define HS_HM_PREAMBLE(half_span) \ + const uint span_idx = gl_GlobalInvocationID.z * gl_NumWorkGroups.y + gl_GlobalInvocationID.y; \ + const uint span_stride = gl_NumWorkGroups.x * gl_WorkGroupSize.x; \ + const uint span_size = span_stride * half_span * 2; \ + const uint span_base = span_idx * span_size; \ + const uint span_off = gl_GlobalInvocationID.x; \ + const uint span_l = span_base + span_off + +#define HS_FM_PREAMBLE(half_span) \ + HS_HM_PREAMBLE(half_span); \ + const uint span_r = span_base + span_stride * (half_span + 1) - span_off - 1 + +// +// +// + +#define HS_XM_GLOBAL_L(stride_idx) \ + vout[span_l + span_stride * stride_idx] + +#define HS_XM_GLOBAL_LOAD_L(stride_idx) \ + HS_XM_GLOBAL_L(stride_idx) + +#define HS_XM_GLOBAL_STORE_L(stride_idx,reg) \ + HS_XM_GLOBAL_L(stride_idx) = reg + +#define HS_FM_GLOBAL_R(stride_idx) \ + vout[span_r + span_stride * stride_idx] + +#define HS_FM_GLOBAL_LOAD_R(stride_idx) \ + HS_FM_GLOBAL_R(stride_idx) + +#define HS_FM_GLOBAL_STORE_R(stride_idx,reg) \ + HS_FM_GLOBAL_R(stride_idx) = reg + +// +// This snarl of macros is for transposing a "slab" of sorted elements +// into linear order. +// +// This can occur as the last step in hs_sort() or via a custom kernel +// that inspects the slab and then transposes and stores it to memory. +// +// The slab format can be inspected more efficiently than a linear +// arrangement. +// +// The prime example is detecting when adjacent keys (in sort order) +// have differing high order bits ("key changes"). The index of each +// change is recorded to an auxilary array. +// +// A post-processing step like this needs to be able to navigate the +// slab and eventually transpose and store the slab in linear order. +// + +#define HS_TRANSPOSE_REG(prefix,row) prefix##row +#define HS_TRANSPOSE_DECL(prefix,row) const HS_KEY_TYPE HS_TRANSPOSE_REG(prefix,row) +#define HS_TRANSPOSE_PRED(level) is_lo_##level + +#define HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) \ + prefix_curr##row_ll##_##row_ur + +#define HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) \ + const HS_KEY_TYPE HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) + +#define HS_TRANSPOSE_STAGE(level) \ + const bool HS_TRANSPOSE_PRED(level) = \ + (gl_SubgroupInvocationID & (1 << (level-1))) == 0; + +#define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \ + HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) = \ + HS_SUBGROUP_SHUFFLE_XOR(HS_TRANSPOSE_PRED(level) ? \ + HS_TRANSPOSE_REG(prefix_prev,row_ll) : \ + HS_TRANSPOSE_REG(prefix_prev,row_ur), \ + 1<<(level-1)); \ + \ + HS_TRANSPOSE_DECL(prefix_curr,row_ll) = \ + HS_TRANSPOSE_PRED(level) ? \ + HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) : \ + HS_TRANSPOSE_REG(prefix_prev,row_ll); \ + \ + HS_TRANSPOSE_DECL(prefix_curr,row_ur) = \ + HS_TRANSPOSE_PRED(level) ? \ + HS_TRANSPOSE_REG(prefix_prev,row_ur) : \ + HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur); + +#define HS_TRANSPOSE_REMAP(prefix,row_from,row_to) \ + vout[gmem_idx + ((row_to-1) << HS_SLAB_WIDTH_LOG2)] = \ + HS_TRANSPOSE_REG(prefix,row_from); + +// +// +// + +#endif + +// +// +// diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h b/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h new file mode 100644 index 0000000000..551fc52180 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h @@ -0,0 +1,75 @@ +// +// Copyright 2016 Google Inc. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// + +#include "hs_transpose.len.xxd" +, +#include "hs_transpose.spv.xxd" +, +#include "hs_bs_4.len.xxd" +, +#include "hs_bs_4.spv.xxd" +, +#include "hs_bs_3.len.xxd" +, +#include "hs_bs_3.spv.xxd" +, +#include "hs_bs_2.len.xxd" +, +#include "hs_bs_2.spv.xxd" +, +#include "hs_bs_1.len.xxd" +, +#include "hs_bs_1.spv.xxd" +, +#include "hs_bs_0.len.xxd" +, +#include "hs_bs_0.spv.xxd" +, +#include "hs_bc_4.len.xxd" +, +#include "hs_bc_4.spv.xxd" +, +#include "hs_bc_3.len.xxd" +, +#include "hs_bc_3.spv.xxd" +, +#include "hs_bc_2.len.xxd" +, +#include "hs_bc_2.spv.xxd" +, +#include "hs_bc_1.len.xxd" +, +#include "hs_bc_1.spv.xxd" +, +#include "hs_bc_0.len.xxd" +, +#include "hs_bc_0.spv.xxd" +, +#include "hs_fm_1_4.len.xxd" +, +#include "hs_fm_1_4.spv.xxd" +, +#include "hs_fm_1_3.len.xxd" +, +#include "hs_fm_1_3.spv.xxd" +, +#include "hs_fm_1_2.len.xxd" +, +#include "hs_fm_1_2.spv.xxd" +, +#include "hs_fm_1_1.len.xxd" +, +#include "hs_fm_1_1.spv.xxd" +, +#include "hs_fm_1_0.len.xxd" +, +#include "hs_fm_1_0.spv.xxd" +, +#include "hs_hm_1_0.len.xxd" +, +#include "hs_hm_1_0.spv.xxd" +, diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_target.h b/src/compute/hs/vk/intel/gen8/u64/hs_target.h new file mode 100644 index 0000000000..f379c23066 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u64/hs_target.h @@ -0,0 +1,113 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "../../../hs_spirv_target.h" + +// +// +// + +#include "hs_glsl.h" + +// +// +// + +#ifndef HS_TARGET_NAME +#define HS_TARGET_NAME hs_target +#endif + +#define HS_TARGET_HELPER(a) a + +// +// +// + +static struct hs_spirv_target const HS_TARGET_NAME = +{ + .config = { + .slab = { + .threads_log2 = HS_SLAB_THREADS_LOG2, + .width_log2 = HS_SLAB_WIDTH_LOG2, + .height = HS_SLAB_HEIGHT + }, + + .words = { + .key = HS_KEY_WORDS, + .val = HS_VAL_WORDS + }, + + .block = { + .slabs = HS_BS_SLABS + }, + + .merge = { + .fm = { + .scale_min = HS_FM_SCALE_MIN, + .scale_max = HS_FM_SCALE_MAX + }, + .hm = { + .scale_min = HS_HM_SCALE_MIN, + .scale_max = HS_HM_SCALE_MAX, + } + }, + + .pad = { 0 } + }, + + .modules.bytes = { + +#include "hs_kernels.h" + +#ifdef HS_DUMP + 0,0,0,0 +#endif + } +}; + +// +// +// + +#ifdef HS_DUMP + +#include <stdlib.h> +#include <stdio.h> + +int +main(int argc, char const * argv[]) +{ + FILE * fp = fopen("hs_target.bin","wb"); + + fwrite(&HS_TARGET_NAME.config,1,sizeof(HS_TARGET_NAME.config),fp); + + uint8_t const * modules = HS_TARGET_NAME.modules.bytes; + size_t modsize = (modules[0]<<24) | (modules[1]<<16) | (modules[2]<<8) | modules[3]; + + while (modsize > 0) { + // fprintf(stderr,"%zu\n",modsize); + modsize += sizeof(uint32_t); + fwrite(modules,1,modsize,fp); + modules += modsize; + modsize = (modules[0]<<24) | (modules[1]<<16) | (modules[2]<<8) | modules[3]; + } + + fclose(fp); + + return EXIT_SUCCESS; +} + +#endif + +// +// +// diff --git a/src/compute/hs/vk/intel/gen8/u64/make_all.bat b/src/compute/hs/vk/intel/gen8/u64/make_all.bat new file mode 100644 index 0000000000..d148ef0113 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u64/make_all.bat @@ -0,0 +1,79 @@ +@ECHO OFF + +:: +:: delete the previous images +:: + +del *.pre.comp +del *.comp +del *.spv +del *.xxd + +:: +:: +:: + +set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen + +:: --- 32-bit keys --- + +:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +:: --- 64-bit keys + +%HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +:: CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +:: +:: remove trailing whitespace from generated files +:: + +sed -i 's/[[:space:]]*$//' hs_glsl.h +sed -i 's/[[:space:]]*$//' hs_kernels.h + +:: +:: FIXME -- convert this to a bash script +:: +:: Note that we can use xargs instead of the cmd for/do +:: + +for %%f in (*.comp) do ( + dos2unix %%f + clang-format -style=Mozilla -i %%f || goto :error + cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error + clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error +:: glslangValidator -V110 -o %%~nf.spv %%~nf.pre.comp || goto :error + glslc --target-env=vulkan1.1 -std=460 -I . -o %%~nf.spv %%~nf.pre.comp || goto :error + spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error +:: spirv-remap ... || goto :error + xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error + for /f %%A in ('wc -c %%~nf.spv') do ( + echo %%~nf.spv %%A + printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd || goto :error + ) +) + +:: +:: dump a binary +:: + +cl /DHS_DUMP /Fe:hs_dump.exe /Tchs_target.h +hs_dump + +:: +:: delete temporary files +:: + +:: del *.pre.comp +del *.comp +del *.spv +del *.obj +del *.exe + +exit /b 0 + +:error + +exit /b %errorlevel% |