diff options
author | 2018-07-16 15:57:05 -0700 | |
---|---|---|
committer | 2018-07-17 17:01:41 +0000 | |
commit | 9e0d7e4072e43495a3907bb2bac7824e8e60c368 (patch) | |
tree | baaff58dd81c1dc5e26668a8d517cbdf568bdb94 /src/compute/hs/vk | |
parent | 53c876900247ad700ce28f7b33031047a6cff402 (diff) |
Bug fixes and improvements to SKC and HotSort. Vulkan is WIP.
Bug: skia:
Change-Id: Iffc75a5b4dfcbfa4a6c23d972bb9798c2f550335
Reviewed-on: https://skia-review.googlesource.com/141582
Reviewed-by: Mike Reed <reed@google.com>
Reviewed-by: Allan MacKinnon <allanmac@google.com>
Commit-Queue: Allan MacKinnon <allanmac@google.com>
Diffstat (limited to 'src/compute/hs/vk')
-rw-r--r-- | src/compute/hs/vk/hs_spirv_target.h | 77 | ||||
-rw-r--r-- | src/compute/hs/vk/hs_vk_launcher.c | 248 | ||||
-rw-r--r-- | src/compute/hs/vk/hs_vk_launcher.h | 88 | ||||
-rw-r--r-- | src/compute/hs/vk/intel/gen8/u32/make_all.bat | 48 | ||||
-rw-r--r-- | src/compute/hs/vk/intel/gen8/u32b32/make_all.bat | 48 | ||||
-rw-r--r-- | src/compute/hs/vk/intel/gen8/u64/hs_glsl.h | 100 | ||||
-rw-r--r-- | src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h | 417 | ||||
-rw-r--r-- | src/compute/hs/vk/intel/gen8/u64/hs_kernels.h | 75 | ||||
-rw-r--r-- | src/compute/hs/vk/intel/gen8/u64/hs_target.h | 113 | ||||
-rw-r--r-- | src/compute/hs/vk/intel/gen8/u64/make_all.bat | 79 | ||||
-rw-r--r-- | src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat | 48 | ||||
-rw-r--r-- | src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat | 48 | ||||
-rw-r--r-- | src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat | 48 |
13 files changed, 1437 insertions, 0 deletions
diff --git a/src/compute/hs/vk/hs_spirv_target.h b/src/compute/hs/vk/hs_spirv_target.h new file mode 100644 index 0000000000..aa711efc6d --- /dev/null +++ b/src/compute/hs/vk/hs_spirv_target.h @@ -0,0 +1,77 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#pragma once + +// +// +// + +#include <stdint.h> + +// +// This structure packages all of the parameters and SPIR-V kernels +// for a target architecture. +// + +struct hs_spirv_target_config +{ + struct { + uint8_t threads_log2; + uint8_t width_log2; + uint8_t height; + } slab; + + struct { + uint8_t key; + uint8_t val; + } words; + + struct { + uint8_t slabs; + } block; + + struct { + struct { + uint8_t scale_min; + uint8_t scale_max; + } fm; + struct { + uint8_t scale_min; + uint8_t scale_max; + } hm; + } merge; + + uint8_t pad[2]; +}; + +static_assert(sizeof(struct hs_spirv_target_config) == 12, + "modules.words[] must start on a 32-bit boundary"); + +// +// For now, kernels are appended end-to-end with a leading big-endian +// length followed by a SPIR-V binary. +// +// The entry point for each kernel is "main". +// +// When the tools support packaging multiple named compute shaders in +// one SPIR-V module then reevaluate this encoding. +// + +struct hs_spirv_target +{ + struct hs_spirv_target_config config; + union { + uint8_t bytes[]; + uint32_t words[]; + } modules; +}; + +// +// +// diff --git a/src/compute/hs/vk/hs_vk_launcher.c b/src/compute/hs/vk/hs_vk_launcher.c new file mode 100644 index 0000000000..e1080a0e8b --- /dev/null +++ b/src/compute/hs/vk/hs_vk_launcher.c @@ -0,0 +1,248 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#include <stdlib.h> +#include <string.h> + +#include "common/vk/assert_vk.h" +#include "common/util.h" + +#include "hs_vk_launcher.h" +#include "hs_spirv_target.h" + +// +// +// + +struct hs_vk +{ + struct hs_spirv_target_config config; + + uint32_t key_val_size; + uint32_t slab_keys; + uint32_t bs_slabs_log2_ru; + uint32_t bc_slabs_log2_max; + + VkDevice device; + VkAllocationCallbacks const * allocator; + + struct { + uint32_t count; + VkPipeline * transpose; + VkPipeline * bs; + VkPipeline * bc; + VkPipeline * fm[3]; + VkPipeline * hm[3]; + VkPipeline all[]; + } pipelines; +}; + +// +// +// + +struct hs_vk * +hs_vk_create(struct hs_spirv_target const * const target, + VkDevice device, + VkAllocationCallbacks const * allocator, + VkPipelineCache pipeline_cache) +{ + // + // we reference these values a lot + // + uint32_t const bs_slabs_log2_ru = msb_idx_u32(pow2_ru_u32(target->config.block.slabs)); + uint32_t const bc_slabs_log2_max = msb_idx_u32(pow2_rd_u32(target->config.block.slabs)); + + // + // how many kernels will be created? + // + uint32_t const count_bs = bs_slabs_log2_ru + 1; + uint32_t const count_bc = bc_slabs_log2_max + 1; + uint32_t count_fm[3] = { 0 }; + uint32_t count_hm[3] = { 0 }; + + // guaranteed to be in range [0,2] + for (uint32_t scale = target->config.merge.fm.scale_min; + scale <= target->config.merge.fm.scale_max; + scale++) + { + count_fm[scale] = msb_idx_u32(pow2_ru_u32(target->config.block.slabs>>(scale-1))) + 1; + } + + // guaranteed to be in range [0,2] + for (uint32_t scale = target->config.merge.hm.scale_min; + scale <= target->config.merge.hm.scale_max; + scale++) + { + count_hm[scale] = 1; + } + + uint32_t const count_all = + 1 + + count_bs + + count_bc + + count_fm[0] + count_fm[1] + count_fm[2] + + count_hm[0] + count_hm[1] + count_hm[2]; + + // + // allocate hs_vk + // + struct hs_vk * hs; + + if (allocator == NULL) + { + hs = malloc(sizeof(*hs) + sizeof(VkPipeline*) * count_all); + } + else + { + hs = NULL; + } + + // save the config + memcpy(&hs->config,&target->config,sizeof(hs->config)); + + // save some frequently used calculated values + hs->key_val_size = (target->config.words.key + target->config.words.val) * 4; + hs->slab_keys = target->config.slab.height << target->config.slab.width_log2; + hs->bs_slabs_log2_ru = bs_slabs_log2_ru; + hs->bc_slabs_log2_max = bc_slabs_log2_max; + + // save device & allocator + hs->device = device; + hs->allocator = allocator; + + // save kernel count + hs->pipelines.count = count_all; + + // + // create all the compute pipelines + // + VkComputePipelineCreateInfo cpci = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .pNext = NULL, + .flags = VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT, + .stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = NULL, + .flags = 0, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = VK_NULL_HANDLE, + .pName = "main", + .pSpecializationInfo = NULL + }, + .basePipelineHandle = VK_NULL_HANDLE, + .basePipelineIndex = -1 + }; + + // + // Create a shader module, use it to create a pipeline... and + // dispose of the shader module. + // + uint32_t const * modules = target->modules.words; + + for (uint32_t ii=0; ii<count_all; ii++) + { + size_t const module_size = *modules++; + + VkShaderModuleCreateInfo const smci = { + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + .pNext = NULL, + .flags = 0, + .codeSize = module_size, + .pCode = modules + }; + + modules += module_size; + + vk(CreateShaderModule(device, + &smci, + allocator, + &cpci.stage.module)); + + + vk(CreateComputePipelines(device, + pipeline_cache, + count_all, + &cpci, + allocator, + hs->pipelines.all+ii)); + + vkDestroyShaderModule(device, + cpci.stage.module, + allocator); + } + + // + // initialize pointers to pipeline handles + // + VkPipeline * pipeline_next = hs->pipelines.all; + + // TRANSPOSE + hs->pipelines.transpose = pipeline_next; + pipeline_next += 1; + + // BS + hs->pipelines.bs = pipeline_next; + pipeline_next += count_bs; + + // BC + hs->pipelines.bc = pipeline_next; + pipeline_next += count_bc; + + // FM[0] + hs->pipelines.fm[0] = count_fm[0] ? pipeline_next : NULL; + pipeline_next += count_fm[0]; + + // FM[1] + hs->pipelines.fm[1] = count_fm[1] ? pipeline_next : NULL; + pipeline_next += count_fm[1]; + + // FM[2] + hs->pipelines.fm[2] = count_fm[2] ? pipeline_next : NULL; + pipeline_next += count_fm[2]; + + // HM[0] + hs->pipelines.hm[0] = count_hm[0] ? pipeline_next : NULL; + pipeline_next += count_hm[0]; + + // HM[1] + hs->pipelines.hm[1] = count_hm[1] ? pipeline_next : NULL; + pipeline_next += count_hm[1]; + + // HM[2] + hs->pipelines.hm[2] = count_hm[2] ? pipeline_next : NULL; + pipeline_next += count_hm[2]; + + return hs; +} + +// +// +// + +void +hs_vk_release(struct hs_vk * const hs) +{ + for (uint32_t ii=0; ii<hs->pipelines.count; ii++) + vkDestroyPipeline(hs->device, + hs->pipelines.all[ii], + hs->allocator); + + if (hs->allocator == NULL) + { + free(hs); + } + else + { + ; + } +} + +// +// +// diff --git a/src/compute/hs/vk/hs_vk_launcher.h b/src/compute/hs/vk/hs_vk_launcher.h new file mode 100644 index 0000000000..a549666985 --- /dev/null +++ b/src/compute/hs/vk/hs_vk_launcher.h @@ -0,0 +1,88 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#pragma once + +// +// +// + +#include <vulkan/vulkan.h> + +// +// +// + +#include <stdint.h> +#include <stdbool.h> + +// +// +// + +#include "hs_spirv_target.h" + +// +// +// + +struct hs_vk * +hs_vk_create(struct hs_spirv_target const * const target, + VkDevice device, + VkAllocationCallbacks const * allocator, + VkPipelineCache pipeline_cache); + +// +// Resources will be disposed of with the same device and allocator +// used for creation. +// + +void +hs_vk_release(struct hs_vk * const hs); + +// +// Determine what padding will be applied to the input and output +// buffers. +// +// Always check to see if the allocated buffers are large enough. +// +// count : number of keys +// count + count_padded_in : additional keys required for sorting +// count + count_padded_out : additional keys required for merging +// + +void +hs_vk_pad(struct hs_vk const * const hs, + uint32_t const count, + uint32_t * const count_padded_in, + uint32_t * const count_padded_out); + +// +// Sort the keys in the vin buffer and store them in the vout buffer. +// +// If vout is NULL then the sort will be performed in place. +// + +#if 0 +void +hs_vk_sort(struct hs_vk const * const hs, + vk_command_queue cq, + uint32_t const wait_list_size, + vk_event * wait_list, + vk_event * event, + vk_mem vin, + vk_mem vout, + uint32_t const count, + uint32_t const count_padded_in, + uint32_t const count_padded_out, + bool const linearize); +#endif + +// +// +// diff --git a/src/compute/hs/vk/intel/gen8/u32/make_all.bat b/src/compute/hs/vk/intel/gen8/u32/make_all.bat new file mode 100644 index 0000000000..9afd7b3a72 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u32/make_all.bat @@ -0,0 +1,48 @@ +@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+ echo %%~nf
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+ )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat b/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat new file mode 100644 index 0000000000..9afd7b3a72 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat @@ -0,0 +1,48 @@ +@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+ echo %%~nf
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+ )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h b/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h new file mode 100644 index 0000000000..d4376114e5 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h @@ -0,0 +1,100 @@ +// +// Copyright 2016 Google Inc. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// + +#ifndef HS_GLSL_ONCE +#define HS_GLSL_ONCE + +#define HS_SLAB_THREADS_LOG2 3 +#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2) +#define HS_SLAB_WIDTH_LOG2 3 +#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2) +#define HS_SLAB_HEIGHT 16 +#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT) +#define HS_REG_LAST(c) c##16 +#define HS_KEY_TYPE uint64_t +#define HS_KEY_WORDS 2 +#define HS_VAL_WORDS 0 +#define HS_BS_SLABS 16 +#define HS_BS_SLABS_LOG2_RU 4 +#define HS_BC_SLABS_LOG2_MAX 4 +#define HS_FM_SCALE_MIN 1 +#define HS_FM_SCALE_MAX 1 +#define HS_HM_SCALE_MIN 1 +#define HS_HM_SCALE_MAX 1 +#define HS_EMPTY + +#define HS_SLAB_ROWS() \ + HS_SLAB_ROW( 1, 0 ) \ + HS_SLAB_ROW( 2, 1 ) \ + HS_SLAB_ROW( 3, 2 ) \ + HS_SLAB_ROW( 4, 3 ) \ + HS_SLAB_ROW( 5, 4 ) \ + HS_SLAB_ROW( 6, 5 ) \ + HS_SLAB_ROW( 7, 6 ) \ + HS_SLAB_ROW( 8, 7 ) \ + HS_SLAB_ROW( 9, 8 ) \ + HS_SLAB_ROW( 10, 9 ) \ + HS_SLAB_ROW( 11, 10 ) \ + HS_SLAB_ROW( 12, 11 ) \ + HS_SLAB_ROW( 13, 12 ) \ + HS_SLAB_ROW( 14, 13 ) \ + HS_SLAB_ROW( 15, 14 ) \ + HS_SLAB_ROW( 16, 15 ) \ + HS_EMPTY + +#define HS_TRANSPOSE_SLAB() \ + HS_TRANSPOSE_STAGE( 1 ) \ + HS_TRANSPOSE_STAGE( 2 ) \ + HS_TRANSPOSE_STAGE( 3 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 2, 1 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 4, 3 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 6, 5 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 8, 7 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 10, 9 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 12, 11 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 14, 13 ) \ + HS_TRANSPOSE_BLEND( r, s, 1, 16, 15 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 3, 1 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 4, 2 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 7, 5 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 8, 6 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 11, 9 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 12, 10 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 15, 13 ) \ + HS_TRANSPOSE_BLEND( s, t, 2, 16, 14 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 5, 1 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 6, 2 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 7, 3 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 8, 4 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 13, 9 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 14, 10 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 15, 11 ) \ + HS_TRANSPOSE_BLEND( t, u, 3, 16, 12 ) \ + HS_TRANSPOSE_REMAP( u, 1, 1 ) \ + HS_TRANSPOSE_REMAP( u, 2, 3 ) \ + HS_TRANSPOSE_REMAP( u, 3, 5 ) \ + HS_TRANSPOSE_REMAP( u, 4, 7 ) \ + HS_TRANSPOSE_REMAP( u, 5, 9 ) \ + HS_TRANSPOSE_REMAP( u, 6, 11 ) \ + HS_TRANSPOSE_REMAP( u, 7, 13 ) \ + HS_TRANSPOSE_REMAP( u, 8, 15 ) \ + HS_TRANSPOSE_REMAP( u, 9, 2 ) \ + HS_TRANSPOSE_REMAP( u, 10, 4 ) \ + HS_TRANSPOSE_REMAP( u, 11, 6 ) \ + HS_TRANSPOSE_REMAP( u, 12, 8 ) \ + HS_TRANSPOSE_REMAP( u, 13, 10 ) \ + HS_TRANSPOSE_REMAP( u, 14, 12 ) \ + HS_TRANSPOSE_REMAP( u, 15, 14 ) \ + HS_TRANSPOSE_REMAP( u, 16, 16 ) \ + HS_EMPTY + +#endif + +// +// +// + diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h b/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h new file mode 100644 index 0000000000..c67dffa3a0 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h @@ -0,0 +1,417 @@ +// +// Copyright 2016 Google Inc. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// + +#ifndef HS_GLSL_MACROS_ONCE +#define HS_GLSL_MACROS_ONCE + +// +// +// + +#define HS_HASH # +#define HS_EVAL(a) a +#define HS_GLSL_EXT() HS_EVAL(HS_HASH)##extension +#define HS_GLSL_EXT_ENABLE(name) HS_GLSL_EXT() name : enable +#define HS_GLSL_VERSION(ver) HS_EVAL(HS_HASH)##version ver + +// +// +// + +// HS_GLSL_VERSION(460) + +HS_GLSL_EXT_ENABLE(GL_ARB_gpu_shader_int64) +HS_GLSL_EXT_ENABLE(GL_KHR_shader_subgroup_shuffle) +HS_GLSL_EXT_ENABLE(GL_KHR_shader_subgroup_basic) + +// +// +// + +#include "hs_glsl.h" + +// +// +// + +#if (HS_KEY_WORDS == 1) +#define HS_SHUFFLE_CAST_TO(v) v +#define HS_SHUFFLE_CAST_FROM(v) v +#elif (HS_KEY_WORDS == 2) +#define HS_SHUFFLE_CAST_TO(v) uint64BitsToDouble(v) +#define HS_SHUFFLE_CAST_FROM(v) doubleBitsToUint64(v) +#endif + +#define HS_SUBGROUP_SHUFFLE(v,i) HS_SHUFFLE_CAST_FROM(subgroupShuffle(HS_SHUFFLE_CAST_TO(v),i)) +#define HS_SUBGROUP_SHUFFLE_XOR(v,m) HS_SHUFFLE_CAST_FROM(subgroupShuffleXor(HS_SHUFFLE_CAST_TO(v),m)) +#define HS_SUBGROUP_SHUFFLE_UP(v,d) HS_SHUFFLE_CAST_FROM(subgroupShuffleUp(HS_SHUFFLE_CAST_TO(v),d)) +#define HS_SUBGROUP_SHUFFLE_DOWN(v,d) HS_SHUFFLE_CAST_FROM(subgroupShuffleDown(HS_SHUFFLE_CAST_TO(v),d)) + +// +// This up/down shuffle has defined values for [0,subgroup size) +// + +#define HS_SUBGROUP_SHUFFLE_UP_2(prev,curr,delta) + +#define HS_SUBGROUP_SHUFFLE_DOWN_2(curr,next,delta) + +// +// FYI, restrict shouldn't have any impact on these kernels and +// benchmarks appear to prove that true +// + +#define HS_RESTRICT restrict + +// +// +// + +#define HS_GLSL_WORKGROUP_SIZE(x,y,z) \ + layout (local_size_x = x, \ + local_size_y = y, \ + local_size_z = z) in + +#define HS_GLSL_SUBGROUP_SIZE(x) + +// +// KERNEL PROTOS +// + +#define HS_TRANSPOSE_KERNEL_PROTO(slab_width) \ + buffer _vout { HS_KEY_TYPE vout[]; }; \ + HS_GLSL_WORKGROUP_SIZE(slab_width,1,1); \ + HS_GLSL_SUBGROUP_SIZE(slab_width) \ + void main() + +#define HS_BS_KERNEL_PROTO(slab_width,slab_count,slab_count_ru_log2) \ + buffer readonly _vin { HS_KEY_TYPE vin[]; }; \ + buffer writeonly _vout { HS_KEY_TYPE vout[]; }; \ + HS_GLSL_WORKGROUP_SIZE(slab_width*slab_count,1,1); \ + HS_GLSL_SUBGROUP_SIZE(slab_width) \ + void main() + +#define HS_BC_KERNEL_PROTO(slab_width,slab_count,slab_count_log2) \ + buffer _vout { HS_KEY_TYPE vout[]; }; \ + HS_GLSL_WORKGROUP_SIZE(slab_width*slab_count,1,1); \ + HS_GLSL_SUBGROUP_SIZE(slab_width) \ + void main() + +#define HS_HM_KERNEL_PROTO(s) \ + buffer _vout { HS_KEY_TYPE vout[]; }; \ + HS_GLSL_WORKGROUP_SIZE(HS_SLAB_KEYS,1,1); \ + void main() + +#define HS_FM_KERNEL_PROTO(s,r) \ + buffer _vout { HS_KEY_TYPE vout[]; }; \ + HS_GLSL_WORKGROUP_SIZE(HS_SLAB_KEYS,1,1); \ + void main() + +// +// BLOCK LOCAL MEMORY DECLARATION +// + +#define HS_BLOCK_LOCAL_MEM_DECL(width,height) \ + shared struct { \ + HS_KEY_TYPE m[width * height]; \ + } smem + +// +// BLOCK BARRIER +// + +#define HS_BLOCK_BARRIER() \ + barrier() + +// +// SLAB GLOBAL +// + +#define HS_SLAB_GLOBAL_PREAMBLE(slab_width,slab_height) \ + const uint gmem_idx = \ + (gl_GlobalInvocationID.x & ~(slab_width-1)) * slab_height + \ + gl_SubgroupInvocationID + +#define HS_SLAB_GLOBAL_LOAD(extent,slab_width,row_idx) \ + extent[gmem_idx + slab_width * row_idx] + +#define HS_SLAB_GLOBAL_STORE(slab_width,row_idx,reg) \ + vout[gmem_idx + slab_width * row_idx] = reg + +// +// SLAB LOCAL +// + +#define HS_SLAB_LOCAL_L(offset) \ + smem.m[smem_l_idx + (offset)] + +#define HS_SLAB_LOCAL_R(offset) \ + smem.m[smem_r_idx + (offset)] + +// +// SLAB LOCAL VERTICAL LOADS +// + +#define HS_BX_LOCAL_V(offset) \ + smem.m[gl_LocalInvocationID.x + (offset)] + +// +// BLOCK SORT MERGE HORIZONTAL +// + +#define HS_BS_MERGE_H_PREAMBLE(slab_width,slab_count) \ + const uint smem_l_idx = \ + gl_SubgroupID * (slab_width * slab_count) + \ + gl_SubgroupInvocationID; \ + const uint smem_r_idx = \ + (gl_SubgroupID ^ 1) * (slab_width * slab_count) + \ + (gl_SubgroupInvocationID ^ (slab_width - 1)) + +// +// BLOCK CLEAN MERGE HORIZONTAL +// + +#define HS_BC_MERGE_H_PREAMBLE(slab_width,slab_height,slab_count) \ + const uint gmem_l_idx = \ + (gl_GlobalInvocationID.x & ~(slab_width*slab_count-1)) * slab_height + \ + gl_LocalInvocationID.x; \ + const uint smem_l_idx = \ + gl_SubgroupID * (slab_width * slab_count) + \ + gl_SubgroupInvocationID + +#define HS_BC_GLOBAL_LOAD_L(slab_width,slab_idx) \ + vout[gmem_l_idx + (slab_width * slab_idx)] + +// +// SLAB FLIP AND HALF PREAMBLES +// + +#define HS_SLAB_FLIP_PREAMBLE(mask) \ + const uint flip_lane_idx = gl_SubgroupInvocationID ^ mask; \ + const bool t_lt = gl_SubgroupInvocationID < flip_lane_idx; + +#define HS_SLAB_HALF_PREAMBLE(mask) \ + const uint half_lane_idx = gl_SubgroupInvocationID ^ mask; \ + const bool t_lt = gl_SubgroupInvocationID < half_lane_idx; + +// +// Inter-lane compare exchange +// + +// default +#define HS_CMP_XCHG_V0(a,b) \ + { \ + const HS_KEY_TYPE t = min(a,b); \ + b = max(a,b); \ + a = t; \ + } + +// super slow +#define HS_CMP_XCHG_V1(a,b) \ + { \ + const HS_KEY_TYPE tmp = a; \ + a = (a < b) ? a : b; \ + b ^= a ^ tmp; \ + } + +// best +#define HS_CMP_XCHG_V2(a,b) \ + if (a >= b) { \ + const HS_KEY_TYPE t = a; \ + a = b; \ + b = t; \ + } + +// good +#define HS_CMP_XCHG_V3(a,b) \ + { \ + const bool ge = a >= b; \ + const HS_KEY_TYPE t = a; \ + a = ge ? b : a; \ + b = ge ? t : b; \ + } + +// +// +// + +#if (HS_KEY_WORDS == 1) +#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V0(a,b) +#elif (HS_KEY_WORDS == 2) +#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V2(a,b) +#endif + +// +// The flip/half comparisons rely on a "conditional min/max": +// +// - if the flag is false, return min(a,b) +// - otherwise, return max(a,b) +// +// What's a little surprising is that sequence (1) is faster than (2) +// for 32-bit keys. +// +// I suspect either a code generation problem or that the sequence +// maps well to the GEN instruction set. +// +// We mostly care about 64-bit keys and unsurprisingly sequence (2) is +// fastest for this wider type. +// + +#define HS_LOGICAL_XOR() != + +// this is what you would normally use +#define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) HS_LOGICAL_XOR() lt) ? b : a + +// this seems to be faster for 32-bit keys +#define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b)) + +// +// +// + +#if (HS_KEY_WORDS == 1) +#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V1(lt,a,b) +#elif (HS_KEY_WORDS == 2) +#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b) +#endif + +// +// Conditional inter-subgroup flip/half compare exchange +// + +#define HS_CMP_FLIP(i,a,b) \ + { \ + const HS_KEY_TYPE ta = HS_SUBGROUP_SHUFFLE(a,flip_lane_idx); \ + const HS_KEY_TYPE tb = HS_SUBGROUP_SHUFFLE(b,flip_lane_idx); \ + a = HS_COND_MIN_MAX(t_lt,a,tb); \ + b = HS_COND_MIN_MAX(t_lt,b,ta); \ + } + +#define HS_CMP_HALF(i,a) \ + { \ + const HS_KEY_TYPE ta = HS_SUBGROUP_SHUFFLE(a,half_lane_idx); \ + a = HS_COND_MIN_MAX(t_lt,a,ta); \ + } + +// +// The device's comparison operator might return what we actually +// want. For example, it appears GEN 'cmp' returns {true:-1,false:0}. +// + +#define HS_CMP_IS_ZERO_ONE + +#ifdef HS_CMP_IS_ZERO_ONE +// OpenCL requires a {true: +1, false: 0} scalar result +// (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF } +#define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b)) +#define HS_CMP_TO_MASK(a) (HS_KEY_TYPE)(-a) +#else +// However, OpenCL requires { -1, 0 } for vectors +// (a < b) -> { 0xFFFFFFFF, 0 } +#define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64 +#define HS_CMP_TO_MASK(a) (a) +#endif + +// +// The "flip-merge" and "half-merge" preambles are very similar +// + +#define HS_HM_PREAMBLE(half_span) \ + const uint span_idx = gl_GlobalInvocationID.z * gl_NumWorkGroups.y + gl_GlobalInvocationID.y; \ + const uint span_stride = gl_NumWorkGroups.x * gl_WorkGroupSize.x; \ + const uint span_size = span_stride * half_span * 2; \ + const uint span_base = span_idx * span_size; \ + const uint span_off = gl_GlobalInvocationID.x; \ + const uint span_l = span_base + span_off + +#define HS_FM_PREAMBLE(half_span) \ + HS_HM_PREAMBLE(half_span); \ + const uint span_r = span_base + span_stride * (half_span + 1) - span_off - 1 + +// +// +// + +#define HS_XM_GLOBAL_L(stride_idx) \ + vout[span_l + span_stride * stride_idx] + +#define HS_XM_GLOBAL_LOAD_L(stride_idx) \ + HS_XM_GLOBAL_L(stride_idx) + +#define HS_XM_GLOBAL_STORE_L(stride_idx,reg) \ + HS_XM_GLOBAL_L(stride_idx) = reg + +#define HS_FM_GLOBAL_R(stride_idx) \ + vout[span_r + span_stride * stride_idx] + +#define HS_FM_GLOBAL_LOAD_R(stride_idx) \ + HS_FM_GLOBAL_R(stride_idx) + +#define HS_FM_GLOBAL_STORE_R(stride_idx,reg) \ + HS_FM_GLOBAL_R(stride_idx) = reg + +// +// This snarl of macros is for transposing a "slab" of sorted elements +// into linear order. +// +// This can occur as the last step in hs_sort() or via a custom kernel +// that inspects the slab and then transposes and stores it to memory. +// +// The slab format can be inspected more efficiently than a linear +// arrangement. +// +// The prime example is detecting when adjacent keys (in sort order) +// have differing high order bits ("key changes"). The index of each +// change is recorded to an auxilary array. +// +// A post-processing step like this needs to be able to navigate the +// slab and eventually transpose and store the slab in linear order. +// + +#define HS_TRANSPOSE_REG(prefix,row) prefix##row +#define HS_TRANSPOSE_DECL(prefix,row) const HS_KEY_TYPE HS_TRANSPOSE_REG(prefix,row) +#define HS_TRANSPOSE_PRED(level) is_lo_##level + +#define HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) \ + prefix_curr##row_ll##_##row_ur + +#define HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) \ + const HS_KEY_TYPE HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) + +#define HS_TRANSPOSE_STAGE(level) \ + const bool HS_TRANSPOSE_PRED(level) = \ + (gl_SubgroupInvocationID & (1 << (level-1))) == 0; + +#define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \ + HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) = \ + HS_SUBGROUP_SHUFFLE_XOR(HS_TRANSPOSE_PRED(level) ? \ + HS_TRANSPOSE_REG(prefix_prev,row_ll) : \ + HS_TRANSPOSE_REG(prefix_prev,row_ur), \ + 1<<(level-1)); \ + \ + HS_TRANSPOSE_DECL(prefix_curr,row_ll) = \ + HS_TRANSPOSE_PRED(level) ? \ + HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) : \ + HS_TRANSPOSE_REG(prefix_prev,row_ll); \ + \ + HS_TRANSPOSE_DECL(prefix_curr,row_ur) = \ + HS_TRANSPOSE_PRED(level) ? \ + HS_TRANSPOSE_REG(prefix_prev,row_ur) : \ + HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur); + +#define HS_TRANSPOSE_REMAP(prefix,row_from,row_to) \ + vout[gmem_idx + ((row_to-1) << HS_SLAB_WIDTH_LOG2)] = \ + HS_TRANSPOSE_REG(prefix,row_from); + +// +// +// + +#endif + +// +// +// diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h b/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h new file mode 100644 index 0000000000..551fc52180 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h @@ -0,0 +1,75 @@ +// +// Copyright 2016 Google Inc. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// + +#include "hs_transpose.len.xxd" +, +#include "hs_transpose.spv.xxd" +, +#include "hs_bs_4.len.xxd" +, +#include "hs_bs_4.spv.xxd" +, +#include "hs_bs_3.len.xxd" +, +#include "hs_bs_3.spv.xxd" +, +#include "hs_bs_2.len.xxd" +, +#include "hs_bs_2.spv.xxd" +, +#include "hs_bs_1.len.xxd" +, +#include "hs_bs_1.spv.xxd" +, +#include "hs_bs_0.len.xxd" +, +#include "hs_bs_0.spv.xxd" +, +#include "hs_bc_4.len.xxd" +, +#include "hs_bc_4.spv.xxd" +, +#include "hs_bc_3.len.xxd" +, +#include "hs_bc_3.spv.xxd" +, +#include "hs_bc_2.len.xxd" +, +#include "hs_bc_2.spv.xxd" +, +#include "hs_bc_1.len.xxd" +, +#include "hs_bc_1.spv.xxd" +, +#include "hs_bc_0.len.xxd" +, +#include "hs_bc_0.spv.xxd" +, +#include "hs_fm_1_4.len.xxd" +, +#include "hs_fm_1_4.spv.xxd" +, +#include "hs_fm_1_3.len.xxd" +, +#include "hs_fm_1_3.spv.xxd" +, +#include "hs_fm_1_2.len.xxd" +, +#include "hs_fm_1_2.spv.xxd" +, +#include "hs_fm_1_1.len.xxd" +, +#include "hs_fm_1_1.spv.xxd" +, +#include "hs_fm_1_0.len.xxd" +, +#include "hs_fm_1_0.spv.xxd" +, +#include "hs_hm_1_0.len.xxd" +, +#include "hs_hm_1_0.spv.xxd" +, diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_target.h b/src/compute/hs/vk/intel/gen8/u64/hs_target.h new file mode 100644 index 0000000000..f379c23066 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u64/hs_target.h @@ -0,0 +1,113 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "../../../hs_spirv_target.h" + +// +// +// + +#include "hs_glsl.h" + +// +// +// + +#ifndef HS_TARGET_NAME +#define HS_TARGET_NAME hs_target +#endif + +#define HS_TARGET_HELPER(a) a + +// +// +// + +static struct hs_spirv_target const HS_TARGET_NAME = +{ + .config = { + .slab = { + .threads_log2 = HS_SLAB_THREADS_LOG2, + .width_log2 = HS_SLAB_WIDTH_LOG2, + .height = HS_SLAB_HEIGHT + }, + + .words = { + .key = HS_KEY_WORDS, + .val = HS_VAL_WORDS + }, + + .block = { + .slabs = HS_BS_SLABS + }, + + .merge = { + .fm = { + .scale_min = HS_FM_SCALE_MIN, + .scale_max = HS_FM_SCALE_MAX + }, + .hm = { + .scale_min = HS_HM_SCALE_MIN, + .scale_max = HS_HM_SCALE_MAX, + } + }, + + .pad = { 0 } + }, + + .modules.bytes = { + +#include "hs_kernels.h" + +#ifdef HS_DUMP + 0,0,0,0 +#endif + } +}; + +// +// +// + +#ifdef HS_DUMP + +#include <stdlib.h> +#include <stdio.h> + +int +main(int argc, char const * argv[]) +{ + FILE * fp = fopen("hs_target.bin","wb"); + + fwrite(&HS_TARGET_NAME.config,1,sizeof(HS_TARGET_NAME.config),fp); + + uint8_t const * modules = HS_TARGET_NAME.modules.bytes; + size_t modsize = (modules[0]<<24) | (modules[1]<<16) | (modules[2]<<8) | modules[3]; + + while (modsize > 0) { + // fprintf(stderr,"%zu\n",modsize); + modsize += sizeof(uint32_t); + fwrite(modules,1,modsize,fp); + modules += modsize; + modsize = (modules[0]<<24) | (modules[1]<<16) | (modules[2]<<8) | modules[3]; + } + + fclose(fp); + + return EXIT_SUCCESS; +} + +#endif + +// +// +// diff --git a/src/compute/hs/vk/intel/gen8/u64/make_all.bat b/src/compute/hs/vk/intel/gen8/u64/make_all.bat new file mode 100644 index 0000000000..d148ef0113 --- /dev/null +++ b/src/compute/hs/vk/intel/gen8/u64/make_all.bat @@ -0,0 +1,79 @@ +@ECHO OFF + +:: +:: delete the previous images +:: + +del *.pre.comp +del *.comp +del *.spv +del *.xxd + +:: +:: +:: + +set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen + +:: --- 32-bit keys --- + +:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +:: --- 64-bit keys + +%HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z +:: CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z + +:: +:: remove trailing whitespace from generated files +:: + +sed -i 's/[[:space:]]*$//' hs_glsl.h +sed -i 's/[[:space:]]*$//' hs_kernels.h + +:: +:: FIXME -- convert this to a bash script +:: +:: Note that we can use xargs instead of the cmd for/do +:: + +for %%f in (*.comp) do ( + dos2unix %%f + clang-format -style=Mozilla -i %%f || goto :error + cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error + clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error +:: glslangValidator -V110 -o %%~nf.spv %%~nf.pre.comp || goto :error + glslc --target-env=vulkan1.1 -std=460 -I . -o %%~nf.spv %%~nf.pre.comp || goto :error + spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error +:: spirv-remap ... || goto :error + xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error + for /f %%A in ('wc -c %%~nf.spv') do ( + echo %%~nf.spv %%A + printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd || goto :error + ) +) + +:: +:: dump a binary +:: + +cl /DHS_DUMP /Fe:hs_dump.exe /Tchs_target.h +hs_dump + +:: +:: delete temporary files +:: + +:: del *.pre.comp +del *.comp +del *.spv +del *.obj +del *.exe + +exit /b 0 + +:error + +exit /b %errorlevel% diff --git a/src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat b/src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat new file mode 100644 index 0000000000..9afd7b3a72 --- /dev/null +++ b/src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat @@ -0,0 +1,48 @@ +@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+ echo %%~nf
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+ )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat b/src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat new file mode 100644 index 0000000000..9afd7b3a72 --- /dev/null +++ b/src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat @@ -0,0 +1,48 @@ +@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+ echo %%~nf
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+ )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat b/src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat new file mode 100644 index 0000000000..9afd7b3a72 --- /dev/null +++ b/src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat @@ -0,0 +1,48 @@ +@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+ echo %%~nf
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+ )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
|