aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/compute/hs/vk
diff options
context:
space:
mode:
authorGravatar Allan MacKinnon <allanmac@google.com>2018-07-16 15:57:05 -0700
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2018-07-17 17:01:41 +0000
commit9e0d7e4072e43495a3907bb2bac7824e8e60c368 (patch)
treebaaff58dd81c1dc5e26668a8d517cbdf568bdb94 /src/compute/hs/vk
parent53c876900247ad700ce28f7b33031047a6cff402 (diff)
Bug fixes and improvements to SKC and HotSort. Vulkan is WIP.
Bug: skia: Change-Id: Iffc75a5b4dfcbfa4a6c23d972bb9798c2f550335 Reviewed-on: https://skia-review.googlesource.com/141582 Reviewed-by: Mike Reed <reed@google.com> Reviewed-by: Allan MacKinnon <allanmac@google.com> Commit-Queue: Allan MacKinnon <allanmac@google.com>
Diffstat (limited to 'src/compute/hs/vk')
-rw-r--r--src/compute/hs/vk/hs_spirv_target.h77
-rw-r--r--src/compute/hs/vk/hs_vk_launcher.c248
-rw-r--r--src/compute/hs/vk/hs_vk_launcher.h88
-rw-r--r--src/compute/hs/vk/intel/gen8/u32/make_all.bat48
-rw-r--r--src/compute/hs/vk/intel/gen8/u32b32/make_all.bat48
-rw-r--r--src/compute/hs/vk/intel/gen8/u64/hs_glsl.h100
-rw-r--r--src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h417
-rw-r--r--src/compute/hs/vk/intel/gen8/u64/hs_kernels.h75
-rw-r--r--src/compute/hs/vk/intel/gen8/u64/hs_target.h113
-rw-r--r--src/compute/hs/vk/intel/gen8/u64/make_all.bat79
-rw-r--r--src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat48
-rw-r--r--src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat48
-rw-r--r--src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat48
13 files changed, 1437 insertions, 0 deletions
diff --git a/src/compute/hs/vk/hs_spirv_target.h b/src/compute/hs/vk/hs_spirv_target.h
new file mode 100644
index 0000000000..aa711efc6d
--- /dev/null
+++ b/src/compute/hs/vk/hs_spirv_target.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include <stdint.h>
+
+//
+// This structure packages all of the parameters and SPIR-V kernels
+// for a target architecture.
+//
+
+struct hs_spirv_target_config
+{
+ struct {
+ uint8_t threads_log2;
+ uint8_t width_log2;
+ uint8_t height;
+ } slab;
+
+ struct {
+ uint8_t key;
+ uint8_t val;
+ } words;
+
+ struct {
+ uint8_t slabs;
+ } block;
+
+ struct {
+ struct {
+ uint8_t scale_min;
+ uint8_t scale_max;
+ } fm;
+ struct {
+ uint8_t scale_min;
+ uint8_t scale_max;
+ } hm;
+ } merge;
+
+ uint8_t pad[2];
+};
+
+static_assert(sizeof(struct hs_spirv_target_config) == 12,
+ "modules.words[] must start on a 32-bit boundary");
+
+//
+// For now, kernels are appended end-to-end with a leading big-endian
+// length followed by a SPIR-V binary.
+//
+// The entry point for each kernel is "main".
+//
+// When the tools support packaging multiple named compute shaders in
+// one SPIR-V module then reevaluate this encoding.
+//
+
+struct hs_spirv_target
+{
+ struct hs_spirv_target_config config;
+ union {
+ uint8_t bytes[];
+ uint32_t words[];
+ } modules;
+};
+
+//
+//
+//
diff --git a/src/compute/hs/vk/hs_vk_launcher.c b/src/compute/hs/vk/hs_vk_launcher.c
new file mode 100644
index 0000000000..e1080a0e8b
--- /dev/null
+++ b/src/compute/hs/vk/hs_vk_launcher.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/vk/assert_vk.h"
+#include "common/util.h"
+
+#include "hs_vk_launcher.h"
+#include "hs_spirv_target.h"
+
+//
+//
+//
+
+struct hs_vk
+{
+ struct hs_spirv_target_config config;
+
+ uint32_t key_val_size;
+ uint32_t slab_keys;
+ uint32_t bs_slabs_log2_ru;
+ uint32_t bc_slabs_log2_max;
+
+ VkDevice device;
+ VkAllocationCallbacks const * allocator;
+
+ struct {
+ uint32_t count;
+ VkPipeline * transpose;
+ VkPipeline * bs;
+ VkPipeline * bc;
+ VkPipeline * fm[3];
+ VkPipeline * hm[3];
+ VkPipeline all[];
+ } pipelines;
+};
+
+//
+//
+//
+
+struct hs_vk *
+hs_vk_create(struct hs_spirv_target const * const target,
+ VkDevice device,
+ VkAllocationCallbacks const * allocator,
+ VkPipelineCache pipeline_cache)
+{
+ //
+ // we reference these values a lot
+ //
+ uint32_t const bs_slabs_log2_ru = msb_idx_u32(pow2_ru_u32(target->config.block.slabs));
+ uint32_t const bc_slabs_log2_max = msb_idx_u32(pow2_rd_u32(target->config.block.slabs));
+
+ //
+ // how many kernels will be created?
+ //
+ uint32_t const count_bs = bs_slabs_log2_ru + 1;
+ uint32_t const count_bc = bc_slabs_log2_max + 1;
+ uint32_t count_fm[3] = { 0 };
+ uint32_t count_hm[3] = { 0 };
+
+ // guaranteed to be in range [0,2]
+ for (uint32_t scale = target->config.merge.fm.scale_min;
+ scale <= target->config.merge.fm.scale_max;
+ scale++)
+ {
+ count_fm[scale] = msb_idx_u32(pow2_ru_u32(target->config.block.slabs>>(scale-1))) + 1;
+ }
+
+ // guaranteed to be in range [0,2]
+ for (uint32_t scale = target->config.merge.hm.scale_min;
+ scale <= target->config.merge.hm.scale_max;
+ scale++)
+ {
+ count_hm[scale] = 1;
+ }
+
+ uint32_t const count_all =
+ 1
+ + count_bs
+ + count_bc
+ + count_fm[0] + count_fm[1] + count_fm[2]
+ + count_hm[0] + count_hm[1] + count_hm[2];
+
+ //
+ // allocate hs_vk
+ //
+ struct hs_vk * hs;
+
+ if (allocator == NULL)
+ {
+ hs = malloc(sizeof(*hs) + sizeof(VkPipeline*) * count_all);
+ }
+ else
+ {
+ hs = NULL;
+ }
+
+ // save the config
+ memcpy(&hs->config,&target->config,sizeof(hs->config));
+
+ // save some frequently used calculated values
+ hs->key_val_size = (target->config.words.key + target->config.words.val) * 4;
+ hs->slab_keys = target->config.slab.height << target->config.slab.width_log2;
+ hs->bs_slabs_log2_ru = bs_slabs_log2_ru;
+ hs->bc_slabs_log2_max = bc_slabs_log2_max;
+
+ // save device & allocator
+ hs->device = device;
+ hs->allocator = allocator;
+
+ // save kernel count
+ hs->pipelines.count = count_all;
+
+ //
+ // create all the compute pipelines
+ //
+ VkComputePipelineCreateInfo cpci = {
+ .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+ .pNext = NULL,
+ .flags = VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT,
+ .stage = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+ .pNext = NULL,
+ .flags = 0,
+ .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+ .module = VK_NULL_HANDLE,
+ .pName = "main",
+ .pSpecializationInfo = NULL
+ },
+ .basePipelineHandle = VK_NULL_HANDLE,
+ .basePipelineIndex = -1
+ };
+
+ //
+ // Create a shader module, use it to create a pipeline... and
+ // dispose of the shader module.
+ //
+ uint32_t const * modules = target->modules.words;
+
+ for (uint32_t ii=0; ii<count_all; ii++)
+ {
+ size_t const module_size = *modules++;
+
+ VkShaderModuleCreateInfo const smci = {
+ .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+ .pNext = NULL,
+ .flags = 0,
+ .codeSize = module_size,
+ .pCode = modules
+ };
+
+ modules += module_size;
+
+ vk(CreateShaderModule(device,
+ &smci,
+ allocator,
+ &cpci.stage.module));
+
+
+ vk(CreateComputePipelines(device,
+ pipeline_cache,
+ count_all,
+ &cpci,
+ allocator,
+ hs->pipelines.all+ii));
+
+ vkDestroyShaderModule(device,
+ cpci.stage.module,
+ allocator);
+ }
+
+ //
+ // initialize pointers to pipeline handles
+ //
+ VkPipeline * pipeline_next = hs->pipelines.all;
+
+ // TRANSPOSE
+ hs->pipelines.transpose = pipeline_next;
+ pipeline_next += 1;
+
+ // BS
+ hs->pipelines.bs = pipeline_next;
+ pipeline_next += count_bs;
+
+ // BC
+ hs->pipelines.bc = pipeline_next;
+ pipeline_next += count_bc;
+
+ // FM[0]
+ hs->pipelines.fm[0] = count_fm[0] ? pipeline_next : NULL;
+ pipeline_next += count_fm[0];
+
+ // FM[1]
+ hs->pipelines.fm[1] = count_fm[1] ? pipeline_next : NULL;
+ pipeline_next += count_fm[1];
+
+ // FM[2]
+ hs->pipelines.fm[2] = count_fm[2] ? pipeline_next : NULL;
+ pipeline_next += count_fm[2];
+
+ // HM[0]
+ hs->pipelines.hm[0] = count_hm[0] ? pipeline_next : NULL;
+ pipeline_next += count_hm[0];
+
+ // HM[1]
+ hs->pipelines.hm[1] = count_hm[1] ? pipeline_next : NULL;
+ pipeline_next += count_hm[1];
+
+ // HM[2]
+ hs->pipelines.hm[2] = count_hm[2] ? pipeline_next : NULL;
+ pipeline_next += count_hm[2];
+
+ return hs;
+}
+
+//
+//
+//
+
+void
+hs_vk_release(struct hs_vk * const hs)
+{
+ for (uint32_t ii=0; ii<hs->pipelines.count; ii++)
+ vkDestroyPipeline(hs->device,
+ hs->pipelines.all[ii],
+ hs->allocator);
+
+ if (hs->allocator == NULL)
+ {
+ free(hs);
+ }
+ else
+ {
+ ;
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/hs/vk/hs_vk_launcher.h b/src/compute/hs/vk/hs_vk_launcher.h
new file mode 100644
index 0000000000..a549666985
--- /dev/null
+++ b/src/compute/hs/vk/hs_vk_launcher.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include <vulkan/vulkan.h>
+
+//
+//
+//
+
+#include <stdint.h>
+#include <stdbool.h>
+
+//
+//
+//
+
+#include "hs_spirv_target.h"
+
+//
+//
+//
+
+struct hs_vk *
+hs_vk_create(struct hs_spirv_target const * const target,
+ VkDevice device,
+ VkAllocationCallbacks const * allocator,
+ VkPipelineCache pipeline_cache);
+
+//
+// Resources will be disposed of with the same device and allocator
+// used for creation.
+//
+
+void
+hs_vk_release(struct hs_vk * const hs);
+
+//
+// Determine what padding will be applied to the input and output
+// buffers.
+//
+// Always check to see if the allocated buffers are large enough.
+//
+// count : number of keys
+// count + count_padded_in : additional keys required for sorting
+// count + count_padded_out : additional keys required for merging
+//
+
+void
+hs_vk_pad(struct hs_vk const * const hs,
+ uint32_t const count,
+ uint32_t * const count_padded_in,
+ uint32_t * const count_padded_out);
+
+//
+// Sort the keys in the vin buffer and store them in the vout buffer.
+//
+// If vout is NULL then the sort will be performed in place.
+//
+
+#if 0
+void
+hs_vk_sort(struct hs_vk const * const hs,
+ vk_command_queue cq,
+ uint32_t const wait_list_size,
+ vk_event * wait_list,
+ vk_event * event,
+ vk_mem vin,
+ vk_mem vout,
+ uint32_t const count,
+ uint32_t const count_padded_in,
+ uint32_t const count_padded_out,
+ bool const linearize);
+#endif
+
+//
+//
+//
diff --git a/src/compute/hs/vk/intel/gen8/u32/make_all.bat b/src/compute/hs/vk/intel/gen8/u32/make_all.bat
new file mode 100644
index 0000000000..9afd7b3a72
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u32/make_all.bat
@@ -0,0 +1,48 @@
+@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+ echo %%~nf
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+ )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat b/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat
new file mode 100644
index 0000000000..9afd7b3a72
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat
@@ -0,0 +1,48 @@
+@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+ echo %%~nf
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+ )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h b/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h
new file mode 100644
index 0000000000..d4376114e5
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h
@@ -0,0 +1,100 @@
+//
+// Copyright 2016 Google Inc.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+
+#ifndef HS_GLSL_ONCE
+#define HS_GLSL_ONCE
+
+#define HS_SLAB_THREADS_LOG2 3
+#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2)
+#define HS_SLAB_WIDTH_LOG2 3
+#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2)
+#define HS_SLAB_HEIGHT 16
+#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)
+#define HS_REG_LAST(c) c##16
+#define HS_KEY_TYPE uint64_t
+#define HS_KEY_WORDS 2
+#define HS_VAL_WORDS 0
+#define HS_BS_SLABS 16
+#define HS_BS_SLABS_LOG2_RU 4
+#define HS_BC_SLABS_LOG2_MAX 4
+#define HS_FM_SCALE_MIN 1
+#define HS_FM_SCALE_MAX 1
+#define HS_HM_SCALE_MIN 1
+#define HS_HM_SCALE_MAX 1
+#define HS_EMPTY
+
+#define HS_SLAB_ROWS() \
+ HS_SLAB_ROW( 1, 0 ) \
+ HS_SLAB_ROW( 2, 1 ) \
+ HS_SLAB_ROW( 3, 2 ) \
+ HS_SLAB_ROW( 4, 3 ) \
+ HS_SLAB_ROW( 5, 4 ) \
+ HS_SLAB_ROW( 6, 5 ) \
+ HS_SLAB_ROW( 7, 6 ) \
+ HS_SLAB_ROW( 8, 7 ) \
+ HS_SLAB_ROW( 9, 8 ) \
+ HS_SLAB_ROW( 10, 9 ) \
+ HS_SLAB_ROW( 11, 10 ) \
+ HS_SLAB_ROW( 12, 11 ) \
+ HS_SLAB_ROW( 13, 12 ) \
+ HS_SLAB_ROW( 14, 13 ) \
+ HS_SLAB_ROW( 15, 14 ) \
+ HS_SLAB_ROW( 16, 15 ) \
+ HS_EMPTY
+
+#define HS_TRANSPOSE_SLAB() \
+ HS_TRANSPOSE_STAGE( 1 ) \
+ HS_TRANSPOSE_STAGE( 2 ) \
+ HS_TRANSPOSE_STAGE( 3 ) \
+ HS_TRANSPOSE_BLEND( r, s, 1, 2, 1 ) \
+ HS_TRANSPOSE_BLEND( r, s, 1, 4, 3 ) \
+ HS_TRANSPOSE_BLEND( r, s, 1, 6, 5 ) \
+ HS_TRANSPOSE_BLEND( r, s, 1, 8, 7 ) \
+ HS_TRANSPOSE_BLEND( r, s, 1, 10, 9 ) \
+ HS_TRANSPOSE_BLEND( r, s, 1, 12, 11 ) \
+ HS_TRANSPOSE_BLEND( r, s, 1, 14, 13 ) \
+ HS_TRANSPOSE_BLEND( r, s, 1, 16, 15 ) \
+ HS_TRANSPOSE_BLEND( s, t, 2, 3, 1 ) \
+ HS_TRANSPOSE_BLEND( s, t, 2, 4, 2 ) \
+ HS_TRANSPOSE_BLEND( s, t, 2, 7, 5 ) \
+ HS_TRANSPOSE_BLEND( s, t, 2, 8, 6 ) \
+ HS_TRANSPOSE_BLEND( s, t, 2, 11, 9 ) \
+ HS_TRANSPOSE_BLEND( s, t, 2, 12, 10 ) \
+ HS_TRANSPOSE_BLEND( s, t, 2, 15, 13 ) \
+ HS_TRANSPOSE_BLEND( s, t, 2, 16, 14 ) \
+ HS_TRANSPOSE_BLEND( t, u, 3, 5, 1 ) \
+ HS_TRANSPOSE_BLEND( t, u, 3, 6, 2 ) \
+ HS_TRANSPOSE_BLEND( t, u, 3, 7, 3 ) \
+ HS_TRANSPOSE_BLEND( t, u, 3, 8, 4 ) \
+ HS_TRANSPOSE_BLEND( t, u, 3, 13, 9 ) \
+ HS_TRANSPOSE_BLEND( t, u, 3, 14, 10 ) \
+ HS_TRANSPOSE_BLEND( t, u, 3, 15, 11 ) \
+ HS_TRANSPOSE_BLEND( t, u, 3, 16, 12 ) \
+ HS_TRANSPOSE_REMAP( u, 1, 1 ) \
+ HS_TRANSPOSE_REMAP( u, 2, 3 ) \
+ HS_TRANSPOSE_REMAP( u, 3, 5 ) \
+ HS_TRANSPOSE_REMAP( u, 4, 7 ) \
+ HS_TRANSPOSE_REMAP( u, 5, 9 ) \
+ HS_TRANSPOSE_REMAP( u, 6, 11 ) \
+ HS_TRANSPOSE_REMAP( u, 7, 13 ) \
+ HS_TRANSPOSE_REMAP( u, 8, 15 ) \
+ HS_TRANSPOSE_REMAP( u, 9, 2 ) \
+ HS_TRANSPOSE_REMAP( u, 10, 4 ) \
+ HS_TRANSPOSE_REMAP( u, 11, 6 ) \
+ HS_TRANSPOSE_REMAP( u, 12, 8 ) \
+ HS_TRANSPOSE_REMAP( u, 13, 10 ) \
+ HS_TRANSPOSE_REMAP( u, 14, 12 ) \
+ HS_TRANSPOSE_REMAP( u, 15, 14 ) \
+ HS_TRANSPOSE_REMAP( u, 16, 16 ) \
+ HS_EMPTY
+
+#endif
+
+//
+//
+//
+
diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h b/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h
new file mode 100644
index 0000000000..c67dffa3a0
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h
@@ -0,0 +1,417 @@
+//
+// Copyright 2016 Google Inc.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+
+#ifndef HS_GLSL_MACROS_ONCE
+#define HS_GLSL_MACROS_ONCE
+
+//
+//
+//
+
+#define HS_HASH #
+#define HS_EVAL(a) a
+#define HS_GLSL_EXT() HS_EVAL(HS_HASH)##extension
+#define HS_GLSL_EXT_ENABLE(name) HS_GLSL_EXT() name : enable
+#define HS_GLSL_VERSION(ver) HS_EVAL(HS_HASH)##version ver
+
+//
+//
+//
+
+// HS_GLSL_VERSION(460)
+
+HS_GLSL_EXT_ENABLE(GL_ARB_gpu_shader_int64)
+HS_GLSL_EXT_ENABLE(GL_KHR_shader_subgroup_shuffle)
+HS_GLSL_EXT_ENABLE(GL_KHR_shader_subgroup_basic)
+
+//
+//
+//
+
+#include "hs_glsl.h"
+
+//
+//
+//
+
+#if (HS_KEY_WORDS == 1)
+#define HS_SHUFFLE_CAST_TO(v) v
+#define HS_SHUFFLE_CAST_FROM(v) v
+#elif (HS_KEY_WORDS == 2)
+#define HS_SHUFFLE_CAST_TO(v) uint64BitsToDouble(v)
+#define HS_SHUFFLE_CAST_FROM(v) doubleBitsToUint64(v)
+#endif
+
+#define HS_SUBGROUP_SHUFFLE(v,i) HS_SHUFFLE_CAST_FROM(subgroupShuffle(HS_SHUFFLE_CAST_TO(v),i))
+#define HS_SUBGROUP_SHUFFLE_XOR(v,m) HS_SHUFFLE_CAST_FROM(subgroupShuffleXor(HS_SHUFFLE_CAST_TO(v),m))
+#define HS_SUBGROUP_SHUFFLE_UP(v,d) HS_SHUFFLE_CAST_FROM(subgroupShuffleUp(HS_SHUFFLE_CAST_TO(v),d))
+#define HS_SUBGROUP_SHUFFLE_DOWN(v,d) HS_SHUFFLE_CAST_FROM(subgroupShuffleDown(HS_SHUFFLE_CAST_TO(v),d))
+
+//
+// This up/down shuffle has defined values for [0,subgroup size)
+//
+
+#define HS_SUBGROUP_SHUFFLE_UP_2(prev,curr,delta)
+
+#define HS_SUBGROUP_SHUFFLE_DOWN_2(curr,next,delta)
+
+//
+// FYI, restrict shouldn't have any impact on these kernels and
+// benchmarks appear to prove that true
+//
+
+#define HS_RESTRICT restrict
+
+//
+//
+//
+
+#define HS_GLSL_WORKGROUP_SIZE(x,y,z) \
+ layout (local_size_x = x, \
+ local_size_y = y, \
+ local_size_z = z) in
+
+#define HS_GLSL_SUBGROUP_SIZE(x)
+
+//
+// KERNEL PROTOS
+//
+
+#define HS_TRANSPOSE_KERNEL_PROTO(slab_width) \
+ buffer _vout { HS_KEY_TYPE vout[]; }; \
+ HS_GLSL_WORKGROUP_SIZE(slab_width,1,1); \
+ HS_GLSL_SUBGROUP_SIZE(slab_width) \
+ void main()
+
+#define HS_BS_KERNEL_PROTO(slab_width,slab_count,slab_count_ru_log2) \
+ buffer readonly _vin { HS_KEY_TYPE vin[]; }; \
+ buffer writeonly _vout { HS_KEY_TYPE vout[]; }; \
+ HS_GLSL_WORKGROUP_SIZE(slab_width*slab_count,1,1); \
+ HS_GLSL_SUBGROUP_SIZE(slab_width) \
+ void main()
+
+#define HS_BC_KERNEL_PROTO(slab_width,slab_count,slab_count_log2) \
+ buffer _vout { HS_KEY_TYPE vout[]; }; \
+ HS_GLSL_WORKGROUP_SIZE(slab_width*slab_count,1,1); \
+ HS_GLSL_SUBGROUP_SIZE(slab_width) \
+ void main()
+
+#define HS_HM_KERNEL_PROTO(s) \
+ buffer _vout { HS_KEY_TYPE vout[]; }; \
+ HS_GLSL_WORKGROUP_SIZE(HS_SLAB_KEYS,1,1); \
+ void main()
+
+#define HS_FM_KERNEL_PROTO(s,r) \
+ buffer _vout { HS_KEY_TYPE vout[]; }; \
+ HS_GLSL_WORKGROUP_SIZE(HS_SLAB_KEYS,1,1); \
+ void main()
+
+//
+// BLOCK LOCAL MEMORY DECLARATION
+//
+
+#define HS_BLOCK_LOCAL_MEM_DECL(width,height) \
+ shared struct { \
+ HS_KEY_TYPE m[width * height]; \
+ } smem
+
+//
+// BLOCK BARRIER
+//
+
+#define HS_BLOCK_BARRIER() \
+ barrier()
+
+//
+// SLAB GLOBAL
+//
+
+#define HS_SLAB_GLOBAL_PREAMBLE(slab_width,slab_height) \
+ const uint gmem_idx = \
+ (gl_GlobalInvocationID.x & ~(slab_width-1)) * slab_height + \
+ gl_SubgroupInvocationID
+
+#define HS_SLAB_GLOBAL_LOAD(extent,slab_width,row_idx) \
+ extent[gmem_idx + slab_width * row_idx]
+
+#define HS_SLAB_GLOBAL_STORE(slab_width,row_idx,reg) \
+ vout[gmem_idx + slab_width * row_idx] = reg
+
+//
+// SLAB LOCAL
+//
+
+#define HS_SLAB_LOCAL_L(offset) \
+ smem.m[smem_l_idx + (offset)]
+
+#define HS_SLAB_LOCAL_R(offset) \
+ smem.m[smem_r_idx + (offset)]
+
+//
+// SLAB LOCAL VERTICAL LOADS
+//
+
+#define HS_BX_LOCAL_V(offset) \
+ smem.m[gl_LocalInvocationID.x + (offset)]
+
+//
+// BLOCK SORT MERGE HORIZONTAL
+//
+
+#define HS_BS_MERGE_H_PREAMBLE(slab_width,slab_count) \
+ const uint smem_l_idx = \
+ gl_SubgroupID * (slab_width * slab_count) + \
+ gl_SubgroupInvocationID; \
+ const uint smem_r_idx = \
+ (gl_SubgroupID ^ 1) * (slab_width * slab_count) + \
+ (gl_SubgroupInvocationID ^ (slab_width - 1))
+
+//
+// BLOCK CLEAN MERGE HORIZONTAL
+//
+
+#define HS_BC_MERGE_H_PREAMBLE(slab_width,slab_height,slab_count) \
+ const uint gmem_l_idx = \
+ (gl_GlobalInvocationID.x & ~(slab_width*slab_count-1)) * slab_height + \
+ gl_LocalInvocationID.x; \
+ const uint smem_l_idx = \
+ gl_SubgroupID * (slab_width * slab_count) + \
+ gl_SubgroupInvocationID
+
+#define HS_BC_GLOBAL_LOAD_L(slab_width,slab_idx) \
+ vout[gmem_l_idx + (slab_width * slab_idx)]
+
+//
+// SLAB FLIP AND HALF PREAMBLES
+//
+
+#define HS_SLAB_FLIP_PREAMBLE(mask) \
+ const uint flip_lane_idx = gl_SubgroupInvocationID ^ mask; \
+ const bool t_lt = gl_SubgroupInvocationID < flip_lane_idx;
+
+#define HS_SLAB_HALF_PREAMBLE(mask) \
+ const uint half_lane_idx = gl_SubgroupInvocationID ^ mask; \
+ const bool t_lt = gl_SubgroupInvocationID < half_lane_idx;
+
+//
+// Inter-lane compare exchange
+//
+
+// default
+#define HS_CMP_XCHG_V0(a,b) \
+ { \
+ const HS_KEY_TYPE t = min(a,b); \
+ b = max(a,b); \
+ a = t; \
+ }
+
+// super slow
+#define HS_CMP_XCHG_V1(a,b) \
+ { \
+ const HS_KEY_TYPE tmp = a; \
+ a = (a < b) ? a : b; \
+ b ^= a ^ tmp; \
+ }
+
+// best
+#define HS_CMP_XCHG_V2(a,b) \
+ if (a >= b) { \
+ const HS_KEY_TYPE t = a; \
+ a = b; \
+ b = t; \
+ }
+
+// good
+#define HS_CMP_XCHG_V3(a,b) \
+ { \
+ const bool ge = a >= b; \
+ const HS_KEY_TYPE t = a; \
+ a = ge ? b : a; \
+ b = ge ? t : b; \
+ }
+
+//
+//
+//
+
+#if (HS_KEY_WORDS == 1)
+#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V0(a,b)
+#elif (HS_KEY_WORDS == 2)
+#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V2(a,b)
+#endif
+
+//
+// The flip/half comparisons rely on a "conditional min/max":
+//
+// - if the flag is false, return min(a,b)
+// - otherwise, return max(a,b)
+//
+// What's a little surprising is that sequence (1) is faster than (2)
+// for 32-bit keys.
+//
+// I suspect either a code generation problem or that the sequence
+// maps well to the GEN instruction set.
+//
+// We mostly care about 64-bit keys and unsurprisingly sequence (2) is
+// fastest for this wider type.
+//
+
+#define HS_LOGICAL_XOR() !=
+
+// this is what you would normally use
+#define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) HS_LOGICAL_XOR() lt) ? b : a
+
+// this seems to be faster for 32-bit keys
+#define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b))
+
+//
+//
+//
+
+#if (HS_KEY_WORDS == 1)
+#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V1(lt,a,b)
+#elif (HS_KEY_WORDS == 2)
+#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b)
+#endif
+
+//
+// Conditional inter-subgroup flip/half compare exchange
+//
+
+#define HS_CMP_FLIP(i,a,b) \
+ { \
+ const HS_KEY_TYPE ta = HS_SUBGROUP_SHUFFLE(a,flip_lane_idx); \
+ const HS_KEY_TYPE tb = HS_SUBGROUP_SHUFFLE(b,flip_lane_idx); \
+ a = HS_COND_MIN_MAX(t_lt,a,tb); \
+ b = HS_COND_MIN_MAX(t_lt,b,ta); \
+ }
+
+#define HS_CMP_HALF(i,a) \
+ { \
+ const HS_KEY_TYPE ta = HS_SUBGROUP_SHUFFLE(a,half_lane_idx); \
+ a = HS_COND_MIN_MAX(t_lt,a,ta); \
+ }
+
+//
+// The device's comparison operator might return what we actually
+// want. For example, it appears GEN 'cmp' returns {true:-1,false:0}.
+//
+
+#define HS_CMP_IS_ZERO_ONE
+
+#ifdef HS_CMP_IS_ZERO_ONE
+// OpenCL requires a {true: +1, false: 0} scalar result
+// (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF }
+#define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b))
+#define HS_CMP_TO_MASK(a) (HS_KEY_TYPE)(-a)
+#else
+// However, OpenCL requires { -1, 0 } for vectors
+// (a < b) -> { 0xFFFFFFFF, 0 }
+#define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64
+#define HS_CMP_TO_MASK(a) (a)
+#endif
+
+//
+// The "flip-merge" and "half-merge" preambles are very similar
+//
+
+#define HS_HM_PREAMBLE(half_span) \
+ const uint span_idx = gl_GlobalInvocationID.z * gl_NumWorkGroups.y + gl_GlobalInvocationID.y; \
+ const uint span_stride = gl_NumWorkGroups.x * gl_WorkGroupSize.x; \
+ const uint span_size = span_stride * half_span * 2; \
+ const uint span_base = span_idx * span_size; \
+ const uint span_off = gl_GlobalInvocationID.x; \
+ const uint span_l = span_base + span_off
+
+#define HS_FM_PREAMBLE(half_span) \
+ HS_HM_PREAMBLE(half_span); \
+ const uint span_r = span_base + span_stride * (half_span + 1) - span_off - 1
+
+//
+//
+//
+
+#define HS_XM_GLOBAL_L(stride_idx) \
+ vout[span_l + span_stride * stride_idx]
+
+#define HS_XM_GLOBAL_LOAD_L(stride_idx) \
+ HS_XM_GLOBAL_L(stride_idx)
+
+#define HS_XM_GLOBAL_STORE_L(stride_idx,reg) \
+ HS_XM_GLOBAL_L(stride_idx) = reg
+
+#define HS_FM_GLOBAL_R(stride_idx) \
+ vout[span_r + span_stride * stride_idx]
+
+#define HS_FM_GLOBAL_LOAD_R(stride_idx) \
+ HS_FM_GLOBAL_R(stride_idx)
+
+#define HS_FM_GLOBAL_STORE_R(stride_idx,reg) \
+ HS_FM_GLOBAL_R(stride_idx) = reg
+
+//
+// This snarl of macros is for transposing a "slab" of sorted elements
+// into linear order.
+//
+// This can occur as the last step in hs_sort() or via a custom kernel
+// that inspects the slab and then transposes and stores it to memory.
+//
+// The slab format can be inspected more efficiently than a linear
+// arrangement.
+//
+// The prime example is detecting when adjacent keys (in sort order)
+// have differing high order bits ("key changes"). The index of each
+// change is recorded to an auxilary array.
+//
+// A post-processing step like this needs to be able to navigate the
+// slab and eventually transpose and store the slab in linear order.
+//
+
+#define HS_TRANSPOSE_REG(prefix,row) prefix##row
+#define HS_TRANSPOSE_DECL(prefix,row) const HS_KEY_TYPE HS_TRANSPOSE_REG(prefix,row)
+#define HS_TRANSPOSE_PRED(level) is_lo_##level
+
+#define HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) \
+ prefix_curr##row_ll##_##row_ur
+
+#define HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) \
+ const HS_KEY_TYPE HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur)
+
+#define HS_TRANSPOSE_STAGE(level) \
+ const bool HS_TRANSPOSE_PRED(level) = \
+ (gl_SubgroupInvocationID & (1 << (level-1))) == 0;
+
+#define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \
+ HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) = \
+ HS_SUBGROUP_SHUFFLE_XOR(HS_TRANSPOSE_PRED(level) ? \
+ HS_TRANSPOSE_REG(prefix_prev,row_ll) : \
+ HS_TRANSPOSE_REG(prefix_prev,row_ur), \
+ 1<<(level-1)); \
+ \
+ HS_TRANSPOSE_DECL(prefix_curr,row_ll) = \
+ HS_TRANSPOSE_PRED(level) ? \
+ HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) : \
+ HS_TRANSPOSE_REG(prefix_prev,row_ll); \
+ \
+ HS_TRANSPOSE_DECL(prefix_curr,row_ur) = \
+ HS_TRANSPOSE_PRED(level) ? \
+ HS_TRANSPOSE_REG(prefix_prev,row_ur) : \
+ HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur);
+
+#define HS_TRANSPOSE_REMAP(prefix,row_from,row_to) \
+ vout[gmem_idx + ((row_to-1) << HS_SLAB_WIDTH_LOG2)] = \
+ HS_TRANSPOSE_REG(prefix,row_from);
+
+//
+//
+//
+
+#endif
+
+//
+//
+//
diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h b/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h
new file mode 100644
index 0000000000..551fc52180
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h
@@ -0,0 +1,75 @@
+//
+// Copyright 2016 Google Inc.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+
+#include "hs_transpose.len.xxd"
+,
+#include "hs_transpose.spv.xxd"
+,
+#include "hs_bs_4.len.xxd"
+,
+#include "hs_bs_4.spv.xxd"
+,
+#include "hs_bs_3.len.xxd"
+,
+#include "hs_bs_3.spv.xxd"
+,
+#include "hs_bs_2.len.xxd"
+,
+#include "hs_bs_2.spv.xxd"
+,
+#include "hs_bs_1.len.xxd"
+,
+#include "hs_bs_1.spv.xxd"
+,
+#include "hs_bs_0.len.xxd"
+,
+#include "hs_bs_0.spv.xxd"
+,
+#include "hs_bc_4.len.xxd"
+,
+#include "hs_bc_4.spv.xxd"
+,
+#include "hs_bc_3.len.xxd"
+,
+#include "hs_bc_3.spv.xxd"
+,
+#include "hs_bc_2.len.xxd"
+,
+#include "hs_bc_2.spv.xxd"
+,
+#include "hs_bc_1.len.xxd"
+,
+#include "hs_bc_1.spv.xxd"
+,
+#include "hs_bc_0.len.xxd"
+,
+#include "hs_bc_0.spv.xxd"
+,
+#include "hs_fm_1_4.len.xxd"
+,
+#include "hs_fm_1_4.spv.xxd"
+,
+#include "hs_fm_1_3.len.xxd"
+,
+#include "hs_fm_1_3.spv.xxd"
+,
+#include "hs_fm_1_2.len.xxd"
+,
+#include "hs_fm_1_2.spv.xxd"
+,
+#include "hs_fm_1_1.len.xxd"
+,
+#include "hs_fm_1_1.spv.xxd"
+,
+#include "hs_fm_1_0.len.xxd"
+,
+#include "hs_fm_1_0.spv.xxd"
+,
+#include "hs_hm_1_0.len.xxd"
+,
+#include "hs_hm_1_0.spv.xxd"
+,
diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_target.h b/src/compute/hs/vk/intel/gen8/u64/hs_target.h
new file mode 100644
index 0000000000..f379c23066
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u64/hs_target.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "../../../hs_spirv_target.h"
+
+//
+//
+//
+
+#include "hs_glsl.h"
+
+//
+//
+//
+
+#ifndef HS_TARGET_NAME
+#define HS_TARGET_NAME hs_target
+#endif
+
+#define HS_TARGET_HELPER(a) a
+
+//
+//
+//
+
+static struct hs_spirv_target const HS_TARGET_NAME =
+{
+ .config = {
+ .slab = {
+ .threads_log2 = HS_SLAB_THREADS_LOG2,
+ .width_log2 = HS_SLAB_WIDTH_LOG2,
+ .height = HS_SLAB_HEIGHT
+ },
+
+ .words = {
+ .key = HS_KEY_WORDS,
+ .val = HS_VAL_WORDS
+ },
+
+ .block = {
+ .slabs = HS_BS_SLABS
+ },
+
+ .merge = {
+ .fm = {
+ .scale_min = HS_FM_SCALE_MIN,
+ .scale_max = HS_FM_SCALE_MAX
+ },
+ .hm = {
+ .scale_min = HS_HM_SCALE_MIN,
+ .scale_max = HS_HM_SCALE_MAX,
+ }
+ },
+
+ .pad = { 0 }
+ },
+
+ .modules.bytes = {
+
+#include "hs_kernels.h"
+
+#ifdef HS_DUMP
+ 0,0,0,0
+#endif
+ }
+};
+
+//
+//
+//
+
+#ifdef HS_DUMP
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int
+main(int argc, char const * argv[])
+{
+ FILE * fp = fopen("hs_target.bin","wb");
+
+ fwrite(&HS_TARGET_NAME.config,1,sizeof(HS_TARGET_NAME.config),fp);
+
+ uint8_t const * modules = HS_TARGET_NAME.modules.bytes;
+ size_t modsize = (modules[0]<<24) | (modules[1]<<16) | (modules[2]<<8) | modules[3];
+
+ while (modsize > 0) {
+ // fprintf(stderr,"%zu\n",modsize);
+ modsize += sizeof(uint32_t);
+ fwrite(modules,1,modsize,fp);
+ modules += modsize;
+ modsize = (modules[0]<<24) | (modules[1]<<16) | (modules[2]<<8) | modules[3];
+ }
+
+ fclose(fp);
+
+ return EXIT_SUCCESS;
+}
+
+#endif
+
+//
+//
+//
diff --git a/src/compute/hs/vk/intel/gen8/u64/make_all.bat b/src/compute/hs/vk/intel/gen8/u64/make_all.bat
new file mode 100644
index 0000000000..d148ef0113
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u64/make_all.bat
@@ -0,0 +1,79 @@
+@ECHO OFF
+
+::
+:: delete the previous images
+::
+
+del *.pre.comp
+del *.comp
+del *.spv
+del *.xxd
+
+::
+::
+::
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+:: --- 32-bit keys ---
+
+:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+:: --- 64-bit keys
+
+%HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+:: CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+::
+:: remove trailing whitespace from generated files
+::
+
+sed -i 's/[[:space:]]*$//' hs_glsl.h
+sed -i 's/[[:space:]]*$//' hs_kernels.h
+
+::
+:: FIXME -- convert this to a bash script
+::
+:: Note that we can use xargs instead of the cmd for/do
+::
+
+for %%f in (*.comp) do (
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+:: glslangValidator -V110 -o %%~nf.spv %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=460 -I . -o %%~nf.spv %%~nf.pre.comp || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+:: spirv-remap ... || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ echo %%~nf.spv %%A
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd || goto :error
+ )
+)
+
+::
+:: dump a binary
+::
+
+cl /DHS_DUMP /Fe:hs_dump.exe /Tchs_target.h
+hs_dump
+
+::
+:: delete temporary files
+::
+
+:: del *.pre.comp
+del *.comp
+del *.spv
+del *.obj
+del *.exe
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat b/src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat
new file mode 100644
index 0000000000..9afd7b3a72
--- /dev/null
+++ b/src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat
@@ -0,0 +1,48 @@
+@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+ echo %%~nf
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+ )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat b/src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat
new file mode 100644
index 0000000000..9afd7b3a72
--- /dev/null
+++ b/src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat
@@ -0,0 +1,48 @@
+@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+ echo %%~nf
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+ )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat b/src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat
new file mode 100644
index 0000000000..9afd7b3a72
--- /dev/null
+++ b/src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat
@@ -0,0 +1,48 @@
+@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192 -S 65536 -b 8 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+ echo %%~nf
+ dos2unix %%f
+ clang-format -style=Mozilla -i %%f || goto :error
+ cl -I . -EP %%f -P -Fi%%~nf.pre.comp || goto :error
+ clang-format -style=Mozilla -i %%~nf.pre.comp || goto :error
+ glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+ spirv-opt -O %%~nf.spv -o %%~nf.spv || goto :error
+ xxd -i < %%~nf.spv > %%~nf.spv.xxd || goto :error
+ for /f %%A in ('wc -c %%~nf.spv') do (
+ printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+ )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%