Bug fixes and improvements to SKC and HotSort. Vulkan is WIP.

Bug: skia: Change-Id: Iffc75a5b4dfcbfa4a6c23d972bb9798c2f550335 Reviewed-on: https://skia-review.googlesource.com/141582 Reviewed-by: Mike Reed <reed@google.com> Reviewed-by: Allan MacKinnon <allanmac@google.com> Commit-Queue: Allan MacKinnon <allanmac@google.com>
author: Allan MacKinnon <allanmac@google.com> 2018-07-16 15:57:05 -0700
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2018-07-17 17:01:41 +0000
commit: 9e0d7e4072e43495a3907bb2bac7824e8e60c368 (patch)
tree: baaff58dd81c1dc5e26668a8d517cbdf568bdb94 /src/compute/hs/vk
parent: 53c876900247ad700ce28f7b33031047a6cff402 (diff)
13 files changed, 1437 insertions, 0 deletions
diff --git a/src/compute/hs/vk/hs_spirv_target.h b/src/compute/hs/vk/hs_spirv_target.h
new file mode 100644
index 0000000000..aa711efc6d
--- /dev/null
+++ b/src/compute/hs/vk/hs_spirv_target.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include <stdint.h>
+
+//
+// This structure packages all of the parameters and SPIR-V kernels
+// for a target architecture.
+//
+
+struct hs_spirv_target_config
+{
+  struct {
+    uint8_t   threads_log2;
+    uint8_t   width_log2;
+    uint8_t   height;
+  } slab;
+
+  struct {
+    uint8_t   key;
+    uint8_t   val;
+  } words;
+
+  struct {
+    uint8_t   slabs;
+  } block;
+
+  struct {
+    struct {
+      uint8_t scale_min;
+      uint8_t scale_max;
+    } fm;
+    struct {
+      uint8_t scale_min;
+      uint8_t scale_max;
+    } hm;
+  } merge;
+
+  uint8_t     pad[2];
+};
+
+static_assert(sizeof(struct hs_spirv_target_config) == 12,
+              "modules.words[] must start on a 32-bit boundary");
+
+//
+// For now, kernels are appended end-to-end with a leading big-endian
+// length followed by a SPIR-V binary.
+//
+// The entry point for each kernel is "main".
+//
+// When the tools support packaging multiple named compute shaders in
+// one SPIR-V module then reevaluate this encoding.
+//
+
+struct hs_spirv_target
+{
+  struct hs_spirv_target_config config;
+  union {
+    uint8_t                     bytes[];
+    uint32_t                    words[];
+  } modules;
+};
+
+//
+//
+//
diff --git a/src/compute/hs/vk/hs_vk_launcher.c b/src/compute/hs/vk/hs_vk_launcher.c
new file mode 100644
index 0000000000..e1080a0e8b
--- /dev/null
+++ b/src/compute/hs/vk/hs_vk_launcher.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/vk/assert_vk.h"
+#include "common/util.h"
+
+#include "hs_vk_launcher.h"
+#include "hs_spirv_target.h"
+
+//
+//
+//
+
+struct hs_vk
+{
+  struct hs_spirv_target_config config;
+
+  uint32_t                      key_val_size;
+  uint32_t                      slab_keys;
+  uint32_t                      bs_slabs_log2_ru;
+  uint32_t                      bc_slabs_log2_max;
+
+  VkDevice                      device;
+  VkAllocationCallbacks const * allocator;
+
+  struct {
+    uint32_t                    count;
+    VkPipeline                * transpose;
+    VkPipeline                * bs;
+    VkPipeline                * bc;
+    VkPipeline                * fm[3];
+    VkPipeline                * hm[3];
+    VkPipeline                  all[];
+  } pipelines;
+};
+
+//
+//
+//
+
+struct hs_vk *
+hs_vk_create(struct hs_spirv_target const * const target,
+             VkDevice                             device,
+             VkAllocationCallbacks  const *       allocator,
+             VkPipelineCache                      pipeline_cache)
+{
+  //
+  // we reference these values a lot
+  //
+  uint32_t const bs_slabs_log2_ru  = msb_idx_u32(pow2_ru_u32(target->config.block.slabs));
+  uint32_t const bc_slabs_log2_max = msb_idx_u32(pow2_rd_u32(target->config.block.slabs));
+
+  //
+  // how many kernels will be created?
+  //
+  uint32_t const count_bs    = bs_slabs_log2_ru + 1;
+  uint32_t const count_bc    = bc_slabs_log2_max + 1;
+  uint32_t       count_fm[3] = { 0 };
+  uint32_t       count_hm[3] = { 0 };
+
+  // guaranteed to be in range [0,2]
+  for (uint32_t scale = target->config.merge.fm.scale_min;
+       scale <= target->config.merge.fm.scale_max;
+       scale++)
+    {
+      count_fm[scale] = msb_idx_u32(pow2_ru_u32(target->config.block.slabs>>(scale-1))) + 1;
+    }
+
+  // guaranteed to be in range [0,2]
+  for (uint32_t scale = target->config.merge.hm.scale_min;
+       scale <= target->config.merge.hm.scale_max;
+       scale++)
+    {
+      count_hm[scale] = 1;
+    }
+
+  uint32_t const count_all =
+    1
+    + count_bs
+    + count_bc
+    + count_fm[0] + count_fm[1] + count_fm[2]
+    + count_hm[0] + count_hm[1] + count_hm[2];
+
+  //
+  // allocate hs_vk
+  //
+  struct hs_vk * hs;
+
+  if (allocator == NULL)
+    {
+      hs = malloc(sizeof(*hs) + sizeof(VkPipeline*) * count_all);
+    }
+  else
+    {
+      hs = NULL;
+    }
+
+  // save the config
+  memcpy(&hs->config,&target->config,sizeof(hs->config));
+
+  // save some frequently used calculated values
+  hs->key_val_size      = (target->config.words.key + target->config.words.val) * 4;
+  hs->slab_keys         = target->config.slab.height << target->config.slab.width_log2;
+  hs->bs_slabs_log2_ru  = bs_slabs_log2_ru;
+  hs->bc_slabs_log2_max = bc_slabs_log2_max;
+
+  // save device & allocator
+  hs->device            = device;
+  hs->allocator         = allocator;
+
+  // save kernel count
+  hs->pipelines.count   = count_all;
+
+  //
+  // create all the compute pipelines
+  //
+  VkComputePipelineCreateInfo cpci = {
+    .sType                 = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+    .pNext                 = NULL,
+    .flags                 = VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT,
+    .stage = {
+      .sType               = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+      .pNext               = NULL,
+      .flags               = 0,
+      .stage               = VK_SHADER_STAGE_COMPUTE_BIT,
+      .module              = VK_NULL_HANDLE,
+      .pName               = "main",
+      .pSpecializationInfo = NULL
+    },
+    .basePipelineHandle    = VK_NULL_HANDLE,
+    .basePipelineIndex     = -1
+  };
+
+  //
+  // Create a shader module, use it to create a pipeline... and
+  // dispose of the shader module.
+  //
+  uint32_t const * modules = target->modules.words;
+
+  for (uint32_t ii=0; ii<count_all; ii++)
+    {
+      size_t const module_size = *modules++;
+
+      VkShaderModuleCreateInfo const smci = {
+        .sType    = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+        .pNext    = NULL,
+        .flags    = 0,
+        .codeSize = module_size,
+        .pCode    = modules
+      };
+
+      modules += module_size;
+
+      vk(CreateShaderModule(device,
+                            &smci,
+                            allocator,
+                            &cpci.stage.module));
+
+
+      vk(CreateComputePipelines(device,
+                                pipeline_cache,
+                                count_all,
+                                &cpci,
+                                allocator,
+                                hs->pipelines.all+ii));
+
+      vkDestroyShaderModule(device,
+                            cpci.stage.module,
+                            allocator);
+    }
+
+  //
+  // initialize pointers to pipeline handles
+  //
+  VkPipeline * pipeline_next = hs->pipelines.all;
+
+  // TRANSPOSE
+  hs->pipelines.transpose = pipeline_next;
+  pipeline_next          += 1;
+
+  // BS
+  hs->pipelines.bs        = pipeline_next;
+  pipeline_next          += count_bs;
+
+  // BC
+  hs->pipelines.bc        = pipeline_next;
+  pipeline_next          += count_bc;
+
+  // FM[0]
+  hs->pipelines.fm[0]     = count_fm[0] ? pipeline_next : NULL;
+  pipeline_next          += count_fm[0];
+
+  // FM[1]
+  hs->pipelines.fm[1]     = count_fm[1] ? pipeline_next : NULL;
+  pipeline_next          += count_fm[1];
+
+  // FM[2]
+  hs->pipelines.fm[2]     = count_fm[2] ? pipeline_next : NULL;
+  pipeline_next          += count_fm[2];
+
+  // HM[0]
+  hs->pipelines.hm[0]     = count_hm[0] ? pipeline_next : NULL;
+  pipeline_next          += count_hm[0];
+
+  // HM[1]
+  hs->pipelines.hm[1]     = count_hm[1] ? pipeline_next : NULL;
+  pipeline_next          += count_hm[1];
+
+  // HM[2]
+  hs->pipelines.hm[2]     = count_hm[2] ? pipeline_next : NULL;
+  pipeline_next          += count_hm[2];
+
+  return hs;
+}
+
+//
+//
+//
+
+void
+hs_vk_release(struct hs_vk * const hs)
+{
+  for (uint32_t ii=0; ii<hs->pipelines.count; ii++)
+    vkDestroyPipeline(hs->device,
+                      hs->pipelines.all[ii],
+                      hs->allocator);
+
+  if (hs->allocator == NULL)
+    {
+      free(hs);
+    }
+  else
+    {
+      ;
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/hs/vk/hs_vk_launcher.h b/src/compute/hs/vk/hs_vk_launcher.h
new file mode 100644
index 0000000000..a549666985
--- /dev/null
+++ b/src/compute/hs/vk/hs_vk_launcher.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include <vulkan/vulkan.h>
+
+//
+//
+//
+
+#include <stdint.h>
+#include <stdbool.h>
+
+//
+//
+//
+
+#include "hs_spirv_target.h"
+
+//
+//
+//
+
+struct hs_vk *
+hs_vk_create(struct hs_spirv_target const * const target,
+             VkDevice                             device,
+             VkAllocationCallbacks  const *       allocator,
+             VkPipelineCache                      pipeline_cache);
+
+//
+// Resources will be disposed of with the same device and allocator
+// used for creation.
+//
+
+void
+hs_vk_release(struct hs_vk * const hs);
+
+//
+// Determine what padding will be applied to the input and output
+// buffers.
+//
+// Always check to see if the allocated buffers are large enough.
+//
+// count                    : number of keys
+// count + count_padded_in  : additional keys required for sorting
+// count + count_padded_out : additional keys required for merging
+//
+
+void
+hs_vk_pad(struct hs_vk const * const hs,
+          uint32_t             const count,
+          uint32_t           * const count_padded_in,
+          uint32_t           * const count_padded_out);
+
+//
+// Sort the keys in the vin buffer and store them in the vout buffer.
+//
+// If vout is NULL then the sort will be performed in place.
+//
+
+#if 0
+void
+hs_vk_sort(struct hs_vk const * const hs,
+           vk_command_queue           cq,
+           uint32_t             const wait_list_size,
+           vk_event           *       wait_list,
+           vk_event           *       event,
+           vk_mem                     vin,
+           vk_mem                     vout,
+           uint32_t             const count,
+           uint32_t             const count_padded_in,
+           uint32_t             const count_padded_out,
+           bool                 const linearize);
+#endif
+
+//
+//
+//
diff --git a/src/compute/hs/vk/intel/gen8/u32/make_all.bat b/src/compute/hs/vk/intel/gen8/u32/make_all.bat
new file mode 100644
index 0000000000..9afd7b3a72
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u32/make_all.bat
@@ -0,0 +1,48 @@
+@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192  -S 65536 -b 8  -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+    echo %%~nf
+    dos2unix %%f
+    clang-format -style=Mozilla -i %%f                                                            || goto :error
+    cl -I . -EP %%f -P -Fi%%~nf.pre.comp                                                          || goto :error
+    clang-format -style=Mozilla -i %%~nf.pre.comp                                                 || goto :error
+    glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+    spirv-opt -O %%~nf.spv -o %%~nf.spv                                                           || goto :error
+    xxd -i < %%~nf.spv > %%~nf.spv.xxd                                                            || goto :error
+    for /f %%A in ('wc -c %%~nf.spv') do (
+      printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+    )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat b/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat
new file mode 100644
index 0000000000..9afd7b3a72
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u32b32/make_all.bat
@@ -0,0 +1,48 @@
+@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192  -S 65536 -b 8  -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+    echo %%~nf
+    dos2unix %%f
+    clang-format -style=Mozilla -i %%f                                                            || goto :error
+    cl -I . -EP %%f -P -Fi%%~nf.pre.comp                                                          || goto :error
+    clang-format -style=Mozilla -i %%~nf.pre.comp                                                 || goto :error
+    glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+    spirv-opt -O %%~nf.spv -o %%~nf.spv                                                           || goto :error
+    xxd -i < %%~nf.spv > %%~nf.spv.xxd                                                            || goto :error
+    for /f %%A in ('wc -c %%~nf.spv') do (
+      printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+    )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h b/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h
new file mode 100644
index 0000000000..d4376114e5
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u64/hs_glsl.h
@@ -0,0 +1,100 @@
+//
+// Copyright 2016 Google Inc.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+
+#ifndef HS_GLSL_ONCE
+#define HS_GLSL_ONCE
+
+#define HS_SLAB_THREADS_LOG2    3
+#define HS_SLAB_THREADS         (1 << HS_SLAB_THREADS_LOG2)
+#define HS_SLAB_WIDTH_LOG2      3
+#define HS_SLAB_WIDTH           (1 << HS_SLAB_WIDTH_LOG2)
+#define HS_SLAB_HEIGHT          16
+#define HS_SLAB_KEYS            (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)
+#define HS_REG_LAST(c)          c##16
+#define HS_KEY_TYPE             uint64_t
+#define HS_KEY_WORDS            2
+#define HS_VAL_WORDS            0
+#define HS_BS_SLABS             16
+#define HS_BS_SLABS_LOG2_RU     4
+#define HS_BC_SLABS_LOG2_MAX    4
+#define HS_FM_SCALE_MIN         1
+#define HS_FM_SCALE_MAX         1
+#define HS_HM_SCALE_MIN         1
+#define HS_HM_SCALE_MAX         1
+#define HS_EMPTY
+
+#define HS_SLAB_ROWS()    \
+  HS_SLAB_ROW(   1,   0 ) \
+  HS_SLAB_ROW(   2,   1 ) \
+  HS_SLAB_ROW(   3,   2 ) \
+  HS_SLAB_ROW(   4,   3 ) \
+  HS_SLAB_ROW(   5,   4 ) \
+  HS_SLAB_ROW(   6,   5 ) \
+  HS_SLAB_ROW(   7,   6 ) \
+  HS_SLAB_ROW(   8,   7 ) \
+  HS_SLAB_ROW(   9,   8 ) \
+  HS_SLAB_ROW(  10,   9 ) \
+  HS_SLAB_ROW(  11,  10 ) \
+  HS_SLAB_ROW(  12,  11 ) \
+  HS_SLAB_ROW(  13,  12 ) \
+  HS_SLAB_ROW(  14,  13 ) \
+  HS_SLAB_ROW(  15,  14 ) \
+  HS_SLAB_ROW(  16,  15 ) \
+  HS_EMPTY
+
+#define HS_TRANSPOSE_SLAB()                \
+  HS_TRANSPOSE_STAGE( 1 )                  \
+  HS_TRANSPOSE_STAGE( 2 )                  \
+  HS_TRANSPOSE_STAGE( 3 )                  \
+  HS_TRANSPOSE_BLEND( r, s,  1,   2,   1 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,   4,   3 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,   6,   5 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,   8,   7 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,  10,   9 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,  12,  11 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,  14,  13 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,  16,  15 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,   3,   1 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,   4,   2 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,   7,   5 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,   8,   6 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,  11,   9 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,  12,  10 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,  15,  13 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,  16,  14 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,   5,   1 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,   6,   2 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,   7,   3 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,   8,   4 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,  13,   9 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,  14,  10 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,  15,  11 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,  16,  12 ) \
+  HS_TRANSPOSE_REMAP( u,   1,   1 )        \
+  HS_TRANSPOSE_REMAP( u,   2,   3 )        \
+  HS_TRANSPOSE_REMAP( u,   3,   5 )        \
+  HS_TRANSPOSE_REMAP( u,   4,   7 )        \
+  HS_TRANSPOSE_REMAP( u,   5,   9 )        \
+  HS_TRANSPOSE_REMAP( u,   6,  11 )        \
+  HS_TRANSPOSE_REMAP( u,   7,  13 )        \
+  HS_TRANSPOSE_REMAP( u,   8,  15 )        \
+  HS_TRANSPOSE_REMAP( u,   9,   2 )        \
+  HS_TRANSPOSE_REMAP( u,  10,   4 )        \
+  HS_TRANSPOSE_REMAP( u,  11,   6 )        \
+  HS_TRANSPOSE_REMAP( u,  12,   8 )        \
+  HS_TRANSPOSE_REMAP( u,  13,  10 )        \
+  HS_TRANSPOSE_REMAP( u,  14,  12 )        \
+  HS_TRANSPOSE_REMAP( u,  15,  14 )        \
+  HS_TRANSPOSE_REMAP( u,  16,  16 )        \
+  HS_EMPTY
+
+#endif
+
+//
+//
+//
+
diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h b/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h
new file mode 100644
index 0000000000..c67dffa3a0
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u64/hs_glsl_macros.h
@@ -0,0 +1,417 @@
+//
+// Copyright 2016 Google Inc.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+
+#ifndef HS_GLSL_MACROS_ONCE
+#define HS_GLSL_MACROS_ONCE
+
+//
+//
+//
+
+#define HS_HASH                  #
+#define HS_EVAL(a)               a
+#define HS_GLSL_EXT()            HS_EVAL(HS_HASH)##extension
+#define HS_GLSL_EXT_ENABLE(name) HS_GLSL_EXT() name : enable
+#define HS_GLSL_VERSION(ver)     HS_EVAL(HS_HASH)##version ver
+
+//
+//
+//
+
+// HS_GLSL_VERSION(460)
+
+HS_GLSL_EXT_ENABLE(GL_ARB_gpu_shader_int64)
+HS_GLSL_EXT_ENABLE(GL_KHR_shader_subgroup_shuffle)
+HS_GLSL_EXT_ENABLE(GL_KHR_shader_subgroup_basic)
+
+//
+//
+//
+
+#include "hs_glsl.h"
+
+//
+//
+//
+
+#if   (HS_KEY_WORDS == 1)
+#define HS_SHUFFLE_CAST_TO(v)   v
+#define HS_SHUFFLE_CAST_FROM(v) v
+#elif (HS_KEY_WORDS == 2)
+#define HS_SHUFFLE_CAST_TO(v)   uint64BitsToDouble(v)
+#define HS_SHUFFLE_CAST_FROM(v) doubleBitsToUint64(v)
+#endif
+
+#define HS_SUBGROUP_SHUFFLE(v,i)      HS_SHUFFLE_CAST_FROM(subgroupShuffle(HS_SHUFFLE_CAST_TO(v),i))
+#define HS_SUBGROUP_SHUFFLE_XOR(v,m)  HS_SHUFFLE_CAST_FROM(subgroupShuffleXor(HS_SHUFFLE_CAST_TO(v),m))
+#define HS_SUBGROUP_SHUFFLE_UP(v,d)   HS_SHUFFLE_CAST_FROM(subgroupShuffleUp(HS_SHUFFLE_CAST_TO(v),d))
+#define HS_SUBGROUP_SHUFFLE_DOWN(v,d) HS_SHUFFLE_CAST_FROM(subgroupShuffleDown(HS_SHUFFLE_CAST_TO(v),d))
+
+//
+// This up/down shuffle has defined values for [0,subgroup size)
+//
+
+#define HS_SUBGROUP_SHUFFLE_UP_2(prev,curr,delta)
+
+#define HS_SUBGROUP_SHUFFLE_DOWN_2(curr,next,delta)
+
+//
+// FYI, restrict shouldn't have any impact on these kernels and
+// benchmarks appear to prove that true
+//
+
+#define HS_RESTRICT restrict
+
+//
+//
+//
+
+#define HS_GLSL_WORKGROUP_SIZE(x,y,z)           \
+  layout (local_size_x = x,                     \
+          local_size_y = y,                     \
+          local_size_z = z) in
+
+#define HS_GLSL_SUBGROUP_SIZE(x)
+
+//
+// KERNEL PROTOS
+//
+
+#define HS_TRANSPOSE_KERNEL_PROTO(slab_width)                           \
+  buffer _vout { HS_KEY_TYPE vout[]; };                                 \
+  HS_GLSL_WORKGROUP_SIZE(slab_width,1,1);                               \
+  HS_GLSL_SUBGROUP_SIZE(slab_width)                                     \
+  void main()
+
+#define HS_BS_KERNEL_PROTO(slab_width,slab_count,slab_count_ru_log2)    \
+  buffer readonly  _vin  { HS_KEY_TYPE vin[];  };                       \
+  buffer writeonly _vout { HS_KEY_TYPE vout[]; };                       \
+  HS_GLSL_WORKGROUP_SIZE(slab_width*slab_count,1,1);                    \
+  HS_GLSL_SUBGROUP_SIZE(slab_width)                                     \
+  void main()
+
+#define HS_BC_KERNEL_PROTO(slab_width,slab_count,slab_count_log2)       \
+  buffer _vout { HS_KEY_TYPE vout[]; };                                 \
+  HS_GLSL_WORKGROUP_SIZE(slab_width*slab_count,1,1);                    \
+  HS_GLSL_SUBGROUP_SIZE(slab_width)                                     \
+  void main()
+
+#define HS_HM_KERNEL_PROTO(s)                                           \
+  buffer _vout { HS_KEY_TYPE vout[]; };                                 \
+  HS_GLSL_WORKGROUP_SIZE(HS_SLAB_KEYS,1,1);                             \
+  void main()
+
+#define HS_FM_KERNEL_PROTO(s,r)                                         \
+  buffer _vout { HS_KEY_TYPE vout[]; };                                 \
+  HS_GLSL_WORKGROUP_SIZE(HS_SLAB_KEYS,1,1);                             \
+  void main()
+
+//
+// BLOCK LOCAL MEMORY DECLARATION
+//
+
+#define HS_BLOCK_LOCAL_MEM_DECL(width,height)   \
+  shared struct {                               \
+    HS_KEY_TYPE m[width * height];              \
+  } smem
+
+//
+// BLOCK BARRIER
+//
+
+#define HS_BLOCK_BARRIER()                      \
+  barrier()
+
+//
+// SLAB GLOBAL
+//
+
+#define HS_SLAB_GLOBAL_PREAMBLE(slab_width,slab_height)         \
+  const uint gmem_idx =                                         \
+    (gl_GlobalInvocationID.x & ~(slab_width-1)) * slab_height + \
+    gl_SubgroupInvocationID
+
+#define HS_SLAB_GLOBAL_LOAD(extent,slab_width,row_idx)  \
+  extent[gmem_idx + slab_width * row_idx]
+
+#define HS_SLAB_GLOBAL_STORE(slab_width,row_idx,reg)    \
+  vout[gmem_idx + slab_width * row_idx] = reg
+
+//
+// SLAB LOCAL
+//
+
+#define HS_SLAB_LOCAL_L(offset)                 \
+    smem.m[smem_l_idx + (offset)]
+
+#define HS_SLAB_LOCAL_R(offset)                 \
+    smem.m[smem_r_idx + (offset)]
+
+//
+// SLAB LOCAL VERTICAL LOADS
+//
+
+#define HS_BX_LOCAL_V(offset)                   \
+  smem.m[gl_LocalInvocationID.x + (offset)]
+
+//
+// BLOCK SORT MERGE HORIZONTAL
+//
+
+#define HS_BS_MERGE_H_PREAMBLE(slab_width,slab_count)   \
+  const uint smem_l_idx =                               \
+    gl_SubgroupID * (slab_width * slab_count) +         \
+    gl_SubgroupInvocationID;                            \
+  const uint smem_r_idx =                               \
+    (gl_SubgroupID ^ 1) * (slab_width * slab_count) +   \
+    (gl_SubgroupInvocationID ^ (slab_width - 1))
+
+//
+// BLOCK CLEAN MERGE HORIZONTAL
+//
+
+#define HS_BC_MERGE_H_PREAMBLE(slab_width,slab_height,slab_count)       \
+  const uint gmem_l_idx =                                               \
+    (gl_GlobalInvocationID.x & ~(slab_width*slab_count-1)) * slab_height + \
+    gl_LocalInvocationID.x;                                             \
+  const uint smem_l_idx =                                               \
+    gl_SubgroupID * (slab_width * slab_count) +                         \
+    gl_SubgroupInvocationID
+
+#define HS_BC_GLOBAL_LOAD_L(slab_width,slab_idx)        \
+  vout[gmem_l_idx + (slab_width * slab_idx)]
+
+//
+// SLAB FLIP AND HALF PREAMBLES
+//
+
+#define HS_SLAB_FLIP_PREAMBLE(mask)                                     \
+  const uint flip_lane_idx = gl_SubgroupInvocationID ^ mask;            \
+  const bool t_lt          = gl_SubgroupInvocationID < flip_lane_idx;
+
+#define HS_SLAB_HALF_PREAMBLE(mask)                                     \
+  const uint half_lane_idx = gl_SubgroupInvocationID ^ mask;            \
+  const bool t_lt          = gl_SubgroupInvocationID < half_lane_idx;
+
+//
+// Inter-lane compare exchange
+//
+
+// default
+#define HS_CMP_XCHG_V0(a,b)                     \
+  {                                             \
+    const HS_KEY_TYPE t = min(a,b);             \
+    b = max(a,b);                               \
+    a = t;                                      \
+  }
+
+// super slow
+#define HS_CMP_XCHG_V1(a,b)                     \
+  {                                             \
+    const HS_KEY_TYPE tmp = a;                  \
+    a  = (a < b) ? a : b;                       \
+    b ^= a ^ tmp;                               \
+  }
+
+// best
+#define HS_CMP_XCHG_V2(a,b)                     \
+  if (a >= b) {                                 \
+    const HS_KEY_TYPE t = a;                    \
+    a = b;                                      \
+    b = t;                                      \
+  }
+
+// good
+#define HS_CMP_XCHG_V3(a,b)                     \
+  {                                             \
+    const bool        ge = a >= b;              \
+    const HS_KEY_TYPE t  = a;                   \
+    a = ge ? b : a;                             \
+    b = ge ? t : b;                             \
+  }
+
+//
+//
+//
+
+#if   (HS_KEY_WORDS == 1)
+#define HS_CMP_XCHG(a,b)  HS_CMP_XCHG_V0(a,b)
+#elif (HS_KEY_WORDS == 2)
+#define HS_CMP_XCHG(a,b)  HS_CMP_XCHG_V2(a,b)
+#endif
+
+//
+// The flip/half comparisons rely on a "conditional min/max":
+//
+//  - if the flag is false, return min(a,b)
+//  - otherwise, return max(a,b)
+//
+// What's a little surprising is that sequence (1) is faster than (2)
+// for 32-bit keys.
+//
+// I suspect either a code generation problem or that the sequence
+// maps well to the GEN instruction set.
+//
+// We mostly care about 64-bit keys and unsurprisingly sequence (2) is
+// fastest for this wider type.
+//
+
+#define HS_LOGICAL_XOR() !=
+
+// this is what you would normally use
+#define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) HS_LOGICAL_XOR() lt) ? b : a
+
+// this seems to be faster for 32-bit keys
+#define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b))
+
+//
+//
+//
+
+#if   (HS_KEY_WORDS == 1)
+#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V1(lt,a,b)
+#elif (HS_KEY_WORDS == 2)
+#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b)
+#endif
+
+//
+// Conditional inter-subgroup flip/half compare exchange
+//
+
+#define HS_CMP_FLIP(i,a,b)                                              \
+  {                                                                     \
+    const HS_KEY_TYPE ta = HS_SUBGROUP_SHUFFLE(a,flip_lane_idx);        \
+    const HS_KEY_TYPE tb = HS_SUBGROUP_SHUFFLE(b,flip_lane_idx);        \
+    a = HS_COND_MIN_MAX(t_lt,a,tb);                                     \
+    b = HS_COND_MIN_MAX(t_lt,b,ta);                                     \
+  }
+
+#define HS_CMP_HALF(i,a)                                                \
+    {                                                                   \
+      const HS_KEY_TYPE ta = HS_SUBGROUP_SHUFFLE(a,half_lane_idx);      \
+      a = HS_COND_MIN_MAX(t_lt,a,ta);                                   \
+    }
+
+//
+// The device's comparison operator might return what we actually
+// want.  For example, it appears GEN 'cmp' returns {true:-1,false:0}.
+//
+
+#define HS_CMP_IS_ZERO_ONE
+
+#ifdef HS_CMP_IS_ZERO_ONE
+// OpenCL requires a {true: +1, false: 0} scalar result
+// (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF }
+#define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b))
+#define HS_CMP_TO_MASK(a)   (HS_KEY_TYPE)(-a)
+#else
+// However, OpenCL requires { -1, 0 } for vectors
+// (a < b) -> { 0xFFFFFFFF, 0 }
+#define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64
+#define HS_CMP_TO_MASK(a)   (a)
+#endif
+
+//
+// The "flip-merge" and "half-merge" preambles are very similar
+//
+
+#define HS_HM_PREAMBLE(half_span)                                       \
+  const uint span_idx    = gl_GlobalInvocationID.z * gl_NumWorkGroups.y + gl_GlobalInvocationID.y; \
+  const uint span_stride = gl_NumWorkGroups.x * gl_WorkGroupSize.x;     \
+  const uint span_size   = span_stride * half_span * 2;                 \
+  const uint span_base   = span_idx * span_size;                        \
+  const uint span_off    = gl_GlobalInvocationID.x;                     \
+  const uint span_l      = span_base + span_off
+
+#define HS_FM_PREAMBLE(half_span)                                       \
+  HS_HM_PREAMBLE(half_span);                                            \
+  const uint span_r      = span_base + span_stride * (half_span + 1) - span_off - 1
+
+//
+//
+//
+
+#define HS_XM_GLOBAL_L(stride_idx)              \
+  vout[span_l + span_stride * stride_idx]
+
+#define HS_XM_GLOBAL_LOAD_L(stride_idx)         \
+  HS_XM_GLOBAL_L(stride_idx)
+
+#define HS_XM_GLOBAL_STORE_L(stride_idx,reg)    \
+  HS_XM_GLOBAL_L(stride_idx) = reg
+
+#define HS_FM_GLOBAL_R(stride_idx)              \
+  vout[span_r + span_stride * stride_idx]
+
+#define HS_FM_GLOBAL_LOAD_R(stride_idx)         \
+  HS_FM_GLOBAL_R(stride_idx)
+
+#define HS_FM_GLOBAL_STORE_R(stride_idx,reg)    \
+  HS_FM_GLOBAL_R(stride_idx) = reg
+
+//
+// This snarl of macros is for transposing a "slab" of sorted elements
+// into linear order.
+//
+// This can occur as the last step in hs_sort() or via a custom kernel
+// that inspects the slab and then transposes and stores it to memory.
+//
+// The slab format can be inspected more efficiently than a linear
+// arrangement.
+//
+// The prime example is detecting when adjacent keys (in sort order)
+// have differing high order bits ("key changes").  The index of each
+// change is recorded to an auxilary array.
+//
+// A post-processing step like this needs to be able to navigate the
+// slab and eventually transpose and store the slab in linear order.
+//
+
+#define HS_TRANSPOSE_REG(prefix,row)   prefix##row
+#define HS_TRANSPOSE_DECL(prefix,row)  const HS_KEY_TYPE HS_TRANSPOSE_REG(prefix,row)
+#define HS_TRANSPOSE_PRED(level)       is_lo_##level
+
+#define HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur)       \
+  prefix_curr##row_ll##_##row_ur
+
+#define HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur)      \
+  const HS_KEY_TYPE HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur)
+
+#define HS_TRANSPOSE_STAGE(level)                       \
+  const bool HS_TRANSPOSE_PRED(level) =                 \
+    (gl_SubgroupInvocationID & (1 << (level-1))) == 0;
+
+#define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \
+  HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) =                    \
+    HS_SUBGROUP_SHUFFLE_XOR(HS_TRANSPOSE_PRED(level) ?                  \
+                            HS_TRANSPOSE_REG(prefix_prev,row_ll) :      \
+                            HS_TRANSPOSE_REG(prefix_prev,row_ur),       \
+                            1<<(level-1));                              \
+                                                                        \
+  HS_TRANSPOSE_DECL(prefix_curr,row_ll) =                               \
+    HS_TRANSPOSE_PRED(level)                  ?                         \
+    HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) :                   \
+    HS_TRANSPOSE_REG(prefix_prev,row_ll);                               \
+                                                                        \
+  HS_TRANSPOSE_DECL(prefix_curr,row_ur) =                               \
+    HS_TRANSPOSE_PRED(level)                  ?                         \
+    HS_TRANSPOSE_REG(prefix_prev,row_ur)      :                         \
+    HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur);
+
+#define HS_TRANSPOSE_REMAP(prefix,row_from,row_to)      \
+  vout[gmem_idx + ((row_to-1) << HS_SLAB_WIDTH_LOG2)] = \
+    HS_TRANSPOSE_REG(prefix,row_from);
+
+//
+//
+//
+
+#endif
+
+//
+//
+//
diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h b/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h
new file mode 100644
index 0000000000..551fc52180
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u64/hs_kernels.h
@@ -0,0 +1,75 @@
+//
+// Copyright 2016 Google Inc.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+
+#include "hs_transpose.len.xxd"
+,
+#include "hs_transpose.spv.xxd"
+,
+#include "hs_bs_4.len.xxd"
+,
+#include "hs_bs_4.spv.xxd"
+,
+#include "hs_bs_3.len.xxd"
+,
+#include "hs_bs_3.spv.xxd"
+,
+#include "hs_bs_2.len.xxd"
+,
+#include "hs_bs_2.spv.xxd"
+,
+#include "hs_bs_1.len.xxd"
+,
+#include "hs_bs_1.spv.xxd"
+,
+#include "hs_bs_0.len.xxd"
+,
+#include "hs_bs_0.spv.xxd"
+,
+#include "hs_bc_4.len.xxd"
+,
+#include "hs_bc_4.spv.xxd"
+,
+#include "hs_bc_3.len.xxd"
+,
+#include "hs_bc_3.spv.xxd"
+,
+#include "hs_bc_2.len.xxd"
+,
+#include "hs_bc_2.spv.xxd"
+,
+#include "hs_bc_1.len.xxd"
+,
+#include "hs_bc_1.spv.xxd"
+,
+#include "hs_bc_0.len.xxd"
+,
+#include "hs_bc_0.spv.xxd"
+,
+#include "hs_fm_1_4.len.xxd"
+,
+#include "hs_fm_1_4.spv.xxd"
+,
+#include "hs_fm_1_3.len.xxd"
+,
+#include "hs_fm_1_3.spv.xxd"
+,
+#include "hs_fm_1_2.len.xxd"
+,
+#include "hs_fm_1_2.spv.xxd"
+,
+#include "hs_fm_1_1.len.xxd"
+,
+#include "hs_fm_1_1.spv.xxd"
+,
+#include "hs_fm_1_0.len.xxd"
+,
+#include "hs_fm_1_0.spv.xxd"
+,
+#include "hs_hm_1_0.len.xxd"
+,
+#include "hs_hm_1_0.spv.xxd"
+,
diff --git a/src/compute/hs/vk/intel/gen8/u64/hs_target.h b/src/compute/hs/vk/intel/gen8/u64/hs_target.h
new file mode 100644
index 0000000000..f379c23066
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u64/hs_target.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "../../../hs_spirv_target.h"
+
+//
+//
+//
+
+#include "hs_glsl.h"
+
+//
+//
+//
+
+#ifndef HS_TARGET_NAME
+#define HS_TARGET_NAME      hs_target
+#endif
+
+#define HS_TARGET_HELPER(a) a
+
+//
+//
+//
+
+static struct hs_spirv_target const HS_TARGET_NAME =
+{
+  .config = {
+    .slab = {
+      .threads_log2 = HS_SLAB_THREADS_LOG2,
+      .width_log2   = HS_SLAB_WIDTH_LOG2,
+      .height       = HS_SLAB_HEIGHT
+    },
+
+    .words = {
+      .key          = HS_KEY_WORDS,
+      .val          = HS_VAL_WORDS
+    },
+
+    .block = {
+      .slabs        = HS_BS_SLABS
+    },
+
+    .merge = {
+      .fm = {
+        .scale_min  = HS_FM_SCALE_MIN,
+        .scale_max  = HS_FM_SCALE_MAX
+      },
+      .hm = {
+        .scale_min  = HS_HM_SCALE_MIN,
+        .scale_max  = HS_HM_SCALE_MAX,
+      }
+    },
+
+    .pad = { 0 }
+  },
+
+  .modules.bytes = {
+
+#include "hs_kernels.h"
+
+#ifdef HS_DUMP
+    0,0,0,0
+#endif
+  }
+};
+
+//
+//
+//
+
+#ifdef HS_DUMP
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int
+main(int argc, char const * argv[])
+{
+  FILE * fp = fopen("hs_target.bin","wb");
+
+  fwrite(&HS_TARGET_NAME.config,1,sizeof(HS_TARGET_NAME.config),fp);
+
+  uint8_t const * modules = HS_TARGET_NAME.modules.bytes;
+  size_t          modsize = (modules[0]<<24) | (modules[1]<<16) | (modules[2]<<8) | modules[3];
+
+  while (modsize > 0) {
+    // fprintf(stderr,"%zu\n",modsize);
+    modsize += sizeof(uint32_t);
+    fwrite(modules,1,modsize,fp);
+    modules += modsize;
+    modsize  = (modules[0]<<24) | (modules[1]<<16) | (modules[2]<<8) | modules[3];
+  }
+
+  fclose(fp);
+
+  return EXIT_SUCCESS;
+}
+
+#endif
+
+//
+//
+//
diff --git a/src/compute/hs/vk/intel/gen8/u64/make_all.bat b/src/compute/hs/vk/intel/gen8/u64/make_all.bat
new file mode 100644
index 0000000000..d148ef0113
--- /dev/null
+++ b/src/compute/hs/vk/intel/gen8/u64/make_all.bat
@@ -0,0 +1,79 @@
+@ECHO OFF
+
+::
+:: delete the previous images
+::
+
+del *.pre.comp
+del *.comp
+del *.spv
+del *.xxd
+
+::
+::
+::
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+:: --- 32-bit keys ---
+
+:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+:: CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192  -S 65536 -b 8  -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+:: --- 64-bit keys
+
+%HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+:: CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+::
+:: remove trailing whitespace from generated files
+::
+
+sed -i 's/[[:space:]]*$//' hs_glsl.h
+sed -i 's/[[:space:]]*$//' hs_kernels.h
+
+::
+:: FIXME -- convert this to a bash script
+::
+:: Note that we can use xargs instead of the cmd for/do
+::
+
+for %%f in (*.comp) do (
+    dos2unix %%f
+    clang-format -style=Mozilla -i %%f                                      || goto :error
+    cl -I . -EP %%f -P -Fi%%~nf.pre.comp                                    || goto :error
+    clang-format -style=Mozilla -i %%~nf.pre.comp                           || goto :error
+:: glslangValidator -V110 -o %%~nf.spv %%~nf.pre.comp                       || goto :error
+    glslc --target-env=vulkan1.1 -std=460 -I . -o %%~nf.spv %%~nf.pre.comp  || goto :error
+    spirv-opt -O %%~nf.spv -o %%~nf.spv                                     || goto :error
+:: spirv-remap ...                                                          || goto :error
+    xxd -i < %%~nf.spv > %%~nf.spv.xxd                                      || goto :error
+    for /f %%A in ('wc -c %%~nf.spv') do (
+        echo %%~nf.spv %%A
+        printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd             || goto :error
+    )
+)
+
+::
+:: dump a binary
+::
+
+cl /DHS_DUMP /Fe:hs_dump.exe /Tchs_target.h
+hs_dump
+
+::
+:: delete temporary files
+::
+
+:: del *.pre.comp
+del *.comp
+del *.spv
+del *.obj
+del *.exe
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat b/src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat
new file mode 100644
index 0000000000..9afd7b3a72
--- /dev/null
+++ b/src/compute/hs/vk/nvidia/sm_3x/u32/make_all.bat
@@ -0,0 +1,48 @@
+@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192  -S 65536 -b 8  -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+    echo %%~nf
+    dos2unix %%f
+    clang-format -style=Mozilla -i %%f                                                            || goto :error
+    cl -I . -EP %%f -P -Fi%%~nf.pre.comp                                                          || goto :error
+    clang-format -style=Mozilla -i %%~nf.pre.comp                                                 || goto :error
+    glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+    spirv-opt -O %%~nf.spv -o %%~nf.spv                                                           || goto :error
+    xxd -i < %%~nf.spv > %%~nf.spv.xxd                                                            || goto :error
+    for /f %%A in ('wc -c %%~nf.spv') do (
+      printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+    )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat b/src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat
new file mode 100644
index 0000000000..9afd7b3a72
--- /dev/null
+++ b/src/compute/hs/vk/nvidia/sm_3x/u32b32/make_all.bat
@@ -0,0 +1,48 @@
+@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192  -S 65536 -b 8  -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+    echo %%~nf
+    dos2unix %%f
+    clang-format -style=Mozilla -i %%f                                                            || goto :error
+    cl -I . -EP %%f -P -Fi%%~nf.pre.comp                                                          || goto :error
+    clang-format -style=Mozilla -i %%~nf.pre.comp                                                 || goto :error
+    glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+    spirv-opt -O %%~nf.spv -o %%~nf.spv                                                           || goto :error
+    xxd -i < %%~nf.spv > %%~nf.spv.xxd                                                            || goto :error
+    for /f %%A in ('wc -c %%~nf.spv') do (
+      printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+    )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
diff --git a/src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat b/src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat
new file mode 100644
index 0000000000..9afd7b3a72
--- /dev/null
+++ b/src/compute/hs/vk/nvidia/sm_3x/u64/make_all.bat
@@ -0,0 +1,48 @@
+@ECHO OFF
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+REM
+REM
+REM
+
+set HS_GEN=..\..\..\..\..\..\spinel\bin\x64\Debug\hs_gen
+
+REM --- 32-bit keys ---
+
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 24 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 1 -w 8 -r 32 -s 8192  -S 65536 -b 8  -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM --- 64-bit keys
+
+CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 21504 -S 65536 -b 16 -B 48 -m 1 -M 1 -f 1 -F 1 -c 1 -C 1 -z
+REM CMD /C %HS_GEN% -v -a "glsl" -t 2 -w 8 -r 16 -s 32768 -S 65536 -b 28 -B 56 -m 1 -M 1 -f 0 -F 0 -c 0 -C 0 -z
+
+REM CMD /C make_inl_cl.bat hs_cl.cl
+
+for %%f in (*.comp) do (
+    echo %%~nf
+    dos2unix %%f
+    clang-format -style=Mozilla -i %%f                                                            || goto :error
+    cl -I . -EP %%f -P -Fi%%~nf.pre.comp                                                          || goto :error
+    clang-format -style=Mozilla -i %%~nf.pre.comp                                                 || goto :error
+    glslc --target-env=vulkan1.1 -std=450 -fshader-stage=compute -I . %%~nf.pre.comp -o %%~nf.spv || goto :error
+    spirv-opt -O %%~nf.spv -o %%~nf.spv                                                           || goto :error
+    xxd -i < %%~nf.spv > %%~nf.spv.xxd                                                            || goto :error
+    for /f %%A in ('wc -c %%~nf.spv') do (
+      printf "%%.8x" %%A | xxd -r -p | xxd -i > %%~nf.len.xxd
+    )
+)
+
+del *.comp
+del *.pre.comp
+del *.spv
+
+exit /b 0
+
+:error
+
+exit /b %errorlevel%
author	Allan MacKinnon <allanmac@google.com>	2018-07-16 15:57:05 -0700
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2018-07-17 17:01:41 +0000
commit	9e0d7e4072e43495a3907bb2bac7824e8e60c368 (patch)
tree	baaff58dd81c1dc5e26668a8d517cbdf568bdb94 /src/compute/hs/vk
parent	53c876900247ad700ce28f7b33031047a6cff402 (diff)