12 files changed, 2229 insertions, 1885 deletions
diff --git a/src/compute/hs/gen/gen.h b/src/compute/hs/gen/gen.h
index 4043a8df5c..3635d553cf 100644
--- a/src/compute/hs/gen/gen.h
+++ b/src/compute/hs/gen/gen.h
@@ -9,21 +9,20 @@
 #pragma once
 
 //
+// TODO:
 //
+// Add Key-Val sorting support -- easy.
 //
 
 #include <stdio.h>
 #include <stdint.h>
 
 //
+// All code generation is driven by the specified architectural
+// details and host platform API.
 //
-//
-
-#define MERGE_LEVELS_MAX_LOG2  7                            // merge up to 128 warps
-#define MERGE_LEVELS_MAX_SIZE  (1 << MERGE_LEVELS_MAX_LOG2) // ((1 << MERGE_MAX_LOG2) - 1) // incorrect debug error
-
-//
-//
+// In general, the warps-per-block and keys-per-thread are the
+// critical knobs for tuning performance.
 //
 
 struct hsg_config
@@ -58,6 +57,7 @@ struct hsg_config
 
   struct {
     uint32_t    lanes;
+    uint32_t    lanes_log2;
     uint32_t    skpw_bs;
   } warp;
 
@@ -72,7 +72,7 @@ struct hsg_config
 };
 
 //
-//
+// HotSort can merge non-power-of-two blocks of warps
 //
 
 struct hsg_level
@@ -91,6 +91,16 @@ struct hsg_level
   } active;
 };
 
+//
+//
+//
+
+#define MERGE_LEVELS_MAX_LOG2  7 // merge up to 128 warps
+#define MERGE_LEVELS_MAX_SIZE  (1 << MERGE_LEVELS_MAX_LOG2)
+
+//
+// This is computed
+//
 
 struct hsg_merge
 {
@@ -113,6 +123,8 @@ struct hsg_merge
 //
 //
 
+#if 0
+
 #define HSG_FILE_NAME_SIZE  80
 
 struct hsg_file
@@ -126,18 +138,6 @@ struct hsg_file
 //
 //
 
-typedef enum hsg_kernel_type {
-
-  HSG_KERNEL_TYPE_SORT_BLOCK,
-
-  HSG_KERNEL_TYPE_COUNT
-
-} hsg_kernel_type;
-
-//
-//
-//
-
 typedef enum hsg_file_type {
 
   HSG_FILE_TYPE_HEADER,
@@ -147,6 +147,8 @@ typedef enum hsg_file_type {
 
 } hsg_file_type;
 
+#endif
+
 //
 //
 //
@@ -158,10 +160,8 @@ typedef enum hsg_file_type {
   HSG_OP_EXPAND_X(HSG_OP_TYPE_BEGIN)                            \
   HSG_OP_EXPAND_X(HSG_OP_TYPE_ELSE)                             \
                                                                 \
-  HSG_OP_EXPAND_X(HSG_OP_TYPE_FILE_HEADER)                      \
-  HSG_OP_EXPAND_X(HSG_OP_TYPE_FILE_FOOTER)                      \
-                                                                \
-  HSG_OP_EXPAND_X(HSG_OP_TYPE_DUMMY_KERNEL)                     \
+  HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_BEGIN)                     \
+  HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_END)                       \
                                                                 \
   HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO)           \
   HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE)        \
@@ -186,12 +186,13 @@ typedef enum hsg_file_type {
   HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT)         \
   HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT)         \
   HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT)        \
+  HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_MERGE_RIGHT_PRED)              \
                                                                 \
   HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_LOAD)               \
   HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_STORE)              \
                                                                 \
-  HSG_OP_EXPAND_X(HSG_OP_TYPE_WARP_FLIP)                        \
-  HSG_OP_EXPAND_X(HSG_OP_TYPE_WARP_HALF)                        \
+  HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_FLIP)                        \
+  HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_HALF)                        \
                                                                 \
   HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_FLIP)                         \
   HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_HALF)                         \
@@ -221,8 +222,6 @@ typedef enum hsg_file_type {
                                                                 \
   HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_ACTIVE_PRED)                   \
                                                                 \
-  HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_MERGE_RIGHT_PRED)              \
-                                                                \
   HSG_OP_EXPAND_X(HSG_OP_TYPE_COUNT)
 
 //
@@ -271,42 +270,63 @@ struct hsg_op
 //
 //
 
-typedef void (*hsg_target_pfn)(struct hsg_file        * const files,
-                               struct hsg_merge const * const merge,
-                               struct hsg_op    const * const ops,
-                               uint32_t                 const depth);
+extern char const * const hsg_op_type_string[];
 
 //
 //
 //
 
-extern struct hsg_config hsg_config;
-extern struct hsg_merge  hsg_merge[MERGE_LEVELS_MAX_LOG2];
+struct hsg_target
+{
+  struct hsg_target_state * state;
+};
 
 //
+// All targets share this prototype
+//
+
+typedef
+void
+(*hsg_target_pfn)(struct hsg_target       * const target,
+                  struct hsg_config const * const config,
+                  struct hsg_merge  const * const merge,
+                  struct hsg_op     const * const ops,
+                  uint32_t                  const depth);
 //
 //
+//
+
+extern
+void
+hsg_target_debug(struct hsg_target       * const target,
+                 struct hsg_config const * const config,
+                 struct hsg_merge  const * const merge,
+                 struct hsg_op     const * const ops,
+                 uint32_t                  const depth);
 
 extern
 void
-hsg_target_debug    (struct hsg_file        * const files,
-                     struct hsg_merge const * const merge,
-                     struct hsg_op    const * const ops,
-                     uint32_t                 const depth);
+hsg_target_cuda(struct hsg_target       * const target,
+                struct hsg_config const * const config,
+                struct hsg_merge  const * const merge,
+                struct hsg_op     const * const ops,
+                uint32_t                  const depth);
 
 extern
 void
-hsg_target_cuda_sm3x(struct hsg_file        * const files,
-                     struct hsg_merge const * const merge,
-                     struct hsg_op    const * const ops,
-                     uint32_t                 const depth);
+hsg_target_opencl(struct hsg_target       * const target,
+                  struct hsg_config const * const config,
+                  struct hsg_merge  const * const merge,
+                  struct hsg_op     const * const ops,
+                  uint32_t                  const depth);
 
 extern
 void
-hsg_target_igp_genx (struct hsg_file        * const files,
-                     struct hsg_merge const * const merge,
-                     struct hsg_op    const * const ops,
-                     uint32_t                 const depth);
+hsg_target_glsl(struct hsg_target       * const target,
+                struct hsg_config const * const config,
+                struct hsg_merge  const * const merge,
+                struct hsg_op     const * const ops,
+                uint32_t                  const depth);
 //
 //
 //
diff --git a/src/compute/hs/gen/main.c b/src/compute/hs/gen/main.c
index 42f4518bfd..e06e23029b 100644
--- a/src/compute/hs/gen/main.c
+++ b/src/compute/hs/gen/main.c
@@ -20,14 +20,8 @@
 //
 
 #include "networks.h"
-#include "macros.h"
-#include "util.h"
-
-//
-//
-//
-
-#define HSG_INDENT  2
+#include "common/util.h"
+#include "common/macros.h"
 
 //
 //
@@ -36,7 +30,6 @@
 #undef  HSG_OP_EXPAND_X
 #define HSG_OP_EXPAND_X(t) #t ,
 
-static
 char const * const
 hsg_op_type_string[] =
   {
@@ -53,10 +46,8 @@ hsg_op_type_string[] =
 #define BEGIN()                           (struct hsg_op){ HSG_OP_TYPE_BEGIN                                    }
 #define ELSE()                            (struct hsg_op){ HSG_OP_TYPE_ELSE                                     }
 
-#define STORE_SLAB_EARLY_EXIT()           (struct hsg_op){ HSG_OP_TYPE_STORE_SLAB_EARLY_EXIT                    }
-
-#define FILE_HEADER()                     (struct hsg_op){ HSG_OP_TYPE_FILE_HEADER                              }
-#define FILE_FOOTER()                     (struct hsg_op){ HSG_OP_TYPE_FILE_FOOTER                              }
+#define TARGET_BEGIN()                    (struct hsg_op){ HSG_OP_TYPE_TARGET_BEGIN                             }
+#define TARGET_END()                      (struct hsg_op){ HSG_OP_TYPE_TARGET_END                               }
 
 #define TRANSPOSE_KERNEL_PROTO()          (struct hsg_op){ HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO                   }
 #define TRANSPOSE_KERNEL_PREAMBLE()       (struct hsg_op){ HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE                }
@@ -68,11 +59,11 @@ hsg_op_type_string[] =
 #define BC_KERNEL_PROTO(i)                (struct hsg_op){ HSG_OP_TYPE_BC_KERNEL_PROTO,             { i       } }
 #define BC_KERNEL_PREAMBLE(i)             (struct hsg_op){ HSG_OP_TYPE_BC_KERNEL_PREAMBLE,          { i       } }
 
-#define FM_KERNEL_PROTO(l,s)              (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PROTO,             { l, s    } }
-#define FM_KERNEL_PREAMBLE(w,s)           (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PREAMBLE,          { w, s    } }
+#define FM_KERNEL_PROTO(s,r)              (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PROTO,             { s, r    } }
+#define FM_KERNEL_PREAMBLE(h)             (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PREAMBLE,          { h       } }
 
-#define HM_KERNEL_PROTO(d,w)              (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PROTO,             { d, w    } }
-#define HM_KERNEL_PREAMBLE(w,s)           (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PREAMBLE,          { w, s    } }
+#define HM_KERNEL_PROTO(s)                (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PROTO,             { s       } }
+#define HM_KERNEL_PREAMBLE(h)             (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PREAMBLE,          { h       } }
 
 #define BX_REG_GLOBAL_LOAD(n,v)           (struct hsg_op){ HSG_OP_TYPE_BX_REG_GLOBAL_LOAD,          { n, v    } }
 #define BX_REG_GLOBAL_STORE(n)            (struct hsg_op){ HSG_OP_TYPE_BX_REG_GLOBAL_STORE,         { n       } }
@@ -81,12 +72,13 @@ hsg_op_type_string[] =
 #define FM_REG_GLOBAL_STORE_LEFT(n,i)     (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT,    { n, i    } }
 #define FM_REG_GLOBAL_LOAD_RIGHT(n,i)     (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT,    { n, i    } }
 #define FM_REG_GLOBAL_STORE_RIGHT(n,i)    (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT,   { n, i    } }
+#define FM_MERGE_RIGHT_PRED(n,s)          (struct hsg_op){ HSG_OP_TYPE_FM_MERGE_RIGHT_PRED,         { n, s    } }
 
 #define HM_REG_GLOBAL_LOAD(n,i)           (struct hsg_op){ HSG_OP_TYPE_HM_REG_GLOBAL_LOAD,          { n, i    } }
 #define HM_REG_GLOBAL_STORE(n,i)          (struct hsg_op){ HSG_OP_TYPE_HM_REG_GLOBAL_STORE,         { n, i    } }
 
-#define WARP_FLIP(f)                      (struct hsg_op){ HSG_OP_TYPE_WARP_FLIP,                   { f       } }
-#define WARP_HALF(h)                      (struct hsg_op){ HSG_OP_TYPE_WARP_HALF,                   { h       } }
+#define SLAB_FLIP(f)                      (struct hsg_op){ HSG_OP_TYPE_SLAB_FLIP,                   { f       } }
+#define SLAB_HALF(h)                      (struct hsg_op){ HSG_OP_TYPE_SLAB_HALF,                   { h       } }
 
 #define CMP_FLIP(a,b,c)                   (struct hsg_op){ HSG_OP_TYPE_CMP_FLIP,                    { a, b, c } }
 #define CMP_HALF(a,b)                     (struct hsg_op){ HSG_OP_TYPE_CMP_HALF,                    { a, b    } }
@@ -121,13 +113,12 @@ hsg_op_type_string[] =
 
 #define BS_ACTIVE_PRED(m,l)               (struct hsg_op){ HSG_OP_TYPE_BS_ACTIVE_PRED,              { m, l    } }
 
-#define FM_MERGE_RIGHT_PRED(n,s)          (struct hsg_op){ HSG_OP_TYPE_FM_MERGE_RIGHT_PRED,         { n, s    } }
-
 //
 // DEFAULTS
 //
 
-struct hsg_config hsg_config = // FIXME -- how useful is this?
+static
+struct hsg_config hsg_config =
   {
     .merge  = {
       .flip = {
@@ -138,8 +129,6 @@ struct hsg_config hsg_config = // FIXME -- how useful is this?
         .lo         = 1,
         .hi         = 1
       },
-
-      .max_log2     = 27 // 2^27th = 128m
     },
 
     .block  = {
@@ -156,6 +145,7 @@ struct hsg_config hsg_config = // FIXME -- how useful is this?
 
     .warp   = {
       .lanes        = 32,
+      .lanes_log2   = 5,
     },
 
     .thread = {
@@ -172,45 +162,11 @@ struct hsg_config hsg_config = // FIXME -- how useful is this?
 // ZERO HSG_MERGE STRUCT
 //
 
+static
 struct hsg_merge hsg_merge[MERGE_LEVELS_MAX_LOG2] = { 0 };
 
 //
-//
-//
-
-static const hsg_target_pfn hsg_target_pfns[] =
-  {
-    hsg_target_debug,
-    hsg_target_cuda_sm3x,
-    hsg_target_igp_genx,
-    // hsg_target_adreno_5xx,
-    // hsg_target_amd_gcn,
-    // hsg_target_x86_sse,
-    // hsg_target_x86_avx2,
-  };
-
-static const char * hsg_target_pfn_string[] =
-  {
-    "hs_debug",
-    "hs_cuda",
-    "hs_cl"
-  };
-
-static const char * hsg_file_type_string[][2] =
-  {
-    { ".h", ".txt" },
-    { ".h", ".cu"  },
-    { ".h", ".cl"  }
-  };
-
-//
-//
-//
-
-#define HSG_TARGET_PFN_COUNT  ARRAY_LENGTH(hsg_target_pfns)
-
-//
-//
+// STATS ON INSTRUCTIONS
 //
 
 static hsg_op_type hsg_op_type_counts[HSG_OP_TYPE_COUNT] = { 0 };
@@ -223,8 +179,18 @@ static
 void
 hsg_op_debug()
 {
+  uint32_t total = 0;
+
   for (hsg_op_type t=HSG_OP_TYPE_EXIT; t<HSG_OP_TYPE_COUNT; t++)
-    fprintf(stderr,"%-37s : %u\n",hsg_op_type_string[t],hsg_op_type_counts[t]);
+    {
+      uint32_t const count = hsg_op_type_counts[t];
+
+      total += count;
+
+      fprintf(stderr,"%-37s : %u\n",hsg_op_type_string[t],count);
+    }
+
+  fprintf(stderr,"%-37s : %u\n\n\n","TOTAL",total);
 }
 
 //
@@ -268,7 +234,7 @@ hsg_merge_levels_init_shared(struct hsg_merge * const merge)
     //
     // The provided smem_bs size will be allocated for each sorting block.
     //
-    uint32_t const bs_threads   = merge->warps * hsg_config.warp.lanes;
+    uint32_t const bs_threads   = merge->warps << hsg_config.warp.lanes_log2;
     uint32_t const bs_keys      = hsg_config.block.smem_bs / (hsg_config.type.words * sizeof(uint32_t));
     uint32_t const bs_kpt       = bs_keys / bs_threads;
     uint32_t const bs_kpt_mod   = (bs_kpt / hsg_config.block.warps_mod) * hsg_config.block.warps_mod;
@@ -282,7 +248,7 @@ hsg_merge_levels_init_shared(struct hsg_merge * const merge)
       }
 
     // clamp to number of registers
-    merge->rows_bs = min(bs_rows_even, hsg_config.thread.regs);
+    merge->rows_bs = MIN_MACRO(bs_rows_even, hsg_config.thread.regs);
   }
 
   //
@@ -297,19 +263,19 @@ hsg_merge_levels_init_shared(struct hsg_merge * const merge)
     //
     // if merge->warps is not pow2 then we're going to skip creating a bc elsewhere
     //
-    uint32_t const bc_warps_min  = max(merge->warps,hsg_config.block.warps_min);
-    uint32_t const bc_threads    = bc_warps_min * hsg_config.warp.lanes;
+    uint32_t const bc_warps_min  = MAX_MACRO(merge->warps,hsg_config.block.warps_min);
+    uint32_t const bc_threads    = bc_warps_min << hsg_config.warp.lanes_log2;
     uint32_t const bc_block_rd   = (((hsg_config.block.smem_bc * bc_warps_min) / hsg_config.block.warps_max) /
                                     hsg_config.block.smem_quantum) * hsg_config.block.smem_quantum;
-    uint32_t const bc_block_max  = max(bc_block_rd,hsg_config.block.smem_min);
-    uint32_t const bc_block_smem = min(bc_block_max,hsg_config.block.smem_bs);
+    uint32_t const bc_block_max  = MAX_MACRO(bc_block_rd,hsg_config.block.smem_min);
+    uint32_t const bc_block_smem = MIN_MACRO(bc_block_max,hsg_config.block.smem_bs);
 
     // what is the max amount of shared in each possible bc block config?
     uint32_t const bc_keys       = bc_block_smem / (hsg_config.type.words * sizeof(uint32_t));
     uint32_t const bc_kpt        = bc_keys / bc_threads;
     uint32_t const bc_kpt_mod    = (bc_kpt / hsg_config.block.warps_mod) * hsg_config.block.warps_mod;
 
-    merge->rows_bc = min(bc_kpt_mod, hsg_config.thread.regs);
+    merge->rows_bc = MIN_MACRO(bc_kpt_mod, hsg_config.thread.regs);
     merge->skpw_bc = bc_keys / bc_warps_min;
   }
 }
@@ -441,7 +407,7 @@ hsg_merge_levels_hint(struct hsg_merge * const merge, bool const autotune)
   for (uint32_t level=0; level<MERGE_LEVELS_MAX_LOG2; level++)
     {
       // max network
-      uint32_t const n_max = max(merge->levels[level].networks[0],
+      uint32_t const n_max = MAX_MACRO(merge->levels[level].networks[0],
                                  merge->levels[level].networks[1]);
 
       if (n_max <= (merge->rows_bs + hsg_config.thread.xtra))
@@ -533,7 +499,7 @@ hsg_network_copy(struct hsg_op            *       ops,
 
   for (uint32_t ii=0; ii<len; ii++)
     {
-      const struct hsg_op * const cx = cxa + ii;
+      struct hsg_op const * const cx = cxa + ii;
 
       ops = hsg_op(ops,CMP_XCHG(cx->a,cx->b,prefix));
     }
@@ -638,7 +604,7 @@ hsg_warp_half_downto(struct hsg_op * ops, uint32_t h)
     {
       ops = hsg_begin(ops);
 
-      ops = hsg_op(ops,WARP_HALF(h));
+      ops = hsg_op(ops,SLAB_HALF(h));
       ops = hsg_warp_half_network(ops);
 
       ops = hsg_end(ops);
@@ -665,7 +631,7 @@ hsg_warp_flip(struct hsg_op * ops, uint32_t f)
 {
   ops = hsg_begin(ops);
 
-  ops = hsg_op(ops,WARP_FLIP(f));
+  ops = hsg_op(ops,SLAB_FLIP(f));
   ops = hsg_warp_flip_network(ops);
 
   ops = hsg_end(ops);
@@ -782,7 +748,7 @@ hsg_bc_half_merge_level(struct hsg_op          *       ops,
   uint32_t const net_even = merge->levels[0].networks[0];
 
   // min of warps in block and remaining horizontal rows
-  uint32_t const active = min(s_count, net_even);
+  uint32_t const active = MIN_MACRO(s_count, net_even);
 
   // conditional on blockIdx.x
   if (active < merge->warps)
@@ -834,7 +800,7 @@ hsg_bc_half_merge(struct hsg_op * ops, struct hsg_merge const * const merge)
   //
   // will only be called with merge->warps >= 2
   //
-  uint32_t const warps    = max(merge->warps,hsg_config.block.warps_min);
+  uint32_t const warps    = MAX_MACRO(merge->warps,hsg_config.block.warps_min);
 
   // guaranteed to be an even network
   uint32_t const net_even = merge->levels[0].networks[0];
@@ -851,7 +817,7 @@ hsg_bc_half_merge(struct hsg_op * ops, struct hsg_merge const * const merge)
     {
       // compute store count
       uint32_t const r_rem   = hsg_config.thread.regs + 1 - r_lo;
-      uint32_t const s_count = min(s_max,r_rem);
+      uint32_t const s_count = MIN_MACRO(s_max,r_rem);
 
       // block sync -- can skip if first
       if (r_lo > 1)
@@ -1010,7 +976,7 @@ hsg_bs_flip_merge(struct hsg_op * ops, struct hsg_merge const * const merge)
           uint32_t r_hi = hsg_config.thread.regs + 1 - r_lo;
 
           // compute store count
-          uint32_t const s_pairs = min(s_pairs_max,r_mid - r_lo);
+          uint32_t const s_pairs = MIN_MACRO(s_pairs_max,r_mid - r_lo);
 
           // store rows to shared
           for (uint32_t c=0; c<s_pairs; c++)
@@ -1082,7 +1048,7 @@ hsg_bs_flip_merge_all(struct hsg_op * ops, const struct hsg_merge * const merge)
 
 static
 struct hsg_op *
-hsg_bs_sort(struct hsg_op * ops, const struct hsg_merge * const merge)
+hsg_bs_sort(struct hsg_op * ops, struct hsg_merge const * const merge)
 {
   // func proto
   ops = hsg_op(ops,BS_KERNEL_PROTO(merge->index));
@@ -1125,7 +1091,7 @@ hsg_bs_sort_all(struct hsg_op * ops)
 {
   for (uint32_t merge_idx=0; merge_idx<MERGE_LEVELS_MAX_LOG2; merge_idx++)
     {
-      const struct hsg_merge* const m = hsg_merge + merge_idx;
+      struct hsg_merge const * const m = hsg_merge + merge_idx;
 
       if (m->warps == 0)
         break;
@@ -1142,7 +1108,7 @@ hsg_bs_sort_all(struct hsg_op * ops)
 
 static
 struct hsg_op *
-hsg_bc_clean(struct hsg_op * ops, const struct hsg_merge * const merge)
+hsg_bc_clean(struct hsg_op * ops, struct hsg_merge const * const merge)
 {
   // func proto
   ops = hsg_op(ops,BC_KERNEL_PROTO(merge->index));
@@ -1189,7 +1155,7 @@ hsg_bc_clean_all(struct hsg_op * ops)
 {
   for (uint32_t merge_idx=0; merge_idx<MERGE_LEVELS_MAX_LOG2; merge_idx++)
     {
-      const struct hsg_merge* const m = hsg_merge + merge_idx;
+      struct hsg_merge const * const m = hsg_merge + merge_idx;
 
       if (m->warps == 0)
         break;
@@ -1215,9 +1181,7 @@ static
 struct hsg_op *
 hsg_fm_thread_load_left(struct hsg_op * ops, uint32_t const n)
 {
-  uint32_t const mid = n/2;
-
-  for (uint32_t r=1; r<=mid; r++)
+  for (uint32_t r=1; r<=n; r++)
     ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_LEFT(r,r-1));
 
   return ops;
@@ -1227,9 +1191,7 @@ static
 struct hsg_op *
 hsg_fm_thread_store_left(struct hsg_op * ops, uint32_t const n)
 {
-  uint32_t const mid = n/2;
-
-  for (uint32_t r=mid; r>=1; r--)
+  for (uint32_t r=1; r<=n; r++)
     ops = hsg_op(ops,FM_REG_GLOBAL_STORE_LEFT(r,r-1));
 
   return ops;
@@ -1237,53 +1199,60 @@ hsg_fm_thread_store_left(struct hsg_op * ops, uint32_t const n)
 
 static
 struct hsg_op *
-hsg_fm_thread_load_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right)
+hsg_fm_thread_load_right(struct hsg_op * ops, uint32_t const half_span, uint32_t const half_case)
 {
-  uint32_t const mid   = n / 2;
-  uint32_t const first = mid + 1;
-  uint32_t const last  = mid + span_right;
-
-  for (uint32_t r=first; r<=last; r++)
-    ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_RIGHT(r,r-first));
+  for (uint32_t r=0; r<half_case; r++)
+    ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_RIGHT(r,half_span+1+r));
 
   return ops;
 }
 
 static
 struct hsg_op *
-hsg_fm_thread_store_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right)
+hsg_fm_thread_store_right(struct hsg_op * ops, uint32_t const half_span, uint32_t const half_case)
 {
-  uint32_t const mid   = n / 2;
-  uint32_t const first = mid + 1;
-  uint32_t const last  = mid + span_right;
-
-  for (uint32_t r=last; r>=first; r--)
-    ops = hsg_op(ops,FM_REG_GLOBAL_STORE_RIGHT(r,r-first));
+  for (uint32_t r=0; r<half_case; r++)
+    ops = hsg_op(ops,FM_REG_GLOBAL_STORE_RIGHT(r,half_span+1+r));
 
   return ops;
 }
 
 static
 struct hsg_op *
-hsg_fm_thread_merge_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right)
+hsg_fm_merge(struct hsg_op * ops,
+             uint32_t const scale_log2,
+             uint32_t const span_left,
+             uint32_t const span_right)
 {
-  // conditional
-  ops = hsg_op(ops,FM_MERGE_RIGHT_PRED(n/2,span_right));
+  // func proto
+  ops = hsg_op(ops,FM_KERNEL_PROTO(scale_log2,msb_idx_u32(pow2_ru_u32(span_right))));
 
   // begin
   ops = hsg_begin(ops);
 
-  // load
-  ops = hsg_fm_thread_load_right(ops,n,span_right);
+  // preamble for loading/storing
+  ops = hsg_op(ops,FM_KERNEL_PREAMBLE(span_left));
+
+  // load left span
+  ops = hsg_fm_thread_load_left(ops,span_left);
+
+  // load right span
+  ops = hsg_fm_thread_load_right(ops,span_left,span_right);
 
   // compare left and right
-  ops = hsg_thread_merge_left_right(ops,n/2,span_right);
+  ops = hsg_thread_merge_left_right(ops,span_left,span_right);
+
+  // left merging network
+  ops = hsg_thread_merge(ops,span_left);
 
   // right merging network
-  ops = hsg_thread_merge_offset(ops,n/2,span_right);
+  ops = hsg_thread_merge_offset(ops,span_left,span_right);
+
+  // store
+  ops = hsg_fm_thread_store_left(ops,span_left);
 
   // store
-  ops = hsg_fm_thread_store_right(ops,n,span_right);
+  ops = hsg_fm_thread_store_right(ops,span_left,span_right);
 
   // end
   ops = hsg_end(ops);
@@ -1293,45 +1262,12 @@ hsg_fm_thread_merge_right(struct hsg_op * ops, uint32_t const n, uint32_t const
 
 static
 struct hsg_op *
-hsg_fm_thread_merge_right_all(struct hsg_op * ops, uint32_t const span)
+hsg_fm_merge_all(struct hsg_op * ops, uint32_t const scale_log2, uint32_t const warps)
 {
-  ops = hsg_fm_thread_merge_right(ops,span,span/2);
-
-  for (uint32_t span_pow2 = pow2_ru_u32(span) / 4; span_pow2 >= 1; span_pow2 /= 2)
-    {
-      ops = hsg_fm_thread_merge_right(ops,span,span_pow2);
-    }
-
-  return ops;
-}
-
-static
-struct hsg_op *
-hsg_fm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uint32_t const fm_scale)
-{
-  // func proto
-  ops = hsg_op(ops,FM_KERNEL_PROTO(level,fm_scale));
-
-  // begin
-  ops = hsg_begin(ops);
-
-  // shared declare
-  ops = hsg_op(ops,FM_KERNEL_PREAMBLE(span,fm_scale));
-
-  // load
-  ops = hsg_fm_thread_load_left(ops,span);
-
-  // right merging network
-  ops = hsg_fm_thread_merge_right_all(ops,span);
-
-  // left merging network
-  ops = hsg_thread_merge(ops,span/2);
-
-  // store
-  ops = hsg_fm_thread_store_left(ops,span);
+  uint32_t const span_left = (warps << scale_log2) / 2;
 
-  // end
-  ops = hsg_end(ops);
+  for (uint32_t span_right=span_left; span_right >= 1; span_right=pow2_ru_u32(span_right)/2)
+    ops = hsg_fm_merge(ops,scale_log2,span_left,span_right);
 
   return ops;
 }
@@ -1354,7 +1290,7 @@ static
 struct hsg_op *
 hsg_hm_thread_store(struct hsg_op * ops, uint32_t const n)
 {
-  for (uint32_t r=n; r>=1; r--)
+  for (uint32_t r=1; r<=n; r++)
     ops = hsg_op(ops,HM_REG_GLOBAL_STORE(r,r-1));
 
   return ops;
@@ -1362,16 +1298,18 @@ hsg_hm_thread_store(struct hsg_op * ops, uint32_t const n)
 
 static
 struct hsg_op *
-hsg_hm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uint32_t const hm_scale)
+hsg_hm_merge(struct hsg_op * ops, uint32_t const scale_log2, uint32_t const warps_pow2)
 {
+  uint32_t const span = warps_pow2 << scale_log2;
+
   // func proto
-  ops = hsg_op(ops,HM_KERNEL_PROTO(level,level-msb_idx_u32(span)));
+  ops = hsg_op(ops,HM_KERNEL_PROTO(scale_log2));
 
   // begin
   ops = hsg_begin(ops);
 
-  // declarations
-  ops = hsg_op(ops,HM_KERNEL_PREAMBLE(span,hm_scale));
+  // preamble for loading/storing
+  ops = hsg_op(ops,HM_KERNEL_PREAMBLE(span/2));
 
   // load
   ops = hsg_hm_thread_load(ops,span);
@@ -1389,55 +1327,6 @@ hsg_hm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uin
 }
 
 //
-//
-//
-
-static
-struct hsg_op *
-hsg_fm_merge_level(struct hsg_op * ops, uint32_t const level)
-{
-  uint32_t const bc_max      = pow2_rd_u32(hsg_merge[0].warps);
-  uint32_t const bc_max_log2 = msb_idx_u32(bc_max);
-
-  uint32_t const fm_level    = (level <= bc_max_log2) ? hsg_config.merge.flip.lo : min(level - bc_max_log2,hsg_config.merge.flip.hi);
-  uint32_t const fm_scale    = level - fm_level;
-
-  ops = hsg_fm_merge(ops,
-                     level,
-                     hsg_merge[0].warps * (1u << fm_level),
-                     fm_scale);
-
-  return ops;
-}
-
-//
-//
-//
-
-static
-struct hsg_op *
-hsg_hm_merge_level(struct hsg_op * ops, uint32_t const level)
-{
-  uint32_t const bc_max      = pow2_rd_u32(hsg_merge[0].warps);
-  uint32_t const bc_max_log2 = msb_idx_u32(bc_max);
-
-  uint32_t const fm_log2_max = bc_max_log2 + hsg_config.merge.flip.hi;
-
-  if (level > fm_log2_max)
-    {
-      uint32_t const down_warps_log2 = level - fm_log2_max;
-      uint32_t const hm_level        = max(hsg_config.merge.half.lo,min(hsg_config.merge.half.hi,down_warps_log2));
-
-      ops = hsg_hm_merge(ops,
-                         level - hsg_config.merge.flip.hi,
-                         bc_max * (1u << hm_level),
-                         down_warps_log2 - hm_level);
-    }
-
-  return ops;
-}
-
-//
 // GENERATE MERGE KERNELS
 //
 
@@ -1445,23 +1334,20 @@ static
 struct hsg_op *
 hsg_xm_merge_all(struct hsg_op * ops)
 {
-  uint32_t const keys_per_block = hsg_merge[0].warps * hsg_config.warp.lanes * hsg_config.thread.regs;
-  uint32_t const blocks         = ((1U << hsg_config.merge.max_log2) + keys_per_block - 1) / keys_per_block;
-  uint32_t const blocks_ru      = pow2_ru_u32(blocks);
-  uint32_t const blocks_log2    = msb_idx_u32(blocks_ru);
+  uint32_t const warps = hsg_merge[0].warps;
+  uint32_t const warps_pow2 = pow2_rd_u32(warps);
 
-  for (uint32_t level=1; level<=blocks_log2; level+=1)
-    {
-      //
-      // GENERATE FLIP MERGE KERNELS
-      //
-      ops = hsg_fm_merge_level(ops,level);
+  //
+  // GENERATE FLIP MERGE KERNELS
+  //
+  for (uint32_t scale_log2=hsg_config.merge.flip.lo; scale_log2<=hsg_config.merge.flip.hi; scale_log2++)
+    ops = hsg_fm_merge_all(ops,scale_log2,warps);
 
-      //
-      // GENERATE HALF MERGE KERNELS
-      //
-      ops = hsg_hm_merge_level(ops,level);
-    }
+  //
+  // GENERATE HALF MERGE KERNELS
+  //
+  for (uint32_t scale_log2=hsg_config.merge.half.lo; scale_log2<=hsg_config.merge.half.hi; scale_log2++)
+    ops = hsg_hm_merge(ops,scale_log2,warps_pow2);
 
   return ops;
 }
@@ -1470,93 +1356,30 @@ hsg_xm_merge_all(struct hsg_op * ops)
 //
 //
 
-void
-hsg_target_indent(struct hsg_file * const files, uint32_t const depth)
-{
-  fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-          "%*s",
-          depth*HSG_INDENT,"");
-}
-
-void
-hsg_target_debug(struct hsg_file        * const files,
-                 const struct hsg_merge * const merge,
-                 const struct hsg_op    * const ops,
-                 uint32_t                     const depth)
-{
-
-  hsg_target_indent(files,depth);
-
-  fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-          "%s\n",
-          hsg_op_type_string[ops->type]);
-}
-
-//
-//
-//
-
-static
-struct hsg_file*
-hsg_files_open(const char * prefix, const char ** suffix)
-{
-#define STR_BUF_SIZE 80
-
-  struct hsg_file * files = malloc(sizeof(struct hsg_file) * HSG_FILE_TYPE_COUNT);
-
-  for (int32_t ii=0; ii<HSG_FILE_TYPE_COUNT; ii++)
-    {
-      char * name = files[ii].name;
-
-      // save prefix
-      files[ii].prefix = prefix;
-
-      // build filename
-      strcpy_s(name,STR_BUF_SIZE,prefix);
-      strcat_s(name,STR_BUF_SIZE,suffix[ii]);
-
-      // open file
-      fopen_s(&files[ii].file,name,"w+");
-    }
-
-  return files;
-}
-
-static
-void
-hsg_files_close(struct hsg_file * files)
-{
-  for (int32_t ii=0; ii<HSG_FILE_TYPE_COUNT; ii++)
-    fclose(files[ii].file);
-}
-
-//
-//
-//
-
 static
-const struct hsg_op *
-hsg_op_translate_depth(hsg_target_pfn                 target_pfn,
-                       struct hsg_file        * const files,
-                       const struct hsg_merge * const merge,
-                       const struct hsg_op    *       ops,
-                       uint32_t                     const depth)
+struct hsg_op const *
+hsg_op_translate_depth(hsg_target_pfn                  target_pfn,
+                       struct hsg_target       * const target,
+                       struct hsg_config const * const config,
+                       struct hsg_merge  const * const merge,
+                       struct hsg_op     const *       ops,
+                       uint32_t                  const depth)
 {
   while (ops->type != HSG_OP_TYPE_EXIT)
     {
       switch (ops->type)
         {
         case HSG_OP_TYPE_END:
-          target_pfn(files,merge,ops,depth-1);
+          target_pfn(target,config,merge,ops,depth-1);
           return ops + 1;
 
         case HSG_OP_TYPE_BEGIN:
-          target_pfn(files,merge,ops,depth);
-          ops = hsg_op_translate_depth(target_pfn,files,merge,ops+1,depth+1);
+          target_pfn(target,config,merge,ops,depth);
+          ops = hsg_op_translate_depth(target_pfn,target,config,merge,ops+1,depth+1);
           break;
 
         default:
-          target_pfn(files,merge,ops++,depth);
+          target_pfn(target,config,merge,ops++,depth);
         }
     }
 
@@ -1565,12 +1388,13 @@ hsg_op_translate_depth(hsg_target_pfn                 target_pfn,
 
 static
 void
-hsg_op_translate(hsg_target_pfn                 target_pfn,
-                 struct hsg_file        * const files,
-                 const struct hsg_merge * const merge,
-                 const struct hsg_op    *       ops)
+hsg_op_translate(hsg_target_pfn                  target_pfn,
+                 struct hsg_target       * const target,
+                 struct hsg_config const * const config,
+                 struct hsg_merge  const * const merge,
+                 struct hsg_op     const *       ops)
 {
-  hsg_op_translate_depth(target_pfn,files,merge,ops,0);
+  hsg_op_translate_depth(target_pfn,target,config,merge,ops,0);
 }
 
 //
@@ -1581,37 +1405,27 @@ int
 main(int argc, char * argv[])
 {
   //
-  // INIT
-  //
-  for (uint32_t ii=0; ii<=MERGE_LEVELS_MAX_LOG2; ii++)
-    {
-      hsg_merge[ii].index = ii;
-      hsg_merge[ii].warps = 32 / (1u << ii);
-    }
-
-  //
   // PROCESS OPTIONS
   //
-  int32_t arch     = 0;
-  int32_t opt      = 0;
-
-  bool    quiet    = false;
-  bool    autotune = false;
+  int32_t      opt       = 0;
+  bool         verbose   = false;
+  bool         autotune  = false;
+  char const * arch      = "undefined";
 
-  while ((opt = getopt(argc,argv,"hqa:g:G:s:S:w:b:B:m:M:k:r:x:t:f:F:c:C:z")) != EOF)
+  while ((opt = getopt(argc,argv,"hva:g:G:s:S:w:b:B:m:M:k:r:x:t:f:F:c:C:z")) != EOF)
     {
       switch (opt)
         {
         case 'h':
           fprintf(stderr,"Help goes here...\n");
-          return -1;
+          return EXIT_FAILURE;
 
-        case 'q':
-          quiet = true;
+        case 'v':
+          verbose = true;
           break;
 
         case 'a':
-          arch = atoi(optarg);
+          arch = optarg;
           break;
 
         case 'g':
@@ -1635,30 +1449,28 @@ main(int argc, char * argv[])
           break;
 
         case 'w':
-          hsg_config.warp.lanes = atoi(optarg);
+          hsg_config.warp.lanes      = atoi(optarg);
+          hsg_config.warp.lanes_log2 = msb_idx_u32(hsg_config.warp.lanes);
           break;
 
         case 'b':
           // maximum warps in a workgroup / cta / thread block
           {
-            uint32_t const warps         = atoi(optarg);
-            uint32_t const warps_ru_pow2 = pow2_ru_u32(warps);
-
-            // set warps_max if not already set
-            if (hsg_config.block.warps_max == UINT32_MAX)
-              hsg_config.block.warps_max = warps_ru_pow2;
+            uint32_t const warps = atoi(optarg);
 
             // must always be even
-            if ((warps&1) != 0)
+            if ((warps & 1) != 0)
               {
                 fprintf(stderr,"Error: -b must be even.\n");
-                exit(-1);
+                return EXIT_FAILURE;
               }
 
+            hsg_merge[0].index = 0;
             hsg_merge[0].warps = warps;
 
-            for (uint32_t ii=1; ii<=MERGE_LEVELS_MAX_LOG2; ii++)
-              hsg_merge[ii].warps = warps_ru_pow2 / (1u << ii);
+            // set warps_max if not already set
+            if (hsg_config.block.warps_max == UINT32_MAX)
+              hsg_config.block.warps_max = pow2_ru_u32(warps);
           }
           break;
 
@@ -1677,18 +1489,14 @@ main(int argc, char * argv[])
           hsg_config.block.warps_mod = atoi(optarg);
           break;
 
-        case 'k':
-          hsg_config.merge.max_log2 = atoi(optarg);
-          break;
-
         case 'r':
           {
             uint32_t const regs = atoi(optarg);
 
-            if ((regs&1) != 0)
+            if ((regs & 1) != 0)
               {
                 fprintf(stderr,"Error: -r must be even.\n");
-                exit(-1);
+                return EXIT_FAILURE;
               }
 
             hsg_config.thread.regs = regs;
@@ -1726,17 +1534,39 @@ main(int argc, char * argv[])
     }
 
   //
-  // WHICH ARCH TARGET?
+  // INIT MERGE
   //
-  hsg_target_pfn hsg_target_pfn = (arch < HSG_TARGET_PFN_COUNT) ? hsg_target_pfns[arch] : hsg_target_debug;
+  uint32_t const warps_ru_pow2 = pow2_ru_u32(hsg_merge[0].warps);
+
+  for (uint32_t ii=1; ii<=MERGE_LEVELS_MAX_LOG2; ii++)
+    {
+      hsg_merge[ii].index = ii;
+      hsg_merge[ii].warps = warps_ru_pow2 >> ii;
+    }
 
   //
-  // OPEN FILES
+  // WHICH ARCH TARGET?
   //
-  struct hsg_file * files = hsg_files_open(hsg_target_pfn_string[arch],hsg_file_type_string[arch]);
+  hsg_target_pfn hsg_target_pfn;
+
+  if      (strcmp(arch,"debug") == 0)
+    hsg_target_pfn = hsg_target_debug;
+  else if (strcmp(arch,"cuda") == 0)
+    hsg_target_pfn = hsg_target_cuda;
+  else if (strcmp(arch,"opencl") == 0)
+    hsg_target_pfn = hsg_target_opencl;
+  else if (strcmp(arch,"glsl") == 0)
+    hsg_target_pfn = hsg_target_glsl;
+  else {
+    fprintf(stderr,"Invalid arch: %s\n",arch);
+    exit(EXIT_FAILURE);
+  }
+
+  if (verbose)
+    fprintf(stderr,"Target: %s\n",arch);
 
   //
-  // INIT F_KEYS
+  // INIT SMEM KEY ALLOCATION
   //
   hsg_config_init_shared();
 
@@ -1766,27 +1596,26 @@ main(int argc, char * argv[])
       //
       // THESE ARE FOR DEBUG/INSPECTION
       //
-
-      if (!quiet)
+      if (verbose)
         {
           hsg_merge_levels_debug(merge);
         }
     }
 
-  if (!quiet)
+  if (verbose)
     fprintf(stderr,"\n\n");
 
   //
+  // GENERATE THE OPCODES
   //
-  //
-  uint32_t        const op_count  = 1024*1024; // 2^20 ops for now!
-  struct hsg_op * const ops_begin = malloc(op_count * sizeof(*ops_begin));
+  uint32_t        const op_count  = 1<<17;
+  struct hsg_op * const ops_begin = malloc(sizeof(*ops_begin) * op_count);
   struct hsg_op *       ops       = ops_begin;
 
   //
-  // APPEND HEADER
+  // OPEN INITIAL FILES AND APPEND HEADER
   //
-  ops = hsg_op(ops,FILE_HEADER());
+  ops = hsg_op(ops,TARGET_BEGIN());
 
   //
   // GENERATE TRANSPOSE KERNEL
@@ -1809,9 +1638,9 @@ main(int argc, char * argv[])
   ops = hsg_xm_merge_all(ops);
 
   //
-  // APPEND FOOTER
+  // APPEND FOOTER AND CLOSE INITIAL FILES
   //
-  ops = hsg_op(ops,FILE_FOOTER());
+  ops = hsg_op(ops,TARGET_END());
 
   //
   // ... WE'RE DONE!
@@ -1821,20 +1650,17 @@ main(int argc, char * argv[])
   //
   // APPLY TARGET TRANSLATOR TO ACCUMULATED OPS
   //
-  hsg_op_translate(hsg_target_pfn,files,hsg_merge,ops_begin);
+  struct hsg_target target;
 
-  //
-  //
-  //
-  if (!quiet)
-    hsg_op_debug();
+  hsg_op_translate(hsg_target_pfn,&target,&hsg_config,hsg_merge,ops_begin);
 
   //
+  // DUMP INSTRUCTION COUNTS
   //
-  //
-  hsg_files_close(files);
+  if (verbose)
+    hsg_op_debug();
 
-  return 0;
+  return EXIT_SUCCESS;
 }
 
 //
diff --git a/src/compute/hs/gen/networks_merging.c b/src/compute/hs/gen/networks_merging.c
index 90dca03c21..f93958c842 100644
--- a/src/compute/hs/gen/networks_merging.c
+++ b/src/compute/hs/gen/networks_merging.c
@@ -11,7 +11,7 @@
 //
 
 #include "networks.h"
-#include "macros.h"
+#include "common/macros.h"
 
 //
 //
@@ -24,7 +24,7 @@
 //
 //
 
-#define LM(n)  { ARRAY_LENGTH(mn##n), mn##n }
+#define LM(n)  { ARRAY_LENGTH_MACRO(mn##n), mn##n }
 
 //
 //
diff --git a/src/compute/hs/gen/networks_sorting.c b/src/compute/hs/gen/networks_sorting.c
index c7beb6b45e..3d8d364399 100644
--- a/src/compute/hs/gen/networks_sorting.c
+++ b/src/compute/hs/gen/networks_sorting.c
@@ -14,7 +14,7 @@
 //
 
 #include "networks.h"
-#include "macros.h"
+#include "common/macros.h"
 
 //
 //
@@ -27,7 +27,7 @@
 //
 //
 
-#define LS(n)  { ARRAY_LENGTH(sn##n), sn##n }
+#define LS(n)  { ARRAY_LENGTH_MACRO(sn##n), sn##n }
 
 //
 //
diff --git a/src/compute/hs/gen/target_cuda.c b/src/compute/hs/gen/target_cuda.c
new file mode 100644
index 0000000000..e140c4be4c
--- /dev/null
+++ b/src/compute/hs/gen/target_cuda.c
@@ -0,0 +1,600 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+//
+//
+
+#include "gen.h"
+#include "transpose.h"
+
+#include "common/util.h"
+#include "common/macros.h"
+
+//
+//
+//
+
+struct hsg_transpose_state
+{
+  FILE                    * header;
+  struct hsg_config const * config;
+};
+
+static
+char
+hsg_transpose_reg_prefix(uint32_t const cols_log2)
+{
+  return 'a' + (('r' + cols_log2 - 'a') % 26);
+}
+
+static
+void
+hsg_transpose_blend(uint32_t                     const cols_log2,
+                    uint32_t                     const row_ll, // lower-left
+                    uint32_t                     const row_ur, // upper-right
+                    struct hsg_transpose_state * const state)
+{
+  // we're starting register names at '1' for now
+  fprintf(state->header,
+          "  HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n",
+          hsg_transpose_reg_prefix(cols_log2-1),
+          hsg_transpose_reg_prefix(cols_log2),
+          cols_log2,row_ll+1,row_ur+1);
+}
+
+static
+void
+hsg_transpose_remap(uint32_t                     const row_from,
+                    uint32_t                     const row_to,
+                    struct hsg_transpose_state * const state)
+{
+  // we're starting register names at '1' for now
+  fprintf(state->header,
+          "  HS_TRANSPOSE_REMAP( %c, %3u, %3u )        \\\n",
+          hsg_transpose_reg_prefix(state->config->warp.lanes_log2),
+          row_from+1,row_to+1);
+}
+
+//
+//
+//
+
+static
+void
+hsg_copyright(FILE * file)
+{
+  fprintf(file,
+          "//                                                    \n"
+          "// Copyright 2016 Google Inc.                         \n"
+          "//                                                    \n"
+          "// Use of this source code is governed by a BSD-style \n"
+          "// license that can be found in the LICENSE file.     \n"
+          "//                                                    \n"
+          "\n");
+}
+
+//
+//
+//
+
+struct hsg_target_state
+{
+  FILE * header;
+  FILE * source;
+};
+
+//
+//
+//
+
+void
+hsg_target_cuda(struct hsg_target       * const target,
+                struct hsg_config const * const config,
+                struct hsg_merge  const * const merge,
+                struct hsg_op     const * const ops,
+                uint32_t                  const depth)
+{
+  switch (ops->type)
+    {
+    case HSG_OP_TYPE_END:
+      fprintf(target->state->source,
+              "}\n");
+      break;
+
+    case HSG_OP_TYPE_BEGIN:
+      fprintf(target->state->source,
+              "{\n");
+      break;
+
+    case HSG_OP_TYPE_ELSE:
+      fprintf(target->state->source,
+              "else\n");
+      break;
+
+    case HSG_OP_TYPE_TARGET_BEGIN:
+      {
+        // allocate state
+        target->state = malloc(sizeof(*target->state));
+
+        // allocate files
+        fopen_s(&target->state->header,"hs_cuda.h", "wb");
+        fopen_s(&target->state->source,"hs_cuda.cu","wb");
+
+        // initialize header
+        uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps));
+
+        hsg_copyright(target->state->header);
+
+        fprintf(target->state->header,
+                "#ifndef HS_CUDA_ONCE                                            \n"
+                "#define HS_CUDA_ONCE                                            \n"
+                "                                                                \n"
+                "#define HS_SLAB_THREADS_LOG2    %u                              \n"
+                "#define HS_SLAB_THREADS         (1 << HS_SLAB_THREADS_LOG2)     \n"
+                "#define HS_SLAB_WIDTH_LOG2      %u                              \n"
+                "#define HS_SLAB_WIDTH           (1 << HS_SLAB_WIDTH_LOG2)       \n"
+                "#define HS_SLAB_HEIGHT          %u                              \n"
+                "#define HS_SLAB_KEYS            (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)\n"
+                "#define HS_REG_LAST(c)          c##%u                           \n"
+                "#define HS_KEY_TYPE             %s                              \n"
+                "#define HS_KEY_WORDS            %u                              \n"
+                "#define HS_VAL_WORDS            0                               \n"
+                "#define HS_BS_SLABS             %u                              \n"
+                "#define HS_BS_SLABS_LOG2_RU     %u                              \n"
+                "#define HS_BC_SLABS_LOG2_MAX    %u                              \n"
+                "#define HS_FM_SCALE_MIN         %u                              \n"
+                "#define HS_FM_SCALE_MAX         %u                              \n"
+                "#define HS_HM_SCALE_MIN         %u                              \n"
+                "#define HS_HM_SCALE_MAX         %u                              \n"
+                "#define HS_EMPTY                                                \n"
+                "                                                                \n",
+                config->warp.lanes_log2,
+                config->warp.lanes_log2,
+                config->thread.regs,
+                config->thread.regs,
+                (config->type.words == 2) ? "ulong" : "uint",
+                config->type.words,
+                merge->warps,
+                msb_idx_u32(pow2_ru_u32(merge->warps)),
+                bc_max,
+                config->merge.flip.lo,
+                config->merge.flip.hi,
+                config->merge.half.lo,
+                config->merge.half.hi);
+
+        fprintf(target->state->header,
+                "#define HS_SLAB_ROWS()    \\\n");
+
+        for (uint32_t ii=1; ii<=config->thread.regs; ii++)
+          fprintf(target->state->header,
+                  "  HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1);
+
+        fprintf(target->state->header,
+                "  HS_EMPTY\n"
+                "          \n");
+
+        fprintf(target->state->header,
+                "#define HS_TRANSPOSE_SLAB()                \\\n");
+
+        for (uint32_t ii=1; ii<=config->warp.lanes_log2; ii++)
+          fprintf(target->state->header,
+                  "  HS_TRANSPOSE_STAGE( %u )                  \\\n",ii);
+
+        struct hsg_transpose_state state[1] =
+          {
+           { .header = target->state->header,
+             .config = config
+           }
+          };
+
+        hsg_transpose(config->warp.lanes_log2,
+                      config->thread.regs,
+                      hsg_transpose_blend,state,
+                      hsg_transpose_remap,state);
+
+        fprintf(target->state->header,
+                "  HS_EMPTY\n"
+                "          \n");
+
+        hsg_copyright(target->state->source);
+
+        fprintf(target->state->source,
+                "#include \"hs_cuda_macros.h\" \n"
+                "                              \n"
+                "//                            \n"
+                "//                            \n"
+                "//                            \n");
+      }
+      break;
+
+    case HSG_OP_TYPE_TARGET_END:
+      // decorate the files
+      fprintf(target->state->header,
+              "#endif \n"
+              "       \n"
+              "//     \n"
+              "//     \n"
+              "//     \n"
+              "       \n");
+      fprintf(target->state->source,
+              "       \n"
+              "//     \n"
+              "//     \n"
+              "//     \n"
+              "       \n");
+
+      // close files
+      fclose(target->state->header);
+      fclose(target->state->source);
+
+      // free state
+      free(target->state);
+      break;
+
+    case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO:
+      {
+        fprintf(target->state->source,
+                "\nHS_TRANSPOSE_KERNEL_PROTO(%u)\n",
+                config->warp.lanes);
+      }
+      break;
+
+    case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE:
+      {
+        fprintf(target->state->source,
+                "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+                config->warp.lanes,config->thread.regs);
+      }
+      break;
+
+    case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY:
+      {
+        fprintf(target->state->source,
+                "HS_TRANSPOSE_SLAB()\n");
+      }
+      break;
+
+    case HSG_OP_TYPE_BS_KERNEL_PROTO:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        uint32_t const bs  = pow2_ru_u32(m->warps);
+        uint32_t const msb = msb_idx_u32(bs);
+
+        fprintf(target->state->source,
+                "\nHS_BS_KERNEL_PROTO(%u,%u,%u)\n",
+                config->warp.lanes,m->warps,msb);
+      }
+      break;
+
+    case HSG_OP_TYPE_BS_KERNEL_PREAMBLE:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        if (m->warps > 1)
+          {
+            fprintf(target->state->source,
+                    "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+                    m->warps * config->warp.lanes,
+                    m->rows_bs);
+          }
+
+        fprintf(target->state->source,
+                "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+                config->warp.lanes,config->thread.regs);
+      }
+      break;
+
+    case HSG_OP_TYPE_BC_KERNEL_PROTO:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        uint32_t const msb = msb_idx_u32(m->warps);
+
+        fprintf(target->state->source,
+                "\nHS_BC_KERNEL_PROTO(%u,%u,%u)\n",
+                config->warp.lanes,m->warps,msb);
+      }
+      break;
+
+    case HSG_OP_TYPE_BC_KERNEL_PREAMBLE:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        if (m->warps > 1)
+          {
+            fprintf(target->state->source,
+                    "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+                    m->warps * config->warp.lanes,
+                    m->rows_bc);
+          }
+
+        fprintf(target->state->source,
+                "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+                config->warp.lanes,config->thread.regs);
+      }
+      break;
+
+    case HSG_OP_TYPE_FM_KERNEL_PROTO:
+      fprintf(target->state->source,
+              "\nHS_FM_KERNEL_PROTO(%u,%u)\n",
+              ops->a,ops->b);
+      break;
+
+    case HSG_OP_TYPE_FM_KERNEL_PREAMBLE:
+      fprintf(target->state->source,
+              "HS_FM_PREAMBLE(%u);\n",
+              ops->a);
+      break;
+
+    case HSG_OP_TYPE_HM_KERNEL_PROTO:
+      {
+        fprintf(target->state->source,
+                "\nHS_HM_KERNEL_PROTO(%u)\n",
+                ops->a);
+      }
+      break;
+
+    case HSG_OP_TYPE_HM_KERNEL_PREAMBLE:
+      fprintf(target->state->source,
+              "HS_HM_PREAMBLE(%u);\n",
+              ops->a);
+      break;
+
+    case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD:
+      {
+        static char const * const vstr[] = { "vin", "vout" };
+
+        fprintf(target->state->source,
+                "HS_KEY_TYPE r%-3u = HS_SLAB_GLOBAL_LOAD(%s,%u,%u);\n",
+                ops->n,vstr[ops->v],config->warp.lanes,ops->n-1);
+      }
+      break;
+
+    case HSG_OP_TYPE_BX_REG_GLOBAL_STORE:
+      fprintf(target->state->source,
+              "HS_SLAB_GLOBAL_STORE(%u,%u,r%u);\n",
+              config->warp.lanes,ops->n-1,ops->n);
+      break;
+
+    case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+              ops->a,ops->b);
+      break;
+
+    case HSG_OP_TYPE_HM_REG_GLOBAL_STORE:
+      fprintf(target->state->source,
+              "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+              ops->b,ops->a);
+      break;
+
+    case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+              ops->a,ops->b);
+      break;
+
+    case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT:
+      fprintf(target->state->source,
+              "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+              ops->b,ops->a);
+      break;
+
+    case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%-3u = HS_FM_GLOBAL_LOAD_R(%u);\n",
+              ops->b,ops->a);
+      break;
+
+    case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT:
+      fprintf(target->state->source,
+              "HS_FM_GLOBAL_STORE_R(%-3u,r%u);\n",
+              ops->a,ops->b);
+      break;
+
+    case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED:
+      {
+        if (ops->a <= ops->b)
+          {
+            fprintf(target->state->source,
+                    "if (HS_FM_IS_NOT_LAST_SPAN() || (fm_frac == 0))\n");
+          }
+        else if (ops->b > 1)
+          {
+            fprintf(target->state->source,
+                    "else if (fm_frac == %u)\n",
+                    ops->b);
+          }
+        else
+          {
+	    fprintf(target->state->source,
+		    "else\n");
+          }
+      }
+      break;
+
+    case HSG_OP_TYPE_SLAB_FLIP:
+      fprintf(target->state->source,
+              "HS_SLAB_FLIP_PREAMBLE(%u);\n",
+              ops->n-1);
+      break;
+
+    case HSG_OP_TYPE_SLAB_HALF:
+      fprintf(target->state->source,
+              "HS_SLAB_HALF_PREAMBLE(%u);\n",
+              ops->n / 2);
+      break;
+
+    case HSG_OP_TYPE_CMP_FLIP:
+      fprintf(target->state->source,
+              "HS_CMP_FLIP(%-3u,r%-3u,r%-3u);\n",ops->a,ops->b,ops->c);
+      break;
+
+    case HSG_OP_TYPE_CMP_HALF:
+      fprintf(target->state->source,
+              "HS_CMP_HALF(%-3u,r%-3u);\n",ops->a,ops->b);
+      break;
+
+    case HSG_OP_TYPE_CMP_XCHG:
+      if (ops->c == UINT32_MAX)
+        {
+          fprintf(target->state->source,
+                  "HS_CMP_XCHG(r%-3u,r%-3u);\n",
+                  ops->a,ops->b);
+        }
+      else
+        {
+          fprintf(target->state->source,
+                  "HS_CMP_XCHG(r%u_%u,r%u_%u);\n",
+                  ops->c,ops->a,ops->c,ops->b);
+        }
+      break;
+
+    case HSG_OP_TYPE_BS_REG_SHARED_STORE_V:
+      fprintf(target->state->source,
+              "HS_BX_LOCAL_V(%-3u * %-2u * %-3u) = r%u;\n",
+              merge[ops->a].warps,config->warp.lanes,ops->c,ops->b);
+      break;
+
+    case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V:
+      fprintf(target->state->source,
+              "r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+              ops->b,merge[ops->a].warps,config->warp.lanes,ops->c);
+      break;
+
+    case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+              ops->b,ops->a,config->warp.lanes,ops->c);
+      break;
+
+    case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT:
+      fprintf(target->state->source,
+              "HS_SLAB_LOCAL_L(%5u) = r%u_%u;\n",
+              ops->b * config->warp.lanes,
+              ops->c,
+              ops->a);
+      break;
+
+    case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT:
+      fprintf(target->state->source,
+              "HS_SLAB_LOCAL_R(%5u) = r%u_%u;\n",
+              ops->b * config->warp.lanes,
+              ops->c,
+              ops->a);
+      break;
+
+    case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_L(%u);\n",
+              ops->c,
+              ops->a,
+              ops->b * config->warp.lanes);
+      break;
+
+    case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_R(%u);\n",
+              ops->c,
+              ops->a,
+              ops->b * config->warp.lanes);
+      break;
+
+    case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%u_%-3u = HS_BC_GLOBAL_LOAD_L(%u,%u);\n",
+              ops->c,
+              ops->a,
+              config->warp.lanes,ops->b);
+      break;
+
+    case HSG_OP_TYPE_BLOCK_SYNC:
+      fprintf(target->state->source,
+              "HS_BLOCK_BARRIER();\n");
+      //
+      // FIXME - Named barriers to allow coordinating warps to proceed?
+      //
+      break;
+
+    case HSG_OP_TYPE_BS_FRAC_PRED:
+      {
+        if (ops->m == 0)
+          {
+            fprintf(target->state->source,
+                    "if (warp_idx < bs_full)\n");
+          }
+        else
+          {
+            fprintf(target->state->source,
+                    "else if (bs_frac == %u)\n",
+                    ops->w);
+          }
+      }
+      break;
+
+    case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        fprintf(target->state->source,
+                "HS_BS_MERGE_H_PREAMBLE(%u,%u);\n",
+                config->warp.lanes,m->warps);
+      }
+      break;
+
+    case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        fprintf(target->state->source,
+                "HS_BC_MERGE_H_PREAMBLE(%u,%u,%u);\n",
+                config->warp.lanes,config->thread.regs,m->warps);
+      }
+      break;
+
+    case HSG_OP_TYPE_BX_MERGE_H_PRED:
+      fprintf(target->state->source,
+              "if (get_sub_group_id() < %u)\n",
+              ops->a);
+      break;
+
+    case HSG_OP_TYPE_BS_ACTIVE_PRED:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        if (m->warps <= 32)
+          {
+            fprintf(target->state->source,
+                    "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n",
+                    m->levels[ops->b].active.b32a2[0]);
+          }
+        else
+          {
+            fprintf(target->state->source,
+                    "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n",
+                    m->levels[ops->b].active.b32a2[1],
+                    m->levels[ops->b].active.b32a2[0]);
+          }
+      }
+      break;
+
+    default:
+      fprintf(stderr,"type not found: %s\n",hsg_op_type_string[ops->type]);
+      exit(EXIT_FAILURE);
+      break;
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/hs/gen/target_cuda_sm3x.c b/src/compute/hs/gen/target_cuda_sm3x.c
deleted file mode 100644
index 6369aa33b0..0000000000
--- a/src/compute/hs/gen/target_cuda_sm3x.c
+++ /dev/null
@@ -1,776 +0,0 @@
-/*
- * Copyright 2016 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#include <stdio.h>
-
-//
-//
-//
-
-#include "gen.h"
-#include "util.h"
-
-//
-//
-//
-
-void
-hsg_target_cuda_sm3x(struct hsg_file        * const files,
-                     struct hsg_merge const * const merge,
-                     struct hsg_op    const * const ops,
-                     uint32_t                 const depth)
-{
-  const char* const type     = (hsg_config.type.words == 2) ? "uint64_t"   : "uint32_t";
-  const char* const type_max = (hsg_config.type.words == 2) ? "UINT64_MAX" : "UINT32_MAX";
-
-  switch (ops->type)
-    {
-    case HSG_OP_TYPE_END:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "}\n");
-      break;
-
-    case HSG_OP_TYPE_BEGIN:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "{\n");
-      break;
-
-    case HSG_OP_TYPE_ELSE:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "else\n");
-      break;
-
-    case HSG_OP_TYPE_FILE_HEADER:
-      {
-        uint32_t const bc_min = msb_idx_u32(hsg_config.block.warps_min);
-        uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge[0].warps));
-
-        fprintf(files[HSG_FILE_TYPE_HEADER].file,
-                "//                                                      \n"
-                "// Copyright 2016 Google Inc.                           \n"
-                "//                                                      \n"
-                "// Use of this source code is governed by a BSD-style   \n"
-                "// license that can be found in the LICENSE file.       \n"
-                "//                                                      \n"
-                "                                                        \n"
-                "#pragma once                                            \n"
-                "                                                        \n"
-                "#include <stdint.h>                                     \n"
-                "                                                        \n"
-                "#define HS_LANES_PER_WARP     %u                        \n"
-                "#define HS_BS_WARPS_PER_BLOCK %u                        \n"
-                "#define HS_BC_WARPS_LOG2_MIN  %u                        \n"
-                "#define HS_BC_WARPS_LOG2_MAX  %u                        \n"
-                "#define HS_KEYS_PER_THREAD    %u                        \n"
-                "#define HS_KEY_WORDS          %u                        \n"
-                "#define HS_KEY_TYPE           %s                        \n"
-                "                                                        \n"
-                "#include <%s_args.h>                                    \n"
-                "                                                        \n",
-                hsg_config.warp.lanes,
-                merge->warps,
-                bc_min,
-                bc_max,
-                hsg_config.thread.regs,
-                hsg_config.type.words,
-                type,
-                files[HSG_FILE_TYPE_SOURCE].prefix);
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "// -*- compile-command: \"nvcc -arch sm_52 -Xptxas=-v,-abi=no -cubin -I. %s\"; -*-\n",
-                files[HSG_FILE_TYPE_SOURCE].name);
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "//                                                      \n"
-                "// Copyright 2016 Google Inc.                           \n"
-                "//                                                      \n"
-                "// Use of this source code is governed by a BSD-style   \n"
-                "// license that can be found in the LICENSE file.       \n"
-                "//                                                      \n"
-                "                                                        \n"
-                "#ifdef __cplusplus                                      \n"
-                "extern \"C\" {                                          \n"
-                "#endif                                                  \n"
-                "                                                        \n"
-                "#include \"%s_launcher.h\"                              \n"
-                "                                                        \n"
-                "#ifdef __cplusplus                                      \n"
-                "}                                                       \n"
-                "#endif                                                  \n"
-                "                                                        \n"
-                "#include \"%s_launch_bounds.h\"                         \n"
-                "#include <%s_finalize.inl>                              \n"
-                "                                                        \n"
-                "//                                                      \n"
-                "//                                                      \n"
-                "//                                                      \n",
-                files[HSG_FILE_TYPE_HEADER].prefix,
-                files[HSG_FILE_TYPE_SOURCE].prefix,
-                files[HSG_FILE_TYPE_SOURCE].prefix);
-      }
-      break;
-
-    case HSG_OP_TYPE_FILE_FOOTER:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "                              \n"
-              "//                            \n"
-              "//                            \n"
-              "//                            \n"
-              "                              \n"
-              "#include \"%s_launcher.inl\"  \n"
-              "                              \n"
-              "//                            \n"
-              "//                            \n"
-              "//                            \n",
-              files[HSG_FILE_TYPE_SOURCE].prefix);
-      break;
-
-    case HSG_OP_TYPE_BS_KERNEL_PROTO:
-      {
-        const uint32_t tpb = merge->warps * hsg_config.warp.lanes;
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "                                                \n"
-                "extern \"C\"                                    \n"
-                "__global__                                      \n"
-                "__launch_bounds__(%u,%u)                        \n"
-                "void                                            \n"
-                "hs_bs_kernel(const struct hs_args args) \n",
-                tpb,1);
-      }
-      break;
-
-    case HSG_OP_TYPE_BS_KERNEL_PREAMBLE:
-      {
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "__shared__ union { \n");
-
-        for (uint32_t ii=0; ii<MERGE_LEVELS_MAX_LOG2; ii++)
-          {
-            const struct hsg_merge* const m = merge + ii;
-
-            if (m->warps < 2)
-              break;
-
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "  %s m%u[%u][%u];\n",
-                    type,
-                    ii,
-                    m->rows_bs,
-                    m->warps * hsg_config.warp.lanes);
-          }
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                " struct {             \n"
-                "    %s f[%u][%u];     \n"
-                "    %s l[%u];         \n"
-                " };                   \n",
-                type,
-                merge[0].warps,
-                hsg_config.warp.skpw_bs - 1,
-                type,
-                merge[0].warps);
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "} shared;          \n"
-                "                   \n");
-
-        const uint32_t kpw = hsg_config.warp.lanes * hsg_config.thread.regs;
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "const int32_t block_warp_idx = threadIdx.x / %u;                 \n"
-                "const int32_t warp_lane_idx  = threadIdx.x & %u;                 \n"
-                "const int32_t warp_idx       = blockIdx.x * %u + block_warp_idx; \n"
-                "const int32_t warp_gmem_idx  = warp_idx * %u + warp_lane_idx;    \n"
-                "                                                                 \n"
-                "%s const * const vin_ptr  = args.vin  + warp_gmem_idx;           \n"
-                "%s       * const vout_ptr = args.vout + warp_gmem_idx;           \n"
-                "                                                                 \n",
-
-                hsg_config.warp.lanes,
-                hsg_config.warp.lanes - 1,
-                merge[0].warps,
-                kpw,
-                type,
-                type);
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "if (warp_idx >= args.bs.full + args.bs.frac) \n"
-                "  return;                                    \n"
-                "                                             \n");
-      }
-      break;
-
-    case HSG_OP_TYPE_BC_KERNEL_PROTO:
-      {
-        uint32_t const bc_warps = merge[ops->a].warps;
-        uint32_t const tpb      = bc_warps * hsg_config.warp.lanes;
-        uint32_t const bpm      = hsg_config.block.warps_max / bc_warps;
-        uint32_t const msb      = msb_idx_u32(bc_warps);
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "                                                   \n"
-                "extern \"C\"                                       \n"
-                "__global__                                         \n"
-                "__launch_bounds__(%u,%u)                           \n"
-                "void                                               \n"
-                "hs_bc_%u_kernel(const struct hs_args args) \n",
-                tpb,bpm,
-                msb);
-      }
-      break;
-
-    case HSG_OP_TYPE_BC_KERNEL_PREAMBLE:
-      {
-        const struct hsg_merge* const m = merge + ops->a;
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "__shared__ union {    \n");
-
-        if (m->warps >= 2)
-          {
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "  %s m%u[%u][%u];  \n",
-                    type,
-                    ops->a,
-                    m->rows_bc,
-                    m->warps * hsg_config.warp.lanes);
-          }
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                " struct {             \n"
-                "    %s f[%u][%u];     \n"
-                "    %s l[%u];         \n"
-                " };                   \n"
-                "} shared;             \n"
-                "                      \n",
-                type,m->warps,m->skpw_bc - 1,
-                type,m->warps);
-
-        const uint32_t kpw = hsg_config.warp.lanes * hsg_config.thread.regs;
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "const int32_t block_warp_idx = threadIdx.x / %u;                     \n"
-                "const int32_t warp_lane_idx  = threadIdx.x & %u;                     \n"
-                "const int32_t warp_gmem_base = blockIdx.x * %u * %u + warp_lane_idx; \n"
-                "const int32_t warp_gmem_idx  = warp_gmem_base + block_warp_idx * %u; \n"
-                "                                                                     \n"
-                "%s * const vout_ptr = args.vout + warp_gmem_idx;                     \n"
-                "                                                                     \n",
-                hsg_config.warp.lanes,
-                hsg_config.warp.lanes - 1,
-                m->warps,kpw,
-                kpw,
-                type);
-
-#if 0
-        //
-        // NO LONGER NEED THIS TEST
-        //
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "if (warp_idx >= args.bc.full) \n"
-                "  return;                     \n"
-                "                              \n");
-#endif
-      }
-      break;
-
-    case HSG_OP_TYPE_FM_KERNEL_PROTO:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "                                                   \n"
-              "#define HS_FM_WARPS_LOG2_%u  %u                \n"
-              "extern \"C\"                                       \n"
-              "__global__                                         \n"
-              "HS_FM_LAUNCH_BOUNDS_%u                         \n"
-              "void                                               \n"
-              "hs_fm_%u_kernel(const struct hs_args args) \n",
-              ops->a,
-              ops->b,
-              ops->a - ops->b,
-              ops->a);
-      break;
-
-    case HSG_OP_TYPE_FM_KERNEL_PREAMBLE:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "const int32_t warp_idx      = (blockDim.x * blockIdx.x + threadIdx.x) / %u;              \n"
-              "const int32_t warp_lane_idx = threadIdx.x & %u;                                          \n"
-              "                                                                                         \n"
-              "const int32_t merge_idx     = warp_idx / %u >> %u;                                       \n"
-              "                                                                                         \n"
-              "const int32_t merge_stride  = %u * %u << %u;                                             \n"
-              "const int32_t merge_keys    = merge_stride * %u;                                         \n"
-              "                                                                                         \n"
-              "const int32_t merge_base    = merge_idx * merge_keys;                                    \n"
-              "                                                                                         \n"
-              "const int32_t merge_l_off   = (warp_idx - merge_idx * (%u << %u)) * %u + warp_lane_idx;  \n"
-              "const int32_t merge_l_end   = merge_l_off + merge_stride * (%u / 2 - 1);                 \n"
-              "%s * const merge_l   = args.vout + merge_base + merge_l_off;                             \n"
-              "                                                                                         \n"
-              "const int32_t merge_r_off   = merge_keys - merge_l_end - 1;                              \n"
-              "%s * const merge_r   = args.vout + merge_base + merge_r_off;                             \n"
-              "                                                                                         \n",
-              hsg_config.warp.lanes,
-              hsg_config.warp.lanes-1,
-              hsg_config.thread.regs,ops->b,
-              hsg_config.thread.regs,hsg_config.warp.lanes,ops->b,
-              ops->a,
-              hsg_config.thread.regs,ops->b,hsg_config.warp.lanes,
-              ops->a,
-              type,
-              type);
-      break;
-
-    case HSG_OP_TYPE_HM_KERNEL_PROTO:
-      {
-        const uint32_t bc_max = msb_idx_u32(pow2_rd_u32(merge[0].warps));
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "                                                   \n"
-                "#define HS_HM_WARPS_LOG2_%u  %u                \n"
-                "extern \"C\"                                       \n"
-                "__global__                                         \n"
-                "HS_HM_LAUNCH_BOUNDS_%u                         \n"
-                "void                                               \n"
-                "hs_hm_%u_kernel(const struct hs_args args) \n",
-                ops->a,
-                ops->b,
-                ops->a - ops->b - bc_max,
-                ops->a);
-      }
-      break;
-
-    case HSG_OP_TYPE_HM_KERNEL_PREAMBLE:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "const int32_t warp_idx      = (blockDim.x * blockIdx.x + threadIdx.x) / %u;       \n"
-              "const int32_t warp_lane_idx = threadIdx.x & %u;                                   \n"
-              "                                                                                  \n"
-              "const int32_t merge_idx     = (warp_idx / %u) >> %u;                              \n"
-              "                                                                                  \n"
-              "const int32_t merge_stride  = %u * %u << %u;                                      \n"
-              "const int32_t merge_keys    = merge_stride * %u;                                  \n"
-              "                                                                                  \n"
-              "const int32_t merge_base    = merge_idx * merge_keys;                             \n"
-              "                                                                                  \n"
-              "const int32_t merge_off     = (warp_idx - merge_idx * (%u << %u)) * %u;           \n"
-              "%s * const merge_ptr     = args.vout + merge_base + merge_off + warp_lane_idx;    \n"
-              "                                                                                  \n",
-              hsg_config.warp.lanes,
-              hsg_config.warp.lanes-1,
-              hsg_config.thread.regs,ops->b,
-              hsg_config.thread.regs,hsg_config.warp.lanes,ops->b,
-              ops->a,
-              hsg_config.thread.regs,ops->b,hsg_config.warp.lanes,
-              type);
-      break;
-
-    case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD:
-      {
-        static const char* const vstr[] = { "vin_ptr", "vout_ptr" };
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "%s r%-3u = %s[%-3u * %u]; \n",
-                type,ops->n,vstr[ops->v],ops->n-1,hsg_config.warp.lanes);
-      }
-      break;
-
-    case HSG_OP_TYPE_BX_REG_GLOBAL_STORE:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "vout_ptr[%-3u * %u] = r%u; \n",
-              ops->n-1,hsg_config.warp.lanes,ops->n);
-      break;
-
-#if 0
-    case HSG_OP_TYPE_BX_WARP_STORE_PRED:
-      if (ops->a == 1)
-        {
-          fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                  "if (!args.is_final) \n");
-        }
-      else
-        {
-          fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                  "if (blockIdx.x * %u + block_warp_idx >= args.bx.ru) \n"
-                  "{                                                   \n"
-                  "  return;                                           \n"
-                  "}                                                   \n"
-                  "else if (!args.is_final)                            \n",
-                  ops->a);
-        }
-      break;
-#endif
-
-    case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "%s r%-3u = merge_ptr[%-3u * merge_stride];\n",
-              type,ops->a,ops->b);
-      break;
-
-    case HSG_OP_TYPE_HM_REG_GLOBAL_STORE:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "merge_ptr[%-3u * merge_stride] = r%u;\n",
-              ops->b,ops->a);
-      break;
-
-    case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "%s r%-3u = merge_l[%-3u * merge_stride];\n",
-              type,ops->a,ops->b);
-      break;
-
-    case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "merge_l[%-3u * merge_stride] = r%u;\n",
-              ops->b,ops->a);
-      break;
-
-    case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "%s r%-3u = merge_r[%-3u * merge_stride];\n",
-              type,ops->a,ops->b);
-      break;
-
-    case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "merge_r[%-3u * merge_stride] = r%u;\n",
-              ops->b,ops->a);
-      break;
-
-    case HSG_OP_TYPE_WARP_FLIP:
-      {
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "const int32_t flip_lane_mask = %u;                             \n"
-                "const int32_t flip_lane_idx  = warp_lane_idx ^ flip_lane_mask; \n"
-                "const bool    t_lt           = warp_lane_idx < flip_lane_idx;  \n",
-                ops->n-1);
-      }
-      break;
-
-    case HSG_OP_TYPE_WARP_HALF:
-      {
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "const int32_t half_lane_mask = %u;                             \n"
-                "const int32_t half_lane_idx  = warp_lane_idx ^ half_lane_mask; \n"
-                "const bool    t_lt           = warp_lane_idx < half_lane_idx;  \n",
-                ops->n / 2);
-      }
-      break;
-
-    case HSG_OP_TYPE_CMP_FLIP:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "HS_CMP_FLIP(r%-3u,r%-3u,r%-3u)\n",ops->a,ops->b,ops->c);
-      break;
-
-    case HSG_OP_TYPE_CMP_HALF:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "HS_CMP_HALF(r%-3u,r%-3u)\n",ops->a,ops->b);
-      break;
-
-    case HSG_OP_TYPE_CMP_XCHG:
-      if (ops->c == UINT32_MAX)
-        {
-          fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                  "HS_CMP_XCHG(r%-3u,r%-3u)\n",
-                  ops->a,ops->b);
-        }
-      else
-        {
-          fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                  "HS_CMP_XCHG(r%u_%u,r%u_%u)\n",
-                  ops->c,ops->a,ops->c,ops->b);
-        }
-      break;
-
-    case HSG_OP_TYPE_BS_REG_SHARED_STORE_V:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "smem_v[%-3u * %-2u * %-3u] = r%u;\n",
-              ops->a,hsg_config.warp.lanes,ops->c,ops->b);
-      break;
-
-    case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "r%-3u = smem_v[%-3u * %-2u * %-3u];\n",
-              ops->b,ops->a,hsg_config.warp.lanes,ops->c);
-      break;
-
-    case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "%s r%-3u = smem_v[%-3u * %-2u * %-3u];\n",
-              type,ops->b,ops->a,hsg_config.warp.lanes,ops->c);
-      break;
-
-    case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "smem_l[%5u] = r%u_%u;\n",
-              ops->b * hsg_config.warp.lanes,
-              ops->c,
-              ops->a);
-      break;
-
-    case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "smem_r[%5u] = r%u_%u;\n",
-              ops->b * hsg_config.warp.lanes,
-              ops->c,
-              ops->a);
-      break;
-
-    case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "%s r%u_%-3u = smem_l[%u];\n",
-              type,
-              ops->c,
-              ops->a,
-              ops->b * hsg_config.warp.lanes);
-      break;
-
-    case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "%s r%u_%-3u = smem_r[%u];\n",
-              type,
-              ops->c,
-              ops->a,
-              ops->b * hsg_config.warp.lanes);
-      break;
-
-    case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "%s r%u_%-3u = gmem_l[%u];\n",
-              type,
-              ops->c,
-              ops->a,
-              ops->b * hsg_config.warp.lanes);
-      break;
-
-#if 0
-    case HSG_OP_TYPE_REG_F_PREAMBLE:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "%s*       const f_%u_smem_st_ptr = &shared.f[block_warp_idx]",
-              type,
-              ops->a);
-
-      if (ops->a >= (int32_t)hsg_config.warp.lanes)
-        {
-          fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                  "[warp_lane_idx * %u];\n",
-                  (ops->a / hsg_config.warp.lanes) * hsg_config.warp.lanes + 1);
-        }
-      else
-        {
-          fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                  "[(warp_lane_idx & 0x%X) * %u + (warp_lane_idx & ~0x%X)];\n",
-                  ops->a-1,
-                  hsg_config.warp.lanes + 1,
-                  ops->a-1);
-        }
-
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "const %s* const f_%u_smem_ld_ptr = &shared.f[block_warp_idx][warp_lane_idx];\n",
-              type,
-              ops->a);
-
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "%s*       const f_%u_gmem_st_ptr = args.vout + warp_gmem_idx",
-              type,
-              ops->a);
-
-      if (ops->a >= (int32_t)hsg_config.warp.lanes)
-        {
-          fprintf(files[HSG_FILE_TYPE_SOURCE].file,";\n");
-        }
-      else
-        {
-          fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                  " - warp_lane_idx + (warp_lane_idx & ~0x%X) * %u + (warp_lane_idx & 0x%X);\n",
-                  ops->a-1,
-                  hsg_config.thread.regs,
-                  ops->a-1);
-        }
-      break;
-
-    case HSG_OP_TYPE_REG_SHARED_STORE_F:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "f_%u_smem_st_ptr[%-3u] = r%u;\n",
-              ops->c,
-              ops->b,
-              ops->a);
-      break;
-
-    case HSG_OP_TYPE_REG_SHARED_LOAD_F:
-      if (ops->c >= (int32_t)hsg_config.warp.lanes)
-        {
-          uint32_t const adjacent = ops->c / hsg_config.warp.lanes;
-          uint32_t const stride   = adjacent * hsg_config.warp.lanes + 1;
-
-          fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                  "r%-3u = f_%u_smem_ld_ptr[%-3u];\n",
-                  ops->a,
-                  ops->c,
-                  (ops->b / adjacent) * stride + (ops->b % adjacent) * hsg_config.warp.lanes);
-        }
-      else
-        {
-          fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                  "r%-3u = f_%u_smem_ld_ptr[%-3u];\n",
-                  ops->a,
-                  ops->c,
-                  ops->b * (hsg_config.warp.lanes + 1));
-        }
-      break;
-
-    case HSG_OP_TYPE_REG_GLOBAL_STORE_F:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "f_%u_gmem_st_ptr[%-3u * %u + %-3u] = r%u;\n",
-              ops->c,
-              ops->b,
-              hsg_config.thread.regs, // hsg_config.warp.lanes,
-              (ops->a - 1) & ~(ops->c - 1),
-              ops->a);
-      break;
-#endif
-
-#if 0
-    case HSG_OP_TYPE_FINALIZE:
-      {
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "HS_FINALIZE(%s,args,shared.f[block_warp_idx],shared.l,\n"
-                "                block_warp_idx,warp_lane_idx,warp_gmem_idx,\n"
-                "                r%-3u",
-                ops->a == 1 ? "true" : "false",
-                1);
-
-#define HS_WARP_FINALIZE_PRETTY_PRINT  8
-
-        for (uint32_t r=2; r<=hsg_config.thread.regs; r++)
-          {
-            if (r % HS_WARP_FINALIZE_PRETTY_PRINT == 1)
-              fprintf(files[HSG_FILE_TYPE_SOURCE].file,",\n");
-            else
-              fprintf(files[HSG_FILE_TYPE_SOURCE].file,",");
-
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,"r%-3u",r);
-          }
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,");\n");
-      }
-      break;
-#endif
-
-    case HSG_OP_TYPE_BLOCK_SYNC:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "__syncthreads();\n");
-      break;
-
-    case HSG_OP_TYPE_BS_FRAC_PRED:
-      {
-        if (ops->m == 0)
-          {
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "if (warp_idx < args.bs.full)\n");
-          }
-        else
-          {
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "else if (args.bs.frac == %u)\n",
-                    ops->w);
-          }
-      }
-      break;
-
-#if 0 // DELETED
-    case HSG_OP_TYPE_BX_MERGE_V_PREAMBLE:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "%s * const smem_v = shared.m%u[0] + threadIdx.x; \n",
-              type,ops->a);
-      break;
-#endif
-
-    case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE:
-      if (ops->c == 0)
-        {
-          fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                  "%s * smem_l = shared.m%u[block_warp_idx    ] + warp_lane_idx;        \n"
-                  "%s * smem_r = shared.m%u[block_warp_idx ^ 1] + (warp_lane_idx ^ %u); \n",
-                  type,ops->a,
-                  type,ops->a,hsg_config.warp.lanes-1);
-        }
-      else
-        {
-          fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                  "smem_l = shared.m%u[block_warp_idx    ] + warp_lane_idx;        \n"
-                  "smem_r = shared.m%u[block_warp_idx ^ 1] + (warp_lane_idx ^ %u); \n",
-                  ops->a,
-                  ops->a,hsg_config.warp.lanes-1);
-        }
-      break;
-
-    case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "%s const * const gmem_l = args.vout + (warp_gmem_base + block_warp_idx * %u); \n"
-              "%s       * const smem_l = shared.m%u[block_warp_idx] + warp_lane_idx;         \n"
-              "%s       * const smem_v = shared.m%u[0] + threadIdx.x;                        \n",
-              type,hsg_config.warp.lanes,
-              type,ops->a,
-              type,ops->a);
-      break;
-
-    case HSG_OP_TYPE_BX_MERGE_H_PRED:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "if (threadIdx.x < %u)\n",
-              ops->a * hsg_config.warp.lanes);
-      break;
-
-    case HSG_OP_TYPE_BS_ACTIVE_PRED:
-      {
-        const struct hsg_merge* const m = merge + ops->a;
-
-        if (m->warps <= 32)
-          {
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "if (((1u << block_warp_idx) & 0x%08X) != 0)\n",
-                    m->levels[ops->b].active.b32a2[0]);
-          }
-        else
-          {
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "if (((1UL << block_warp_idx) & 0x%08X%08XL) != 0L)\n",
-                    m->levels[ops->b].active.b32a2[1],
-                    m->levels[ops->b].active.b32a2[0]);
-          }
-      }
-      break;
-
-    case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED:
-      {
-        if (ops->a == ops->b)
-          {
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "if (merge_idx < args.fm.full) \n");
-          }
-        else if (ops->b > 1)
-          {
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "else if (args.fm.frac == %u) \n",
-                    ops->b);
-          }
-        else
-          {
-	    fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-		    "else\n");
-          }
-      }
-      break;
-
-    default:
-      hsg_target_debug(files,merge,ops,depth);
-      break;
-    }
-}
-
-//
-//
-//
diff --git a/src/compute/hs/gen/target_debug.c b/src/compute/hs/gen/target_debug.c
new file mode 100644
index 0000000000..1481ca8041
--- /dev/null
+++ b/src/compute/hs/gen/target_debug.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+//
+//
+
+#include "gen.h"
+
+//
+//
+//
+
+#define HSG_INDENT  2
+
+//
+//
+//
+
+struct hsg_target_state
+{
+  FILE * txt;
+};
+
+//
+//
+//
+
+void
+hsg_target_indent(struct hsg_target * const target, uint32_t const depth)
+{
+  fprintf(target->state->txt,
+          "%*s",
+          depth*HSG_INDENT,"");
+}
+
+void
+hsg_target_debug(struct hsg_target       * const target,
+                 struct hsg_config const * const config,
+                 struct hsg_merge  const * const merge,
+                 struct hsg_op     const * const ops,
+                 uint32_t                  const depth)
+{
+  if (ops->type == HSG_OP_TYPE_TARGET_BEGIN)
+    {
+      target->state = malloc(sizeof(*target->state));
+      fopen_s(&target->state->txt,"hs_debug.txt","wb");
+    }
+
+  hsg_target_indent(target,depth);
+
+  fprintf(target->state->txt,
+          "%s\n",
+          hsg_op_type_string[ops->type]);
+
+  if (ops->type == HSG_OP_TYPE_TARGET_END)
+    {
+      fclose(target->state->txt);
+      free(target->state);
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/hs/gen/target_glsl.c b/src/compute/hs/gen/target_glsl.c
new file mode 100644
index 0000000000..2bb75797ab
--- /dev/null
+++ b/src/compute/hs/gen/target_glsl.c
@@ -0,0 +1,674 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+//
+//
+
+#include "gen.h"
+#include "transpose.h"
+
+#include "common/util.h"
+#include "common/macros.h"
+
+//
+//
+//
+
+struct hsg_transpose_state
+{
+  FILE                    * header;
+  struct hsg_config const * config;
+};
+
+static
+char
+hsg_transpose_reg_prefix(uint32_t const cols_log2)
+{
+  return 'a' + (('r' + cols_log2 - 'a') % 26);
+}
+
+static
+void
+hsg_transpose_blend(uint32_t                     const cols_log2,
+                    uint32_t                     const row_ll, // lower-left
+                    uint32_t                     const row_ur, // upper-right
+                    struct hsg_transpose_state * const state)
+{
+  // we're starting register names at '1' for now
+  fprintf(state->header,
+          "  HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n",
+          hsg_transpose_reg_prefix(cols_log2-1),
+          hsg_transpose_reg_prefix(cols_log2),
+          cols_log2,row_ll+1,row_ur+1);
+}
+
+static
+void
+hsg_transpose_remap(uint32_t                     const row_from,
+                    uint32_t                     const row_to,
+                    struct hsg_transpose_state * const state)
+{
+  // we're starting register names at '1' for now
+  fprintf(state->header,
+          "  HS_TRANSPOSE_REMAP( %c, %3u, %3u )        \\\n",
+          hsg_transpose_reg_prefix(state->config->warp.lanes_log2),
+          row_from+1,row_to+1);
+}
+
+//
+//
+//
+
+static
+void
+hsg_copyright(FILE * file)
+{
+  fprintf(file,
+          "//                                                              \n"
+          "// Copyright 2016 Google Inc.                                   \n"
+          "//                                                              \n"
+          "// Use of this source code is governed by a BSD-style           \n"
+          "// license that can be found in the LICENSE file.               \n"
+          "//                                                              \n"
+          "\n");
+}
+
+static
+void
+hsg_macros(FILE * file)
+{
+  fprintf(file,
+          "#include \"hs_glsl_macros.h\" \n"
+          "                              \n"
+          "//                            \n"
+          "//                            \n"
+          "//                            \n"
+          "\n");
+}
+
+//
+//
+//
+
+struct hsg_target_state
+{
+  FILE * header;
+  FILE * embeds;
+  FILE * source;
+};
+
+//
+//
+//
+
+void
+hsg_target_glsl(struct hsg_target       * const target,
+                struct hsg_config const * const config,
+                struct hsg_merge  const * const merge,
+                struct hsg_op     const * const ops,
+                uint32_t                  const depth)
+{
+  switch (ops->type)
+    {
+    case HSG_OP_TYPE_END:
+      fprintf(target->state->source,
+              "}\n");
+
+      if (depth == 0) {
+        fclose(target->state->source);
+        target->state->source = NULL;
+      }
+      break;
+
+    case HSG_OP_TYPE_BEGIN:
+      fprintf(target->state->source,
+              "{\n");
+      break;
+
+    case HSG_OP_TYPE_ELSE:
+      fprintf(target->state->source,
+              "else\n");
+      break;
+
+    case HSG_OP_TYPE_TARGET_BEGIN:
+      {
+        // allocate state
+        target->state = malloc(sizeof(*target->state));
+
+        // allocate files
+        fopen_s(&target->state->header,"hs_glsl.h",   "wb");
+        fopen_s(&target->state->embeds,"hs_kernels.h","wb");
+
+        hsg_copyright(target->state->header);
+        hsg_copyright(target->state->embeds);
+
+        // initialize header
+        uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps));
+
+        fprintf(target->state->header,
+                "#ifndef HS_GLSL_ONCE                                            \n"
+                "#define HS_GLSL_ONCE                                            \n"
+                "                                                                \n"
+                "#define HS_SLAB_THREADS_LOG2    %u                              \n"
+                "#define HS_SLAB_THREADS         (1 << HS_SLAB_THREADS_LOG2)     \n"
+                "#define HS_SLAB_WIDTH_LOG2      %u                              \n"
+                "#define HS_SLAB_WIDTH           (1 << HS_SLAB_WIDTH_LOG2)       \n"
+                "#define HS_SLAB_HEIGHT          %u                              \n"
+                "#define HS_SLAB_KEYS            (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)\n"
+                "#define HS_REG_LAST(c)          c##%u                           \n"
+                "#define HS_KEY_TYPE             %s                              \n"
+                "#define HS_KEY_WORDS            %u                              \n"
+                "#define HS_VAL_WORDS            0                               \n"
+                "#define HS_BS_SLABS             %u                              \n"
+                "#define HS_BS_SLABS_LOG2_RU     %u                              \n"
+                "#define HS_BC_SLABS_LOG2_MAX    %u                              \n"
+                "#define HS_FM_SCALE_MIN         %u                              \n"
+                "#define HS_FM_SCALE_MAX         %u                              \n"
+                "#define HS_HM_SCALE_MIN         %u                              \n"
+                "#define HS_HM_SCALE_MAX         %u                              \n"
+                "#define HS_EMPTY                                                \n"
+                "                                                                \n",
+                config->warp.lanes_log2,
+                config->warp.lanes_log2,
+                config->thread.regs,
+                config->thread.regs,
+                (config->type.words == 2) ? "uint64_t" : "uint32_t",
+                config->type.words,
+                merge->warps,
+                msb_idx_u32(pow2_ru_u32(merge->warps)),
+                bc_max,
+                config->merge.flip.lo,
+                config->merge.flip.hi,
+                config->merge.half.lo,
+                config->merge.half.hi);
+
+        fprintf(target->state->header,
+                "#define HS_SLAB_ROWS()    \\\n");
+
+        for (uint32_t ii=1; ii<=config->thread.regs; ii++)
+          fprintf(target->state->header,
+                  "  HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1);
+
+        fprintf(target->state->header,
+                "  HS_EMPTY\n"
+                "          \n");
+
+        fprintf(target->state->header,
+                "#define HS_TRANSPOSE_SLAB()                \\\n");
+
+        for (uint32_t ii=1; ii<=config->warp.lanes_log2; ii++)
+          fprintf(target->state->header,
+                  "  HS_TRANSPOSE_STAGE( %u )                  \\\n",ii);
+
+        struct hsg_transpose_state state[1] =
+          {
+           { .header = target->state->header,
+             .config = config
+           }
+          };
+
+        hsg_transpose(config->warp.lanes_log2,
+                      config->thread.regs,
+                      hsg_transpose_blend,state,
+                      hsg_transpose_remap,state);
+
+        fprintf(target->state->header,
+                "  HS_EMPTY\n"
+                "          \n");
+
+#if 0
+        fprintf(target->state->source,
+                "#include <hs_glsl_macros.h> \n"
+                "                            \n"
+                "//                          \n"
+                "//                          \n"
+                "//                          \n");
+#endif
+      }
+      break;
+
+    case HSG_OP_TYPE_TARGET_END:
+      // decorate the files
+      fprintf(target->state->header,
+              "#endif \n"
+              "       \n"
+              "//     \n"
+              "//     \n"
+              "//     \n"
+              "       \n");
+
+      // close files
+      fclose(target->state->header);
+      fclose(target->state->embeds);
+
+      // free state
+      free(target->state);
+      break;
+
+    case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO:
+      {
+        fprintf(target->state->embeds,
+                "#include \"hs_transpose.len.xxd\"\n,\n"
+                "#include \"hs_transpose.spv.xxd\"\n,\n");
+
+        fopen_s(&target->state->source,"hs_transpose.comp","w+");
+
+        hsg_copyright(target->state->source);
+
+        hsg_macros(target->state->source);
+
+        fprintf(target->state->source,
+                "HS_TRANSPOSE_KERNEL_PROTO(%u)\n",
+                config->warp.lanes);
+      }
+      break;
+
+    case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE:
+      {
+        fprintf(target->state->source,
+                "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+                config->warp.lanes,config->thread.regs);
+      }
+      break;
+
+    case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY:
+      {
+        fprintf(target->state->source,
+                "HS_TRANSPOSE_SLAB()\n");
+      }
+      break;
+
+    case HSG_OP_TYPE_BS_KERNEL_PROTO:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        uint32_t const bs  = pow2_ru_u32(m->warps);
+        uint32_t const msb = msb_idx_u32(bs);
+
+        fprintf(target->state->embeds,
+                "#include \"hs_bs_%u.len.xxd\"\n,\n"
+                "#include \"hs_bs_%u.spv.xxd\"\n,\n",
+                msb,
+                msb);
+
+        char filename[] = { "hs_bs_123.comp" };
+        sprintf_s(filename,sizeof(filename),"hs_bs_%u.comp",msb);
+        fopen_s(&target->state->source,filename,"w+");
+
+        hsg_copyright(target->state->source);
+
+        hsg_macros(target->state->source);
+
+        if (m->warps > 1)
+          {
+            fprintf(target->state->source,
+                    "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+                    m->warps * config->warp.lanes,
+                    m->rows_bs);
+          }
+
+        fprintf(target->state->source,
+                "HS_BS_KERNEL_PROTO(%u,%u,%u)\n",
+                config->warp.lanes,m->warps,msb);
+      }
+      break;
+
+    case HSG_OP_TYPE_BS_KERNEL_PREAMBLE:
+      fprintf(target->state->source,
+              "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+              config->warp.lanes,config->thread.regs);
+      break;
+
+    case HSG_OP_TYPE_BC_KERNEL_PROTO:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        uint32_t const msb = msb_idx_u32(m->warps);
+
+        fprintf(target->state->embeds,
+                "#include \"hs_bc_%u.len.xxd\"\n,\n"
+                "#include \"hs_bc_%u.spv.xxd\"\n,\n",
+                msb,
+                msb);
+
+        char filename[] = { "hs_bc_123.comp" };
+        sprintf_s(filename,sizeof(filename),"hs_bc_%u.comp",msb);
+        fopen_s(&target->state->source,filename,"w+");
+
+        hsg_copyright(target->state->source);
+
+        hsg_macros(target->state->source);
+
+        if (m->warps > 1)
+          {
+            fprintf(target->state->source,
+                    "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+                    m->warps * config->warp.lanes,
+                    m->rows_bc);
+          }
+
+        fprintf(target->state->source,
+                "HS_BC_KERNEL_PROTO(%u,%u,%u)\n",
+                config->warp.lanes,m->warps,msb);
+      }
+      break;
+
+    case HSG_OP_TYPE_BC_KERNEL_PREAMBLE:
+      fprintf(target->state->source,
+              "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+              config->warp.lanes,config->thread.regs);
+      break;
+
+    case HSG_OP_TYPE_FM_KERNEL_PROTO:
+      {
+        fprintf(target->state->embeds,
+                "#include \"hs_fm_%u_%u.len.xxd\"\n,\n"
+                "#include \"hs_fm_%u_%u.spv.xxd\"\n,\n",
+                ops->a,ops->b,
+                ops->a,ops->b);
+
+        char filename[] = { "hs_fm_123_123.comp" };
+        sprintf_s(filename,sizeof(filename),"hs_fm_%u_%u.comp",ops->a,ops->b);
+        fopen_s(&target->state->source,filename,"w+");
+
+        hsg_copyright(target->state->source);
+
+        hsg_macros(target->state->source);
+
+        fprintf(target->state->source,
+                "HS_FM_KERNEL_PROTO(%u,%u)\n",
+                ops->a,ops->b);
+      }
+      break;
+
+    case HSG_OP_TYPE_FM_KERNEL_PREAMBLE:
+      fprintf(target->state->source,
+              "HS_FM_PREAMBLE(%u);\n",
+              ops->a);
+      break;
+
+    case HSG_OP_TYPE_HM_KERNEL_PROTO:
+      {
+        fprintf(target->state->embeds,
+                "#include \"hs_hm_%u_%u.len.xxd\"\n,\n"
+                "#include \"hs_hm_%u_%u.spv.xxd\"\n,\n",
+                ops->a,ops->b,
+                ops->a,ops->b);
+
+        char filename[] = { "hs_hm_123_123.comp" };
+        sprintf_s(filename,sizeof(filename),"hs_hm_%u_%u.comp",ops->a,ops->b);
+        fopen_s(&target->state->source,filename,"w+");
+
+        hsg_copyright(target->state->source);
+
+        hsg_macros(target->state->source);
+
+        fprintf(target->state->source,
+                "HS_HM_KERNEL_PROTO(%u)\n",
+                ops->a);
+      }
+      break;
+
+    case HSG_OP_TYPE_HM_KERNEL_PREAMBLE:
+      fprintf(target->state->source,
+              "HS_HM_PREAMBLE(%u);\n",
+              ops->a);
+      break;
+
+    case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD:
+      {
+        static char const * const vstr[] = { "vin", "vout" };
+
+        fprintf(target->state->source,
+                "HS_KEY_TYPE r%-3u = HS_SLAB_GLOBAL_LOAD(%s,%u,%u);\n",
+                ops->n,vstr[ops->v],config->warp.lanes,ops->n-1);
+      }
+      break;
+
+    case HSG_OP_TYPE_BX_REG_GLOBAL_STORE:
+      fprintf(target->state->source,
+              "HS_SLAB_GLOBAL_STORE(%u,%u,r%u);\n",
+              config->warp.lanes,ops->n-1,ops->n);
+      break;
+
+    case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+              ops->a,ops->b);
+      break;
+
+    case HSG_OP_TYPE_HM_REG_GLOBAL_STORE:
+      fprintf(target->state->source,
+              "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+              ops->b,ops->a);
+      break;
+
+    case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+              ops->a,ops->b);
+      break;
+
+    case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT:
+      fprintf(target->state->source,
+              "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+              ops->b,ops->a);
+      break;
+
+    case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%-3u = HS_FM_GLOBAL_LOAD_R(%u);\n",
+              ops->b,ops->a);
+      break;
+
+    case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT:
+      fprintf(target->state->source,
+              "HS_FM_GLOBAL_STORE_R(%-3u,r%u);\n",
+              ops->a,ops->b);
+      break;
+
+    case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED:
+      {
+        if (ops->a <= ops->b)
+          {
+            fprintf(target->state->source,
+                    "if (HS_FM_IS_NOT_LAST_SPAN() || (fm_frac == 0))\n");
+          }
+        else if (ops->b > 1)
+          {
+            fprintf(target->state->source,
+                    "else if (fm_frac == %u)\n",
+                    ops->b);
+          }
+        else
+          {
+	    fprintf(target->state->source,
+		    "else\n");
+          }
+      }
+      break;
+
+    case HSG_OP_TYPE_SLAB_FLIP:
+      fprintf(target->state->source,
+              "HS_SLAB_FLIP_PREAMBLE(%u);\n",
+              ops->n-1);
+      break;
+
+    case HSG_OP_TYPE_SLAB_HALF:
+      fprintf(target->state->source,
+              "HS_SLAB_HALF_PREAMBLE(%u);\n",
+              ops->n / 2);
+      break;
+
+    case HSG_OP_TYPE_CMP_FLIP:
+      fprintf(target->state->source,
+              "HS_CMP_FLIP(%-3u,r%-3u,r%-3u);\n",ops->a,ops->b,ops->c);
+      break;
+
+    case HSG_OP_TYPE_CMP_HALF:
+      fprintf(target->state->source,
+              "HS_CMP_HALF(%-3u,r%-3u);\n",ops->a,ops->b);
+      break;
+
+    case HSG_OP_TYPE_CMP_XCHG:
+      if (ops->c == UINT32_MAX)
+        {
+          fprintf(target->state->source,
+                  "HS_CMP_XCHG(r%-3u,r%-3u);\n",
+                  ops->a,ops->b);
+        }
+      else
+        {
+          fprintf(target->state->source,
+                  "HS_CMP_XCHG(r%u_%u,r%u_%u);\n",
+                  ops->c,ops->a,ops->c,ops->b);
+        }
+      break;
+
+    case HSG_OP_TYPE_BS_REG_SHARED_STORE_V:
+      fprintf(target->state->source,
+              "HS_BX_LOCAL_V(%-3u * %-2u * %-3u) = r%u;\n",
+              merge[ops->a].warps,config->warp.lanes,ops->c,ops->b);
+      break;
+
+    case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V:
+      fprintf(target->state->source,
+              "r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+              ops->b,merge[ops->a].warps,config->warp.lanes,ops->c);
+      break;
+
+    case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+              ops->b,ops->a,config->warp.lanes,ops->c);
+      break;
+
+    case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT:
+      fprintf(target->state->source,
+              "HS_SLAB_LOCAL_L(%5u) = r%u_%u;\n",
+              ops->b * config->warp.lanes,
+              ops->c,
+              ops->a);
+      break;
+
+    case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT:
+      fprintf(target->state->source,
+              "HS_SLAB_LOCAL_R(%5u) = r%u_%u;\n",
+              ops->b * config->warp.lanes,
+              ops->c,
+              ops->a);
+      break;
+
+    case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_L(%u);\n",
+              ops->c,
+              ops->a,
+              ops->b * config->warp.lanes);
+      break;
+
+    case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_R(%u);\n",
+              ops->c,
+              ops->a,
+              ops->b * config->warp.lanes);
+      break;
+
+    case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%u_%-3u = HS_BC_GLOBAL_LOAD_L(%u,%u);\n",
+              ops->c,
+              ops->a,
+              config->warp.lanes,ops->b);
+      break;
+
+    case HSG_OP_TYPE_BLOCK_SYNC:
+      fprintf(target->state->source,
+              "HS_BLOCK_BARRIER();\n");
+      //
+      // FIXME - Named barriers to allow coordinating warps to proceed?
+      //
+      break;
+
+    case HSG_OP_TYPE_BS_FRAC_PRED:
+      {
+        if (ops->m == 0)
+          {
+            fprintf(target->state->source,
+                    "if (warp_idx < bs_full)\n");
+          }
+        else
+          {
+            fprintf(target->state->source,
+                    "else if (bs_frac == %u)\n",
+                    ops->w);
+          }
+      }
+      break;
+
+    case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        fprintf(target->state->source,
+                "HS_BS_MERGE_H_PREAMBLE(%u,%u);\n",
+                config->warp.lanes,m->warps);
+      }
+      break;
+
+    case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        fprintf(target->state->source,
+                "HS_BC_MERGE_H_PREAMBLE(%u,%u,%u);\n",
+                config->warp.lanes,config->thread.regs,m->warps);
+      }
+      break;
+
+    case HSG_OP_TYPE_BX_MERGE_H_PRED:
+      fprintf(target->state->source,
+              "if (get_sub_group_id() < %u)\n",
+              ops->a);
+      break;
+
+    case HSG_OP_TYPE_BS_ACTIVE_PRED:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        if (m->warps <= 32)
+          {
+            fprintf(target->state->source,
+                    "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n",
+                    m->levels[ops->b].active.b32a2[0]);
+          }
+        else
+          {
+            fprintf(target->state->source,
+                    "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n",
+                    m->levels[ops->b].active.b32a2[1],
+                    m->levels[ops->b].active.b32a2[0]);
+          }
+      }
+      break;
+
+    default:
+      fprintf(stderr,"type not found: %s\n",hsg_op_type_string[ops->type]);
+      exit(EXIT_FAILURE);
+      break;
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/hs/gen/target_igp_genx.c b/src/compute/hs/gen/target_igp_genx.c
deleted file mode 100644
index 3d0f2bc1b8..0000000000
--- a/src/compute/hs/gen/target_igp_genx.c
+++ /dev/null
@@ -1,672 +0,0 @@
-/*
- * Copyright 2016 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#include <stdio.h>
-
-//
-//
-//
-
-#include "gen.h"
-#include "util.h"
-#include "macros.h"
-#include "transpose.h"
-
-//
-//
-//
-
-static
-char
-hsg_transpose_reg_prefix(uint32_t const cols_log2)
-{
-  return 'a' + (('r' + cols_log2 - 'a') % 26);
-}
-
-static
-void
-hsg_transpose_blend(uint32_t const cols_log2,
-                    uint32_t const row_ll, // lower-left
-                    uint32_t const row_ur, // upper-right
-                    FILE *         file)
-{
-  // we're starting register names at '1' for now
-  fprintf(file,
-          "  HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n",
-          hsg_transpose_reg_prefix(cols_log2-1),
-          hsg_transpose_reg_prefix(cols_log2),
-          cols_log2,row_ll+1,row_ur+1);
-}
-
-static
-void
-hsg_transpose_remap(uint32_t const row_from,
-                    uint32_t const row_to,
-                    FILE *         file)
-{
-  // we're starting register names at '1' for now
-  fprintf(file,
-          "  HS_TRANSPOSE_REMAP( %c, %3u, %3u )        \\\n",
-          hsg_transpose_reg_prefix(msb_idx_u32(hsg_config.warp.lanes)),
-          row_from+1,row_to+1);
-}
-
-//
-//
-//
-
-void
-hsg_target_igp_genx(struct hsg_file        * const files,
-                    struct hsg_merge const * const merge,
-                    struct hsg_op    const * const ops,
-                    uint32_t                 const depth)
-{
-  switch (ops->type)
-    {
-    case HSG_OP_TYPE_END:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "}\n");
-      break;
-
-    case HSG_OP_TYPE_BEGIN:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "{\n");
-      break;
-
-    case HSG_OP_TYPE_ELSE:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "else\n");
-      break;
-
-    case HSG_OP_TYPE_FILE_HEADER:
-      {
-        uint32_t const bc_max          = msb_idx_u32(pow2_rd_u32(merge->warps));
-        uint32_t const warp_lanes_log2 = msb_idx_u32(hsg_config.warp.lanes);
-
-        fprintf(files[HSG_FILE_TYPE_HEADER].file,
-                "//                                                            \n"
-                "// Copyright 2016 Google Inc.                                 \n"
-                "//                                                            \n"
-                "// Use of this source code is governed by a BSD-style         \n"
-                "// license that can be found in the LICENSE file.             \n"
-                "//                                                            \n"
-                "                                                              \n"
-                "#ifndef HS_CL_ONCE                                            \n"
-                "#define HS_CL_ONCE                                            \n"
-                "                                                              \n"
-                "#define HS_LANES_PER_WARP_LOG2  %u                            \n"
-                "#define HS_LANES_PER_WARP       (1 << HS_LANES_PER_WARP_LOG2) \n"
-                "#define HS_BS_WARPS             %u                            \n"
-                "#define HS_BS_WARPS_LOG2_RU     %u                            \n"
-                "#define HS_BC_WARPS_LOG2_MAX    %u                            \n"
-                "#define HS_FM_BLOCKS_LOG2_MIN   %u                            \n"
-                "#define HS_HM_BLOCKS_LOG2_MIN   %u                            \n"
-                "#define HS_KEYS_PER_LANE        %u                            \n"
-                "#define HS_REG_LAST(c)          c##%u                         \n"
-                "#define HS_KEY_WORDS            %u                            \n"
-                "#define HS_KEY_TYPE             %s                            \n"
-                "#define HS_EMPTY                                              \n"
-                "                                                              \n",
-                warp_lanes_log2,
-                merge->warps,
-                msb_idx_u32(pow2_ru_u32(merge->warps)),
-                bc_max,
-                hsg_config.merge.flip.lo,
-                hsg_config.merge.half.lo,
-                hsg_config.thread.regs,
-                hsg_config.thread.regs,
-                hsg_config.type.words,
-                (hsg_config.type.words == 2) ? "ulong" : "uint");
-
-        fprintf(files[HSG_FILE_TYPE_HEADER].file,
-                "#define HS_SLAB_ROWS()    \\\n");
-
-        for (uint32_t ii=1; ii<=hsg_config.thread.regs; ii++)
-          fprintf(files[HSG_FILE_TYPE_HEADER].file,
-                  "  HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1);
-
-        fprintf(files[HSG_FILE_TYPE_HEADER].file,
-                "  HS_EMPTY\n"
-                "          \n");
-
-        fprintf(files[HSG_FILE_TYPE_HEADER].file,
-                "#define HS_TRANSPOSE_SLAB()                \\\n");
-
-        for (uint32_t ii=1; ii<=warp_lanes_log2; ii++)
-          fprintf(files[HSG_FILE_TYPE_HEADER].file,
-                  "  HS_TRANSPOSE_STAGE( %u )                  \\\n",ii);
-
-        hsg_transpose(msb_idx_u32(hsg_config.warp.lanes),
-                      hsg_config.thread.regs,
-                      files[HSG_FILE_TYPE_HEADER].file,
-                      files[HSG_FILE_TYPE_HEADER].file,
-                      hsg_transpose_blend,
-                      hsg_transpose_remap);
-
-        fprintf(files[HSG_FILE_TYPE_HEADER].file,
-                "  HS_EMPTY\n"
-                "          \n");
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "//                                                      \n"
-                "// Copyright 2016 Google Inc.                           \n"
-                "//                                                      \n"
-                "// Use of this source code is governed by a BSD-style   \n"
-                "// license that can be found in the LICENSE file.       \n"
-                "//                                                      \n"
-                "                                                        \n"
-                "#include <%s_macros.h>                                  \n"
-                "                                                        \n"
-                "//                                                      \n"
-                "//                                                      \n"
-                "//                                                      \n",
-                files[HSG_FILE_TYPE_SOURCE].prefix);
-      }
-      break;
-
-    case HSG_OP_TYPE_FILE_FOOTER:
-      fprintf(files[HSG_FILE_TYPE_HEADER].file,
-              "                                \n"
-              "#endif                          \n"
-              "                                \n"
-              "//                              \n"
-              "//                              \n"
-              "//                              \n"
-              "                                \n");
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "                                \n"
-              "//                              \n"
-              "//                              \n"
-              "//                              \n"
-              "                                \n");
-      break;
-
-    case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO:
-      {
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                " \n"
-                "__kernel                                                             \n"
-                "__attribute__((intel_reqd_sub_group_size(%u)))                       \n"
-                "void hs_kernel_transpose(__global HS_KEY_TYPE * const restrict vout) \n",
-                hsg_config.warp.lanes);
-      }
-      break;
-
-    case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE:
-      {
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "uint const global_id = get_global_id(0);                         \n"
-                "uint const gmem_idx  = (global_id / %u) * %u + (global_id & %u); \n"
-                "                                                                 \n",
-                hsg_config.warp.lanes,
-                hsg_config.warp.lanes * hsg_config.thread.regs,
-                hsg_config.warp.lanes-1);
-      }
-      break;
-
-    case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY:
-      {
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "HS_TRANSPOSE_SLAB()\n");
-      }
-      break;
-
-    case HSG_OP_TYPE_BS_KERNEL_PROTO:
-      {
-        struct hsg_merge const * const m = merge + ops->a;
-
-        uint32_t const tpb = m->warps * hsg_config.warp.lanes;
-        uint32_t const bs  = pow2_ru_u32(m->warps);
-        uint32_t const msb = msb_idx_u32(bs);
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                " \n"
-                "__kernel                                                               \n"
-                "__attribute__((reqd_work_group_size(%u,1,1)))                          \n"
-                "__attribute__((intel_reqd_sub_group_size(%u)))                         \n"
-                "void hs_kernel_bs_%u(__global HS_KEY_TYPE const * const restrict vin,  \n"
-                "                     __global HS_KEY_TYPE       * const restrict vout) \n",
-                tpb,
-                hsg_config.warp.lanes,
-                msb);
-      }
-      break;
-
-    case HSG_OP_TYPE_BS_KERNEL_PREAMBLE:
-      {
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "__local union { \n");
-
-        struct hsg_merge const * const m = merge + ops->a;
-
-        if (m->warps > 1)
-          {
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "  HS_KEY_TYPE m[%u * %u];\n",
-                    m->rows_bs,
-                    m->warps * hsg_config.warp.lanes);
-          }
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "} shared;          \n"
-                "                   \n");
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "uint const global_id = get_global_id(0);                         \n"
-                "uint const gmem_idx  = (global_id / %u) * %u + (global_id & %u); \n"
-                "                                                                 \n",
-                hsg_config.warp.lanes,
-                hsg_config.warp.lanes * hsg_config.thread.regs,
-                hsg_config.warp.lanes-1);
-      }
-      break;
-
-    case HSG_OP_TYPE_BC_KERNEL_PROTO:
-      {
-        uint32_t const bc_max = pow2_rd_u32(merge[0].warps);
-        uint32_t const tpb    = bc_max * hsg_config.warp.lanes;
-        uint32_t const msb    = msb_idx_u32(merge[ops->a].warps);
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "                                                                 \n"
-                "__kernel                                                         \n"
-                "__attribute__((intel_reqd_sub_group_size(%u)))                   \n"
-                "void hs_kernel_bc_%u(__global HS_KEY_TYPE * const restrict vout) \n",
-                hsg_config.warp.lanes,msb);
-      }
-      break;
-
-    case HSG_OP_TYPE_BC_KERNEL_PREAMBLE:
-      {
-        struct hsg_merge const * const m      = merge + ops->a;
-        uint32_t                 const bc_max = pow2_rd_u32(merge[0].warps);
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "__local union {    \n");
-
-        if (m->warps > 1)
-          {
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "  HS_KEY_TYPE m[%-3u * %u];\n",
-                    m->rows_bc,
-                    m->warps * hsg_config.warp.lanes);
-          }
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "} shared;          \n"
-                "                   \n");
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "uint const global_id = get_global_id(0);                         \n"
-                "uint const gmem_idx  = (global_id / %u) * %u + (global_id & %u); \n"
-                "                                                                 \n",
-                hsg_config.warp.lanes,
-                hsg_config.warp.lanes * hsg_config.thread.regs,
-                hsg_config.warp.lanes-1);
-      }
-      break;
-
-    case HSG_OP_TYPE_FM_KERNEL_PROTO:
-      fprintf(files[HSG_FILE_TYPE_HEADER].file,
-              "#define HS_FM_BLOCKS_LOG2_%-2u   %u \n",
-              ops->a,ops->b);
-
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              " \n"
-              "__kernel                                                         \n"
-              "__attribute__((intel_reqd_sub_group_size(%u)))                   \n"
-              "void hs_kernel_fm_%u(__global HS_KEY_TYPE * const restrict vout, \n"
-              "                     uint const fm_full,                         \n"
-              "                     uint const fm_frac)                         \n",
-              hsg_config.warp.lanes,ops->a);
-      break;
-
-    case HSG_OP_TYPE_FM_KERNEL_PREAMBLE:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "uint const    global_id     = (uint)get_global_id(0);                                   \n"
-              "uint const    warp_idx      = global_id / %u;                                           \n"
-              "uint const    warp_lane_idx = global_id & %u;                                           \n"
-              "                                                                                        \n"
-              "uint const    merge_idx     = warp_idx / %u >> %u;                                      \n"
-              "                                                                                        \n"
-              "uint const    merge_stride  = %u * %u << %u;                                            \n"
-              "uint const    merge_keys    = merge_stride * %u;                                        \n"
-              "                                                                                        \n"
-              "uint const    merge_base    = merge_idx * merge_keys;                                   \n"
-              "                                                                                        \n"
-              "uint const    merge_l_off   = (warp_idx - merge_idx * (%u << %u)) * %u + warp_lane_idx; \n"
-              "uint const    merge_l_end   = merge_stride * (%u / 2 - 1) + merge_l_off;                \n"
-              "                                                                                        \n"
-              "int  const    merge_r_off   = merge_keys - merge_l_end - 1;                             \n"
-              "                                                                                        \n"
-              "__global HS_KEY_TYPE * const restrict merge_l = vout + (merge_base + merge_l_off);      \n"
-              "__global HS_KEY_TYPE * const restrict merge_r = vout + (merge_base + merge_r_off);      \n"
-              "                                                                                        \n",
-              hsg_config.warp.lanes,
-              hsg_config.warp.lanes-1,
-              hsg_config.thread.regs,ops->b,
-              hsg_config.thread.regs,hsg_config.warp.lanes,ops->b,
-              ops->a,
-              hsg_config.thread.regs,ops->b,hsg_config.warp.lanes,
-              ops->a);
-      break;
-
-    case HSG_OP_TYPE_HM_KERNEL_PROTO:
-      {
-        uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge[0].warps));
-
-        fprintf(files[HSG_FILE_TYPE_HEADER].file,
-                "#define HS_HM_BLOCKS_LOG2_%-2u   %u \n",
-                ops->a,ops->b);
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                " \n"
-                "__kernel                                                         \n"
-                "__attribute__((intel_reqd_sub_group_size(%u)))                   \n"
-                "void hs_kernel_hm_%u(__global HS_KEY_TYPE * const restrict vout) \n",
-                hsg_config.warp.lanes,ops->a);
-      }
-      break;
-
-    case HSG_OP_TYPE_HM_KERNEL_PREAMBLE:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "uint const    global_id     = (uint)get_global_id(0);                          \n"
-              "uint const    warp_idx      = global_id / %u;                                  \n"
-              "uint const    warp_lane_idx = global_id & %u;                                  \n"
-              "                                                                               \n"
-              "uint const    merge_idx     = (warp_idx / %u) >> %u;                           \n"
-              "                                                                               \n"
-              "uint const    merge_stride  = %u * %u << %u;                                   \n"
-              "uint const    merge_keys    = merge_stride * %u;                               \n"
-              "                                                                               \n"
-              "uint const    merge_base    = merge_idx * merge_keys;                          \n"
-              "uint const    merge_off     = (warp_idx - merge_idx * (%u << %u)) * %u;        \n"
-              "                                                                               \n"
-              "__global HS_KEY_TYPE * const restrict merge_ptr = vout + (merge_base + merge_off + warp_lane_idx); \n"
-              "                                                                               \n",
-              hsg_config.warp.lanes,
-              hsg_config.warp.lanes-1,
-              hsg_config.thread.regs,ops->b,
-              hsg_config.thread.regs,hsg_config.warp.lanes,ops->b,
-              ops->a,
-              hsg_config.thread.regs,ops->b,hsg_config.warp.lanes);
-      break;
-
-    case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD:
-      {
-        static char const * const vstr[] = { "vin", "vout" };
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "HS_KEY_TYPE r%-3u = (%s + gmem_idx)[%-3u * %u]; \n",
-                ops->n,vstr[ops->v],ops->n-1,hsg_config.warp.lanes);
-      }
-      break;
-
-    case HSG_OP_TYPE_BX_REG_GLOBAL_STORE:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "(vout + gmem_idx)[%-3u * %u] = r%u; \n",
-              ops->n-1,hsg_config.warp.lanes,ops->n);
-      break;
-
-    case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "HS_KEY_TYPE r%-3u = merge_ptr[%-3u * merge_stride];\n",
-              ops->a,ops->b);
-      break;
-
-    case HSG_OP_TYPE_HM_REG_GLOBAL_STORE:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "merge_ptr[%-3u * merge_stride] = r%u;\n",
-              ops->b,ops->a);
-      break;
-
-    case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "HS_KEY_TYPE r%-3u = merge_l[%-3u * merge_stride];\n",
-              ops->a,ops->b);
-      break;
-
-    case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "merge_l[%-3u * merge_stride] = r%u;\n",
-              ops->b,ops->a);
-      break;
-
-    case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "HS_KEY_TYPE r%-3u = merge_r[%-3u * merge_stride];\n",
-              ops->a,ops->b);
-      break;
-
-    case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "merge_r[%-3u * merge_stride] = r%u;\n",
-              ops->b,ops->a);
-      break;
-
-    case HSG_OP_TYPE_WARP_FLIP:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "uint const flip_lane_mask = %u;                                        \n"
-              "uint const flip_lane_idx  = get_sub_group_local_id() ^ flip_lane_mask; \n"
-              "int  const t_lt           = get_sub_group_local_id() < flip_lane_idx;  \n",
-              ops->n-1);
-      break;
-
-    case HSG_OP_TYPE_WARP_HALF:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "uint const half_lane_mask = %u;                                        \n"
-              "uint const half_lane_idx  = get_sub_group_local_id() ^ half_lane_mask; \n"
-              "int  const t_lt           = get_sub_group_local_id() < half_lane_idx;  \n",
-              ops->n / 2);
-      break;
-
-    case HSG_OP_TYPE_CMP_FLIP:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "HS_CMP_FLIP(%-3u,r%-3u,r%-3u)\n",ops->a,ops->b,ops->c);
-      break;
-
-    case HSG_OP_TYPE_CMP_HALF:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "HS_CMP_HALF(%-3u,r%-3u)\n",ops->a,ops->b);
-      break;
-
-    case HSG_OP_TYPE_CMP_XCHG:
-      if (ops->c == UINT32_MAX)
-        {
-          fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                  "HS_CMP_XCHG(r%-3u,r%-3u)\n",
-                  ops->a,ops->b);
-        }
-      else
-        {
-          fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                  "HS_CMP_XCHG(r%u_%u,r%u_%u)\n",
-                  ops->c,ops->a,ops->c,ops->b);
-        }
-      break;
-
-    case HSG_OP_TYPE_BS_REG_SHARED_STORE_V:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "(shared.m + get_local_id(0))[%-3u * %-2u * %-3u] = r%u;\n",
-              merge[ops->a].warps,hsg_config.warp.lanes,ops->c,ops->b);
-      break;
-
-    case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "r%-3u = (shared.m + get_local_id(0))[%-3u * %-2u * %-3u];\n",
-              ops->b,merge[ops->a].warps,hsg_config.warp.lanes,ops->c);
-      break;
-
-    case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "HS_KEY_TYPE r%-3u = (shared.m + get_local_id(0))[%-3u * %-2u * %-3u];\n",
-              ops->b,ops->a,hsg_config.warp.lanes,ops->c);
-      break;
-
-    case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "(shared.m + smem_l_idx)[%5u] = r%u_%u;\n",
-              ops->b * hsg_config.warp.lanes,
-              ops->c,
-              ops->a);
-      break;
-
-    case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "(shared.m + smem_r_idx)[%5u] = r%u_%u;\n",
-              ops->b * hsg_config.warp.lanes,
-              ops->c,
-              ops->a);
-      break;
-
-    case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "HS_KEY_TYPE r%u_%-3u = (shared.m + smem_l_idx)[%u];\n",
-              ops->c,
-              ops->a,
-              ops->b * hsg_config.warp.lanes);
-      break;
-
-    case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "HS_KEY_TYPE r%u_%-3u = (shared.m + smem_r_idx)[%u];\n",
-              ops->c,
-              ops->a,
-              ops->b * hsg_config.warp.lanes);
-      break;
-
-    case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "HS_KEY_TYPE r%u_%-3u = (vout + gmem_l_idx)[%u];\n",
-              ops->c,
-              ops->a,
-              ops->b * hsg_config.warp.lanes);
-      break;
-
-    case HSG_OP_TYPE_BLOCK_SYNC:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "barrier(CLK_LOCAL_MEM_FENCE);\n"); // OpenCL 2.0+: work_group_barrier
-      break;
-
-    case HSG_OP_TYPE_BS_FRAC_PRED:
-      {
-        if (ops->m == 0)
-          {
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "if (warp_idx < bs_full)\n");
-          }
-        else
-          {
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "else if (bs_frac == %u)\n",
-                    ops->w);
-          }
-      }
-      break;
-
-    case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE:
-      {
-        struct hsg_merge const * const m = merge + ops->a;
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "uint const smem_l_idx = get_sub_group_id() * %u + get_sub_group_local_id(); \n"
-                "uint const smem_r_idx = (get_sub_group_id() ^ 1) * %u + (get_sub_group_local_id() ^ %u); \n",
-                m->warps * hsg_config.warp.lanes,
-                m->warps * hsg_config.warp.lanes, hsg_config.warp.lanes-1);
-#if 0
-        if (ops->b == true)
-          {
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "uint smem_l_idx =  get_sub_group_id()      * %u +  get_sub_group_local_id();       \n"
-                    "uint smem_r_idx = (get_sub_group_id() ^ 1) * %u + (get_sub_group_local_id() ^ %u); \n",
-                    m->warps * hsg_config.warp.lanes,
-                    m->warps * hsg_config.warp.lanes, hsg_config.warp.lanes-1);
-          }
-        else // update
-          {
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "smem_l_idx =  get_sub_group_id()      * %u +  get_sub_group_local_id();       \n"
-                    "smem_r_idx = (get_sub_group_id() ^ 1) * %u + (get_sub_group_local_id() ^ %u); \n",
-                    m->warps * hsg_config.warp.lanes,
-                    m->warps * hsg_config.warp.lanes, hsg_config.warp.lanes-1);
-          }
-#endif
-      }
-      break;
-
-    case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE:
-      {
-        struct hsg_merge const * const m = merge + ops->a;
-        uint32_t                 const b = m->warps * hsg_config.warp.lanes;
-        uint32_t                 const k = b * hsg_config.thread.regs;
-
-        fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                "uint const gmem_l_idx = (global_id / %u) * %u + (global_id & %u);            \n"
-                "uint const smem_l_idx = get_sub_group_id() * %u  + get_sub_group_local_id(); \n",
-                b,k,b-1,
-                b);
-
-      }
-      break;
-
-    case HSG_OP_TYPE_BX_MERGE_H_PRED:
-      fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-              "if (get_sub_group_id() < %u)\n",
-              ops->a);
-      break;
-
-    case HSG_OP_TYPE_BS_ACTIVE_PRED:
-      {
-        struct hsg_merge const * const m = merge + ops->a;
-
-        if (m->warps <= 32)
-          {
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n",
-                    m->levels[ops->b].active.b32a2[0]);
-          }
-        else
-          {
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n",
-                    m->levels[ops->b].active.b32a2[1],
-                    m->levels[ops->b].active.b32a2[0]);
-          }
-      }
-      break;
-
-    case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED:
-      {
-        if (ops->a == ops->b)
-          {
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "if (merge_idx < fm_full) \n");
-          }
-        else if (ops->b > 1)
-          {
-            fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-                    "else if (fm_frac == %u) \n",
-                    ops->b);
-          }
-        else
-          {
-	    fprintf(files[HSG_FILE_TYPE_SOURCE].file,
-		    "else\n");
-          }
-      }
-      break;
-
-    default:
-      hsg_target_debug(files,merge,ops,depth);
-      break;
-    }
-}
-
-//
-//
-//
diff --git a/src/compute/hs/gen/target_opencl.c b/src/compute/hs/gen/target_opencl.c
new file mode 100644
index 0000000000..fe7343ba5d
--- /dev/null
+++ b/src/compute/hs/gen/target_opencl.c
@@ -0,0 +1,600 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+//
+//
+
+#include "gen.h"
+#include "transpose.h"
+
+#include "common/util.h"
+#include "common/macros.h"
+
+//
+//
+//
+
+struct hsg_transpose_state
+{
+  FILE                    * header;
+  struct hsg_config const * config;
+};
+
+static
+char
+hsg_transpose_reg_prefix(uint32_t const cols_log2)
+{
+  return 'a' + (('r' + cols_log2 - 'a') % 26);
+}
+
+static
+void
+hsg_transpose_blend(uint32_t                     const cols_log2,
+                    uint32_t                     const row_ll, // lower-left
+                    uint32_t                     const row_ur, // upper-right
+                    struct hsg_transpose_state * const state)
+{
+  // we're starting register names at '1' for now
+  fprintf(state->header,
+          "  HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n",
+          hsg_transpose_reg_prefix(cols_log2-1),
+          hsg_transpose_reg_prefix(cols_log2),
+          cols_log2,row_ll+1,row_ur+1);
+}
+
+static
+void
+hsg_transpose_remap(uint32_t                     const row_from,
+                    uint32_t                     const row_to,
+                    struct hsg_transpose_state * const state)
+{
+  // we're starting register names at '1' for now
+  fprintf(state->header,
+          "  HS_TRANSPOSE_REMAP( %c, %3u, %3u )        \\\n",
+          hsg_transpose_reg_prefix(state->config->warp.lanes_log2),
+          row_from+1,row_to+1);
+}
+
+//
+//
+//
+
+static
+void
+hsg_copyright(FILE * file)
+{
+  fprintf(file,
+          "//                                                    \n"
+          "// Copyright 2016 Google Inc.                         \n"
+          "//                                                    \n"
+          "// Use of this source code is governed by a BSD-style \n"
+          "// license that can be found in the LICENSE file.     \n"
+          "//                                                    \n"
+          "\n");
+}
+
+//
+//
+//
+
+struct hsg_target_state
+{
+  FILE * header;
+  FILE * source;
+};
+
+//
+//
+//
+
+void
+hsg_target_opencl(struct hsg_target       * const target,
+                  struct hsg_config const * const config,
+                  struct hsg_merge  const * const merge,
+                  struct hsg_op     const * const ops,
+                  uint32_t                  const depth)
+{
+  switch (ops->type)
+    {
+    case HSG_OP_TYPE_END:
+      fprintf(target->state->source,
+              "}\n");
+      break;
+
+    case HSG_OP_TYPE_BEGIN:
+      fprintf(target->state->source,
+              "{\n");
+      break;
+
+    case HSG_OP_TYPE_ELSE:
+      fprintf(target->state->source,
+              "else\n");
+      break;
+
+    case HSG_OP_TYPE_TARGET_BEGIN:
+      {
+        // allocate state
+        target->state = malloc(sizeof(*target->state));
+
+        // allocate files
+        fopen_s(&target->state->header,"hs_cl.h", "wb");
+        fopen_s(&target->state->source,"hs_cl.cl","wb");
+
+        // initialize header
+        uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps));
+
+        hsg_copyright(target->state->header);
+
+        fprintf(target->state->header,
+                "#ifndef HS_CL_ONCE                                              \n"
+                "#define HS_CL_ONCE                                              \n"
+                "                                                                \n"
+                "#define HS_SLAB_THREADS_LOG2    %u                              \n"
+                "#define HS_SLAB_THREADS         (1 << HS_SLAB_THREADS_LOG2)     \n"
+                "#define HS_SLAB_WIDTH_LOG2      %u                              \n"
+                "#define HS_SLAB_WIDTH           (1 << HS_SLAB_WIDTH_LOG2)       \n"
+                "#define HS_SLAB_HEIGHT          %u                              \n"
+                "#define HS_SLAB_KEYS            (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)\n"
+                "#define HS_REG_LAST(c)          c##%u                           \n"
+                "#define HS_KEY_TYPE             %s                              \n"
+                "#define HS_KEY_WORDS            %u                              \n"
+                "#define HS_VAL_WORDS            0                               \n"
+                "#define HS_BS_SLABS             %u                              \n"
+                "#define HS_BS_SLABS_LOG2_RU     %u                              \n"
+                "#define HS_BC_SLABS_LOG2_MAX    %u                              \n"
+                "#define HS_FM_SCALE_MIN         %u                              \n"
+                "#define HS_FM_SCALE_MAX         %u                              \n"
+                "#define HS_HM_SCALE_MIN         %u                              \n"
+                "#define HS_HM_SCALE_MAX         %u                              \n"
+                "#define HS_EMPTY                                                \n"
+                "                                                                \n",
+                config->warp.lanes_log2,
+                config->warp.lanes_log2,
+                config->thread.regs,
+                config->thread.regs,
+                (config->type.words == 2) ? "ulong" : "uint",
+                config->type.words,
+                merge->warps,
+                msb_idx_u32(pow2_ru_u32(merge->warps)),
+                bc_max,
+                config->merge.flip.lo,
+                config->merge.flip.hi,
+                config->merge.half.lo,
+                config->merge.half.hi);
+
+        fprintf(target->state->header,
+                "#define HS_SLAB_ROWS()    \\\n");
+
+        for (uint32_t ii=1; ii<=config->thread.regs; ii++)
+          fprintf(target->state->header,
+                  "  HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1);
+
+        fprintf(target->state->header,
+                "  HS_EMPTY\n"
+                "          \n");
+
+        fprintf(target->state->header,
+                "#define HS_TRANSPOSE_SLAB()                \\\n");
+
+        for (uint32_t ii=1; ii<=config->warp.lanes_log2; ii++)
+          fprintf(target->state->header,
+                  "  HS_TRANSPOSE_STAGE( %u )                  \\\n",ii);
+
+        struct hsg_transpose_state state[1] =
+          {
+           { .header = target->state->header,
+             .config = config
+           }
+          };
+
+        hsg_transpose(config->warp.lanes_log2,
+                      config->thread.regs,
+                      hsg_transpose_blend,state,
+                      hsg_transpose_remap,state);
+
+        fprintf(target->state->header,
+                "  HS_EMPTY\n"
+                "          \n");
+
+        hsg_copyright(target->state->source);
+
+        fprintf(target->state->source,
+                "#include \"hs_cl_macros.h\" \n"
+                "                            \n"
+                "//                          \n"
+                "//                          \n"
+                "//                          \n");
+      }
+      break;
+
+    case HSG_OP_TYPE_TARGET_END:
+      // decorate the files
+      fprintf(target->state->header,
+              "#endif  \n"
+              "        \n"
+              "//      \n"
+              "//      \n"
+              "//      \n"
+              "        \n");
+      fprintf(target->state->source,
+              "        \n"
+              "//      \n"
+              "//      \n"
+              "//      \n"
+              "        \n");
+
+      // close files
+      fclose(target->state->header);
+      fclose(target->state->source);
+
+      // free state
+      free(target->state);
+      break;
+
+    case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO:
+      {
+        fprintf(target->state->source,
+                "\nHS_TRANSPOSE_KERNEL_PROTO(%u)\n",
+                config->warp.lanes);
+      }
+      break;
+
+    case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE:
+      {
+        fprintf(target->state->source,
+                "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+                config->warp.lanes,config->thread.regs);
+      }
+      break;
+
+    case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY:
+      {
+        fprintf(target->state->source,
+                "HS_TRANSPOSE_SLAB()\n");
+      }
+      break;
+
+    case HSG_OP_TYPE_BS_KERNEL_PROTO:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        uint32_t const bs  = pow2_ru_u32(m->warps);
+        uint32_t const msb = msb_idx_u32(bs);
+
+        fprintf(target->state->source,
+                "\nHS_BS_KERNEL_PROTO(%u,%u,%u)\n",
+                config->warp.lanes,m->warps,msb);
+      }
+      break;
+
+    case HSG_OP_TYPE_BS_KERNEL_PREAMBLE:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        if (m->warps > 1)
+          {
+            fprintf(target->state->source,
+                    "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+                    m->warps * config->warp.lanes,
+                    m->rows_bs);
+          }
+
+        fprintf(target->state->source,
+                "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+                config->warp.lanes,config->thread.regs);
+      }
+      break;
+
+    case HSG_OP_TYPE_BC_KERNEL_PROTO:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        uint32_t const msb = msb_idx_u32(m->warps);
+
+        fprintf(target->state->source,
+                "\nHS_BC_KERNEL_PROTO(%u,%u,%u)\n",
+                config->warp.lanes,m->warps,msb);
+      }
+      break;
+
+    case HSG_OP_TYPE_BC_KERNEL_PREAMBLE:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        if (m->warps > 1)
+          {
+            fprintf(target->state->source,
+                    "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+                    m->warps * config->warp.lanes,
+                    m->rows_bc);
+          }
+
+        fprintf(target->state->source,
+                "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+                config->warp.lanes,config->thread.regs);
+      }
+      break;
+
+    case HSG_OP_TYPE_FM_KERNEL_PROTO:
+      fprintf(target->state->source,
+              "\nHS_FM_KERNEL_PROTO(%u,%u)\n",
+              ops->a,ops->b);
+      break;
+
+    case HSG_OP_TYPE_FM_KERNEL_PREAMBLE:
+      fprintf(target->state->source,
+              "HS_FM_PREAMBLE(%u);\n",
+              ops->a);
+      break;
+
+    case HSG_OP_TYPE_HM_KERNEL_PROTO:
+      {
+        fprintf(target->state->source,
+                "\nHS_HM_KERNEL_PROTO(%u)\n",
+                ops->a);
+      }
+      break;
+
+    case HSG_OP_TYPE_HM_KERNEL_PREAMBLE:
+      fprintf(target->state->source,
+              "HS_HM_PREAMBLE(%u);\n",
+              ops->a);
+      break;
+
+    case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD:
+      {
+        static char const * const vstr[] = { "vin", "vout" };
+
+        fprintf(target->state->source,
+                "HS_KEY_TYPE r%-3u = HS_SLAB_GLOBAL_LOAD(%s,%u,%u);\n",
+                ops->n,vstr[ops->v],config->warp.lanes,ops->n-1);
+      }
+      break;
+
+    case HSG_OP_TYPE_BX_REG_GLOBAL_STORE:
+      fprintf(target->state->source,
+              "HS_SLAB_GLOBAL_STORE(%u,%u,r%u);\n",
+              config->warp.lanes,ops->n-1,ops->n);
+      break;
+
+    case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+              ops->a,ops->b);
+      break;
+
+    case HSG_OP_TYPE_HM_REG_GLOBAL_STORE:
+      fprintf(target->state->source,
+              "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+              ops->b,ops->a);
+      break;
+
+    case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+              ops->a,ops->b);
+      break;
+
+    case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT:
+      fprintf(target->state->source,
+              "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+              ops->b,ops->a);
+      break;
+
+    case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%-3u = HS_FM_GLOBAL_LOAD_R(%u);\n",
+              ops->b,ops->a);
+      break;
+
+    case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT:
+      fprintf(target->state->source,
+              "HS_FM_GLOBAL_STORE_R(%-3u,r%u);\n",
+              ops->a,ops->b);
+      break;
+
+    case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED:
+      {
+        if (ops->a <= ops->b)
+          {
+            fprintf(target->state->source,
+                    "if (HS_FM_IS_NOT_LAST_SPAN() || (fm_frac == 0))\n");
+          }
+        else if (ops->b > 1)
+          {
+            fprintf(target->state->source,
+                    "else if (fm_frac == %u)\n",
+                    ops->b);
+          }
+        else
+          {
+	    fprintf(target->state->source,
+		    "else\n");
+          }
+      }
+      break;
+
+    case HSG_OP_TYPE_SLAB_FLIP:
+      fprintf(target->state->source,
+              "HS_SLAB_FLIP_PREAMBLE(%u);\n",
+              ops->n-1);
+      break;
+
+    case HSG_OP_TYPE_SLAB_HALF:
+      fprintf(target->state->source,
+              "HS_SLAB_HALF_PREAMBLE(%u);\n",
+              ops->n / 2);
+      break;
+
+    case HSG_OP_TYPE_CMP_FLIP:
+      fprintf(target->state->source,
+              "HS_CMP_FLIP(%-3u,r%-3u,r%-3u);\n",ops->a,ops->b,ops->c);
+      break;
+
+    case HSG_OP_TYPE_CMP_HALF:
+      fprintf(target->state->source,
+              "HS_CMP_HALF(%-3u,r%-3u);\n",ops->a,ops->b);
+      break;
+
+    case HSG_OP_TYPE_CMP_XCHG:
+      if (ops->c == UINT32_MAX)
+        {
+          fprintf(target->state->source,
+                  "HS_CMP_XCHG(r%-3u,r%-3u);\n",
+                  ops->a,ops->b);
+        }
+      else
+        {
+          fprintf(target->state->source,
+                  "HS_CMP_XCHG(r%u_%u,r%u_%u);\n",
+                  ops->c,ops->a,ops->c,ops->b);
+        }
+      break;
+
+    case HSG_OP_TYPE_BS_REG_SHARED_STORE_V:
+      fprintf(target->state->source,
+              "HS_BX_LOCAL_V(%-3u * %-2u * %-3u) = r%u;\n",
+              merge[ops->a].warps,config->warp.lanes,ops->c,ops->b);
+      break;
+
+    case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V:
+      fprintf(target->state->source,
+              "r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+              ops->b,merge[ops->a].warps,config->warp.lanes,ops->c);
+      break;
+
+    case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+              ops->b,ops->a,config->warp.lanes,ops->c);
+      break;
+
+    case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT:
+      fprintf(target->state->source,
+              "HS_SLAB_LOCAL_L(%5u) = r%u_%u;\n",
+              ops->b * config->warp.lanes,
+              ops->c,
+              ops->a);
+      break;
+
+    case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT:
+      fprintf(target->state->source,
+              "HS_SLAB_LOCAL_R(%5u) = r%u_%u;\n",
+              ops->b * config->warp.lanes,
+              ops->c,
+              ops->a);
+      break;
+
+    case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_L(%u);\n",
+              ops->c,
+              ops->a,
+              ops->b * config->warp.lanes);
+      break;
+
+    case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_R(%u);\n",
+              ops->c,
+              ops->a,
+              ops->b * config->warp.lanes);
+      break;
+
+    case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT:
+      fprintf(target->state->source,
+              "HS_KEY_TYPE r%u_%-3u = HS_BC_GLOBAL_LOAD_L(%u,%u);\n",
+              ops->c,
+              ops->a,
+              config->warp.lanes,ops->b);
+      break;
+
+    case HSG_OP_TYPE_BLOCK_SYNC:
+      fprintf(target->state->source,
+              "HS_BLOCK_BARRIER();\n");
+      //
+      // FIXME - Named barriers to allow coordinating warps to proceed?
+      //
+      break;
+
+    case HSG_OP_TYPE_BS_FRAC_PRED:
+      {
+        if (ops->m == 0)
+          {
+            fprintf(target->state->source,
+                    "if (warp_idx < bs_full)\n");
+          }
+        else
+          {
+            fprintf(target->state->source,
+                    "else if (bs_frac == %u)\n",
+                    ops->w);
+          }
+      }
+      break;
+
+    case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        fprintf(target->state->source,
+                "HS_BS_MERGE_H_PREAMBLE(%u,%u);\n",
+                config->warp.lanes,m->warps);
+      }
+      break;
+
+    case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        fprintf(target->state->source,
+                "HS_BC_MERGE_H_PREAMBLE(%u,%u,%u);\n",
+                config->warp.lanes,config->thread.regs,m->warps);
+      }
+      break;
+
+    case HSG_OP_TYPE_BX_MERGE_H_PRED:
+      fprintf(target->state->source,
+              "if (get_sub_group_id() < %u)\n",
+              ops->a);
+      break;
+
+    case HSG_OP_TYPE_BS_ACTIVE_PRED:
+      {
+        struct hsg_merge const * const m = merge + ops->a;
+
+        if (m->warps <= 32)
+          {
+            fprintf(target->state->source,
+                    "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n",
+                    m->levels[ops->b].active.b32a2[0]);
+          }
+        else
+          {
+            fprintf(target->state->source,
+                    "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n",
+                    m->levels[ops->b].active.b32a2[1],
+                    m->levels[ops->b].active.b32a2[0]);
+          }
+      }
+      break;
+
+    default:
+      fprintf(stderr,"type not found: %s\n",hsg_op_type_string[ops->type]);
+      exit(EXIT_FAILURE);
+      break;
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/hs/gen/transpose.c b/src/compute/hs/gen/transpose.c
index de15c62631..095f53d330 100644
--- a/src/compute/hs/gen/transpose.c
+++ b/src/compute/hs/gen/transpose.c
@@ -11,7 +11,7 @@
 //
 
 #include "transpose.h"
-#include "macros.h"
+#include "common/macros.h"
 
 //
 // Rows must be an even number.  This is enforced elsewhere.
@@ -21,19 +21,19 @@
 void
 hsg_transpose(uint32_t                   const cols_log2,
               uint32_t                   const rows,
-              void *                           blend,
-              void *                           remap,
               void (*pfn_blend)(uint32_t const cols_log2,
                                 uint32_t const row_ll, // lower-left
                                 uint32_t const row_ur, // upper-right
                                 void *         blend),
+              void *                           blend,
               void (*pfn_remap)(uint32_t const row_from,
                                 uint32_t const row_to,
-                                void *         remap))
+                                void *         remap),
+              void *                           remap)
 {
   // get mapping array
-  uint32_t * map_curr = ALLOCA(rows * sizeof(*map_curr));
-  uint32_t * map_next = ALLOCA(rows * sizeof(*map_next));
+  uint32_t * map_curr = ALLOCA_MACRO(rows * sizeof(*map_curr));
+  uint32_t * map_next = ALLOCA_MACRO(rows * sizeof(*map_next));
 
   // init the mapping array
   for (uint32_t ii=0; ii<rows; ii++)
@@ -89,35 +89,35 @@ static uint32_t cols; // implicit on SIMD/GPU
 
 static
 void
-hsg_debug_remap(uint32_t   const row_from,
-                uint32_t   const row_to,
-                uint32_t * const r)
-{
-  fprintf(stdout,"REMAP( %3u, %3u )\n",row_from,row_to);
-
-  r[row_to] = row_from;
-}
-
-static
-void
 hsg_debug_blend(uint32_t const cols_log2,
                 uint32_t const row_ll, // lower-left
                 uint32_t const row_ur, // upper-right
-                uint32_t *     m)
+                uint32_t *     b)
 {
   fprintf(stdout,"BLEND( %u, %3u, %3u )\n",cols_log2,row_ll,row_ur);
 
-  uint32_t * const ll = ALLOCA(cols * sizeof(*m));
-  uint32_t * const ur = ALLOCA(cols * sizeof(*m));
+  uint32_t * const ll = ALLOCA(cols * sizeof(*b));
+  uint32_t * const ur = ALLOCA(cols * sizeof(*b));
 
-  memcpy(ll,m+row_ll*cols,cols * sizeof(*m));
-  memcpy(ur,m+row_ur*cols,cols * sizeof(*m));
+  memcpy(ll,b+row_ll*cols,cols * sizeof(*b));
+  memcpy(ur,b+row_ur*cols,cols * sizeof(*b));
 
   for (uint32_t ii=0; ii<cols; ii++)
-    m[row_ll*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii] : ur[ii^(1<<cols_log2-1)];
+    b[row_ll*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii] : ur[ii^(1<<cols_log2-1)];
 
   for (uint32_t ii=0; ii<cols; ii++)
-    m[row_ur*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii^(1<<cols_log2-1)] : ur[ii];
+    b[row_ur*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii^(1<<cols_log2-1)] : ur[ii];
+}
+
+static
+void
+hsg_debug_remap(uint32_t   const row_from,
+                uint32_t   const row_to,
+                uint32_t * const r)
+{
+  fprintf(stdout,"REMAP( %3u, %3u )\n",row_from,row_to);
+
+  r[row_to] = row_from;
 }
 
 static
@@ -144,23 +144,22 @@ main(int argc, char * argv[])
 
   cols = 1 << cols_log2;
 
-  uint32_t * const m = ALLOCA(cols * rows * sizeof(*m));
+  uint32_t * const b = ALLOCA(cols * rows * sizeof(*b));
   uint32_t * const r = ALLOCA(       rows * sizeof(*r));
 
   for (uint32_t rr=0; rr<rows; rr++) {
     r[rr] = rr;
     for (uint32_t cc=0; cc<cols; cc++)
-      m[rr*cols+cc] = cc*rows+rr;
+      b[rr*cols+cc] = cc*rows+rr;
   }
 
-  hsg_debug_print(rows,m,r);
+  hsg_debug_print(rows,b,r);
 
   hsg_transpose(cols_log2,rows,
-                m,r,
-                hsg_debug_blend,
-                hsg_debug_remap);
+                hsg_debug_blend,b,
+                hsg_debug_remap,r);
 
-  hsg_debug_print(rows,m,r);
+  hsg_debug_print(rows,b,r);
 
   return 0;
 }
diff --git a/src/compute/hs/gen/transpose.h b/src/compute/hs/gen/transpose.h
index 83f6fc4e42..380210970d 100644
--- a/src/compute/hs/gen/transpose.h
+++ b/src/compute/hs/gen/transpose.h
@@ -23,15 +23,15 @@
 void
 hsg_transpose(uint32_t                   const cols_log2,
               uint32_t                   const rows,
-              void *                           blend,
-              void *                           remap,
               void (*pfn_blend)(uint32_t const cols_log2,
                                 uint32_t const row_ll, // lower-left
                                 uint32_t const row_ur, // upper-right
                                 void *         blend),
+              void *                           blend,
               void (*pfn_remap)(uint32_t const row_from,
                                 uint32_t const row_to,
-                                void *         remap));
+                                void *         remap),
+              void *                           remap);
 
 //
 //