aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/compute/hs/gen
diff options
context:
space:
mode:
Diffstat (limited to 'src/compute/hs/gen')
-rw-r--r--src/compute/hs/gen/gen.h112
-rw-r--r--src/compute/hs/gen/main.c532
-rw-r--r--src/compute/hs/gen/networks_merging.c4
-rw-r--r--src/compute/hs/gen/networks_sorting.c4
-rw-r--r--src/compute/hs/gen/target_cuda.c600
-rw-r--r--src/compute/hs/gen/target_cuda_sm3x.c776
-rw-r--r--src/compute/hs/gen/target_debug.c73
-rw-r--r--src/compute/hs/gen/target_glsl.c674
-rw-r--r--src/compute/hs/gen/target_igp_genx.c672
-rw-r--r--src/compute/hs/gen/target_opencl.c600
-rw-r--r--src/compute/hs/gen/transpose.c61
-rw-r--r--src/compute/hs/gen/transpose.h6
12 files changed, 2229 insertions, 1885 deletions
diff --git a/src/compute/hs/gen/gen.h b/src/compute/hs/gen/gen.h
index 4043a8df5c..3635d553cf 100644
--- a/src/compute/hs/gen/gen.h
+++ b/src/compute/hs/gen/gen.h
@@ -9,21 +9,20 @@
#pragma once
//
+// TODO:
//
+// Add Key-Val sorting support -- easy.
//
#include <stdio.h>
#include <stdint.h>
//
+// All code generation is driven by the specified architectural
+// details and host platform API.
//
-//
-
-#define MERGE_LEVELS_MAX_LOG2 7 // merge up to 128 warps
-#define MERGE_LEVELS_MAX_SIZE (1 << MERGE_LEVELS_MAX_LOG2) // ((1 << MERGE_MAX_LOG2) - 1) // incorrect debug error
-
-//
-//
+// In general, the warps-per-block and keys-per-thread are the
+// critical knobs for tuning performance.
//
struct hsg_config
@@ -58,6 +57,7 @@ struct hsg_config
struct {
uint32_t lanes;
+ uint32_t lanes_log2;
uint32_t skpw_bs;
} warp;
@@ -72,7 +72,7 @@ struct hsg_config
};
//
-//
+// HotSort can merge non-power-of-two blocks of warps
//
struct hsg_level
@@ -91,6 +91,16 @@ struct hsg_level
} active;
};
+//
+//
+//
+
+#define MERGE_LEVELS_MAX_LOG2 7 // merge up to 128 warps
+#define MERGE_LEVELS_MAX_SIZE (1 << MERGE_LEVELS_MAX_LOG2)
+
+//
+// This is computed
+//
struct hsg_merge
{
@@ -113,6 +123,8 @@ struct hsg_merge
//
//
+#if 0
+
#define HSG_FILE_NAME_SIZE 80
struct hsg_file
@@ -126,18 +138,6 @@ struct hsg_file
//
//
-typedef enum hsg_kernel_type {
-
- HSG_KERNEL_TYPE_SORT_BLOCK,
-
- HSG_KERNEL_TYPE_COUNT
-
-} hsg_kernel_type;
-
-//
-//
-//
-
typedef enum hsg_file_type {
HSG_FILE_TYPE_HEADER,
@@ -147,6 +147,8 @@ typedef enum hsg_file_type {
} hsg_file_type;
+#endif
+
//
//
//
@@ -158,10 +160,8 @@ typedef enum hsg_file_type {
HSG_OP_EXPAND_X(HSG_OP_TYPE_BEGIN) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_ELSE) \
\
- HSG_OP_EXPAND_X(HSG_OP_TYPE_FILE_HEADER) \
- HSG_OP_EXPAND_X(HSG_OP_TYPE_FILE_FOOTER) \
- \
- HSG_OP_EXPAND_X(HSG_OP_TYPE_DUMMY_KERNEL) \
+ HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_BEGIN) \
+ HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_END) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE) \
@@ -186,12 +186,13 @@ typedef enum hsg_file_type {
HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT) \
+ HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_MERGE_RIGHT_PRED) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_LOAD) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_STORE) \
\
- HSG_OP_EXPAND_X(HSG_OP_TYPE_WARP_FLIP) \
- HSG_OP_EXPAND_X(HSG_OP_TYPE_WARP_HALF) \
+ HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_FLIP) \
+ HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_HALF) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_FLIP) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_HALF) \
@@ -221,8 +222,6 @@ typedef enum hsg_file_type {
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_ACTIVE_PRED) \
\
- HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_MERGE_RIGHT_PRED) \
- \
HSG_OP_EXPAND_X(HSG_OP_TYPE_COUNT)
//
@@ -271,42 +270,63 @@ struct hsg_op
//
//
-typedef void (*hsg_target_pfn)(struct hsg_file * const files,
- struct hsg_merge const * const merge,
- struct hsg_op const * const ops,
- uint32_t const depth);
+extern char const * const hsg_op_type_string[];
//
//
//
-extern struct hsg_config hsg_config;
-extern struct hsg_merge hsg_merge[MERGE_LEVELS_MAX_LOG2];
+struct hsg_target
+{
+ struct hsg_target_state * state;
+};
//
+// All targets share this prototype
+//
+
+typedef
+void
+(*hsg_target_pfn)(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth);
//
//
+//
+
+extern
+void
+hsg_target_debug(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth);
extern
void
-hsg_target_debug (struct hsg_file * const files,
- struct hsg_merge const * const merge,
- struct hsg_op const * const ops,
- uint32_t const depth);
+hsg_target_cuda(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth);
extern
void
-hsg_target_cuda_sm3x(struct hsg_file * const files,
- struct hsg_merge const * const merge,
- struct hsg_op const * const ops,
- uint32_t const depth);
+hsg_target_opencl(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth);
extern
void
-hsg_target_igp_genx (struct hsg_file * const files,
- struct hsg_merge const * const merge,
- struct hsg_op const * const ops,
- uint32_t const depth);
+hsg_target_glsl(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth);
//
//
//
diff --git a/src/compute/hs/gen/main.c b/src/compute/hs/gen/main.c
index 42f4518bfd..e06e23029b 100644
--- a/src/compute/hs/gen/main.c
+++ b/src/compute/hs/gen/main.c
@@ -20,14 +20,8 @@
//
#include "networks.h"
-#include "macros.h"
-#include "util.h"
-
-//
-//
-//
-
-#define HSG_INDENT 2
+#include "common/util.h"
+#include "common/macros.h"
//
//
@@ -36,7 +30,6 @@
#undef HSG_OP_EXPAND_X
#define HSG_OP_EXPAND_X(t) #t ,
-static
char const * const
hsg_op_type_string[] =
{
@@ -53,10 +46,8 @@ hsg_op_type_string[] =
#define BEGIN() (struct hsg_op){ HSG_OP_TYPE_BEGIN }
#define ELSE() (struct hsg_op){ HSG_OP_TYPE_ELSE }
-#define STORE_SLAB_EARLY_EXIT() (struct hsg_op){ HSG_OP_TYPE_STORE_SLAB_EARLY_EXIT }
-
-#define FILE_HEADER() (struct hsg_op){ HSG_OP_TYPE_FILE_HEADER }
-#define FILE_FOOTER() (struct hsg_op){ HSG_OP_TYPE_FILE_FOOTER }
+#define TARGET_BEGIN() (struct hsg_op){ HSG_OP_TYPE_TARGET_BEGIN }
+#define TARGET_END() (struct hsg_op){ HSG_OP_TYPE_TARGET_END }
#define TRANSPOSE_KERNEL_PROTO() (struct hsg_op){ HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO }
#define TRANSPOSE_KERNEL_PREAMBLE() (struct hsg_op){ HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE }
@@ -68,11 +59,11 @@ hsg_op_type_string[] =
#define BC_KERNEL_PROTO(i) (struct hsg_op){ HSG_OP_TYPE_BC_KERNEL_PROTO, { i } }
#define BC_KERNEL_PREAMBLE(i) (struct hsg_op){ HSG_OP_TYPE_BC_KERNEL_PREAMBLE, { i } }
-#define FM_KERNEL_PROTO(l,s) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PROTO, { l, s } }
-#define FM_KERNEL_PREAMBLE(w,s) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PREAMBLE, { w, s } }
+#define FM_KERNEL_PROTO(s,r) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PROTO, { s, r } }
+#define FM_KERNEL_PREAMBLE(h) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PREAMBLE, { h } }
-#define HM_KERNEL_PROTO(d,w) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PROTO, { d, w } }
-#define HM_KERNEL_PREAMBLE(w,s) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PREAMBLE, { w, s } }
+#define HM_KERNEL_PROTO(s) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PROTO, { s } }
+#define HM_KERNEL_PREAMBLE(h) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PREAMBLE, { h } }
#define BX_REG_GLOBAL_LOAD(n,v) (struct hsg_op){ HSG_OP_TYPE_BX_REG_GLOBAL_LOAD, { n, v } }
#define BX_REG_GLOBAL_STORE(n) (struct hsg_op){ HSG_OP_TYPE_BX_REG_GLOBAL_STORE, { n } }
@@ -81,12 +72,13 @@ hsg_op_type_string[] =
#define FM_REG_GLOBAL_STORE_LEFT(n,i) (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT, { n, i } }
#define FM_REG_GLOBAL_LOAD_RIGHT(n,i) (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT, { n, i } }
#define FM_REG_GLOBAL_STORE_RIGHT(n,i) (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT, { n, i } }
+#define FM_MERGE_RIGHT_PRED(n,s) (struct hsg_op){ HSG_OP_TYPE_FM_MERGE_RIGHT_PRED, { n, s } }
#define HM_REG_GLOBAL_LOAD(n,i) (struct hsg_op){ HSG_OP_TYPE_HM_REG_GLOBAL_LOAD, { n, i } }
#define HM_REG_GLOBAL_STORE(n,i) (struct hsg_op){ HSG_OP_TYPE_HM_REG_GLOBAL_STORE, { n, i } }
-#define WARP_FLIP(f) (struct hsg_op){ HSG_OP_TYPE_WARP_FLIP, { f } }
-#define WARP_HALF(h) (struct hsg_op){ HSG_OP_TYPE_WARP_HALF, { h } }
+#define SLAB_FLIP(f) (struct hsg_op){ HSG_OP_TYPE_SLAB_FLIP, { f } }
+#define SLAB_HALF(h) (struct hsg_op){ HSG_OP_TYPE_SLAB_HALF, { h } }
#define CMP_FLIP(a,b,c) (struct hsg_op){ HSG_OP_TYPE_CMP_FLIP, { a, b, c } }
#define CMP_HALF(a,b) (struct hsg_op){ HSG_OP_TYPE_CMP_HALF, { a, b } }
@@ -121,13 +113,12 @@ hsg_op_type_string[] =
#define BS_ACTIVE_PRED(m,l) (struct hsg_op){ HSG_OP_TYPE_BS_ACTIVE_PRED, { m, l } }
-#define FM_MERGE_RIGHT_PRED(n,s) (struct hsg_op){ HSG_OP_TYPE_FM_MERGE_RIGHT_PRED, { n, s } }
-
//
// DEFAULTS
//
-struct hsg_config hsg_config = // FIXME -- how useful is this?
+static
+struct hsg_config hsg_config =
{
.merge = {
.flip = {
@@ -138,8 +129,6 @@ struct hsg_config hsg_config = // FIXME -- how useful is this?
.lo = 1,
.hi = 1
},
-
- .max_log2 = 27 // 2^27th = 128m
},
.block = {
@@ -156,6 +145,7 @@ struct hsg_config hsg_config = // FIXME -- how useful is this?
.warp = {
.lanes = 32,
+ .lanes_log2 = 5,
},
.thread = {
@@ -172,45 +162,11 @@ struct hsg_config hsg_config = // FIXME -- how useful is this?
// ZERO HSG_MERGE STRUCT
//
+static
struct hsg_merge hsg_merge[MERGE_LEVELS_MAX_LOG2] = { 0 };
//
-//
-//
-
-static const hsg_target_pfn hsg_target_pfns[] =
- {
- hsg_target_debug,
- hsg_target_cuda_sm3x,
- hsg_target_igp_genx,
- // hsg_target_adreno_5xx,
- // hsg_target_amd_gcn,
- // hsg_target_x86_sse,
- // hsg_target_x86_avx2,
- };
-
-static const char * hsg_target_pfn_string[] =
- {
- "hs_debug",
- "hs_cuda",
- "hs_cl"
- };
-
-static const char * hsg_file_type_string[][2] =
- {
- { ".h", ".txt" },
- { ".h", ".cu" },
- { ".h", ".cl" }
- };
-
-//
-//
-//
-
-#define HSG_TARGET_PFN_COUNT ARRAY_LENGTH(hsg_target_pfns)
-
-//
-//
+// STATS ON INSTRUCTIONS
//
static hsg_op_type hsg_op_type_counts[HSG_OP_TYPE_COUNT] = { 0 };
@@ -223,8 +179,18 @@ static
void
hsg_op_debug()
{
+ uint32_t total = 0;
+
for (hsg_op_type t=HSG_OP_TYPE_EXIT; t<HSG_OP_TYPE_COUNT; t++)
- fprintf(stderr,"%-37s : %u\n",hsg_op_type_string[t],hsg_op_type_counts[t]);
+ {
+ uint32_t const count = hsg_op_type_counts[t];
+
+ total += count;
+
+ fprintf(stderr,"%-37s : %u\n",hsg_op_type_string[t],count);
+ }
+
+ fprintf(stderr,"%-37s : %u\n\n\n","TOTAL",total);
}
//
@@ -268,7 +234,7 @@ hsg_merge_levels_init_shared(struct hsg_merge * const merge)
//
// The provided smem_bs size will be allocated for each sorting block.
//
- uint32_t const bs_threads = merge->warps * hsg_config.warp.lanes;
+ uint32_t const bs_threads = merge->warps << hsg_config.warp.lanes_log2;
uint32_t const bs_keys = hsg_config.block.smem_bs / (hsg_config.type.words * sizeof(uint32_t));
uint32_t const bs_kpt = bs_keys / bs_threads;
uint32_t const bs_kpt_mod = (bs_kpt / hsg_config.block.warps_mod) * hsg_config.block.warps_mod;
@@ -282,7 +248,7 @@ hsg_merge_levels_init_shared(struct hsg_merge * const merge)
}
// clamp to number of registers
- merge->rows_bs = min(bs_rows_even, hsg_config.thread.regs);
+ merge->rows_bs = MIN_MACRO(bs_rows_even, hsg_config.thread.regs);
}
//
@@ -297,19 +263,19 @@ hsg_merge_levels_init_shared(struct hsg_merge * const merge)
//
// if merge->warps is not pow2 then we're going to skip creating a bc elsewhere
//
- uint32_t const bc_warps_min = max(merge->warps,hsg_config.block.warps_min);
- uint32_t const bc_threads = bc_warps_min * hsg_config.warp.lanes;
+ uint32_t const bc_warps_min = MAX_MACRO(merge->warps,hsg_config.block.warps_min);
+ uint32_t const bc_threads = bc_warps_min << hsg_config.warp.lanes_log2;
uint32_t const bc_block_rd = (((hsg_config.block.smem_bc * bc_warps_min) / hsg_config.block.warps_max) /
hsg_config.block.smem_quantum) * hsg_config.block.smem_quantum;
- uint32_t const bc_block_max = max(bc_block_rd,hsg_config.block.smem_min);
- uint32_t const bc_block_smem = min(bc_block_max,hsg_config.block.smem_bs);
+ uint32_t const bc_block_max = MAX_MACRO(bc_block_rd,hsg_config.block.smem_min);
+ uint32_t const bc_block_smem = MIN_MACRO(bc_block_max,hsg_config.block.smem_bs);
// what is the max amount of shared in each possible bc block config?
uint32_t const bc_keys = bc_block_smem / (hsg_config.type.words * sizeof(uint32_t));
uint32_t const bc_kpt = bc_keys / bc_threads;
uint32_t const bc_kpt_mod = (bc_kpt / hsg_config.block.warps_mod) * hsg_config.block.warps_mod;
- merge->rows_bc = min(bc_kpt_mod, hsg_config.thread.regs);
+ merge->rows_bc = MIN_MACRO(bc_kpt_mod, hsg_config.thread.regs);
merge->skpw_bc = bc_keys / bc_warps_min;
}
}
@@ -441,7 +407,7 @@ hsg_merge_levels_hint(struct hsg_merge * const merge, bool const autotune)
for (uint32_t level=0; level<MERGE_LEVELS_MAX_LOG2; level++)
{
// max network
- uint32_t const n_max = max(merge->levels[level].networks[0],
+ uint32_t const n_max = MAX_MACRO(merge->levels[level].networks[0],
merge->levels[level].networks[1]);
if (n_max <= (merge->rows_bs + hsg_config.thread.xtra))
@@ -533,7 +499,7 @@ hsg_network_copy(struct hsg_op * ops,
for (uint32_t ii=0; ii<len; ii++)
{
- const struct hsg_op * const cx = cxa + ii;
+ struct hsg_op const * const cx = cxa + ii;
ops = hsg_op(ops,CMP_XCHG(cx->a,cx->b,prefix));
}
@@ -638,7 +604,7 @@ hsg_warp_half_downto(struct hsg_op * ops, uint32_t h)
{
ops = hsg_begin(ops);
- ops = hsg_op(ops,WARP_HALF(h));
+ ops = hsg_op(ops,SLAB_HALF(h));
ops = hsg_warp_half_network(ops);
ops = hsg_end(ops);
@@ -665,7 +631,7 @@ hsg_warp_flip(struct hsg_op * ops, uint32_t f)
{
ops = hsg_begin(ops);
- ops = hsg_op(ops,WARP_FLIP(f));
+ ops = hsg_op(ops,SLAB_FLIP(f));
ops = hsg_warp_flip_network(ops);
ops = hsg_end(ops);
@@ -782,7 +748,7 @@ hsg_bc_half_merge_level(struct hsg_op * ops,
uint32_t const net_even = merge->levels[0].networks[0];
// min of warps in block and remaining horizontal rows
- uint32_t const active = min(s_count, net_even);
+ uint32_t const active = MIN_MACRO(s_count, net_even);
// conditional on blockIdx.x
if (active < merge->warps)
@@ -834,7 +800,7 @@ hsg_bc_half_merge(struct hsg_op * ops, struct hsg_merge const * const merge)
//
// will only be called with merge->warps >= 2
//
- uint32_t const warps = max(merge->warps,hsg_config.block.warps_min);
+ uint32_t const warps = MAX_MACRO(merge->warps,hsg_config.block.warps_min);
// guaranteed to be an even network
uint32_t const net_even = merge->levels[0].networks[0];
@@ -851,7 +817,7 @@ hsg_bc_half_merge(struct hsg_op * ops, struct hsg_merge const * const merge)
{
// compute store count
uint32_t const r_rem = hsg_config.thread.regs + 1 - r_lo;
- uint32_t const s_count = min(s_max,r_rem);
+ uint32_t const s_count = MIN_MACRO(s_max,r_rem);
// block sync -- can skip if first
if (r_lo > 1)
@@ -1010,7 +976,7 @@ hsg_bs_flip_merge(struct hsg_op * ops, struct hsg_merge const * const merge)
uint32_t r_hi = hsg_config.thread.regs + 1 - r_lo;
// compute store count
- uint32_t const s_pairs = min(s_pairs_max,r_mid - r_lo);
+ uint32_t const s_pairs = MIN_MACRO(s_pairs_max,r_mid - r_lo);
// store rows to shared
for (uint32_t c=0; c<s_pairs; c++)
@@ -1082,7 +1048,7 @@ hsg_bs_flip_merge_all(struct hsg_op * ops, const struct hsg_merge * const merge)
static
struct hsg_op *
-hsg_bs_sort(struct hsg_op * ops, const struct hsg_merge * const merge)
+hsg_bs_sort(struct hsg_op * ops, struct hsg_merge const * const merge)
{
// func proto
ops = hsg_op(ops,BS_KERNEL_PROTO(merge->index));
@@ -1125,7 +1091,7 @@ hsg_bs_sort_all(struct hsg_op * ops)
{
for (uint32_t merge_idx=0; merge_idx<MERGE_LEVELS_MAX_LOG2; merge_idx++)
{
- const struct hsg_merge* const m = hsg_merge + merge_idx;
+ struct hsg_merge const * const m = hsg_merge + merge_idx;
if (m->warps == 0)
break;
@@ -1142,7 +1108,7 @@ hsg_bs_sort_all(struct hsg_op * ops)
static
struct hsg_op *
-hsg_bc_clean(struct hsg_op * ops, const struct hsg_merge * const merge)
+hsg_bc_clean(struct hsg_op * ops, struct hsg_merge const * const merge)
{
// func proto
ops = hsg_op(ops,BC_KERNEL_PROTO(merge->index));
@@ -1189,7 +1155,7 @@ hsg_bc_clean_all(struct hsg_op * ops)
{
for (uint32_t merge_idx=0; merge_idx<MERGE_LEVELS_MAX_LOG2; merge_idx++)
{
- const struct hsg_merge* const m = hsg_merge + merge_idx;
+ struct hsg_merge const * const m = hsg_merge + merge_idx;
if (m->warps == 0)
break;
@@ -1215,9 +1181,7 @@ static
struct hsg_op *
hsg_fm_thread_load_left(struct hsg_op * ops, uint32_t const n)
{
- uint32_t const mid = n/2;
-
- for (uint32_t r=1; r<=mid; r++)
+ for (uint32_t r=1; r<=n; r++)
ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_LEFT(r,r-1));
return ops;
@@ -1227,9 +1191,7 @@ static
struct hsg_op *
hsg_fm_thread_store_left(struct hsg_op * ops, uint32_t const n)
{
- uint32_t const mid = n/2;
-
- for (uint32_t r=mid; r>=1; r--)
+ for (uint32_t r=1; r<=n; r++)
ops = hsg_op(ops,FM_REG_GLOBAL_STORE_LEFT(r,r-1));
return ops;
@@ -1237,53 +1199,60 @@ hsg_fm_thread_store_left(struct hsg_op * ops, uint32_t const n)
static
struct hsg_op *
-hsg_fm_thread_load_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right)
+hsg_fm_thread_load_right(struct hsg_op * ops, uint32_t const half_span, uint32_t const half_case)
{
- uint32_t const mid = n / 2;
- uint32_t const first = mid + 1;
- uint32_t const last = mid + span_right;
-
- for (uint32_t r=first; r<=last; r++)
- ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_RIGHT(r,r-first));
+ for (uint32_t r=0; r<half_case; r++)
+ ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_RIGHT(r,half_span+1+r));
return ops;
}
static
struct hsg_op *
-hsg_fm_thread_store_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right)
+hsg_fm_thread_store_right(struct hsg_op * ops, uint32_t const half_span, uint32_t const half_case)
{
- uint32_t const mid = n / 2;
- uint32_t const first = mid + 1;
- uint32_t const last = mid + span_right;
-
- for (uint32_t r=last; r>=first; r--)
- ops = hsg_op(ops,FM_REG_GLOBAL_STORE_RIGHT(r,r-first));
+ for (uint32_t r=0; r<half_case; r++)
+ ops = hsg_op(ops,FM_REG_GLOBAL_STORE_RIGHT(r,half_span+1+r));
return ops;
}
static
struct hsg_op *
-hsg_fm_thread_merge_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right)
+hsg_fm_merge(struct hsg_op * ops,
+ uint32_t const scale_log2,
+ uint32_t const span_left,
+ uint32_t const span_right)
{
- // conditional
- ops = hsg_op(ops,FM_MERGE_RIGHT_PRED(n/2,span_right));
+ // func proto
+ ops = hsg_op(ops,FM_KERNEL_PROTO(scale_log2,msb_idx_u32(pow2_ru_u32(span_right))));
// begin
ops = hsg_begin(ops);
- // load
- ops = hsg_fm_thread_load_right(ops,n,span_right);
+ // preamble for loading/storing
+ ops = hsg_op(ops,FM_KERNEL_PREAMBLE(span_left));
+
+ // load left span
+ ops = hsg_fm_thread_load_left(ops,span_left);
+
+ // load right span
+ ops = hsg_fm_thread_load_right(ops,span_left,span_right);
// compare left and right
- ops = hsg_thread_merge_left_right(ops,n/2,span_right);
+ ops = hsg_thread_merge_left_right(ops,span_left,span_right);
+
+ // left merging network
+ ops = hsg_thread_merge(ops,span_left);
// right merging network
- ops = hsg_thread_merge_offset(ops,n/2,span_right);
+ ops = hsg_thread_merge_offset(ops,span_left,span_right);
+
+ // store
+ ops = hsg_fm_thread_store_left(ops,span_left);
// store
- ops = hsg_fm_thread_store_right(ops,n,span_right);
+ ops = hsg_fm_thread_store_right(ops,span_left,span_right);
// end
ops = hsg_end(ops);
@@ -1293,45 +1262,12 @@ hsg_fm_thread_merge_right(struct hsg_op * ops, uint32_t const n, uint32_t const
static
struct hsg_op *
-hsg_fm_thread_merge_right_all(struct hsg_op * ops, uint32_t const span)
+hsg_fm_merge_all(struct hsg_op * ops, uint32_t const scale_log2, uint32_t const warps)
{
- ops = hsg_fm_thread_merge_right(ops,span,span/2);
-
- for (uint32_t span_pow2 = pow2_ru_u32(span) / 4; span_pow2 >= 1; span_pow2 /= 2)
- {
- ops = hsg_fm_thread_merge_right(ops,span,span_pow2);
- }
-
- return ops;
-}
-
-static
-struct hsg_op *
-hsg_fm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uint32_t const fm_scale)
-{
- // func proto
- ops = hsg_op(ops,FM_KERNEL_PROTO(level,fm_scale));
-
- // begin
- ops = hsg_begin(ops);
-
- // shared declare
- ops = hsg_op(ops,FM_KERNEL_PREAMBLE(span,fm_scale));
-
- // load
- ops = hsg_fm_thread_load_left(ops,span);
-
- // right merging network
- ops = hsg_fm_thread_merge_right_all(ops,span);
-
- // left merging network
- ops = hsg_thread_merge(ops,span/2);
-
- // store
- ops = hsg_fm_thread_store_left(ops,span);
+ uint32_t const span_left = (warps << scale_log2) / 2;
- // end
- ops = hsg_end(ops);
+ for (uint32_t span_right=span_left; span_right >= 1; span_right=pow2_ru_u32(span_right)/2)
+ ops = hsg_fm_merge(ops,scale_log2,span_left,span_right);
return ops;
}
@@ -1354,7 +1290,7 @@ static
struct hsg_op *
hsg_hm_thread_store(struct hsg_op * ops, uint32_t const n)
{
- for (uint32_t r=n; r>=1; r--)
+ for (uint32_t r=1; r<=n; r++)
ops = hsg_op(ops,HM_REG_GLOBAL_STORE(r,r-1));
return ops;
@@ -1362,16 +1298,18 @@ hsg_hm_thread_store(struct hsg_op * ops, uint32_t const n)
static
struct hsg_op *
-hsg_hm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uint32_t const hm_scale)
+hsg_hm_merge(struct hsg_op * ops, uint32_t const scale_log2, uint32_t const warps_pow2)
{
+ uint32_t const span = warps_pow2 << scale_log2;
+
// func proto
- ops = hsg_op(ops,HM_KERNEL_PROTO(level,level-msb_idx_u32(span)));
+ ops = hsg_op(ops,HM_KERNEL_PROTO(scale_log2));
// begin
ops = hsg_begin(ops);
- // declarations
- ops = hsg_op(ops,HM_KERNEL_PREAMBLE(span,hm_scale));
+ // preamble for loading/storing
+ ops = hsg_op(ops,HM_KERNEL_PREAMBLE(span/2));
// load
ops = hsg_hm_thread_load(ops,span);
@@ -1389,55 +1327,6 @@ hsg_hm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uin
}
//
-//
-//
-
-static
-struct hsg_op *
-hsg_fm_merge_level(struct hsg_op * ops, uint32_t const level)
-{
- uint32_t const bc_max = pow2_rd_u32(hsg_merge[0].warps);
- uint32_t const bc_max_log2 = msb_idx_u32(bc_max);
-
- uint32_t const fm_level = (level <= bc_max_log2) ? hsg_config.merge.flip.lo : min(level - bc_max_log2,hsg_config.merge.flip.hi);
- uint32_t const fm_scale = level - fm_level;
-
- ops = hsg_fm_merge(ops,
- level,
- hsg_merge[0].warps * (1u << fm_level),
- fm_scale);
-
- return ops;
-}
-
-//
-//
-//
-
-static
-struct hsg_op *
-hsg_hm_merge_level(struct hsg_op * ops, uint32_t const level)
-{
- uint32_t const bc_max = pow2_rd_u32(hsg_merge[0].warps);
- uint32_t const bc_max_log2 = msb_idx_u32(bc_max);
-
- uint32_t const fm_log2_max = bc_max_log2 + hsg_config.merge.flip.hi;
-
- if (level > fm_log2_max)
- {
- uint32_t const down_warps_log2 = level - fm_log2_max;
- uint32_t const hm_level = max(hsg_config.merge.half.lo,min(hsg_config.merge.half.hi,down_warps_log2));
-
- ops = hsg_hm_merge(ops,
- level - hsg_config.merge.flip.hi,
- bc_max * (1u << hm_level),
- down_warps_log2 - hm_level);
- }
-
- return ops;
-}
-
-//
// GENERATE MERGE KERNELS
//
@@ -1445,23 +1334,20 @@ static
struct hsg_op *
hsg_xm_merge_all(struct hsg_op * ops)
{
- uint32_t const keys_per_block = hsg_merge[0].warps * hsg_config.warp.lanes * hsg_config.thread.regs;
- uint32_t const blocks = ((1U << hsg_config.merge.max_log2) + keys_per_block - 1) / keys_per_block;
- uint32_t const blocks_ru = pow2_ru_u32(blocks);
- uint32_t const blocks_log2 = msb_idx_u32(blocks_ru);
+ uint32_t const warps = hsg_merge[0].warps;
+ uint32_t const warps_pow2 = pow2_rd_u32(warps);
- for (uint32_t level=1; level<=blocks_log2; level+=1)
- {
- //
- // GENERATE FLIP MERGE KERNELS
- //
- ops = hsg_fm_merge_level(ops,level);
+ //
+ // GENERATE FLIP MERGE KERNELS
+ //
+ for (uint32_t scale_log2=hsg_config.merge.flip.lo; scale_log2<=hsg_config.merge.flip.hi; scale_log2++)
+ ops = hsg_fm_merge_all(ops,scale_log2,warps);
- //
- // GENERATE HALF MERGE KERNELS
- //
- ops = hsg_hm_merge_level(ops,level);
- }
+ //
+ // GENERATE HALF MERGE KERNELS
+ //
+ for (uint32_t scale_log2=hsg_config.merge.half.lo; scale_log2<=hsg_config.merge.half.hi; scale_log2++)
+ ops = hsg_hm_merge(ops,scale_log2,warps_pow2);
return ops;
}
@@ -1470,93 +1356,30 @@ hsg_xm_merge_all(struct hsg_op * ops)
//
//
-void
-hsg_target_indent(struct hsg_file * const files, uint32_t const depth)
-{
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%*s",
- depth*HSG_INDENT,"");
-}
-
-void
-hsg_target_debug(struct hsg_file * const files,
- const struct hsg_merge * const merge,
- const struct hsg_op * const ops,
- uint32_t const depth)
-{
-
- hsg_target_indent(files,depth);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s\n",
- hsg_op_type_string[ops->type]);
-}
-
-//
-//
-//
-
-static
-struct hsg_file*
-hsg_files_open(const char * prefix, const char ** suffix)
-{
-#define STR_BUF_SIZE 80
-
- struct hsg_file * files = malloc(sizeof(struct hsg_file) * HSG_FILE_TYPE_COUNT);
-
- for (int32_t ii=0; ii<HSG_FILE_TYPE_COUNT; ii++)
- {
- char * name = files[ii].name;
-
- // save prefix
- files[ii].prefix = prefix;
-
- // build filename
- strcpy_s(name,STR_BUF_SIZE,prefix);
- strcat_s(name,STR_BUF_SIZE,suffix[ii]);
-
- // open file
- fopen_s(&files[ii].file,name,"w+");
- }
-
- return files;
-}
-
-static
-void
-hsg_files_close(struct hsg_file * files)
-{
- for (int32_t ii=0; ii<HSG_FILE_TYPE_COUNT; ii++)
- fclose(files[ii].file);
-}
-
-//
-//
-//
-
static
-const struct hsg_op *
-hsg_op_translate_depth(hsg_target_pfn target_pfn,
- struct hsg_file * const files,
- const struct hsg_merge * const merge,
- const struct hsg_op * ops,
- uint32_t const depth)
+struct hsg_op const *
+hsg_op_translate_depth(hsg_target_pfn target_pfn,
+ struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * ops,
+ uint32_t const depth)
{
while (ops->type != HSG_OP_TYPE_EXIT)
{
switch (ops->type)
{
case HSG_OP_TYPE_END:
- target_pfn(files,merge,ops,depth-1);
+ target_pfn(target,config,merge,ops,depth-1);
return ops + 1;
case HSG_OP_TYPE_BEGIN:
- target_pfn(files,merge,ops,depth);
- ops = hsg_op_translate_depth(target_pfn,files,merge,ops+1,depth+1);
+ target_pfn(target,config,merge,ops,depth);
+ ops = hsg_op_translate_depth(target_pfn,target,config,merge,ops+1,depth+1);
break;
default:
- target_pfn(files,merge,ops++,depth);
+ target_pfn(target,config,merge,ops++,depth);
}
}
@@ -1565,12 +1388,13 @@ hsg_op_translate_depth(hsg_target_pfn target_pfn,
static
void
-hsg_op_translate(hsg_target_pfn target_pfn,
- struct hsg_file * const files,
- const struct hsg_merge * const merge,
- const struct hsg_op * ops)
+hsg_op_translate(hsg_target_pfn target_pfn,
+ struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * ops)
{
- hsg_op_translate_depth(target_pfn,files,merge,ops,0);
+ hsg_op_translate_depth(target_pfn,target,config,merge,ops,0);
}
//
@@ -1581,37 +1405,27 @@ int
main(int argc, char * argv[])
{
//
- // INIT
- //
- for (uint32_t ii=0; ii<=MERGE_LEVELS_MAX_LOG2; ii++)
- {
- hsg_merge[ii].index = ii;
- hsg_merge[ii].warps = 32 / (1u << ii);
- }
-
- //
// PROCESS OPTIONS
//
- int32_t arch = 0;
- int32_t opt = 0;
-
- bool quiet = false;
- bool autotune = false;
+ int32_t opt = 0;
+ bool verbose = false;
+ bool autotune = false;
+ char const * arch = "undefined";
- while ((opt = getopt(argc,argv,"hqa:g:G:s:S:w:b:B:m:M:k:r:x:t:f:F:c:C:z")) != EOF)
+ while ((opt = getopt(argc,argv,"hva:g:G:s:S:w:b:B:m:M:k:r:x:t:f:F:c:C:z")) != EOF)
{
switch (opt)
{
case 'h':
fprintf(stderr,"Help goes here...\n");
- return -1;
+ return EXIT_FAILURE;
- case 'q':
- quiet = true;
+ case 'v':
+ verbose = true;
break;
case 'a':
- arch = atoi(optarg);
+ arch = optarg;
break;
case 'g':
@@ -1635,30 +1449,28 @@ main(int argc, char * argv[])
break;
case 'w':
- hsg_config.warp.lanes = atoi(optarg);
+ hsg_config.warp.lanes = atoi(optarg);
+ hsg_config.warp.lanes_log2 = msb_idx_u32(hsg_config.warp.lanes);
break;
case 'b':
// maximum warps in a workgroup / cta / thread block
{
- uint32_t const warps = atoi(optarg);
- uint32_t const warps_ru_pow2 = pow2_ru_u32(warps);
-
- // set warps_max if not already set
- if (hsg_config.block.warps_max == UINT32_MAX)
- hsg_config.block.warps_max = warps_ru_pow2;
+ uint32_t const warps = atoi(optarg);
// must always be even
- if ((warps&1) != 0)
+ if ((warps & 1) != 0)
{
fprintf(stderr,"Error: -b must be even.\n");
- exit(-1);
+ return EXIT_FAILURE;
}
+ hsg_merge[0].index = 0;
hsg_merge[0].warps = warps;
- for (uint32_t ii=1; ii<=MERGE_LEVELS_MAX_LOG2; ii++)
- hsg_merge[ii].warps = warps_ru_pow2 / (1u << ii);
+ // set warps_max if not already set
+ if (hsg_config.block.warps_max == UINT32_MAX)
+ hsg_config.block.warps_max = pow2_ru_u32(warps);
}
break;
@@ -1677,18 +1489,14 @@ main(int argc, char * argv[])
hsg_config.block.warps_mod = atoi(optarg);
break;
- case 'k':
- hsg_config.merge.max_log2 = atoi(optarg);
- break;
-
case 'r':
{
uint32_t const regs = atoi(optarg);
- if ((regs&1) != 0)
+ if ((regs & 1) != 0)
{
fprintf(stderr,"Error: -r must be even.\n");
- exit(-1);
+ return EXIT_FAILURE;
}
hsg_config.thread.regs = regs;
@@ -1726,17 +1534,39 @@ main(int argc, char * argv[])
}
//
- // WHICH ARCH TARGET?
+ // INIT MERGE
//
- hsg_target_pfn hsg_target_pfn = (arch < HSG_TARGET_PFN_COUNT) ? hsg_target_pfns[arch] : hsg_target_debug;
+ uint32_t const warps_ru_pow2 = pow2_ru_u32(hsg_merge[0].warps);
+
+ for (uint32_t ii=1; ii<=MERGE_LEVELS_MAX_LOG2; ii++)
+ {
+ hsg_merge[ii].index = ii;
+ hsg_merge[ii].warps = warps_ru_pow2 >> ii;
+ }
//
- // OPEN FILES
+ // WHICH ARCH TARGET?
//
- struct hsg_file * files = hsg_files_open(hsg_target_pfn_string[arch],hsg_file_type_string[arch]);
+ hsg_target_pfn hsg_target_pfn;
+
+ if (strcmp(arch,"debug") == 0)
+ hsg_target_pfn = hsg_target_debug;
+ else if (strcmp(arch,"cuda") == 0)
+ hsg_target_pfn = hsg_target_cuda;
+ else if (strcmp(arch,"opencl") == 0)
+ hsg_target_pfn = hsg_target_opencl;
+ else if (strcmp(arch,"glsl") == 0)
+ hsg_target_pfn = hsg_target_glsl;
+ else {
+ fprintf(stderr,"Invalid arch: %s\n",arch);
+ exit(EXIT_FAILURE);
+ }
+
+ if (verbose)
+ fprintf(stderr,"Target: %s\n",arch);
//
- // INIT F_KEYS
+ // INIT SMEM KEY ALLOCATION
//
hsg_config_init_shared();
@@ -1766,27 +1596,26 @@ main(int argc, char * argv[])
//
// THESE ARE FOR DEBUG/INSPECTION
//
-
- if (!quiet)
+ if (verbose)
{
hsg_merge_levels_debug(merge);
}
}
- if (!quiet)
+ if (verbose)
fprintf(stderr,"\n\n");
//
+ // GENERATE THE OPCODES
//
- //
- uint32_t const op_count = 1024*1024; // 2^20 ops for now!
- struct hsg_op * const ops_begin = malloc(op_count * sizeof(*ops_begin));
+ uint32_t const op_count = 1<<17;
+ struct hsg_op * const ops_begin = malloc(sizeof(*ops_begin) * op_count);
struct hsg_op * ops = ops_begin;
//
- // APPEND HEADER
+ // OPEN INITIAL FILES AND APPEND HEADER
//
- ops = hsg_op(ops,FILE_HEADER());
+ ops = hsg_op(ops,TARGET_BEGIN());
//
// GENERATE TRANSPOSE KERNEL
@@ -1809,9 +1638,9 @@ main(int argc, char * argv[])
ops = hsg_xm_merge_all(ops);
//
- // APPEND FOOTER
+ // APPEND FOOTER AND CLOSE INITIAL FILES
//
- ops = hsg_op(ops,FILE_FOOTER());
+ ops = hsg_op(ops,TARGET_END());
//
// ... WE'RE DONE!
@@ -1821,20 +1650,17 @@ main(int argc, char * argv[])
//
// APPLY TARGET TRANSLATOR TO ACCUMULATED OPS
//
- hsg_op_translate(hsg_target_pfn,files,hsg_merge,ops_begin);
+ struct hsg_target target;
- //
- //
- //
- if (!quiet)
- hsg_op_debug();
+ hsg_op_translate(hsg_target_pfn,&target,&hsg_config,hsg_merge,ops_begin);
//
+ // DUMP INSTRUCTION COUNTS
//
- //
- hsg_files_close(files);
+ if (verbose)
+ hsg_op_debug();
- return 0;
+ return EXIT_SUCCESS;
}
//
diff --git a/src/compute/hs/gen/networks_merging.c b/src/compute/hs/gen/networks_merging.c
index 90dca03c21..f93958c842 100644
--- a/src/compute/hs/gen/networks_merging.c
+++ b/src/compute/hs/gen/networks_merging.c
@@ -11,7 +11,7 @@
//
#include "networks.h"
-#include "macros.h"
+#include "common/macros.h"
//
//
@@ -24,7 +24,7 @@
//
//
-#define LM(n) { ARRAY_LENGTH(mn##n), mn##n }
+#define LM(n) { ARRAY_LENGTH_MACRO(mn##n), mn##n }
//
//
diff --git a/src/compute/hs/gen/networks_sorting.c b/src/compute/hs/gen/networks_sorting.c
index c7beb6b45e..3d8d364399 100644
--- a/src/compute/hs/gen/networks_sorting.c
+++ b/src/compute/hs/gen/networks_sorting.c
@@ -14,7 +14,7 @@
//
#include "networks.h"
-#include "macros.h"
+#include "common/macros.h"
//
//
@@ -27,7 +27,7 @@
//
//
-#define LS(n) { ARRAY_LENGTH(sn##n), sn##n }
+#define LS(n) { ARRAY_LENGTH_MACRO(sn##n), sn##n }
//
//
diff --git a/src/compute/hs/gen/target_cuda.c b/src/compute/hs/gen/target_cuda.c
new file mode 100644
index 0000000000..e140c4be4c
--- /dev/null
+++ b/src/compute/hs/gen/target_cuda.c
@@ -0,0 +1,600 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+//
+//
+
+#include "gen.h"
+#include "transpose.h"
+
+#include "common/util.h"
+#include "common/macros.h"
+
+//
+//
+//
+
+struct hsg_transpose_state
+{
+ FILE * header;
+ struct hsg_config const * config;
+};
+
+static
+char
+hsg_transpose_reg_prefix(uint32_t const cols_log2)
+{
+ return 'a' + (('r' + cols_log2 - 'a') % 26);
+}
+
+static
+void
+hsg_transpose_blend(uint32_t const cols_log2,
+ uint32_t const row_ll, // lower-left
+ uint32_t const row_ur, // upper-right
+ struct hsg_transpose_state * const state)
+{
+ // we're starting register names at '1' for now
+ fprintf(state->header,
+ " HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n",
+ hsg_transpose_reg_prefix(cols_log2-1),
+ hsg_transpose_reg_prefix(cols_log2),
+ cols_log2,row_ll+1,row_ur+1);
+}
+
+static
+void
+hsg_transpose_remap(uint32_t const row_from,
+ uint32_t const row_to,
+ struct hsg_transpose_state * const state)
+{
+ // we're starting register names at '1' for now
+ fprintf(state->header,
+ " HS_TRANSPOSE_REMAP( %c, %3u, %3u ) \\\n",
+ hsg_transpose_reg_prefix(state->config->warp.lanes_log2),
+ row_from+1,row_to+1);
+}
+
+//
+//
+//
+
+static
+void
+hsg_copyright(FILE * file)
+{
+ fprintf(file,
+ "// \n"
+ "// Copyright 2016 Google Inc. \n"
+ "// \n"
+ "// Use of this source code is governed by a BSD-style \n"
+ "// license that can be found in the LICENSE file. \n"
+ "// \n"
+ "\n");
+}
+
+//
+//
+//
+
+struct hsg_target_state
+{
+ FILE * header;
+ FILE * source;
+};
+
+//
+//
+//
+
+void
+hsg_target_cuda(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth)
+{
+ switch (ops->type)
+ {
+ case HSG_OP_TYPE_END:
+ fprintf(target->state->source,
+ "}\n");
+ break;
+
+ case HSG_OP_TYPE_BEGIN:
+ fprintf(target->state->source,
+ "{\n");
+ break;
+
+ case HSG_OP_TYPE_ELSE:
+ fprintf(target->state->source,
+ "else\n");
+ break;
+
+ case HSG_OP_TYPE_TARGET_BEGIN:
+ {
+ // allocate state
+ target->state = malloc(sizeof(*target->state));
+
+ // allocate files
+ fopen_s(&target->state->header,"hs_cuda.h", "wb");
+ fopen_s(&target->state->source,"hs_cuda.cu","wb");
+
+ // initialize header
+ uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps));
+
+ hsg_copyright(target->state->header);
+
+ fprintf(target->state->header,
+ "#ifndef HS_CUDA_ONCE \n"
+ "#define HS_CUDA_ONCE \n"
+ " \n"
+ "#define HS_SLAB_THREADS_LOG2 %u \n"
+ "#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2) \n"
+ "#define HS_SLAB_WIDTH_LOG2 %u \n"
+ "#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2) \n"
+ "#define HS_SLAB_HEIGHT %u \n"
+ "#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)\n"
+ "#define HS_REG_LAST(c) c##%u \n"
+ "#define HS_KEY_TYPE %s \n"
+ "#define HS_KEY_WORDS %u \n"
+ "#define HS_VAL_WORDS 0 \n"
+ "#define HS_BS_SLABS %u \n"
+ "#define HS_BS_SLABS_LOG2_RU %u \n"
+ "#define HS_BC_SLABS_LOG2_MAX %u \n"
+ "#define HS_FM_SCALE_MIN %u \n"
+ "#define HS_FM_SCALE_MAX %u \n"
+ "#define HS_HM_SCALE_MIN %u \n"
+ "#define HS_HM_SCALE_MAX %u \n"
+ "#define HS_EMPTY \n"
+ " \n",
+ config->warp.lanes_log2,
+ config->warp.lanes_log2,
+ config->thread.regs,
+ config->thread.regs,
+ (config->type.words == 2) ? "ulong" : "uint",
+ config->type.words,
+ merge->warps,
+ msb_idx_u32(pow2_ru_u32(merge->warps)),
+ bc_max,
+ config->merge.flip.lo,
+ config->merge.flip.hi,
+ config->merge.half.lo,
+ config->merge.half.hi);
+
+ fprintf(target->state->header,
+ "#define HS_SLAB_ROWS() \\\n");
+
+ for (uint32_t ii=1; ii<=config->thread.regs; ii++)
+ fprintf(target->state->header,
+ " HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1);
+
+ fprintf(target->state->header,
+ " HS_EMPTY\n"
+ " \n");
+
+ fprintf(target->state->header,
+ "#define HS_TRANSPOSE_SLAB() \\\n");
+
+ for (uint32_t ii=1; ii<=config->warp.lanes_log2; ii++)
+ fprintf(target->state->header,
+ " HS_TRANSPOSE_STAGE( %u ) \\\n",ii);
+
+ struct hsg_transpose_state state[1] =
+ {
+ { .header = target->state->header,
+ .config = config
+ }
+ };
+
+ hsg_transpose(config->warp.lanes_log2,
+ config->thread.regs,
+ hsg_transpose_blend,state,
+ hsg_transpose_remap,state);
+
+ fprintf(target->state->header,
+ " HS_EMPTY\n"
+ " \n");
+
+ hsg_copyright(target->state->source);
+
+ fprintf(target->state->source,
+ "#include \"hs_cuda_macros.h\" \n"
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n");
+ }
+ break;
+
+ case HSG_OP_TYPE_TARGET_END:
+ // decorate the files
+ fprintf(target->state->header,
+ "#endif \n"
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n"
+ " \n");
+ fprintf(target->state->source,
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n"
+ " \n");
+
+ // close files
+ fclose(target->state->header);
+ fclose(target->state->source);
+
+ // free state
+ free(target->state);
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO:
+ {
+ fprintf(target->state->source,
+ "\nHS_TRANSPOSE_KERNEL_PROTO(%u)\n",
+ config->warp.lanes);
+ }
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE:
+ {
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ }
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY:
+ {
+ fprintf(target->state->source,
+ "HS_TRANSPOSE_SLAB()\n");
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_KERNEL_PROTO:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ uint32_t const bs = pow2_ru_u32(m->warps);
+ uint32_t const msb = msb_idx_u32(bs);
+
+ fprintf(target->state->source,
+ "\nHS_BS_KERNEL_PROTO(%u,%u,%u)\n",
+ config->warp.lanes,m->warps,msb);
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_KERNEL_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ if (m->warps > 1)
+ {
+ fprintf(target->state->source,
+ "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+ m->warps * config->warp.lanes,
+ m->rows_bs);
+ }
+
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ }
+ break;
+
+ case HSG_OP_TYPE_BC_KERNEL_PROTO:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ uint32_t const msb = msb_idx_u32(m->warps);
+
+ fprintf(target->state->source,
+ "\nHS_BC_KERNEL_PROTO(%u,%u,%u)\n",
+ config->warp.lanes,m->warps,msb);
+ }
+ break;
+
+ case HSG_OP_TYPE_BC_KERNEL_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ if (m->warps > 1)
+ {
+ fprintf(target->state->source,
+ "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+ m->warps * config->warp.lanes,
+ m->rows_bc);
+ }
+
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ }
+ break;
+
+ case HSG_OP_TYPE_FM_KERNEL_PROTO:
+ fprintf(target->state->source,
+ "\nHS_FM_KERNEL_PROTO(%u,%u)\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_FM_KERNEL_PREAMBLE:
+ fprintf(target->state->source,
+ "HS_FM_PREAMBLE(%u);\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_HM_KERNEL_PROTO:
+ {
+ fprintf(target->state->source,
+ "\nHS_HM_KERNEL_PROTO(%u)\n",
+ ops->a);
+ }
+ break;
+
+ case HSG_OP_TYPE_HM_KERNEL_PREAMBLE:
+ fprintf(target->state->source,
+ "HS_HM_PREAMBLE(%u);\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD:
+ {
+ static char const * const vstr[] = { "vin", "vout" };
+
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_SLAB_GLOBAL_LOAD(%s,%u,%u);\n",
+ ops->n,vstr[ops->v],config->warp.lanes,ops->n-1);
+ }
+ break;
+
+ case HSG_OP_TYPE_BX_REG_GLOBAL_STORE:
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_STORE(%u,%u,r%u);\n",
+ config->warp.lanes,ops->n-1,ops->n);
+ break;
+
+ case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_HM_REG_GLOBAL_STORE:
+ fprintf(target->state->source,
+ "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT:
+ fprintf(target->state->source,
+ "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_FM_GLOBAL_LOAD_R(%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT:
+ fprintf(target->state->source,
+ "HS_FM_GLOBAL_STORE_R(%-3u,r%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED:
+ {
+ if (ops->a <= ops->b)
+ {
+ fprintf(target->state->source,
+ "if (HS_FM_IS_NOT_LAST_SPAN() || (fm_frac == 0))\n");
+ }
+ else if (ops->b > 1)
+ {
+ fprintf(target->state->source,
+ "else if (fm_frac == %u)\n",
+ ops->b);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "else\n");
+ }
+ }
+ break;
+
+ case HSG_OP_TYPE_SLAB_FLIP:
+ fprintf(target->state->source,
+ "HS_SLAB_FLIP_PREAMBLE(%u);\n",
+ ops->n-1);
+ break;
+
+ case HSG_OP_TYPE_SLAB_HALF:
+ fprintf(target->state->source,
+ "HS_SLAB_HALF_PREAMBLE(%u);\n",
+ ops->n / 2);
+ break;
+
+ case HSG_OP_TYPE_CMP_FLIP:
+ fprintf(target->state->source,
+ "HS_CMP_FLIP(%-3u,r%-3u,r%-3u);\n",ops->a,ops->b,ops->c);
+ break;
+
+ case HSG_OP_TYPE_CMP_HALF:
+ fprintf(target->state->source,
+ "HS_CMP_HALF(%-3u,r%-3u);\n",ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_CMP_XCHG:
+ if (ops->c == UINT32_MAX)
+ {
+ fprintf(target->state->source,
+ "HS_CMP_XCHG(r%-3u,r%-3u);\n",
+ ops->a,ops->b);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "HS_CMP_XCHG(r%u_%u,r%u_%u);\n",
+ ops->c,ops->a,ops->c,ops->b);
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_STORE_V:
+ fprintf(target->state->source,
+ "HS_BX_LOCAL_V(%-3u * %-2u * %-3u) = r%u;\n",
+ merge[ops->a].warps,config->warp.lanes,ops->c,ops->b);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V:
+ fprintf(target->state->source,
+ "r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+ ops->b,merge[ops->a].warps,config->warp.lanes,ops->c);
+ break;
+
+ case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+ ops->b,ops->a,config->warp.lanes,ops->c);
+ break;
+
+ case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT:
+ fprintf(target->state->source,
+ "HS_SLAB_LOCAL_L(%5u) = r%u_%u;\n",
+ ops->b * config->warp.lanes,
+ ops->c,
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT:
+ fprintf(target->state->source,
+ "HS_SLAB_LOCAL_R(%5u) = r%u_%u;\n",
+ ops->b * config->warp.lanes,
+ ops->c,
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_L(%u);\n",
+ ops->c,
+ ops->a,
+ ops->b * config->warp.lanes);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_R(%u);\n",
+ ops->c,
+ ops->a,
+ ops->b * config->warp.lanes);
+ break;
+
+ case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_BC_GLOBAL_LOAD_L(%u,%u);\n",
+ ops->c,
+ ops->a,
+ config->warp.lanes,ops->b);
+ break;
+
+ case HSG_OP_TYPE_BLOCK_SYNC:
+ fprintf(target->state->source,
+ "HS_BLOCK_BARRIER();\n");
+ //
+ // FIXME - Named barriers to allow coordinating warps to proceed?
+ //
+ break;
+
+ case HSG_OP_TYPE_BS_FRAC_PRED:
+ {
+ if (ops->m == 0)
+ {
+ fprintf(target->state->source,
+ "if (warp_idx < bs_full)\n");
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "else if (bs_frac == %u)\n",
+ ops->w);
+ }
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ fprintf(target->state->source,
+ "HS_BS_MERGE_H_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,m->warps);
+ }
+ break;
+
+ case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ fprintf(target->state->source,
+ "HS_BC_MERGE_H_PREAMBLE(%u,%u,%u);\n",
+ config->warp.lanes,config->thread.regs,m->warps);
+ }
+ break;
+
+ case HSG_OP_TYPE_BX_MERGE_H_PRED:
+ fprintf(target->state->source,
+ "if (get_sub_group_id() < %u)\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_ACTIVE_PRED:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ if (m->warps <= 32)
+ {
+ fprintf(target->state->source,
+ "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n",
+ m->levels[ops->b].active.b32a2[0]);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n",
+ m->levels[ops->b].active.b32a2[1],
+ m->levels[ops->b].active.b32a2[0]);
+ }
+ }
+ break;
+
+ default:
+ fprintf(stderr,"type not found: %s\n",hsg_op_type_string[ops->type]);
+ exit(EXIT_FAILURE);
+ break;
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/hs/gen/target_cuda_sm3x.c b/src/compute/hs/gen/target_cuda_sm3x.c
deleted file mode 100644
index 6369aa33b0..0000000000
--- a/src/compute/hs/gen/target_cuda_sm3x.c
+++ /dev/null
@@ -1,776 +0,0 @@
-/*
- * Copyright 2016 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#include <stdio.h>
-
-//
-//
-//
-
-#include "gen.h"
-#include "util.h"
-
-//
-//
-//
-
-void
-hsg_target_cuda_sm3x(struct hsg_file * const files,
- struct hsg_merge const * const merge,
- struct hsg_op const * const ops,
- uint32_t const depth)
-{
- const char* const type = (hsg_config.type.words == 2) ? "uint64_t" : "uint32_t";
- const char* const type_max = (hsg_config.type.words == 2) ? "UINT64_MAX" : "UINT32_MAX";
-
- switch (ops->type)
- {
- case HSG_OP_TYPE_END:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "}\n");
- break;
-
- case HSG_OP_TYPE_BEGIN:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "{\n");
- break;
-
- case HSG_OP_TYPE_ELSE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "else\n");
- break;
-
- case HSG_OP_TYPE_FILE_HEADER:
- {
- uint32_t const bc_min = msb_idx_u32(hsg_config.block.warps_min);
- uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge[0].warps));
-
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- "// \n"
- "// Copyright 2016 Google Inc. \n"
- "// \n"
- "// Use of this source code is governed by a BSD-style \n"
- "// license that can be found in the LICENSE file. \n"
- "// \n"
- " \n"
- "#pragma once \n"
- " \n"
- "#include <stdint.h> \n"
- " \n"
- "#define HS_LANES_PER_WARP %u \n"
- "#define HS_BS_WARPS_PER_BLOCK %u \n"
- "#define HS_BC_WARPS_LOG2_MIN %u \n"
- "#define HS_BC_WARPS_LOG2_MAX %u \n"
- "#define HS_KEYS_PER_THREAD %u \n"
- "#define HS_KEY_WORDS %u \n"
- "#define HS_KEY_TYPE %s \n"
- " \n"
- "#include <%s_args.h> \n"
- " \n",
- hsg_config.warp.lanes,
- merge->warps,
- bc_min,
- bc_max,
- hsg_config.thread.regs,
- hsg_config.type.words,
- type,
- files[HSG_FILE_TYPE_SOURCE].prefix);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "// -*- compile-command: \"nvcc -arch sm_52 -Xptxas=-v,-abi=no -cubin -I. %s\"; -*-\n",
- files[HSG_FILE_TYPE_SOURCE].name);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "// \n"
- "// Copyright 2016 Google Inc. \n"
- "// \n"
- "// Use of this source code is governed by a BSD-style \n"
- "// license that can be found in the LICENSE file. \n"
- "// \n"
- " \n"
- "#ifdef __cplusplus \n"
- "extern \"C\" { \n"
- "#endif \n"
- " \n"
- "#include \"%s_launcher.h\" \n"
- " \n"
- "#ifdef __cplusplus \n"
- "} \n"
- "#endif \n"
- " \n"
- "#include \"%s_launch_bounds.h\" \n"
- "#include <%s_finalize.inl> \n"
- " \n"
- "// \n"
- "// \n"
- "// \n",
- files[HSG_FILE_TYPE_HEADER].prefix,
- files[HSG_FILE_TYPE_SOURCE].prefix,
- files[HSG_FILE_TYPE_SOURCE].prefix);
- }
- break;
-
- case HSG_OP_TYPE_FILE_FOOTER:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "// \n"
- "// \n"
- "// \n"
- " \n"
- "#include \"%s_launcher.inl\" \n"
- " \n"
- "// \n"
- "// \n"
- "// \n",
- files[HSG_FILE_TYPE_SOURCE].prefix);
- break;
-
- case HSG_OP_TYPE_BS_KERNEL_PROTO:
- {
- const uint32_t tpb = merge->warps * hsg_config.warp.lanes;
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "extern \"C\" \n"
- "__global__ \n"
- "__launch_bounds__(%u,%u) \n"
- "void \n"
- "hs_bs_kernel(const struct hs_args args) \n",
- tpb,1);
- }
- break;
-
- case HSG_OP_TYPE_BS_KERNEL_PREAMBLE:
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "__shared__ union { \n");
-
- for (uint32_t ii=0; ii<MERGE_LEVELS_MAX_LOG2; ii++)
- {
- const struct hsg_merge* const m = merge + ii;
-
- if (m->warps < 2)
- break;
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " %s m%u[%u][%u];\n",
- type,
- ii,
- m->rows_bs,
- m->warps * hsg_config.warp.lanes);
- }
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " struct { \n"
- " %s f[%u][%u]; \n"
- " %s l[%u]; \n"
- " }; \n",
- type,
- merge[0].warps,
- hsg_config.warp.skpw_bs - 1,
- type,
- merge[0].warps);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "} shared; \n"
- " \n");
-
- const uint32_t kpw = hsg_config.warp.lanes * hsg_config.thread.regs;
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "const int32_t block_warp_idx = threadIdx.x / %u; \n"
- "const int32_t warp_lane_idx = threadIdx.x & %u; \n"
- "const int32_t warp_idx = blockIdx.x * %u + block_warp_idx; \n"
- "const int32_t warp_gmem_idx = warp_idx * %u + warp_lane_idx; \n"
- " \n"
- "%s const * const vin_ptr = args.vin + warp_gmem_idx; \n"
- "%s * const vout_ptr = args.vout + warp_gmem_idx; \n"
- " \n",
-
- hsg_config.warp.lanes,
- hsg_config.warp.lanes - 1,
- merge[0].warps,
- kpw,
- type,
- type);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (warp_idx >= args.bs.full + args.bs.frac) \n"
- " return; \n"
- " \n");
- }
- break;
-
- case HSG_OP_TYPE_BC_KERNEL_PROTO:
- {
- uint32_t const bc_warps = merge[ops->a].warps;
- uint32_t const tpb = bc_warps * hsg_config.warp.lanes;
- uint32_t const bpm = hsg_config.block.warps_max / bc_warps;
- uint32_t const msb = msb_idx_u32(bc_warps);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "extern \"C\" \n"
- "__global__ \n"
- "__launch_bounds__(%u,%u) \n"
- "void \n"
- "hs_bc_%u_kernel(const struct hs_args args) \n",
- tpb,bpm,
- msb);
- }
- break;
-
- case HSG_OP_TYPE_BC_KERNEL_PREAMBLE:
- {
- const struct hsg_merge* const m = merge + ops->a;
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "__shared__ union { \n");
-
- if (m->warps >= 2)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " %s m%u[%u][%u]; \n",
- type,
- ops->a,
- m->rows_bc,
- m->warps * hsg_config.warp.lanes);
- }
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " struct { \n"
- " %s f[%u][%u]; \n"
- " %s l[%u]; \n"
- " }; \n"
- "} shared; \n"
- " \n",
- type,m->warps,m->skpw_bc - 1,
- type,m->warps);
-
- const uint32_t kpw = hsg_config.warp.lanes * hsg_config.thread.regs;
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "const int32_t block_warp_idx = threadIdx.x / %u; \n"
- "const int32_t warp_lane_idx = threadIdx.x & %u; \n"
- "const int32_t warp_gmem_base = blockIdx.x * %u * %u + warp_lane_idx; \n"
- "const int32_t warp_gmem_idx = warp_gmem_base + block_warp_idx * %u; \n"
- " \n"
- "%s * const vout_ptr = args.vout + warp_gmem_idx; \n"
- " \n",
- hsg_config.warp.lanes,
- hsg_config.warp.lanes - 1,
- m->warps,kpw,
- kpw,
- type);
-
-#if 0
- //
- // NO LONGER NEED THIS TEST
- //
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (warp_idx >= args.bc.full) \n"
- " return; \n"
- " \n");
-#endif
- }
- break;
-
- case HSG_OP_TYPE_FM_KERNEL_PROTO:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "#define HS_FM_WARPS_LOG2_%u %u \n"
- "extern \"C\" \n"
- "__global__ \n"
- "HS_FM_LAUNCH_BOUNDS_%u \n"
- "void \n"
- "hs_fm_%u_kernel(const struct hs_args args) \n",
- ops->a,
- ops->b,
- ops->a - ops->b,
- ops->a);
- break;
-
- case HSG_OP_TYPE_FM_KERNEL_PREAMBLE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "const int32_t warp_idx = (blockDim.x * blockIdx.x + threadIdx.x) / %u; \n"
- "const int32_t warp_lane_idx = threadIdx.x & %u; \n"
- " \n"
- "const int32_t merge_idx = warp_idx / %u >> %u; \n"
- " \n"
- "const int32_t merge_stride = %u * %u << %u; \n"
- "const int32_t merge_keys = merge_stride * %u; \n"
- " \n"
- "const int32_t merge_base = merge_idx * merge_keys; \n"
- " \n"
- "const int32_t merge_l_off = (warp_idx - merge_idx * (%u << %u)) * %u + warp_lane_idx; \n"
- "const int32_t merge_l_end = merge_l_off + merge_stride * (%u / 2 - 1); \n"
- "%s * const merge_l = args.vout + merge_base + merge_l_off; \n"
- " \n"
- "const int32_t merge_r_off = merge_keys - merge_l_end - 1; \n"
- "%s * const merge_r = args.vout + merge_base + merge_r_off; \n"
- " \n",
- hsg_config.warp.lanes,
- hsg_config.warp.lanes-1,
- hsg_config.thread.regs,ops->b,
- hsg_config.thread.regs,hsg_config.warp.lanes,ops->b,
- ops->a,
- hsg_config.thread.regs,ops->b,hsg_config.warp.lanes,
- ops->a,
- type,
- type);
- break;
-
- case HSG_OP_TYPE_HM_KERNEL_PROTO:
- {
- const uint32_t bc_max = msb_idx_u32(pow2_rd_u32(merge[0].warps));
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "#define HS_HM_WARPS_LOG2_%u %u \n"
- "extern \"C\" \n"
- "__global__ \n"
- "HS_HM_LAUNCH_BOUNDS_%u \n"
- "void \n"
- "hs_hm_%u_kernel(const struct hs_args args) \n",
- ops->a,
- ops->b,
- ops->a - ops->b - bc_max,
- ops->a);
- }
- break;
-
- case HSG_OP_TYPE_HM_KERNEL_PREAMBLE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "const int32_t warp_idx = (blockDim.x * blockIdx.x + threadIdx.x) / %u; \n"
- "const int32_t warp_lane_idx = threadIdx.x & %u; \n"
- " \n"
- "const int32_t merge_idx = (warp_idx / %u) >> %u; \n"
- " \n"
- "const int32_t merge_stride = %u * %u << %u; \n"
- "const int32_t merge_keys = merge_stride * %u; \n"
- " \n"
- "const int32_t merge_base = merge_idx * merge_keys; \n"
- " \n"
- "const int32_t merge_off = (warp_idx - merge_idx * (%u << %u)) * %u; \n"
- "%s * const merge_ptr = args.vout + merge_base + merge_off + warp_lane_idx; \n"
- " \n",
- hsg_config.warp.lanes,
- hsg_config.warp.lanes-1,
- hsg_config.thread.regs,ops->b,
- hsg_config.thread.regs,hsg_config.warp.lanes,ops->b,
- ops->a,
- hsg_config.thread.regs,ops->b,hsg_config.warp.lanes,
- type);
- break;
-
- case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD:
- {
- static const char* const vstr[] = { "vin_ptr", "vout_ptr" };
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s r%-3u = %s[%-3u * %u]; \n",
- type,ops->n,vstr[ops->v],ops->n-1,hsg_config.warp.lanes);
- }
- break;
-
- case HSG_OP_TYPE_BX_REG_GLOBAL_STORE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "vout_ptr[%-3u * %u] = r%u; \n",
- ops->n-1,hsg_config.warp.lanes,ops->n);
- break;
-
-#if 0
- case HSG_OP_TYPE_BX_WARP_STORE_PRED:
- if (ops->a == 1)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (!args.is_final) \n");
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (blockIdx.x * %u + block_warp_idx >= args.bx.ru) \n"
- "{ \n"
- " return; \n"
- "} \n"
- "else if (!args.is_final) \n",
- ops->a);
- }
- break;
-#endif
-
- case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s r%-3u = merge_ptr[%-3u * merge_stride];\n",
- type,ops->a,ops->b);
- break;
-
- case HSG_OP_TYPE_HM_REG_GLOBAL_STORE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "merge_ptr[%-3u * merge_stride] = r%u;\n",
- ops->b,ops->a);
- break;
-
- case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s r%-3u = merge_l[%-3u * merge_stride];\n",
- type,ops->a,ops->b);
- break;
-
- case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "merge_l[%-3u * merge_stride] = r%u;\n",
- ops->b,ops->a);
- break;
-
- case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s r%-3u = merge_r[%-3u * merge_stride];\n",
- type,ops->a,ops->b);
- break;
-
- case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "merge_r[%-3u * merge_stride] = r%u;\n",
- ops->b,ops->a);
- break;
-
- case HSG_OP_TYPE_WARP_FLIP:
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "const int32_t flip_lane_mask = %u; \n"
- "const int32_t flip_lane_idx = warp_lane_idx ^ flip_lane_mask; \n"
- "const bool t_lt = warp_lane_idx < flip_lane_idx; \n",
- ops->n-1);
- }
- break;
-
- case HSG_OP_TYPE_WARP_HALF:
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "const int32_t half_lane_mask = %u; \n"
- "const int32_t half_lane_idx = warp_lane_idx ^ half_lane_mask; \n"
- "const bool t_lt = warp_lane_idx < half_lane_idx; \n",
- ops->n / 2);
- }
- break;
-
- case HSG_OP_TYPE_CMP_FLIP:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_CMP_FLIP(r%-3u,r%-3u,r%-3u)\n",ops->a,ops->b,ops->c);
- break;
-
- case HSG_OP_TYPE_CMP_HALF:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_CMP_HALF(r%-3u,r%-3u)\n",ops->a,ops->b);
- break;
-
- case HSG_OP_TYPE_CMP_XCHG:
- if (ops->c == UINT32_MAX)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_CMP_XCHG(r%-3u,r%-3u)\n",
- ops->a,ops->b);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_CMP_XCHG(r%u_%u,r%u_%u)\n",
- ops->c,ops->a,ops->c,ops->b);
- }
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_STORE_V:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "smem_v[%-3u * %-2u * %-3u] = r%u;\n",
- ops->a,hsg_config.warp.lanes,ops->c,ops->b);
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "r%-3u = smem_v[%-3u * %-2u * %-3u];\n",
- ops->b,ops->a,hsg_config.warp.lanes,ops->c);
- break;
-
- case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s r%-3u = smem_v[%-3u * %-2u * %-3u];\n",
- type,ops->b,ops->a,hsg_config.warp.lanes,ops->c);
- break;
-
- case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "smem_l[%5u] = r%u_%u;\n",
- ops->b * hsg_config.warp.lanes,
- ops->c,
- ops->a);
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "smem_r[%5u] = r%u_%u;\n",
- ops->b * hsg_config.warp.lanes,
- ops->c,
- ops->a);
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s r%u_%-3u = smem_l[%u];\n",
- type,
- ops->c,
- ops->a,
- ops->b * hsg_config.warp.lanes);
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s r%u_%-3u = smem_r[%u];\n",
- type,
- ops->c,
- ops->a,
- ops->b * hsg_config.warp.lanes);
- break;
-
- case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s r%u_%-3u = gmem_l[%u];\n",
- type,
- ops->c,
- ops->a,
- ops->b * hsg_config.warp.lanes);
- break;
-
-#if 0
- case HSG_OP_TYPE_REG_F_PREAMBLE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s* const f_%u_smem_st_ptr = &shared.f[block_warp_idx]",
- type,
- ops->a);
-
- if (ops->a >= (int32_t)hsg_config.warp.lanes)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "[warp_lane_idx * %u];\n",
- (ops->a / hsg_config.warp.lanes) * hsg_config.warp.lanes + 1);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "[(warp_lane_idx & 0x%X) * %u + (warp_lane_idx & ~0x%X)];\n",
- ops->a-1,
- hsg_config.warp.lanes + 1,
- ops->a-1);
- }
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "const %s* const f_%u_smem_ld_ptr = &shared.f[block_warp_idx][warp_lane_idx];\n",
- type,
- ops->a);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s* const f_%u_gmem_st_ptr = args.vout + warp_gmem_idx",
- type,
- ops->a);
-
- if (ops->a >= (int32_t)hsg_config.warp.lanes)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,";\n");
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " - warp_lane_idx + (warp_lane_idx & ~0x%X) * %u + (warp_lane_idx & 0x%X);\n",
- ops->a-1,
- hsg_config.thread.regs,
- ops->a-1);
- }
- break;
-
- case HSG_OP_TYPE_REG_SHARED_STORE_F:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "f_%u_smem_st_ptr[%-3u] = r%u;\n",
- ops->c,
- ops->b,
- ops->a);
- break;
-
- case HSG_OP_TYPE_REG_SHARED_LOAD_F:
- if (ops->c >= (int32_t)hsg_config.warp.lanes)
- {
- uint32_t const adjacent = ops->c / hsg_config.warp.lanes;
- uint32_t const stride = adjacent * hsg_config.warp.lanes + 1;
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "r%-3u = f_%u_smem_ld_ptr[%-3u];\n",
- ops->a,
- ops->c,
- (ops->b / adjacent) * stride + (ops->b % adjacent) * hsg_config.warp.lanes);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "r%-3u = f_%u_smem_ld_ptr[%-3u];\n",
- ops->a,
- ops->c,
- ops->b * (hsg_config.warp.lanes + 1));
- }
- break;
-
- case HSG_OP_TYPE_REG_GLOBAL_STORE_F:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "f_%u_gmem_st_ptr[%-3u * %u + %-3u] = r%u;\n",
- ops->c,
- ops->b,
- hsg_config.thread.regs, // hsg_config.warp.lanes,
- (ops->a - 1) & ~(ops->c - 1),
- ops->a);
- break;
-#endif
-
-#if 0
- case HSG_OP_TYPE_FINALIZE:
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_FINALIZE(%s,args,shared.f[block_warp_idx],shared.l,\n"
- " block_warp_idx,warp_lane_idx,warp_gmem_idx,\n"
- " r%-3u",
- ops->a == 1 ? "true" : "false",
- 1);
-
-#define HS_WARP_FINALIZE_PRETTY_PRINT 8
-
- for (uint32_t r=2; r<=hsg_config.thread.regs; r++)
- {
- if (r % HS_WARP_FINALIZE_PRETTY_PRINT == 1)
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,",\n");
- else
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,",");
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,"r%-3u",r);
- }
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,");\n");
- }
- break;
-#endif
-
- case HSG_OP_TYPE_BLOCK_SYNC:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "__syncthreads();\n");
- break;
-
- case HSG_OP_TYPE_BS_FRAC_PRED:
- {
- if (ops->m == 0)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (warp_idx < args.bs.full)\n");
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "else if (args.bs.frac == %u)\n",
- ops->w);
- }
- }
- break;
-
-#if 0 // DELETED
- case HSG_OP_TYPE_BX_MERGE_V_PREAMBLE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s * const smem_v = shared.m%u[0] + threadIdx.x; \n",
- type,ops->a);
- break;
-#endif
-
- case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE:
- if (ops->c == 0)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s * smem_l = shared.m%u[block_warp_idx ] + warp_lane_idx; \n"
- "%s * smem_r = shared.m%u[block_warp_idx ^ 1] + (warp_lane_idx ^ %u); \n",
- type,ops->a,
- type,ops->a,hsg_config.warp.lanes-1);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "smem_l = shared.m%u[block_warp_idx ] + warp_lane_idx; \n"
- "smem_r = shared.m%u[block_warp_idx ^ 1] + (warp_lane_idx ^ %u); \n",
- ops->a,
- ops->a,hsg_config.warp.lanes-1);
- }
- break;
-
- case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "%s const * const gmem_l = args.vout + (warp_gmem_base + block_warp_idx * %u); \n"
- "%s * const smem_l = shared.m%u[block_warp_idx] + warp_lane_idx; \n"
- "%s * const smem_v = shared.m%u[0] + threadIdx.x; \n",
- type,hsg_config.warp.lanes,
- type,ops->a,
- type,ops->a);
- break;
-
- case HSG_OP_TYPE_BX_MERGE_H_PRED:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (threadIdx.x < %u)\n",
- ops->a * hsg_config.warp.lanes);
- break;
-
- case HSG_OP_TYPE_BS_ACTIVE_PRED:
- {
- const struct hsg_merge* const m = merge + ops->a;
-
- if (m->warps <= 32)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (((1u << block_warp_idx) & 0x%08X) != 0)\n",
- m->levels[ops->b].active.b32a2[0]);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (((1UL << block_warp_idx) & 0x%08X%08XL) != 0L)\n",
- m->levels[ops->b].active.b32a2[1],
- m->levels[ops->b].active.b32a2[0]);
- }
- }
- break;
-
- case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED:
- {
- if (ops->a == ops->b)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (merge_idx < args.fm.full) \n");
- }
- else if (ops->b > 1)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "else if (args.fm.frac == %u) \n",
- ops->b);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "else\n");
- }
- }
- break;
-
- default:
- hsg_target_debug(files,merge,ops,depth);
- break;
- }
-}
-
-//
-//
-//
diff --git a/src/compute/hs/gen/target_debug.c b/src/compute/hs/gen/target_debug.c
new file mode 100644
index 0000000000..1481ca8041
--- /dev/null
+++ b/src/compute/hs/gen/target_debug.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+//
+//
+
+#include "gen.h"
+
+//
+//
+//
+
+#define HSG_INDENT 2
+
+//
+//
+//
+
+struct hsg_target_state
+{
+ FILE * txt;
+};
+
+//
+//
+//
+
+void
+hsg_target_indent(struct hsg_target * const target, uint32_t const depth)
+{
+ fprintf(target->state->txt,
+ "%*s",
+ depth*HSG_INDENT,"");
+}
+
+void
+hsg_target_debug(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth)
+{
+ if (ops->type == HSG_OP_TYPE_TARGET_BEGIN)
+ {
+ target->state = malloc(sizeof(*target->state));
+ fopen_s(&target->state->txt,"hs_debug.txt","wb");
+ }
+
+ hsg_target_indent(target,depth);
+
+ fprintf(target->state->txt,
+ "%s\n",
+ hsg_op_type_string[ops->type]);
+
+ if (ops->type == HSG_OP_TYPE_TARGET_END)
+ {
+ fclose(target->state->txt);
+ free(target->state);
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/hs/gen/target_glsl.c b/src/compute/hs/gen/target_glsl.c
new file mode 100644
index 0000000000..2bb75797ab
--- /dev/null
+++ b/src/compute/hs/gen/target_glsl.c
@@ -0,0 +1,674 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+//
+//
+
+#include "gen.h"
+#include "transpose.h"
+
+#include "common/util.h"
+#include "common/macros.h"
+
+//
+//
+//
+
+struct hsg_transpose_state
+{
+ FILE * header;
+ struct hsg_config const * config;
+};
+
+static
+char
+hsg_transpose_reg_prefix(uint32_t const cols_log2)
+{
+ return 'a' + (('r' + cols_log2 - 'a') % 26);
+}
+
+static
+void
+hsg_transpose_blend(uint32_t const cols_log2,
+ uint32_t const row_ll, // lower-left
+ uint32_t const row_ur, // upper-right
+ struct hsg_transpose_state * const state)
+{
+ // we're starting register names at '1' for now
+ fprintf(state->header,
+ " HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n",
+ hsg_transpose_reg_prefix(cols_log2-1),
+ hsg_transpose_reg_prefix(cols_log2),
+ cols_log2,row_ll+1,row_ur+1);
+}
+
+static
+void
+hsg_transpose_remap(uint32_t const row_from,
+ uint32_t const row_to,
+ struct hsg_transpose_state * const state)
+{
+ // we're starting register names at '1' for now
+ fprintf(state->header,
+ " HS_TRANSPOSE_REMAP( %c, %3u, %3u ) \\\n",
+ hsg_transpose_reg_prefix(state->config->warp.lanes_log2),
+ row_from+1,row_to+1);
+}
+
+//
+//
+//
+
+static
+void
+hsg_copyright(FILE * file)
+{
+ fprintf(file,
+ "// \n"
+ "// Copyright 2016 Google Inc. \n"
+ "// \n"
+ "// Use of this source code is governed by a BSD-style \n"
+ "// license that can be found in the LICENSE file. \n"
+ "// \n"
+ "\n");
+}
+
+static
+void
+hsg_macros(FILE * file)
+{
+ fprintf(file,
+ "#include \"hs_glsl_macros.h\" \n"
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n"
+ "\n");
+}
+
+//
+//
+//
+
+struct hsg_target_state
+{
+ FILE * header;
+ FILE * embeds;
+ FILE * source;
+};
+
+//
+//
+//
+
+void
+hsg_target_glsl(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth)
+{
+ switch (ops->type)
+ {
+ case HSG_OP_TYPE_END:
+ fprintf(target->state->source,
+ "}\n");
+
+ if (depth == 0) {
+ fclose(target->state->source);
+ target->state->source = NULL;
+ }
+ break;
+
+ case HSG_OP_TYPE_BEGIN:
+ fprintf(target->state->source,
+ "{\n");
+ break;
+
+ case HSG_OP_TYPE_ELSE:
+ fprintf(target->state->source,
+ "else\n");
+ break;
+
+ case HSG_OP_TYPE_TARGET_BEGIN:
+ {
+ // allocate state
+ target->state = malloc(sizeof(*target->state));
+
+ // allocate files
+ fopen_s(&target->state->header,"hs_glsl.h", "wb");
+ fopen_s(&target->state->embeds,"hs_kernels.h","wb");
+
+ hsg_copyright(target->state->header);
+ hsg_copyright(target->state->embeds);
+
+ // initialize header
+ uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps));
+
+ fprintf(target->state->header,
+ "#ifndef HS_GLSL_ONCE \n"
+ "#define HS_GLSL_ONCE \n"
+ " \n"
+ "#define HS_SLAB_THREADS_LOG2 %u \n"
+ "#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2) \n"
+ "#define HS_SLAB_WIDTH_LOG2 %u \n"
+ "#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2) \n"
+ "#define HS_SLAB_HEIGHT %u \n"
+ "#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)\n"
+ "#define HS_REG_LAST(c) c##%u \n"
+ "#define HS_KEY_TYPE %s \n"
+ "#define HS_KEY_WORDS %u \n"
+ "#define HS_VAL_WORDS 0 \n"
+ "#define HS_BS_SLABS %u \n"
+ "#define HS_BS_SLABS_LOG2_RU %u \n"
+ "#define HS_BC_SLABS_LOG2_MAX %u \n"
+ "#define HS_FM_SCALE_MIN %u \n"
+ "#define HS_FM_SCALE_MAX %u \n"
+ "#define HS_HM_SCALE_MIN %u \n"
+ "#define HS_HM_SCALE_MAX %u \n"
+ "#define HS_EMPTY \n"
+ " \n",
+ config->warp.lanes_log2,
+ config->warp.lanes_log2,
+ config->thread.regs,
+ config->thread.regs,
+ (config->type.words == 2) ? "uint64_t" : "uint32_t",
+ config->type.words,
+ merge->warps,
+ msb_idx_u32(pow2_ru_u32(merge->warps)),
+ bc_max,
+ config->merge.flip.lo,
+ config->merge.flip.hi,
+ config->merge.half.lo,
+ config->merge.half.hi);
+
+ fprintf(target->state->header,
+ "#define HS_SLAB_ROWS() \\\n");
+
+ for (uint32_t ii=1; ii<=config->thread.regs; ii++)
+ fprintf(target->state->header,
+ " HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1);
+
+ fprintf(target->state->header,
+ " HS_EMPTY\n"
+ " \n");
+
+ fprintf(target->state->header,
+ "#define HS_TRANSPOSE_SLAB() \\\n");
+
+ for (uint32_t ii=1; ii<=config->warp.lanes_log2; ii++)
+ fprintf(target->state->header,
+ " HS_TRANSPOSE_STAGE( %u ) \\\n",ii);
+
+ struct hsg_transpose_state state[1] =
+ {
+ { .header = target->state->header,
+ .config = config
+ }
+ };
+
+ hsg_transpose(config->warp.lanes_log2,
+ config->thread.regs,
+ hsg_transpose_blend,state,
+ hsg_transpose_remap,state);
+
+ fprintf(target->state->header,
+ " HS_EMPTY\n"
+ " \n");
+
+#if 0
+ fprintf(target->state->source,
+ "#include <hs_glsl_macros.h> \n"
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n");
+#endif
+ }
+ break;
+
+ case HSG_OP_TYPE_TARGET_END:
+ // decorate the files
+ fprintf(target->state->header,
+ "#endif \n"
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n"
+ " \n");
+
+ // close files
+ fclose(target->state->header);
+ fclose(target->state->embeds);
+
+ // free state
+ free(target->state);
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO:
+ {
+ fprintf(target->state->embeds,
+ "#include \"hs_transpose.len.xxd\"\n,\n"
+ "#include \"hs_transpose.spv.xxd\"\n,\n");
+
+ fopen_s(&target->state->source,"hs_transpose.comp","w+");
+
+ hsg_copyright(target->state->source);
+
+ hsg_macros(target->state->source);
+
+ fprintf(target->state->source,
+ "HS_TRANSPOSE_KERNEL_PROTO(%u)\n",
+ config->warp.lanes);
+ }
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE:
+ {
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ }
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY:
+ {
+ fprintf(target->state->source,
+ "HS_TRANSPOSE_SLAB()\n");
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_KERNEL_PROTO:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ uint32_t const bs = pow2_ru_u32(m->warps);
+ uint32_t const msb = msb_idx_u32(bs);
+
+ fprintf(target->state->embeds,
+ "#include \"hs_bs_%u.len.xxd\"\n,\n"
+ "#include \"hs_bs_%u.spv.xxd\"\n,\n",
+ msb,
+ msb);
+
+ char filename[] = { "hs_bs_123.comp" };
+ sprintf_s(filename,sizeof(filename),"hs_bs_%u.comp",msb);
+ fopen_s(&target->state->source,filename,"w+");
+
+ hsg_copyright(target->state->source);
+
+ hsg_macros(target->state->source);
+
+ if (m->warps > 1)
+ {
+ fprintf(target->state->source,
+ "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+ m->warps * config->warp.lanes,
+ m->rows_bs);
+ }
+
+ fprintf(target->state->source,
+ "HS_BS_KERNEL_PROTO(%u,%u,%u)\n",
+ config->warp.lanes,m->warps,msb);
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_KERNEL_PREAMBLE:
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ break;
+
+ case HSG_OP_TYPE_BC_KERNEL_PROTO:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ uint32_t const msb = msb_idx_u32(m->warps);
+
+ fprintf(target->state->embeds,
+ "#include \"hs_bc_%u.len.xxd\"\n,\n"
+ "#include \"hs_bc_%u.spv.xxd\"\n,\n",
+ msb,
+ msb);
+
+ char filename[] = { "hs_bc_123.comp" };
+ sprintf_s(filename,sizeof(filename),"hs_bc_%u.comp",msb);
+ fopen_s(&target->state->source,filename,"w+");
+
+ hsg_copyright(target->state->source);
+
+ hsg_macros(target->state->source);
+
+ if (m->warps > 1)
+ {
+ fprintf(target->state->source,
+ "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+ m->warps * config->warp.lanes,
+ m->rows_bc);
+ }
+
+ fprintf(target->state->source,
+ "HS_BC_KERNEL_PROTO(%u,%u,%u)\n",
+ config->warp.lanes,m->warps,msb);
+ }
+ break;
+
+ case HSG_OP_TYPE_BC_KERNEL_PREAMBLE:
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ break;
+
+ case HSG_OP_TYPE_FM_KERNEL_PROTO:
+ {
+ fprintf(target->state->embeds,
+ "#include \"hs_fm_%u_%u.len.xxd\"\n,\n"
+ "#include \"hs_fm_%u_%u.spv.xxd\"\n,\n",
+ ops->a,ops->b,
+ ops->a,ops->b);
+
+ char filename[] = { "hs_fm_123_123.comp" };
+ sprintf_s(filename,sizeof(filename),"hs_fm_%u_%u.comp",ops->a,ops->b);
+ fopen_s(&target->state->source,filename,"w+");
+
+ hsg_copyright(target->state->source);
+
+ hsg_macros(target->state->source);
+
+ fprintf(target->state->source,
+ "HS_FM_KERNEL_PROTO(%u,%u)\n",
+ ops->a,ops->b);
+ }
+ break;
+
+ case HSG_OP_TYPE_FM_KERNEL_PREAMBLE:
+ fprintf(target->state->source,
+ "HS_FM_PREAMBLE(%u);\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_HM_KERNEL_PROTO:
+ {
+ fprintf(target->state->embeds,
+ "#include \"hs_hm_%u_%u.len.xxd\"\n,\n"
+ "#include \"hs_hm_%u_%u.spv.xxd\"\n,\n",
+ ops->a,ops->b,
+ ops->a,ops->b);
+
+ char filename[] = { "hs_hm_123_123.comp" };
+ sprintf_s(filename,sizeof(filename),"hs_hm_%u_%u.comp",ops->a,ops->b);
+ fopen_s(&target->state->source,filename,"w+");
+
+ hsg_copyright(target->state->source);
+
+ hsg_macros(target->state->source);
+
+ fprintf(target->state->source,
+ "HS_HM_KERNEL_PROTO(%u)\n",
+ ops->a);
+ }
+ break;
+
+ case HSG_OP_TYPE_HM_KERNEL_PREAMBLE:
+ fprintf(target->state->source,
+ "HS_HM_PREAMBLE(%u);\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD:
+ {
+ static char const * const vstr[] = { "vin", "vout" };
+
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_SLAB_GLOBAL_LOAD(%s,%u,%u);\n",
+ ops->n,vstr[ops->v],config->warp.lanes,ops->n-1);
+ }
+ break;
+
+ case HSG_OP_TYPE_BX_REG_GLOBAL_STORE:
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_STORE(%u,%u,r%u);\n",
+ config->warp.lanes,ops->n-1,ops->n);
+ break;
+
+ case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_HM_REG_GLOBAL_STORE:
+ fprintf(target->state->source,
+ "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT:
+ fprintf(target->state->source,
+ "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_FM_GLOBAL_LOAD_R(%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT:
+ fprintf(target->state->source,
+ "HS_FM_GLOBAL_STORE_R(%-3u,r%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED:
+ {
+ if (ops->a <= ops->b)
+ {
+ fprintf(target->state->source,
+ "if (HS_FM_IS_NOT_LAST_SPAN() || (fm_frac == 0))\n");
+ }
+ else if (ops->b > 1)
+ {
+ fprintf(target->state->source,
+ "else if (fm_frac == %u)\n",
+ ops->b);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "else\n");
+ }
+ }
+ break;
+
+ case HSG_OP_TYPE_SLAB_FLIP:
+ fprintf(target->state->source,
+ "HS_SLAB_FLIP_PREAMBLE(%u);\n",
+ ops->n-1);
+ break;
+
+ case HSG_OP_TYPE_SLAB_HALF:
+ fprintf(target->state->source,
+ "HS_SLAB_HALF_PREAMBLE(%u);\n",
+ ops->n / 2);
+ break;
+
+ case HSG_OP_TYPE_CMP_FLIP:
+ fprintf(target->state->source,
+ "HS_CMP_FLIP(%-3u,r%-3u,r%-3u);\n",ops->a,ops->b,ops->c);
+ break;
+
+ case HSG_OP_TYPE_CMP_HALF:
+ fprintf(target->state->source,
+ "HS_CMP_HALF(%-3u,r%-3u);\n",ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_CMP_XCHG:
+ if (ops->c == UINT32_MAX)
+ {
+ fprintf(target->state->source,
+ "HS_CMP_XCHG(r%-3u,r%-3u);\n",
+ ops->a,ops->b);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "HS_CMP_XCHG(r%u_%u,r%u_%u);\n",
+ ops->c,ops->a,ops->c,ops->b);
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_STORE_V:
+ fprintf(target->state->source,
+ "HS_BX_LOCAL_V(%-3u * %-2u * %-3u) = r%u;\n",
+ merge[ops->a].warps,config->warp.lanes,ops->c,ops->b);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V:
+ fprintf(target->state->source,
+ "r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+ ops->b,merge[ops->a].warps,config->warp.lanes,ops->c);
+ break;
+
+ case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+ ops->b,ops->a,config->warp.lanes,ops->c);
+ break;
+
+ case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT:
+ fprintf(target->state->source,
+ "HS_SLAB_LOCAL_L(%5u) = r%u_%u;\n",
+ ops->b * config->warp.lanes,
+ ops->c,
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT:
+ fprintf(target->state->source,
+ "HS_SLAB_LOCAL_R(%5u) = r%u_%u;\n",
+ ops->b * config->warp.lanes,
+ ops->c,
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_L(%u);\n",
+ ops->c,
+ ops->a,
+ ops->b * config->warp.lanes);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_R(%u);\n",
+ ops->c,
+ ops->a,
+ ops->b * config->warp.lanes);
+ break;
+
+ case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_BC_GLOBAL_LOAD_L(%u,%u);\n",
+ ops->c,
+ ops->a,
+ config->warp.lanes,ops->b);
+ break;
+
+ case HSG_OP_TYPE_BLOCK_SYNC:
+ fprintf(target->state->source,
+ "HS_BLOCK_BARRIER();\n");
+ //
+ // FIXME - Named barriers to allow coordinating warps to proceed?
+ //
+ break;
+
+ case HSG_OP_TYPE_BS_FRAC_PRED:
+ {
+ if (ops->m == 0)
+ {
+ fprintf(target->state->source,
+ "if (warp_idx < bs_full)\n");
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "else if (bs_frac == %u)\n",
+ ops->w);
+ }
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ fprintf(target->state->source,
+ "HS_BS_MERGE_H_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,m->warps);
+ }
+ break;
+
+ case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ fprintf(target->state->source,
+ "HS_BC_MERGE_H_PREAMBLE(%u,%u,%u);\n",
+ config->warp.lanes,config->thread.regs,m->warps);
+ }
+ break;
+
+ case HSG_OP_TYPE_BX_MERGE_H_PRED:
+ fprintf(target->state->source,
+ "if (get_sub_group_id() < %u)\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_ACTIVE_PRED:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ if (m->warps <= 32)
+ {
+ fprintf(target->state->source,
+ "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n",
+ m->levels[ops->b].active.b32a2[0]);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n",
+ m->levels[ops->b].active.b32a2[1],
+ m->levels[ops->b].active.b32a2[0]);
+ }
+ }
+ break;
+
+ default:
+ fprintf(stderr,"type not found: %s\n",hsg_op_type_string[ops->type]);
+ exit(EXIT_FAILURE);
+ break;
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/hs/gen/target_igp_genx.c b/src/compute/hs/gen/target_igp_genx.c
deleted file mode 100644
index 3d0f2bc1b8..0000000000
--- a/src/compute/hs/gen/target_igp_genx.c
+++ /dev/null
@@ -1,672 +0,0 @@
-/*
- * Copyright 2016 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#include <stdio.h>
-
-//
-//
-//
-
-#include "gen.h"
-#include "util.h"
-#include "macros.h"
-#include "transpose.h"
-
-//
-//
-//
-
-static
-char
-hsg_transpose_reg_prefix(uint32_t const cols_log2)
-{
- return 'a' + (('r' + cols_log2 - 'a') % 26);
-}
-
-static
-void
-hsg_transpose_blend(uint32_t const cols_log2,
- uint32_t const row_ll, // lower-left
- uint32_t const row_ur, // upper-right
- FILE * file)
-{
- // we're starting register names at '1' for now
- fprintf(file,
- " HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n",
- hsg_transpose_reg_prefix(cols_log2-1),
- hsg_transpose_reg_prefix(cols_log2),
- cols_log2,row_ll+1,row_ur+1);
-}
-
-static
-void
-hsg_transpose_remap(uint32_t const row_from,
- uint32_t const row_to,
- FILE * file)
-{
- // we're starting register names at '1' for now
- fprintf(file,
- " HS_TRANSPOSE_REMAP( %c, %3u, %3u ) \\\n",
- hsg_transpose_reg_prefix(msb_idx_u32(hsg_config.warp.lanes)),
- row_from+1,row_to+1);
-}
-
-//
-//
-//
-
-void
-hsg_target_igp_genx(struct hsg_file * const files,
- struct hsg_merge const * const merge,
- struct hsg_op const * const ops,
- uint32_t const depth)
-{
- switch (ops->type)
- {
- case HSG_OP_TYPE_END:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "}\n");
- break;
-
- case HSG_OP_TYPE_BEGIN:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "{\n");
- break;
-
- case HSG_OP_TYPE_ELSE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "else\n");
- break;
-
- case HSG_OP_TYPE_FILE_HEADER:
- {
- uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps));
- uint32_t const warp_lanes_log2 = msb_idx_u32(hsg_config.warp.lanes);
-
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- "// \n"
- "// Copyright 2016 Google Inc. \n"
- "// \n"
- "// Use of this source code is governed by a BSD-style \n"
- "// license that can be found in the LICENSE file. \n"
- "// \n"
- " \n"
- "#ifndef HS_CL_ONCE \n"
- "#define HS_CL_ONCE \n"
- " \n"
- "#define HS_LANES_PER_WARP_LOG2 %u \n"
- "#define HS_LANES_PER_WARP (1 << HS_LANES_PER_WARP_LOG2) \n"
- "#define HS_BS_WARPS %u \n"
- "#define HS_BS_WARPS_LOG2_RU %u \n"
- "#define HS_BC_WARPS_LOG2_MAX %u \n"
- "#define HS_FM_BLOCKS_LOG2_MIN %u \n"
- "#define HS_HM_BLOCKS_LOG2_MIN %u \n"
- "#define HS_KEYS_PER_LANE %u \n"
- "#define HS_REG_LAST(c) c##%u \n"
- "#define HS_KEY_WORDS %u \n"
- "#define HS_KEY_TYPE %s \n"
- "#define HS_EMPTY \n"
- " \n",
- warp_lanes_log2,
- merge->warps,
- msb_idx_u32(pow2_ru_u32(merge->warps)),
- bc_max,
- hsg_config.merge.flip.lo,
- hsg_config.merge.half.lo,
- hsg_config.thread.regs,
- hsg_config.thread.regs,
- hsg_config.type.words,
- (hsg_config.type.words == 2) ? "ulong" : "uint");
-
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- "#define HS_SLAB_ROWS() \\\n");
-
- for (uint32_t ii=1; ii<=hsg_config.thread.regs; ii++)
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- " HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1);
-
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- " HS_EMPTY\n"
- " \n");
-
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- "#define HS_TRANSPOSE_SLAB() \\\n");
-
- for (uint32_t ii=1; ii<=warp_lanes_log2; ii++)
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- " HS_TRANSPOSE_STAGE( %u ) \\\n",ii);
-
- hsg_transpose(msb_idx_u32(hsg_config.warp.lanes),
- hsg_config.thread.regs,
- files[HSG_FILE_TYPE_HEADER].file,
- files[HSG_FILE_TYPE_HEADER].file,
- hsg_transpose_blend,
- hsg_transpose_remap);
-
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- " HS_EMPTY\n"
- " \n");
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "// \n"
- "// Copyright 2016 Google Inc. \n"
- "// \n"
- "// Use of this source code is governed by a BSD-style \n"
- "// license that can be found in the LICENSE file. \n"
- "// \n"
- " \n"
- "#include <%s_macros.h> \n"
- " \n"
- "// \n"
- "// \n"
- "// \n",
- files[HSG_FILE_TYPE_SOURCE].prefix);
- }
- break;
-
- case HSG_OP_TYPE_FILE_FOOTER:
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- " \n"
- "#endif \n"
- " \n"
- "// \n"
- "// \n"
- "// \n"
- " \n");
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "// \n"
- "// \n"
- "// \n"
- " \n");
- break;
-
- case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO:
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "__kernel \n"
- "__attribute__((intel_reqd_sub_group_size(%u))) \n"
- "void hs_kernel_transpose(__global HS_KEY_TYPE * const restrict vout) \n",
- hsg_config.warp.lanes);
- }
- break;
-
- case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE:
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const global_id = get_global_id(0); \n"
- "uint const gmem_idx = (global_id / %u) * %u + (global_id & %u); \n"
- " \n",
- hsg_config.warp.lanes,
- hsg_config.warp.lanes * hsg_config.thread.regs,
- hsg_config.warp.lanes-1);
- }
- break;
-
- case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY:
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_TRANSPOSE_SLAB()\n");
- }
- break;
-
- case HSG_OP_TYPE_BS_KERNEL_PROTO:
- {
- struct hsg_merge const * const m = merge + ops->a;
-
- uint32_t const tpb = m->warps * hsg_config.warp.lanes;
- uint32_t const bs = pow2_ru_u32(m->warps);
- uint32_t const msb = msb_idx_u32(bs);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "__kernel \n"
- "__attribute__((reqd_work_group_size(%u,1,1))) \n"
- "__attribute__((intel_reqd_sub_group_size(%u))) \n"
- "void hs_kernel_bs_%u(__global HS_KEY_TYPE const * const restrict vin, \n"
- " __global HS_KEY_TYPE * const restrict vout) \n",
- tpb,
- hsg_config.warp.lanes,
- msb);
- }
- break;
-
- case HSG_OP_TYPE_BS_KERNEL_PREAMBLE:
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "__local union { \n");
-
- struct hsg_merge const * const m = merge + ops->a;
-
- if (m->warps > 1)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " HS_KEY_TYPE m[%u * %u];\n",
- m->rows_bs,
- m->warps * hsg_config.warp.lanes);
- }
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "} shared; \n"
- " \n");
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const global_id = get_global_id(0); \n"
- "uint const gmem_idx = (global_id / %u) * %u + (global_id & %u); \n"
- " \n",
- hsg_config.warp.lanes,
- hsg_config.warp.lanes * hsg_config.thread.regs,
- hsg_config.warp.lanes-1);
- }
- break;
-
- case HSG_OP_TYPE_BC_KERNEL_PROTO:
- {
- uint32_t const bc_max = pow2_rd_u32(merge[0].warps);
- uint32_t const tpb = bc_max * hsg_config.warp.lanes;
- uint32_t const msb = msb_idx_u32(merge[ops->a].warps);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "__kernel \n"
- "__attribute__((intel_reqd_sub_group_size(%u))) \n"
- "void hs_kernel_bc_%u(__global HS_KEY_TYPE * const restrict vout) \n",
- hsg_config.warp.lanes,msb);
- }
- break;
-
- case HSG_OP_TYPE_BC_KERNEL_PREAMBLE:
- {
- struct hsg_merge const * const m = merge + ops->a;
- uint32_t const bc_max = pow2_rd_u32(merge[0].warps);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "__local union { \n");
-
- if (m->warps > 1)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " HS_KEY_TYPE m[%-3u * %u];\n",
- m->rows_bc,
- m->warps * hsg_config.warp.lanes);
- }
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "} shared; \n"
- " \n");
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const global_id = get_global_id(0); \n"
- "uint const gmem_idx = (global_id / %u) * %u + (global_id & %u); \n"
- " \n",
- hsg_config.warp.lanes,
- hsg_config.warp.lanes * hsg_config.thread.regs,
- hsg_config.warp.lanes-1);
- }
- break;
-
- case HSG_OP_TYPE_FM_KERNEL_PROTO:
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- "#define HS_FM_BLOCKS_LOG2_%-2u %u \n",
- ops->a,ops->b);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "__kernel \n"
- "__attribute__((intel_reqd_sub_group_size(%u))) \n"
- "void hs_kernel_fm_%u(__global HS_KEY_TYPE * const restrict vout, \n"
- " uint const fm_full, \n"
- " uint const fm_frac) \n",
- hsg_config.warp.lanes,ops->a);
- break;
-
- case HSG_OP_TYPE_FM_KERNEL_PREAMBLE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const global_id = (uint)get_global_id(0); \n"
- "uint const warp_idx = global_id / %u; \n"
- "uint const warp_lane_idx = global_id & %u; \n"
- " \n"
- "uint const merge_idx = warp_idx / %u >> %u; \n"
- " \n"
- "uint const merge_stride = %u * %u << %u; \n"
- "uint const merge_keys = merge_stride * %u; \n"
- " \n"
- "uint const merge_base = merge_idx * merge_keys; \n"
- " \n"
- "uint const merge_l_off = (warp_idx - merge_idx * (%u << %u)) * %u + warp_lane_idx; \n"
- "uint const merge_l_end = merge_stride * (%u / 2 - 1) + merge_l_off; \n"
- " \n"
- "int const merge_r_off = merge_keys - merge_l_end - 1; \n"
- " \n"
- "__global HS_KEY_TYPE * const restrict merge_l = vout + (merge_base + merge_l_off); \n"
- "__global HS_KEY_TYPE * const restrict merge_r = vout + (merge_base + merge_r_off); \n"
- " \n",
- hsg_config.warp.lanes,
- hsg_config.warp.lanes-1,
- hsg_config.thread.regs,ops->b,
- hsg_config.thread.regs,hsg_config.warp.lanes,ops->b,
- ops->a,
- hsg_config.thread.regs,ops->b,hsg_config.warp.lanes,
- ops->a);
- break;
-
- case HSG_OP_TYPE_HM_KERNEL_PROTO:
- {
- uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge[0].warps));
-
- fprintf(files[HSG_FILE_TYPE_HEADER].file,
- "#define HS_HM_BLOCKS_LOG2_%-2u %u \n",
- ops->a,ops->b);
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- " \n"
- "__kernel \n"
- "__attribute__((intel_reqd_sub_group_size(%u))) \n"
- "void hs_kernel_hm_%u(__global HS_KEY_TYPE * const restrict vout) \n",
- hsg_config.warp.lanes,ops->a);
- }
- break;
-
- case HSG_OP_TYPE_HM_KERNEL_PREAMBLE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const global_id = (uint)get_global_id(0); \n"
- "uint const warp_idx = global_id / %u; \n"
- "uint const warp_lane_idx = global_id & %u; \n"
- " \n"
- "uint const merge_idx = (warp_idx / %u) >> %u; \n"
- " \n"
- "uint const merge_stride = %u * %u << %u; \n"
- "uint const merge_keys = merge_stride * %u; \n"
- " \n"
- "uint const merge_base = merge_idx * merge_keys; \n"
- "uint const merge_off = (warp_idx - merge_idx * (%u << %u)) * %u; \n"
- " \n"
- "__global HS_KEY_TYPE * const restrict merge_ptr = vout + (merge_base + merge_off + warp_lane_idx); \n"
- " \n",
- hsg_config.warp.lanes,
- hsg_config.warp.lanes-1,
- hsg_config.thread.regs,ops->b,
- hsg_config.thread.regs,hsg_config.warp.lanes,ops->b,
- ops->a,
- hsg_config.thread.regs,ops->b,hsg_config.warp.lanes);
- break;
-
- case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD:
- {
- static char const * const vstr[] = { "vin", "vout" };
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_KEY_TYPE r%-3u = (%s + gmem_idx)[%-3u * %u]; \n",
- ops->n,vstr[ops->v],ops->n-1,hsg_config.warp.lanes);
- }
- break;
-
- case HSG_OP_TYPE_BX_REG_GLOBAL_STORE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "(vout + gmem_idx)[%-3u * %u] = r%u; \n",
- ops->n-1,hsg_config.warp.lanes,ops->n);
- break;
-
- case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_KEY_TYPE r%-3u = merge_ptr[%-3u * merge_stride];\n",
- ops->a,ops->b);
- break;
-
- case HSG_OP_TYPE_HM_REG_GLOBAL_STORE:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "merge_ptr[%-3u * merge_stride] = r%u;\n",
- ops->b,ops->a);
- break;
-
- case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_KEY_TYPE r%-3u = merge_l[%-3u * merge_stride];\n",
- ops->a,ops->b);
- break;
-
- case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "merge_l[%-3u * merge_stride] = r%u;\n",
- ops->b,ops->a);
- break;
-
- case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_KEY_TYPE r%-3u = merge_r[%-3u * merge_stride];\n",
- ops->a,ops->b);
- break;
-
- case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "merge_r[%-3u * merge_stride] = r%u;\n",
- ops->b,ops->a);
- break;
-
- case HSG_OP_TYPE_WARP_FLIP:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const flip_lane_mask = %u; \n"
- "uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; \n"
- "int const t_lt = get_sub_group_local_id() < flip_lane_idx; \n",
- ops->n-1);
- break;
-
- case HSG_OP_TYPE_WARP_HALF:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const half_lane_mask = %u; \n"
- "uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; \n"
- "int const t_lt = get_sub_group_local_id() < half_lane_idx; \n",
- ops->n / 2);
- break;
-
- case HSG_OP_TYPE_CMP_FLIP:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_CMP_FLIP(%-3u,r%-3u,r%-3u)\n",ops->a,ops->b,ops->c);
- break;
-
- case HSG_OP_TYPE_CMP_HALF:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_CMP_HALF(%-3u,r%-3u)\n",ops->a,ops->b);
- break;
-
- case HSG_OP_TYPE_CMP_XCHG:
- if (ops->c == UINT32_MAX)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_CMP_XCHG(r%-3u,r%-3u)\n",
- ops->a,ops->b);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_CMP_XCHG(r%u_%u,r%u_%u)\n",
- ops->c,ops->a,ops->c,ops->b);
- }
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_STORE_V:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "(shared.m + get_local_id(0))[%-3u * %-2u * %-3u] = r%u;\n",
- merge[ops->a].warps,hsg_config.warp.lanes,ops->c,ops->b);
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "r%-3u = (shared.m + get_local_id(0))[%-3u * %-2u * %-3u];\n",
- ops->b,merge[ops->a].warps,hsg_config.warp.lanes,ops->c);
- break;
-
- case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_KEY_TYPE r%-3u = (shared.m + get_local_id(0))[%-3u * %-2u * %-3u];\n",
- ops->b,ops->a,hsg_config.warp.lanes,ops->c);
- break;
-
- case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "(shared.m + smem_l_idx)[%5u] = r%u_%u;\n",
- ops->b * hsg_config.warp.lanes,
- ops->c,
- ops->a);
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "(shared.m + smem_r_idx)[%5u] = r%u_%u;\n",
- ops->b * hsg_config.warp.lanes,
- ops->c,
- ops->a);
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_KEY_TYPE r%u_%-3u = (shared.m + smem_l_idx)[%u];\n",
- ops->c,
- ops->a,
- ops->b * hsg_config.warp.lanes);
- break;
-
- case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_KEY_TYPE r%u_%-3u = (shared.m + smem_r_idx)[%u];\n",
- ops->c,
- ops->a,
- ops->b * hsg_config.warp.lanes);
- break;
-
- case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "HS_KEY_TYPE r%u_%-3u = (vout + gmem_l_idx)[%u];\n",
- ops->c,
- ops->a,
- ops->b * hsg_config.warp.lanes);
- break;
-
- case HSG_OP_TYPE_BLOCK_SYNC:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "barrier(CLK_LOCAL_MEM_FENCE);\n"); // OpenCL 2.0+: work_group_barrier
- break;
-
- case HSG_OP_TYPE_BS_FRAC_PRED:
- {
- if (ops->m == 0)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (warp_idx < bs_full)\n");
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "else if (bs_frac == %u)\n",
- ops->w);
- }
- }
- break;
-
- case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE:
- {
- struct hsg_merge const * const m = merge + ops->a;
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const smem_l_idx = get_sub_group_id() * %u + get_sub_group_local_id(); \n"
- "uint const smem_r_idx = (get_sub_group_id() ^ 1) * %u + (get_sub_group_local_id() ^ %u); \n",
- m->warps * hsg_config.warp.lanes,
- m->warps * hsg_config.warp.lanes, hsg_config.warp.lanes-1);
-#if 0
- if (ops->b == true)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint smem_l_idx = get_sub_group_id() * %u + get_sub_group_local_id(); \n"
- "uint smem_r_idx = (get_sub_group_id() ^ 1) * %u + (get_sub_group_local_id() ^ %u); \n",
- m->warps * hsg_config.warp.lanes,
- m->warps * hsg_config.warp.lanes, hsg_config.warp.lanes-1);
- }
- else // update
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "smem_l_idx = get_sub_group_id() * %u + get_sub_group_local_id(); \n"
- "smem_r_idx = (get_sub_group_id() ^ 1) * %u + (get_sub_group_local_id() ^ %u); \n",
- m->warps * hsg_config.warp.lanes,
- m->warps * hsg_config.warp.lanes, hsg_config.warp.lanes-1);
- }
-#endif
- }
- break;
-
- case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE:
- {
- struct hsg_merge const * const m = merge + ops->a;
- uint32_t const b = m->warps * hsg_config.warp.lanes;
- uint32_t const k = b * hsg_config.thread.regs;
-
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "uint const gmem_l_idx = (global_id / %u) * %u + (global_id & %u); \n"
- "uint const smem_l_idx = get_sub_group_id() * %u + get_sub_group_local_id(); \n",
- b,k,b-1,
- b);
-
- }
- break;
-
- case HSG_OP_TYPE_BX_MERGE_H_PRED:
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (get_sub_group_id() < %u)\n",
- ops->a);
- break;
-
- case HSG_OP_TYPE_BS_ACTIVE_PRED:
- {
- struct hsg_merge const * const m = merge + ops->a;
-
- if (m->warps <= 32)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n",
- m->levels[ops->b].active.b32a2[0]);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n",
- m->levels[ops->b].active.b32a2[1],
- m->levels[ops->b].active.b32a2[0]);
- }
- }
- break;
-
- case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED:
- {
- if (ops->a == ops->b)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "if (merge_idx < fm_full) \n");
- }
- else if (ops->b > 1)
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "else if (fm_frac == %u) \n",
- ops->b);
- }
- else
- {
- fprintf(files[HSG_FILE_TYPE_SOURCE].file,
- "else\n");
- }
- }
- break;
-
- default:
- hsg_target_debug(files,merge,ops,depth);
- break;
- }
-}
-
-//
-//
-//
diff --git a/src/compute/hs/gen/target_opencl.c b/src/compute/hs/gen/target_opencl.c
new file mode 100644
index 0000000000..fe7343ba5d
--- /dev/null
+++ b/src/compute/hs/gen/target_opencl.c
@@ -0,0 +1,600 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+//
+//
+
+#include "gen.h"
+#include "transpose.h"
+
+#include "common/util.h"
+#include "common/macros.h"
+
+//
+//
+//
+
+struct hsg_transpose_state
+{
+ FILE * header;
+ struct hsg_config const * config;
+};
+
+static
+char
+hsg_transpose_reg_prefix(uint32_t const cols_log2)
+{
+ return 'a' + (('r' + cols_log2 - 'a') % 26);
+}
+
+static
+void
+hsg_transpose_blend(uint32_t const cols_log2,
+ uint32_t const row_ll, // lower-left
+ uint32_t const row_ur, // upper-right
+ struct hsg_transpose_state * const state)
+{
+ // we're starting register names at '1' for now
+ fprintf(state->header,
+ " HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n",
+ hsg_transpose_reg_prefix(cols_log2-1),
+ hsg_transpose_reg_prefix(cols_log2),
+ cols_log2,row_ll+1,row_ur+1);
+}
+
+static
+void
+hsg_transpose_remap(uint32_t const row_from,
+ uint32_t const row_to,
+ struct hsg_transpose_state * const state)
+{
+ // we're starting register names at '1' for now
+ fprintf(state->header,
+ " HS_TRANSPOSE_REMAP( %c, %3u, %3u ) \\\n",
+ hsg_transpose_reg_prefix(state->config->warp.lanes_log2),
+ row_from+1,row_to+1);
+}
+
+//
+//
+//
+
+static
+void
+hsg_copyright(FILE * file)
+{
+ fprintf(file,
+ "// \n"
+ "// Copyright 2016 Google Inc. \n"
+ "// \n"
+ "// Use of this source code is governed by a BSD-style \n"
+ "// license that can be found in the LICENSE file. \n"
+ "// \n"
+ "\n");
+}
+
+//
+//
+//
+
+struct hsg_target_state
+{
+ FILE * header;
+ FILE * source;
+};
+
+//
+//
+//
+
+void
+hsg_target_opencl(struct hsg_target * const target,
+ struct hsg_config const * const config,
+ struct hsg_merge const * const merge,
+ struct hsg_op const * const ops,
+ uint32_t const depth)
+{
+ switch (ops->type)
+ {
+ case HSG_OP_TYPE_END:
+ fprintf(target->state->source,
+ "}\n");
+ break;
+
+ case HSG_OP_TYPE_BEGIN:
+ fprintf(target->state->source,
+ "{\n");
+ break;
+
+ case HSG_OP_TYPE_ELSE:
+ fprintf(target->state->source,
+ "else\n");
+ break;
+
+ case HSG_OP_TYPE_TARGET_BEGIN:
+ {
+ // allocate state
+ target->state = malloc(sizeof(*target->state));
+
+ // allocate files
+ fopen_s(&target->state->header,"hs_cl.h", "wb");
+ fopen_s(&target->state->source,"hs_cl.cl","wb");
+
+ // initialize header
+ uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps));
+
+ hsg_copyright(target->state->header);
+
+ fprintf(target->state->header,
+ "#ifndef HS_CL_ONCE \n"
+ "#define HS_CL_ONCE \n"
+ " \n"
+ "#define HS_SLAB_THREADS_LOG2 %u \n"
+ "#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2) \n"
+ "#define HS_SLAB_WIDTH_LOG2 %u \n"
+ "#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2) \n"
+ "#define HS_SLAB_HEIGHT %u \n"
+ "#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)\n"
+ "#define HS_REG_LAST(c) c##%u \n"
+ "#define HS_KEY_TYPE %s \n"
+ "#define HS_KEY_WORDS %u \n"
+ "#define HS_VAL_WORDS 0 \n"
+ "#define HS_BS_SLABS %u \n"
+ "#define HS_BS_SLABS_LOG2_RU %u \n"
+ "#define HS_BC_SLABS_LOG2_MAX %u \n"
+ "#define HS_FM_SCALE_MIN %u \n"
+ "#define HS_FM_SCALE_MAX %u \n"
+ "#define HS_HM_SCALE_MIN %u \n"
+ "#define HS_HM_SCALE_MAX %u \n"
+ "#define HS_EMPTY \n"
+ " \n",
+ config->warp.lanes_log2,
+ config->warp.lanes_log2,
+ config->thread.regs,
+ config->thread.regs,
+ (config->type.words == 2) ? "ulong" : "uint",
+ config->type.words,
+ merge->warps,
+ msb_idx_u32(pow2_ru_u32(merge->warps)),
+ bc_max,
+ config->merge.flip.lo,
+ config->merge.flip.hi,
+ config->merge.half.lo,
+ config->merge.half.hi);
+
+ fprintf(target->state->header,
+ "#define HS_SLAB_ROWS() \\\n");
+
+ for (uint32_t ii=1; ii<=config->thread.regs; ii++)
+ fprintf(target->state->header,
+ " HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1);
+
+ fprintf(target->state->header,
+ " HS_EMPTY\n"
+ " \n");
+
+ fprintf(target->state->header,
+ "#define HS_TRANSPOSE_SLAB() \\\n");
+
+ for (uint32_t ii=1; ii<=config->warp.lanes_log2; ii++)
+ fprintf(target->state->header,
+ " HS_TRANSPOSE_STAGE( %u ) \\\n",ii);
+
+ struct hsg_transpose_state state[1] =
+ {
+ { .header = target->state->header,
+ .config = config
+ }
+ };
+
+ hsg_transpose(config->warp.lanes_log2,
+ config->thread.regs,
+ hsg_transpose_blend,state,
+ hsg_transpose_remap,state);
+
+ fprintf(target->state->header,
+ " HS_EMPTY\n"
+ " \n");
+
+ hsg_copyright(target->state->source);
+
+ fprintf(target->state->source,
+ "#include \"hs_cl_macros.h\" \n"
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n");
+ }
+ break;
+
+ case HSG_OP_TYPE_TARGET_END:
+ // decorate the files
+ fprintf(target->state->header,
+ "#endif \n"
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n"
+ " \n");
+ fprintf(target->state->source,
+ " \n"
+ "// \n"
+ "// \n"
+ "// \n"
+ " \n");
+
+ // close files
+ fclose(target->state->header);
+ fclose(target->state->source);
+
+ // free state
+ free(target->state);
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO:
+ {
+ fprintf(target->state->source,
+ "\nHS_TRANSPOSE_KERNEL_PROTO(%u)\n",
+ config->warp.lanes);
+ }
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE:
+ {
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ }
+ break;
+
+ case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY:
+ {
+ fprintf(target->state->source,
+ "HS_TRANSPOSE_SLAB()\n");
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_KERNEL_PROTO:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ uint32_t const bs = pow2_ru_u32(m->warps);
+ uint32_t const msb = msb_idx_u32(bs);
+
+ fprintf(target->state->source,
+ "\nHS_BS_KERNEL_PROTO(%u,%u,%u)\n",
+ config->warp.lanes,m->warps,msb);
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_KERNEL_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ if (m->warps > 1)
+ {
+ fprintf(target->state->source,
+ "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+ m->warps * config->warp.lanes,
+ m->rows_bs);
+ }
+
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ }
+ break;
+
+ case HSG_OP_TYPE_BC_KERNEL_PROTO:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ uint32_t const msb = msb_idx_u32(m->warps);
+
+ fprintf(target->state->source,
+ "\nHS_BC_KERNEL_PROTO(%u,%u,%u)\n",
+ config->warp.lanes,m->warps,msb);
+ }
+ break;
+
+ case HSG_OP_TYPE_BC_KERNEL_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ if (m->warps > 1)
+ {
+ fprintf(target->state->source,
+ "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
+ m->warps * config->warp.lanes,
+ m->rows_bc);
+ }
+
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,config->thread.regs);
+ }
+ break;
+
+ case HSG_OP_TYPE_FM_KERNEL_PROTO:
+ fprintf(target->state->source,
+ "\nHS_FM_KERNEL_PROTO(%u,%u)\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_FM_KERNEL_PREAMBLE:
+ fprintf(target->state->source,
+ "HS_FM_PREAMBLE(%u);\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_HM_KERNEL_PROTO:
+ {
+ fprintf(target->state->source,
+ "\nHS_HM_KERNEL_PROTO(%u)\n",
+ ops->a);
+ }
+ break;
+
+ case HSG_OP_TYPE_HM_KERNEL_PREAMBLE:
+ fprintf(target->state->source,
+ "HS_HM_PREAMBLE(%u);\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD:
+ {
+ static char const * const vstr[] = { "vin", "vout" };
+
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_SLAB_GLOBAL_LOAD(%s,%u,%u);\n",
+ ops->n,vstr[ops->v],config->warp.lanes,ops->n-1);
+ }
+ break;
+
+ case HSG_OP_TYPE_BX_REG_GLOBAL_STORE:
+ fprintf(target->state->source,
+ "HS_SLAB_GLOBAL_STORE(%u,%u,r%u);\n",
+ config->warp.lanes,ops->n-1,ops->n);
+ break;
+
+ case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_HM_REG_GLOBAL_STORE:
+ fprintf(target->state->source,
+ "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT:
+ fprintf(target->state->source,
+ "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_FM_GLOBAL_LOAD_R(%u);\n",
+ ops->b,ops->a);
+ break;
+
+ case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT:
+ fprintf(target->state->source,
+ "HS_FM_GLOBAL_STORE_R(%-3u,r%u);\n",
+ ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED:
+ {
+ if (ops->a <= ops->b)
+ {
+ fprintf(target->state->source,
+ "if (HS_FM_IS_NOT_LAST_SPAN() || (fm_frac == 0))\n");
+ }
+ else if (ops->b > 1)
+ {
+ fprintf(target->state->source,
+ "else if (fm_frac == %u)\n",
+ ops->b);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "else\n");
+ }
+ }
+ break;
+
+ case HSG_OP_TYPE_SLAB_FLIP:
+ fprintf(target->state->source,
+ "HS_SLAB_FLIP_PREAMBLE(%u);\n",
+ ops->n-1);
+ break;
+
+ case HSG_OP_TYPE_SLAB_HALF:
+ fprintf(target->state->source,
+ "HS_SLAB_HALF_PREAMBLE(%u);\n",
+ ops->n / 2);
+ break;
+
+ case HSG_OP_TYPE_CMP_FLIP:
+ fprintf(target->state->source,
+ "HS_CMP_FLIP(%-3u,r%-3u,r%-3u);\n",ops->a,ops->b,ops->c);
+ break;
+
+ case HSG_OP_TYPE_CMP_HALF:
+ fprintf(target->state->source,
+ "HS_CMP_HALF(%-3u,r%-3u);\n",ops->a,ops->b);
+ break;
+
+ case HSG_OP_TYPE_CMP_XCHG:
+ if (ops->c == UINT32_MAX)
+ {
+ fprintf(target->state->source,
+ "HS_CMP_XCHG(r%-3u,r%-3u);\n",
+ ops->a,ops->b);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "HS_CMP_XCHG(r%u_%u,r%u_%u);\n",
+ ops->c,ops->a,ops->c,ops->b);
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_STORE_V:
+ fprintf(target->state->source,
+ "HS_BX_LOCAL_V(%-3u * %-2u * %-3u) = r%u;\n",
+ merge[ops->a].warps,config->warp.lanes,ops->c,ops->b);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V:
+ fprintf(target->state->source,
+ "r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+ ops->b,merge[ops->a].warps,config->warp.lanes,ops->c);
+ break;
+
+ case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n",
+ ops->b,ops->a,config->warp.lanes,ops->c);
+ break;
+
+ case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT:
+ fprintf(target->state->source,
+ "HS_SLAB_LOCAL_L(%5u) = r%u_%u;\n",
+ ops->b * config->warp.lanes,
+ ops->c,
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT:
+ fprintf(target->state->source,
+ "HS_SLAB_LOCAL_R(%5u) = r%u_%u;\n",
+ ops->b * config->warp.lanes,
+ ops->c,
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_L(%u);\n",
+ ops->c,
+ ops->a,
+ ops->b * config->warp.lanes);
+ break;
+
+ case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_R(%u);\n",
+ ops->c,
+ ops->a,
+ ops->b * config->warp.lanes);
+ break;
+
+ case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT:
+ fprintf(target->state->source,
+ "HS_KEY_TYPE r%u_%-3u = HS_BC_GLOBAL_LOAD_L(%u,%u);\n",
+ ops->c,
+ ops->a,
+ config->warp.lanes,ops->b);
+ break;
+
+ case HSG_OP_TYPE_BLOCK_SYNC:
+ fprintf(target->state->source,
+ "HS_BLOCK_BARRIER();\n");
+ //
+ // FIXME - Named barriers to allow coordinating warps to proceed?
+ //
+ break;
+
+ case HSG_OP_TYPE_BS_FRAC_PRED:
+ {
+ if (ops->m == 0)
+ {
+ fprintf(target->state->source,
+ "if (warp_idx < bs_full)\n");
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "else if (bs_frac == %u)\n",
+ ops->w);
+ }
+ }
+ break;
+
+ case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ fprintf(target->state->source,
+ "HS_BS_MERGE_H_PREAMBLE(%u,%u);\n",
+ config->warp.lanes,m->warps);
+ }
+ break;
+
+ case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ fprintf(target->state->source,
+ "HS_BC_MERGE_H_PREAMBLE(%u,%u,%u);\n",
+ config->warp.lanes,config->thread.regs,m->warps);
+ }
+ break;
+
+ case HSG_OP_TYPE_BX_MERGE_H_PRED:
+ fprintf(target->state->source,
+ "if (get_sub_group_id() < %u)\n",
+ ops->a);
+ break;
+
+ case HSG_OP_TYPE_BS_ACTIVE_PRED:
+ {
+ struct hsg_merge const * const m = merge + ops->a;
+
+ if (m->warps <= 32)
+ {
+ fprintf(target->state->source,
+ "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n",
+ m->levels[ops->b].active.b32a2[0]);
+ }
+ else
+ {
+ fprintf(target->state->source,
+ "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n",
+ m->levels[ops->b].active.b32a2[1],
+ m->levels[ops->b].active.b32a2[0]);
+ }
+ }
+ break;
+
+ default:
+ fprintf(stderr,"type not found: %s\n",hsg_op_type_string[ops->type]);
+ exit(EXIT_FAILURE);
+ break;
+ }
+}
+
+//
+//
+//
diff --git a/src/compute/hs/gen/transpose.c b/src/compute/hs/gen/transpose.c
index de15c62631..095f53d330 100644
--- a/src/compute/hs/gen/transpose.c
+++ b/src/compute/hs/gen/transpose.c
@@ -11,7 +11,7 @@
//
#include "transpose.h"
-#include "macros.h"
+#include "common/macros.h"
//
// Rows must be an even number. This is enforced elsewhere.
@@ -21,19 +21,19 @@
void
hsg_transpose(uint32_t const cols_log2,
uint32_t const rows,
- void * blend,
- void * remap,
void (*pfn_blend)(uint32_t const cols_log2,
uint32_t const row_ll, // lower-left
uint32_t const row_ur, // upper-right
void * blend),
+ void * blend,
void (*pfn_remap)(uint32_t const row_from,
uint32_t const row_to,
- void * remap))
+ void * remap),
+ void * remap)
{
// get mapping array
- uint32_t * map_curr = ALLOCA(rows * sizeof(*map_curr));
- uint32_t * map_next = ALLOCA(rows * sizeof(*map_next));
+ uint32_t * map_curr = ALLOCA_MACRO(rows * sizeof(*map_curr));
+ uint32_t * map_next = ALLOCA_MACRO(rows * sizeof(*map_next));
// init the mapping array
for (uint32_t ii=0; ii<rows; ii++)
@@ -89,35 +89,35 @@ static uint32_t cols; // implicit on SIMD/GPU
static
void
-hsg_debug_remap(uint32_t const row_from,
- uint32_t const row_to,
- uint32_t * const r)
-{
- fprintf(stdout,"REMAP( %3u, %3u )\n",row_from,row_to);
-
- r[row_to] = row_from;
-}
-
-static
-void
hsg_debug_blend(uint32_t const cols_log2,
uint32_t const row_ll, // lower-left
uint32_t const row_ur, // upper-right
- uint32_t * m)
+ uint32_t * b)
{
fprintf(stdout,"BLEND( %u, %3u, %3u )\n",cols_log2,row_ll,row_ur);
- uint32_t * const ll = ALLOCA(cols * sizeof(*m));
- uint32_t * const ur = ALLOCA(cols * sizeof(*m));
+ uint32_t * const ll = ALLOCA(cols * sizeof(*b));
+ uint32_t * const ur = ALLOCA(cols * sizeof(*b));
- memcpy(ll,m+row_ll*cols,cols * sizeof(*m));
- memcpy(ur,m+row_ur*cols,cols * sizeof(*m));
+ memcpy(ll,b+row_ll*cols,cols * sizeof(*b));
+ memcpy(ur,b+row_ur*cols,cols * sizeof(*b));
for (uint32_t ii=0; ii<cols; ii++)
- m[row_ll*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii] : ur[ii^(1<<cols_log2-1)];
+ b[row_ll*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii] : ur[ii^(1<<cols_log2-1)];
for (uint32_t ii=0; ii<cols; ii++)
- m[row_ur*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii^(1<<cols_log2-1)] : ur[ii];
+ b[row_ur*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii^(1<<cols_log2-1)] : ur[ii];
+}
+
+static
+void
+hsg_debug_remap(uint32_t const row_from,
+ uint32_t const row_to,
+ uint32_t * const r)
+{
+ fprintf(stdout,"REMAP( %3u, %3u )\n",row_from,row_to);
+
+ r[row_to] = row_from;
}
static
@@ -144,23 +144,22 @@ main(int argc, char * argv[])
cols = 1 << cols_log2;
- uint32_t * const m = ALLOCA(cols * rows * sizeof(*m));
+ uint32_t * const b = ALLOCA(cols * rows * sizeof(*b));
uint32_t * const r = ALLOCA( rows * sizeof(*r));
for (uint32_t rr=0; rr<rows; rr++) {
r[rr] = rr;
for (uint32_t cc=0; cc<cols; cc++)
- m[rr*cols+cc] = cc*rows+rr;
+ b[rr*cols+cc] = cc*rows+rr;
}
- hsg_debug_print(rows,m,r);
+ hsg_debug_print(rows,b,r);
hsg_transpose(cols_log2,rows,
- m,r,
- hsg_debug_blend,
- hsg_debug_remap);
+ hsg_debug_blend,b,
+ hsg_debug_remap,r);
- hsg_debug_print(rows,m,r);
+ hsg_debug_print(rows,b,r);
return 0;
}
diff --git a/src/compute/hs/gen/transpose.h b/src/compute/hs/gen/transpose.h
index 83f6fc4e42..380210970d 100644
--- a/src/compute/hs/gen/transpose.h
+++ b/src/compute/hs/gen/transpose.h
@@ -23,15 +23,15 @@
void
hsg_transpose(uint32_t const cols_log2,
uint32_t const rows,
- void * blend,
- void * remap,
void (*pfn_blend)(uint32_t const cols_log2,
uint32_t const row_ll, // lower-left
uint32_t const row_ur, // upper-right
void * blend),
+ void * blend,
void (*pfn_remap)(uint32_t const row_from,
uint32_t const row_to,
- void * remap));
+ void * remap),
+ void * remap);
//
//