diff options
Diffstat (limited to 'src/compute/hs/gen')
-rw-r--r-- | src/compute/hs/gen/gen.h | 112 | ||||
-rw-r--r-- | src/compute/hs/gen/main.c | 532 | ||||
-rw-r--r-- | src/compute/hs/gen/networks_merging.c | 4 | ||||
-rw-r--r-- | src/compute/hs/gen/networks_sorting.c | 4 | ||||
-rw-r--r-- | src/compute/hs/gen/target_cuda.c | 600 | ||||
-rw-r--r-- | src/compute/hs/gen/target_cuda_sm3x.c | 776 | ||||
-rw-r--r-- | src/compute/hs/gen/target_debug.c | 73 | ||||
-rw-r--r-- | src/compute/hs/gen/target_glsl.c | 674 | ||||
-rw-r--r-- | src/compute/hs/gen/target_igp_genx.c | 672 | ||||
-rw-r--r-- | src/compute/hs/gen/target_opencl.c | 600 | ||||
-rw-r--r-- | src/compute/hs/gen/transpose.c | 61 | ||||
-rw-r--r-- | src/compute/hs/gen/transpose.h | 6 |
12 files changed, 2229 insertions, 1885 deletions
diff --git a/src/compute/hs/gen/gen.h b/src/compute/hs/gen/gen.h index 4043a8df5c..3635d553cf 100644 --- a/src/compute/hs/gen/gen.h +++ b/src/compute/hs/gen/gen.h @@ -9,21 +9,20 @@ #pragma once // +// TODO: // +// Add Key-Val sorting support -- easy. // #include <stdio.h> #include <stdint.h> // +// All code generation is driven by the specified architectural +// details and host platform API. // -// - -#define MERGE_LEVELS_MAX_LOG2 7 // merge up to 128 warps -#define MERGE_LEVELS_MAX_SIZE (1 << MERGE_LEVELS_MAX_LOG2) // ((1 << MERGE_MAX_LOG2) - 1) // incorrect debug error - -// -// +// In general, the warps-per-block and keys-per-thread are the +// critical knobs for tuning performance. // struct hsg_config @@ -58,6 +57,7 @@ struct hsg_config struct { uint32_t lanes; + uint32_t lanes_log2; uint32_t skpw_bs; } warp; @@ -72,7 +72,7 @@ struct hsg_config }; // -// +// HotSort can merge non-power-of-two blocks of warps // struct hsg_level @@ -91,6 +91,16 @@ struct hsg_level } active; }; +// +// +// + +#define MERGE_LEVELS_MAX_LOG2 7 // merge up to 128 warps +#define MERGE_LEVELS_MAX_SIZE (1 << MERGE_LEVELS_MAX_LOG2) + +// +// This is computed +// struct hsg_merge { @@ -113,6 +123,8 @@ struct hsg_merge // // +#if 0 + #define HSG_FILE_NAME_SIZE 80 struct hsg_file @@ -126,18 +138,6 @@ struct hsg_file // // -typedef enum hsg_kernel_type { - - HSG_KERNEL_TYPE_SORT_BLOCK, - - HSG_KERNEL_TYPE_COUNT - -} hsg_kernel_type; - -// -// -// - typedef enum hsg_file_type { HSG_FILE_TYPE_HEADER, @@ -147,6 +147,8 @@ typedef enum hsg_file_type { } hsg_file_type; +#endif + // // // @@ -158,10 +160,8 @@ typedef enum hsg_file_type { HSG_OP_EXPAND_X(HSG_OP_TYPE_BEGIN) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_ELSE) \ \ - HSG_OP_EXPAND_X(HSG_OP_TYPE_FILE_HEADER) \ - HSG_OP_EXPAND_X(HSG_OP_TYPE_FILE_FOOTER) \ - \ - HSG_OP_EXPAND_X(HSG_OP_TYPE_DUMMY_KERNEL) \ + HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_BEGIN) \ + HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_END) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE) \ @@ -186,12 +186,13 @@ typedef enum hsg_file_type { HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT) \ + HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_MERGE_RIGHT_PRED) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_LOAD) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_STORE) \ \ - HSG_OP_EXPAND_X(HSG_OP_TYPE_WARP_FLIP) \ - HSG_OP_EXPAND_X(HSG_OP_TYPE_WARP_HALF) \ + HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_FLIP) \ + HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_HALF) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_FLIP) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_HALF) \ @@ -221,8 +222,6 @@ typedef enum hsg_file_type { \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_ACTIVE_PRED) \ \ - HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_MERGE_RIGHT_PRED) \ - \ HSG_OP_EXPAND_X(HSG_OP_TYPE_COUNT) // @@ -271,42 +270,63 @@ struct hsg_op // // -typedef void (*hsg_target_pfn)(struct hsg_file * const files, - struct hsg_merge const * const merge, - struct hsg_op const * const ops, - uint32_t const depth); +extern char const * const hsg_op_type_string[]; // // // -extern struct hsg_config hsg_config; -extern struct hsg_merge hsg_merge[MERGE_LEVELS_MAX_LOG2]; +struct hsg_target +{ + struct hsg_target_state * state; +}; // +// All targets share this prototype +// + +typedef +void +(*hsg_target_pfn)(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth); // // +// + +extern +void +hsg_target_debug(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth); extern void -hsg_target_debug (struct hsg_file * const files, - struct hsg_merge const * const merge, - struct hsg_op const * const ops, - uint32_t const depth); +hsg_target_cuda(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth); extern void -hsg_target_cuda_sm3x(struct hsg_file * const files, - struct hsg_merge const * const merge, - struct hsg_op const * const ops, - uint32_t const depth); +hsg_target_opencl(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth); extern void -hsg_target_igp_genx (struct hsg_file * const files, - struct hsg_merge const * const merge, - struct hsg_op const * const ops, - uint32_t const depth); +hsg_target_glsl(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth); // // // diff --git a/src/compute/hs/gen/main.c b/src/compute/hs/gen/main.c index 42f4518bfd..e06e23029b 100644 --- a/src/compute/hs/gen/main.c +++ b/src/compute/hs/gen/main.c @@ -20,14 +20,8 @@ // #include "networks.h" -#include "macros.h" -#include "util.h" - -// -// -// - -#define HSG_INDENT 2 +#include "common/util.h" +#include "common/macros.h" // // @@ -36,7 +30,6 @@ #undef HSG_OP_EXPAND_X #define HSG_OP_EXPAND_X(t) #t , -static char const * const hsg_op_type_string[] = { @@ -53,10 +46,8 @@ hsg_op_type_string[] = #define BEGIN() (struct hsg_op){ HSG_OP_TYPE_BEGIN } #define ELSE() (struct hsg_op){ HSG_OP_TYPE_ELSE } -#define STORE_SLAB_EARLY_EXIT() (struct hsg_op){ HSG_OP_TYPE_STORE_SLAB_EARLY_EXIT } - -#define FILE_HEADER() (struct hsg_op){ HSG_OP_TYPE_FILE_HEADER } -#define FILE_FOOTER() (struct hsg_op){ HSG_OP_TYPE_FILE_FOOTER } +#define TARGET_BEGIN() (struct hsg_op){ HSG_OP_TYPE_TARGET_BEGIN } +#define TARGET_END() (struct hsg_op){ HSG_OP_TYPE_TARGET_END } #define TRANSPOSE_KERNEL_PROTO() (struct hsg_op){ HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO } #define TRANSPOSE_KERNEL_PREAMBLE() (struct hsg_op){ HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE } @@ -68,11 +59,11 @@ hsg_op_type_string[] = #define BC_KERNEL_PROTO(i) (struct hsg_op){ HSG_OP_TYPE_BC_KERNEL_PROTO, { i } } #define BC_KERNEL_PREAMBLE(i) (struct hsg_op){ HSG_OP_TYPE_BC_KERNEL_PREAMBLE, { i } } -#define FM_KERNEL_PROTO(l,s) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PROTO, { l, s } } -#define FM_KERNEL_PREAMBLE(w,s) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PREAMBLE, { w, s } } +#define FM_KERNEL_PROTO(s,r) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PROTO, { s, r } } +#define FM_KERNEL_PREAMBLE(h) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PREAMBLE, { h } } -#define HM_KERNEL_PROTO(d,w) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PROTO, { d, w } } -#define HM_KERNEL_PREAMBLE(w,s) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PREAMBLE, { w, s } } +#define HM_KERNEL_PROTO(s) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PROTO, { s } } +#define HM_KERNEL_PREAMBLE(h) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PREAMBLE, { h } } #define BX_REG_GLOBAL_LOAD(n,v) (struct hsg_op){ HSG_OP_TYPE_BX_REG_GLOBAL_LOAD, { n, v } } #define BX_REG_GLOBAL_STORE(n) (struct hsg_op){ HSG_OP_TYPE_BX_REG_GLOBAL_STORE, { n } } @@ -81,12 +72,13 @@ hsg_op_type_string[] = #define FM_REG_GLOBAL_STORE_LEFT(n,i) (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT, { n, i } } #define FM_REG_GLOBAL_LOAD_RIGHT(n,i) (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT, { n, i } } #define FM_REG_GLOBAL_STORE_RIGHT(n,i) (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT, { n, i } } +#define FM_MERGE_RIGHT_PRED(n,s) (struct hsg_op){ HSG_OP_TYPE_FM_MERGE_RIGHT_PRED, { n, s } } #define HM_REG_GLOBAL_LOAD(n,i) (struct hsg_op){ HSG_OP_TYPE_HM_REG_GLOBAL_LOAD, { n, i } } #define HM_REG_GLOBAL_STORE(n,i) (struct hsg_op){ HSG_OP_TYPE_HM_REG_GLOBAL_STORE, { n, i } } -#define WARP_FLIP(f) (struct hsg_op){ HSG_OP_TYPE_WARP_FLIP, { f } } -#define WARP_HALF(h) (struct hsg_op){ HSG_OP_TYPE_WARP_HALF, { h } } +#define SLAB_FLIP(f) (struct hsg_op){ HSG_OP_TYPE_SLAB_FLIP, { f } } +#define SLAB_HALF(h) (struct hsg_op){ HSG_OP_TYPE_SLAB_HALF, { h } } #define CMP_FLIP(a,b,c) (struct hsg_op){ HSG_OP_TYPE_CMP_FLIP, { a, b, c } } #define CMP_HALF(a,b) (struct hsg_op){ HSG_OP_TYPE_CMP_HALF, { a, b } } @@ -121,13 +113,12 @@ hsg_op_type_string[] = #define BS_ACTIVE_PRED(m,l) (struct hsg_op){ HSG_OP_TYPE_BS_ACTIVE_PRED, { m, l } } -#define FM_MERGE_RIGHT_PRED(n,s) (struct hsg_op){ HSG_OP_TYPE_FM_MERGE_RIGHT_PRED, { n, s } } - // // DEFAULTS // -struct hsg_config hsg_config = // FIXME -- how useful is this? +static +struct hsg_config hsg_config = { .merge = { .flip = { @@ -138,8 +129,6 @@ struct hsg_config hsg_config = // FIXME -- how useful is this? .lo = 1, .hi = 1 }, - - .max_log2 = 27 // 2^27th = 128m }, .block = { @@ -156,6 +145,7 @@ struct hsg_config hsg_config = // FIXME -- how useful is this? .warp = { .lanes = 32, + .lanes_log2 = 5, }, .thread = { @@ -172,45 +162,11 @@ struct hsg_config hsg_config = // FIXME -- how useful is this? // ZERO HSG_MERGE STRUCT // +static struct hsg_merge hsg_merge[MERGE_LEVELS_MAX_LOG2] = { 0 }; // -// -// - -static const hsg_target_pfn hsg_target_pfns[] = - { - hsg_target_debug, - hsg_target_cuda_sm3x, - hsg_target_igp_genx, - // hsg_target_adreno_5xx, - // hsg_target_amd_gcn, - // hsg_target_x86_sse, - // hsg_target_x86_avx2, - }; - -static const char * hsg_target_pfn_string[] = - { - "hs_debug", - "hs_cuda", - "hs_cl" - }; - -static const char * hsg_file_type_string[][2] = - { - { ".h", ".txt" }, - { ".h", ".cu" }, - { ".h", ".cl" } - }; - -// -// -// - -#define HSG_TARGET_PFN_COUNT ARRAY_LENGTH(hsg_target_pfns) - -// -// +// STATS ON INSTRUCTIONS // static hsg_op_type hsg_op_type_counts[HSG_OP_TYPE_COUNT] = { 0 }; @@ -223,8 +179,18 @@ static void hsg_op_debug() { + uint32_t total = 0; + for (hsg_op_type t=HSG_OP_TYPE_EXIT; t<HSG_OP_TYPE_COUNT; t++) - fprintf(stderr,"%-37s : %u\n",hsg_op_type_string[t],hsg_op_type_counts[t]); + { + uint32_t const count = hsg_op_type_counts[t]; + + total += count; + + fprintf(stderr,"%-37s : %u\n",hsg_op_type_string[t],count); + } + + fprintf(stderr,"%-37s : %u\n\n\n","TOTAL",total); } // @@ -268,7 +234,7 @@ hsg_merge_levels_init_shared(struct hsg_merge * const merge) // // The provided smem_bs size will be allocated for each sorting block. // - uint32_t const bs_threads = merge->warps * hsg_config.warp.lanes; + uint32_t const bs_threads = merge->warps << hsg_config.warp.lanes_log2; uint32_t const bs_keys = hsg_config.block.smem_bs / (hsg_config.type.words * sizeof(uint32_t)); uint32_t const bs_kpt = bs_keys / bs_threads; uint32_t const bs_kpt_mod = (bs_kpt / hsg_config.block.warps_mod) * hsg_config.block.warps_mod; @@ -282,7 +248,7 @@ hsg_merge_levels_init_shared(struct hsg_merge * const merge) } // clamp to number of registers - merge->rows_bs = min(bs_rows_even, hsg_config.thread.regs); + merge->rows_bs = MIN_MACRO(bs_rows_even, hsg_config.thread.regs); } // @@ -297,19 +263,19 @@ hsg_merge_levels_init_shared(struct hsg_merge * const merge) // // if merge->warps is not pow2 then we're going to skip creating a bc elsewhere // - uint32_t const bc_warps_min = max(merge->warps,hsg_config.block.warps_min); - uint32_t const bc_threads = bc_warps_min * hsg_config.warp.lanes; + uint32_t const bc_warps_min = MAX_MACRO(merge->warps,hsg_config.block.warps_min); + uint32_t const bc_threads = bc_warps_min << hsg_config.warp.lanes_log2; uint32_t const bc_block_rd = (((hsg_config.block.smem_bc * bc_warps_min) / hsg_config.block.warps_max) / hsg_config.block.smem_quantum) * hsg_config.block.smem_quantum; - uint32_t const bc_block_max = max(bc_block_rd,hsg_config.block.smem_min); - uint32_t const bc_block_smem = min(bc_block_max,hsg_config.block.smem_bs); + uint32_t const bc_block_max = MAX_MACRO(bc_block_rd,hsg_config.block.smem_min); + uint32_t const bc_block_smem = MIN_MACRO(bc_block_max,hsg_config.block.smem_bs); // what is the max amount of shared in each possible bc block config? uint32_t const bc_keys = bc_block_smem / (hsg_config.type.words * sizeof(uint32_t)); uint32_t const bc_kpt = bc_keys / bc_threads; uint32_t const bc_kpt_mod = (bc_kpt / hsg_config.block.warps_mod) * hsg_config.block.warps_mod; - merge->rows_bc = min(bc_kpt_mod, hsg_config.thread.regs); + merge->rows_bc = MIN_MACRO(bc_kpt_mod, hsg_config.thread.regs); merge->skpw_bc = bc_keys / bc_warps_min; } } @@ -441,7 +407,7 @@ hsg_merge_levels_hint(struct hsg_merge * const merge, bool const autotune) for (uint32_t level=0; level<MERGE_LEVELS_MAX_LOG2; level++) { // max network - uint32_t const n_max = max(merge->levels[level].networks[0], + uint32_t const n_max = MAX_MACRO(merge->levels[level].networks[0], merge->levels[level].networks[1]); if (n_max <= (merge->rows_bs + hsg_config.thread.xtra)) @@ -533,7 +499,7 @@ hsg_network_copy(struct hsg_op * ops, for (uint32_t ii=0; ii<len; ii++) { - const struct hsg_op * const cx = cxa + ii; + struct hsg_op const * const cx = cxa + ii; ops = hsg_op(ops,CMP_XCHG(cx->a,cx->b,prefix)); } @@ -638,7 +604,7 @@ hsg_warp_half_downto(struct hsg_op * ops, uint32_t h) { ops = hsg_begin(ops); - ops = hsg_op(ops,WARP_HALF(h)); + ops = hsg_op(ops,SLAB_HALF(h)); ops = hsg_warp_half_network(ops); ops = hsg_end(ops); @@ -665,7 +631,7 @@ hsg_warp_flip(struct hsg_op * ops, uint32_t f) { ops = hsg_begin(ops); - ops = hsg_op(ops,WARP_FLIP(f)); + ops = hsg_op(ops,SLAB_FLIP(f)); ops = hsg_warp_flip_network(ops); ops = hsg_end(ops); @@ -782,7 +748,7 @@ hsg_bc_half_merge_level(struct hsg_op * ops, uint32_t const net_even = merge->levels[0].networks[0]; // min of warps in block and remaining horizontal rows - uint32_t const active = min(s_count, net_even); + uint32_t const active = MIN_MACRO(s_count, net_even); // conditional on blockIdx.x if (active < merge->warps) @@ -834,7 +800,7 @@ hsg_bc_half_merge(struct hsg_op * ops, struct hsg_merge const * const merge) // // will only be called with merge->warps >= 2 // - uint32_t const warps = max(merge->warps,hsg_config.block.warps_min); + uint32_t const warps = MAX_MACRO(merge->warps,hsg_config.block.warps_min); // guaranteed to be an even network uint32_t const net_even = merge->levels[0].networks[0]; @@ -851,7 +817,7 @@ hsg_bc_half_merge(struct hsg_op * ops, struct hsg_merge const * const merge) { // compute store count uint32_t const r_rem = hsg_config.thread.regs + 1 - r_lo; - uint32_t const s_count = min(s_max,r_rem); + uint32_t const s_count = MIN_MACRO(s_max,r_rem); // block sync -- can skip if first if (r_lo > 1) @@ -1010,7 +976,7 @@ hsg_bs_flip_merge(struct hsg_op * ops, struct hsg_merge const * const merge) uint32_t r_hi = hsg_config.thread.regs + 1 - r_lo; // compute store count - uint32_t const s_pairs = min(s_pairs_max,r_mid - r_lo); + uint32_t const s_pairs = MIN_MACRO(s_pairs_max,r_mid - r_lo); // store rows to shared for (uint32_t c=0; c<s_pairs; c++) @@ -1082,7 +1048,7 @@ hsg_bs_flip_merge_all(struct hsg_op * ops, const struct hsg_merge * const merge) static struct hsg_op * -hsg_bs_sort(struct hsg_op * ops, const struct hsg_merge * const merge) +hsg_bs_sort(struct hsg_op * ops, struct hsg_merge const * const merge) { // func proto ops = hsg_op(ops,BS_KERNEL_PROTO(merge->index)); @@ -1125,7 +1091,7 @@ hsg_bs_sort_all(struct hsg_op * ops) { for (uint32_t merge_idx=0; merge_idx<MERGE_LEVELS_MAX_LOG2; merge_idx++) { - const struct hsg_merge* const m = hsg_merge + merge_idx; + struct hsg_merge const * const m = hsg_merge + merge_idx; if (m->warps == 0) break; @@ -1142,7 +1108,7 @@ hsg_bs_sort_all(struct hsg_op * ops) static struct hsg_op * -hsg_bc_clean(struct hsg_op * ops, const struct hsg_merge * const merge) +hsg_bc_clean(struct hsg_op * ops, struct hsg_merge const * const merge) { // func proto ops = hsg_op(ops,BC_KERNEL_PROTO(merge->index)); @@ -1189,7 +1155,7 @@ hsg_bc_clean_all(struct hsg_op * ops) { for (uint32_t merge_idx=0; merge_idx<MERGE_LEVELS_MAX_LOG2; merge_idx++) { - const struct hsg_merge* const m = hsg_merge + merge_idx; + struct hsg_merge const * const m = hsg_merge + merge_idx; if (m->warps == 0) break; @@ -1215,9 +1181,7 @@ static struct hsg_op * hsg_fm_thread_load_left(struct hsg_op * ops, uint32_t const n) { - uint32_t const mid = n/2; - - for (uint32_t r=1; r<=mid; r++) + for (uint32_t r=1; r<=n; r++) ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_LEFT(r,r-1)); return ops; @@ -1227,9 +1191,7 @@ static struct hsg_op * hsg_fm_thread_store_left(struct hsg_op * ops, uint32_t const n) { - uint32_t const mid = n/2; - - for (uint32_t r=mid; r>=1; r--) + for (uint32_t r=1; r<=n; r++) ops = hsg_op(ops,FM_REG_GLOBAL_STORE_LEFT(r,r-1)); return ops; @@ -1237,53 +1199,60 @@ hsg_fm_thread_store_left(struct hsg_op * ops, uint32_t const n) static struct hsg_op * -hsg_fm_thread_load_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right) +hsg_fm_thread_load_right(struct hsg_op * ops, uint32_t const half_span, uint32_t const half_case) { - uint32_t const mid = n / 2; - uint32_t const first = mid + 1; - uint32_t const last = mid + span_right; - - for (uint32_t r=first; r<=last; r++) - ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_RIGHT(r,r-first)); + for (uint32_t r=0; r<half_case; r++) + ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_RIGHT(r,half_span+1+r)); return ops; } static struct hsg_op * -hsg_fm_thread_store_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right) +hsg_fm_thread_store_right(struct hsg_op * ops, uint32_t const half_span, uint32_t const half_case) { - uint32_t const mid = n / 2; - uint32_t const first = mid + 1; - uint32_t const last = mid + span_right; - - for (uint32_t r=last; r>=first; r--) - ops = hsg_op(ops,FM_REG_GLOBAL_STORE_RIGHT(r,r-first)); + for (uint32_t r=0; r<half_case; r++) + ops = hsg_op(ops,FM_REG_GLOBAL_STORE_RIGHT(r,half_span+1+r)); return ops; } static struct hsg_op * -hsg_fm_thread_merge_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right) +hsg_fm_merge(struct hsg_op * ops, + uint32_t const scale_log2, + uint32_t const span_left, + uint32_t const span_right) { - // conditional - ops = hsg_op(ops,FM_MERGE_RIGHT_PRED(n/2,span_right)); + // func proto + ops = hsg_op(ops,FM_KERNEL_PROTO(scale_log2,msb_idx_u32(pow2_ru_u32(span_right)))); // begin ops = hsg_begin(ops); - // load - ops = hsg_fm_thread_load_right(ops,n,span_right); + // preamble for loading/storing + ops = hsg_op(ops,FM_KERNEL_PREAMBLE(span_left)); + + // load left span + ops = hsg_fm_thread_load_left(ops,span_left); + + // load right span + ops = hsg_fm_thread_load_right(ops,span_left,span_right); // compare left and right - ops = hsg_thread_merge_left_right(ops,n/2,span_right); + ops = hsg_thread_merge_left_right(ops,span_left,span_right); + + // left merging network + ops = hsg_thread_merge(ops,span_left); // right merging network - ops = hsg_thread_merge_offset(ops,n/2,span_right); + ops = hsg_thread_merge_offset(ops,span_left,span_right); + + // store + ops = hsg_fm_thread_store_left(ops,span_left); // store - ops = hsg_fm_thread_store_right(ops,n,span_right); + ops = hsg_fm_thread_store_right(ops,span_left,span_right); // end ops = hsg_end(ops); @@ -1293,45 +1262,12 @@ hsg_fm_thread_merge_right(struct hsg_op * ops, uint32_t const n, uint32_t const static struct hsg_op * -hsg_fm_thread_merge_right_all(struct hsg_op * ops, uint32_t const span) +hsg_fm_merge_all(struct hsg_op * ops, uint32_t const scale_log2, uint32_t const warps) { - ops = hsg_fm_thread_merge_right(ops,span,span/2); - - for (uint32_t span_pow2 = pow2_ru_u32(span) / 4; span_pow2 >= 1; span_pow2 /= 2) - { - ops = hsg_fm_thread_merge_right(ops,span,span_pow2); - } - - return ops; -} - -static -struct hsg_op * -hsg_fm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uint32_t const fm_scale) -{ - // func proto - ops = hsg_op(ops,FM_KERNEL_PROTO(level,fm_scale)); - - // begin - ops = hsg_begin(ops); - - // shared declare - ops = hsg_op(ops,FM_KERNEL_PREAMBLE(span,fm_scale)); - - // load - ops = hsg_fm_thread_load_left(ops,span); - - // right merging network - ops = hsg_fm_thread_merge_right_all(ops,span); - - // left merging network - ops = hsg_thread_merge(ops,span/2); - - // store - ops = hsg_fm_thread_store_left(ops,span); + uint32_t const span_left = (warps << scale_log2) / 2; - // end - ops = hsg_end(ops); + for (uint32_t span_right=span_left; span_right >= 1; span_right=pow2_ru_u32(span_right)/2) + ops = hsg_fm_merge(ops,scale_log2,span_left,span_right); return ops; } @@ -1354,7 +1290,7 @@ static struct hsg_op * hsg_hm_thread_store(struct hsg_op * ops, uint32_t const n) { - for (uint32_t r=n; r>=1; r--) + for (uint32_t r=1; r<=n; r++) ops = hsg_op(ops,HM_REG_GLOBAL_STORE(r,r-1)); return ops; @@ -1362,16 +1298,18 @@ hsg_hm_thread_store(struct hsg_op * ops, uint32_t const n) static struct hsg_op * -hsg_hm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uint32_t const hm_scale) +hsg_hm_merge(struct hsg_op * ops, uint32_t const scale_log2, uint32_t const warps_pow2) { + uint32_t const span = warps_pow2 << scale_log2; + // func proto - ops = hsg_op(ops,HM_KERNEL_PROTO(level,level-msb_idx_u32(span))); + ops = hsg_op(ops,HM_KERNEL_PROTO(scale_log2)); // begin ops = hsg_begin(ops); - // declarations - ops = hsg_op(ops,HM_KERNEL_PREAMBLE(span,hm_scale)); + // preamble for loading/storing + ops = hsg_op(ops,HM_KERNEL_PREAMBLE(span/2)); // load ops = hsg_hm_thread_load(ops,span); @@ -1389,55 +1327,6 @@ hsg_hm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uin } // -// -// - -static -struct hsg_op * -hsg_fm_merge_level(struct hsg_op * ops, uint32_t const level) -{ - uint32_t const bc_max = pow2_rd_u32(hsg_merge[0].warps); - uint32_t const bc_max_log2 = msb_idx_u32(bc_max); - - uint32_t const fm_level = (level <= bc_max_log2) ? hsg_config.merge.flip.lo : min(level - bc_max_log2,hsg_config.merge.flip.hi); - uint32_t const fm_scale = level - fm_level; - - ops = hsg_fm_merge(ops, - level, - hsg_merge[0].warps * (1u << fm_level), - fm_scale); - - return ops; -} - -// -// -// - -static -struct hsg_op * -hsg_hm_merge_level(struct hsg_op * ops, uint32_t const level) -{ - uint32_t const bc_max = pow2_rd_u32(hsg_merge[0].warps); - uint32_t const bc_max_log2 = msb_idx_u32(bc_max); - - uint32_t const fm_log2_max = bc_max_log2 + hsg_config.merge.flip.hi; - - if (level > fm_log2_max) - { - uint32_t const down_warps_log2 = level - fm_log2_max; - uint32_t const hm_level = max(hsg_config.merge.half.lo,min(hsg_config.merge.half.hi,down_warps_log2)); - - ops = hsg_hm_merge(ops, - level - hsg_config.merge.flip.hi, - bc_max * (1u << hm_level), - down_warps_log2 - hm_level); - } - - return ops; -} - -// // GENERATE MERGE KERNELS // @@ -1445,23 +1334,20 @@ static struct hsg_op * hsg_xm_merge_all(struct hsg_op * ops) { - uint32_t const keys_per_block = hsg_merge[0].warps * hsg_config.warp.lanes * hsg_config.thread.regs; - uint32_t const blocks = ((1U << hsg_config.merge.max_log2) + keys_per_block - 1) / keys_per_block; - uint32_t const blocks_ru = pow2_ru_u32(blocks); - uint32_t const blocks_log2 = msb_idx_u32(blocks_ru); + uint32_t const warps = hsg_merge[0].warps; + uint32_t const warps_pow2 = pow2_rd_u32(warps); - for (uint32_t level=1; level<=blocks_log2; level+=1) - { - // - // GENERATE FLIP MERGE KERNELS - // - ops = hsg_fm_merge_level(ops,level); + // + // GENERATE FLIP MERGE KERNELS + // + for (uint32_t scale_log2=hsg_config.merge.flip.lo; scale_log2<=hsg_config.merge.flip.hi; scale_log2++) + ops = hsg_fm_merge_all(ops,scale_log2,warps); - // - // GENERATE HALF MERGE KERNELS - // - ops = hsg_hm_merge_level(ops,level); - } + // + // GENERATE HALF MERGE KERNELS + // + for (uint32_t scale_log2=hsg_config.merge.half.lo; scale_log2<=hsg_config.merge.half.hi; scale_log2++) + ops = hsg_hm_merge(ops,scale_log2,warps_pow2); return ops; } @@ -1470,93 +1356,30 @@ hsg_xm_merge_all(struct hsg_op * ops) // // -void -hsg_target_indent(struct hsg_file * const files, uint32_t const depth) -{ - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%*s", - depth*HSG_INDENT,""); -} - -void -hsg_target_debug(struct hsg_file * const files, - const struct hsg_merge * const merge, - const struct hsg_op * const ops, - uint32_t const depth) -{ - - hsg_target_indent(files,depth); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s\n", - hsg_op_type_string[ops->type]); -} - -// -// -// - -static -struct hsg_file* -hsg_files_open(const char * prefix, const char ** suffix) -{ -#define STR_BUF_SIZE 80 - - struct hsg_file * files = malloc(sizeof(struct hsg_file) * HSG_FILE_TYPE_COUNT); - - for (int32_t ii=0; ii<HSG_FILE_TYPE_COUNT; ii++) - { - char * name = files[ii].name; - - // save prefix - files[ii].prefix = prefix; - - // build filename - strcpy_s(name,STR_BUF_SIZE,prefix); - strcat_s(name,STR_BUF_SIZE,suffix[ii]); - - // open file - fopen_s(&files[ii].file,name,"w+"); - } - - return files; -} - -static -void -hsg_files_close(struct hsg_file * files) -{ - for (int32_t ii=0; ii<HSG_FILE_TYPE_COUNT; ii++) - fclose(files[ii].file); -} - -// -// -// - static -const struct hsg_op * -hsg_op_translate_depth(hsg_target_pfn target_pfn, - struct hsg_file * const files, - const struct hsg_merge * const merge, - const struct hsg_op * ops, - uint32_t const depth) +struct hsg_op const * +hsg_op_translate_depth(hsg_target_pfn target_pfn, + struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * ops, + uint32_t const depth) { while (ops->type != HSG_OP_TYPE_EXIT) { switch (ops->type) { case HSG_OP_TYPE_END: - target_pfn(files,merge,ops,depth-1); + target_pfn(target,config,merge,ops,depth-1); return ops + 1; case HSG_OP_TYPE_BEGIN: - target_pfn(files,merge,ops,depth); - ops = hsg_op_translate_depth(target_pfn,files,merge,ops+1,depth+1); + target_pfn(target,config,merge,ops,depth); + ops = hsg_op_translate_depth(target_pfn,target,config,merge,ops+1,depth+1); break; default: - target_pfn(files,merge,ops++,depth); + target_pfn(target,config,merge,ops++,depth); } } @@ -1565,12 +1388,13 @@ hsg_op_translate_depth(hsg_target_pfn target_pfn, static void -hsg_op_translate(hsg_target_pfn target_pfn, - struct hsg_file * const files, - const struct hsg_merge * const merge, - const struct hsg_op * ops) +hsg_op_translate(hsg_target_pfn target_pfn, + struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * ops) { - hsg_op_translate_depth(target_pfn,files,merge,ops,0); + hsg_op_translate_depth(target_pfn,target,config,merge,ops,0); } // @@ -1581,37 +1405,27 @@ int main(int argc, char * argv[]) { // - // INIT - // - for (uint32_t ii=0; ii<=MERGE_LEVELS_MAX_LOG2; ii++) - { - hsg_merge[ii].index = ii; - hsg_merge[ii].warps = 32 / (1u << ii); - } - - // // PROCESS OPTIONS // - int32_t arch = 0; - int32_t opt = 0; - - bool quiet = false; - bool autotune = false; + int32_t opt = 0; + bool verbose = false; + bool autotune = false; + char const * arch = "undefined"; - while ((opt = getopt(argc,argv,"hqa:g:G:s:S:w:b:B:m:M:k:r:x:t:f:F:c:C:z")) != EOF) + while ((opt = getopt(argc,argv,"hva:g:G:s:S:w:b:B:m:M:k:r:x:t:f:F:c:C:z")) != EOF) { switch (opt) { case 'h': fprintf(stderr,"Help goes here...\n"); - return -1; + return EXIT_FAILURE; - case 'q': - quiet = true; + case 'v': + verbose = true; break; case 'a': - arch = atoi(optarg); + arch = optarg; break; case 'g': @@ -1635,30 +1449,28 @@ main(int argc, char * argv[]) break; case 'w': - hsg_config.warp.lanes = atoi(optarg); + hsg_config.warp.lanes = atoi(optarg); + hsg_config.warp.lanes_log2 = msb_idx_u32(hsg_config.warp.lanes); break; case 'b': // maximum warps in a workgroup / cta / thread block { - uint32_t const warps = atoi(optarg); - uint32_t const warps_ru_pow2 = pow2_ru_u32(warps); - - // set warps_max if not already set - if (hsg_config.block.warps_max == UINT32_MAX) - hsg_config.block.warps_max = warps_ru_pow2; + uint32_t const warps = atoi(optarg); // must always be even - if ((warps&1) != 0) + if ((warps & 1) != 0) { fprintf(stderr,"Error: -b must be even.\n"); - exit(-1); + return EXIT_FAILURE; } + hsg_merge[0].index = 0; hsg_merge[0].warps = warps; - for (uint32_t ii=1; ii<=MERGE_LEVELS_MAX_LOG2; ii++) - hsg_merge[ii].warps = warps_ru_pow2 / (1u << ii); + // set warps_max if not already set + if (hsg_config.block.warps_max == UINT32_MAX) + hsg_config.block.warps_max = pow2_ru_u32(warps); } break; @@ -1677,18 +1489,14 @@ main(int argc, char * argv[]) hsg_config.block.warps_mod = atoi(optarg); break; - case 'k': - hsg_config.merge.max_log2 = atoi(optarg); - break; - case 'r': { uint32_t const regs = atoi(optarg); - if ((regs&1) != 0) + if ((regs & 1) != 0) { fprintf(stderr,"Error: -r must be even.\n"); - exit(-1); + return EXIT_FAILURE; } hsg_config.thread.regs = regs; @@ -1726,17 +1534,39 @@ main(int argc, char * argv[]) } // - // WHICH ARCH TARGET? + // INIT MERGE // - hsg_target_pfn hsg_target_pfn = (arch < HSG_TARGET_PFN_COUNT) ? hsg_target_pfns[arch] : hsg_target_debug; + uint32_t const warps_ru_pow2 = pow2_ru_u32(hsg_merge[0].warps); + + for (uint32_t ii=1; ii<=MERGE_LEVELS_MAX_LOG2; ii++) + { + hsg_merge[ii].index = ii; + hsg_merge[ii].warps = warps_ru_pow2 >> ii; + } // - // OPEN FILES + // WHICH ARCH TARGET? // - struct hsg_file * files = hsg_files_open(hsg_target_pfn_string[arch],hsg_file_type_string[arch]); + hsg_target_pfn hsg_target_pfn; + + if (strcmp(arch,"debug") == 0) + hsg_target_pfn = hsg_target_debug; + else if (strcmp(arch,"cuda") == 0) + hsg_target_pfn = hsg_target_cuda; + else if (strcmp(arch,"opencl") == 0) + hsg_target_pfn = hsg_target_opencl; + else if (strcmp(arch,"glsl") == 0) + hsg_target_pfn = hsg_target_glsl; + else { + fprintf(stderr,"Invalid arch: %s\n",arch); + exit(EXIT_FAILURE); + } + + if (verbose) + fprintf(stderr,"Target: %s\n",arch); // - // INIT F_KEYS + // INIT SMEM KEY ALLOCATION // hsg_config_init_shared(); @@ -1766,27 +1596,26 @@ main(int argc, char * argv[]) // // THESE ARE FOR DEBUG/INSPECTION // - - if (!quiet) + if (verbose) { hsg_merge_levels_debug(merge); } } - if (!quiet) + if (verbose) fprintf(stderr,"\n\n"); // + // GENERATE THE OPCODES // - // - uint32_t const op_count = 1024*1024; // 2^20 ops for now! - struct hsg_op * const ops_begin = malloc(op_count * sizeof(*ops_begin)); + uint32_t const op_count = 1<<17; + struct hsg_op * const ops_begin = malloc(sizeof(*ops_begin) * op_count); struct hsg_op * ops = ops_begin; // - // APPEND HEADER + // OPEN INITIAL FILES AND APPEND HEADER // - ops = hsg_op(ops,FILE_HEADER()); + ops = hsg_op(ops,TARGET_BEGIN()); // // GENERATE TRANSPOSE KERNEL @@ -1809,9 +1638,9 @@ main(int argc, char * argv[]) ops = hsg_xm_merge_all(ops); // - // APPEND FOOTER + // APPEND FOOTER AND CLOSE INITIAL FILES // - ops = hsg_op(ops,FILE_FOOTER()); + ops = hsg_op(ops,TARGET_END()); // // ... WE'RE DONE! @@ -1821,20 +1650,17 @@ main(int argc, char * argv[]) // // APPLY TARGET TRANSLATOR TO ACCUMULATED OPS // - hsg_op_translate(hsg_target_pfn,files,hsg_merge,ops_begin); + struct hsg_target target; - // - // - // - if (!quiet) - hsg_op_debug(); + hsg_op_translate(hsg_target_pfn,&target,&hsg_config,hsg_merge,ops_begin); // + // DUMP INSTRUCTION COUNTS // - // - hsg_files_close(files); + if (verbose) + hsg_op_debug(); - return 0; + return EXIT_SUCCESS; } // diff --git a/src/compute/hs/gen/networks_merging.c b/src/compute/hs/gen/networks_merging.c index 90dca03c21..f93958c842 100644 --- a/src/compute/hs/gen/networks_merging.c +++ b/src/compute/hs/gen/networks_merging.c @@ -11,7 +11,7 @@ // #include "networks.h" -#include "macros.h" +#include "common/macros.h" // // @@ -24,7 +24,7 @@ // // -#define LM(n) { ARRAY_LENGTH(mn##n), mn##n } +#define LM(n) { ARRAY_LENGTH_MACRO(mn##n), mn##n } // // diff --git a/src/compute/hs/gen/networks_sorting.c b/src/compute/hs/gen/networks_sorting.c index c7beb6b45e..3d8d364399 100644 --- a/src/compute/hs/gen/networks_sorting.c +++ b/src/compute/hs/gen/networks_sorting.c @@ -14,7 +14,7 @@ // #include "networks.h" -#include "macros.h" +#include "common/macros.h" // // @@ -27,7 +27,7 @@ // // -#define LS(n) { ARRAY_LENGTH(sn##n), sn##n } +#define LS(n) { ARRAY_LENGTH_MACRO(sn##n), sn##n } // // diff --git a/src/compute/hs/gen/target_cuda.c b/src/compute/hs/gen/target_cuda.c new file mode 100644 index 0000000000..e140c4be4c --- /dev/null +++ b/src/compute/hs/gen/target_cuda.c @@ -0,0 +1,600 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#include <stdio.h> +#include <stdlib.h> + +// +// +// + +#include "gen.h" +#include "transpose.h" + +#include "common/util.h" +#include "common/macros.h" + +// +// +// + +struct hsg_transpose_state +{ + FILE * header; + struct hsg_config const * config; +}; + +static +char +hsg_transpose_reg_prefix(uint32_t const cols_log2) +{ + return 'a' + (('r' + cols_log2 - 'a') % 26); +} + +static +void +hsg_transpose_blend(uint32_t const cols_log2, + uint32_t const row_ll, // lower-left + uint32_t const row_ur, // upper-right + struct hsg_transpose_state * const state) +{ + // we're starting register names at '1' for now + fprintf(state->header, + " HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n", + hsg_transpose_reg_prefix(cols_log2-1), + hsg_transpose_reg_prefix(cols_log2), + cols_log2,row_ll+1,row_ur+1); +} + +static +void +hsg_transpose_remap(uint32_t const row_from, + uint32_t const row_to, + struct hsg_transpose_state * const state) +{ + // we're starting register names at '1' for now + fprintf(state->header, + " HS_TRANSPOSE_REMAP( %c, %3u, %3u ) \\\n", + hsg_transpose_reg_prefix(state->config->warp.lanes_log2), + row_from+1,row_to+1); +} + +// +// +// + +static +void +hsg_copyright(FILE * file) +{ + fprintf(file, + "// \n" + "// Copyright 2016 Google Inc. \n" + "// \n" + "// Use of this source code is governed by a BSD-style \n" + "// license that can be found in the LICENSE file. \n" + "// \n" + "\n"); +} + +// +// +// + +struct hsg_target_state +{ + FILE * header; + FILE * source; +}; + +// +// +// + +void +hsg_target_cuda(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth) +{ + switch (ops->type) + { + case HSG_OP_TYPE_END: + fprintf(target->state->source, + "}\n"); + break; + + case HSG_OP_TYPE_BEGIN: + fprintf(target->state->source, + "{\n"); + break; + + case HSG_OP_TYPE_ELSE: + fprintf(target->state->source, + "else\n"); + break; + + case HSG_OP_TYPE_TARGET_BEGIN: + { + // allocate state + target->state = malloc(sizeof(*target->state)); + + // allocate files + fopen_s(&target->state->header,"hs_cuda.h", "wb"); + fopen_s(&target->state->source,"hs_cuda.cu","wb"); + + // initialize header + uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps)); + + hsg_copyright(target->state->header); + + fprintf(target->state->header, + "#ifndef HS_CUDA_ONCE \n" + "#define HS_CUDA_ONCE \n" + " \n" + "#define HS_SLAB_THREADS_LOG2 %u \n" + "#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2) \n" + "#define HS_SLAB_WIDTH_LOG2 %u \n" + "#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2) \n" + "#define HS_SLAB_HEIGHT %u \n" + "#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)\n" + "#define HS_REG_LAST(c) c##%u \n" + "#define HS_KEY_TYPE %s \n" + "#define HS_KEY_WORDS %u \n" + "#define HS_VAL_WORDS 0 \n" + "#define HS_BS_SLABS %u \n" + "#define HS_BS_SLABS_LOG2_RU %u \n" + "#define HS_BC_SLABS_LOG2_MAX %u \n" + "#define HS_FM_SCALE_MIN %u \n" + "#define HS_FM_SCALE_MAX %u \n" + "#define HS_HM_SCALE_MIN %u \n" + "#define HS_HM_SCALE_MAX %u \n" + "#define HS_EMPTY \n" + " \n", + config->warp.lanes_log2, + config->warp.lanes_log2, + config->thread.regs, + config->thread.regs, + (config->type.words == 2) ? "ulong" : "uint", + config->type.words, + merge->warps, + msb_idx_u32(pow2_ru_u32(merge->warps)), + bc_max, + config->merge.flip.lo, + config->merge.flip.hi, + config->merge.half.lo, + config->merge.half.hi); + + fprintf(target->state->header, + "#define HS_SLAB_ROWS() \\\n"); + + for (uint32_t ii=1; ii<=config->thread.regs; ii++) + fprintf(target->state->header, + " HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1); + + fprintf(target->state->header, + " HS_EMPTY\n" + " \n"); + + fprintf(target->state->header, + "#define HS_TRANSPOSE_SLAB() \\\n"); + + for (uint32_t ii=1; ii<=config->warp.lanes_log2; ii++) + fprintf(target->state->header, + " HS_TRANSPOSE_STAGE( %u ) \\\n",ii); + + struct hsg_transpose_state state[1] = + { + { .header = target->state->header, + .config = config + } + }; + + hsg_transpose(config->warp.lanes_log2, + config->thread.regs, + hsg_transpose_blend,state, + hsg_transpose_remap,state); + + fprintf(target->state->header, + " HS_EMPTY\n" + " \n"); + + hsg_copyright(target->state->source); + + fprintf(target->state->source, + "#include \"hs_cuda_macros.h\" \n" + " \n" + "// \n" + "// \n" + "// \n"); + } + break; + + case HSG_OP_TYPE_TARGET_END: + // decorate the files + fprintf(target->state->header, + "#endif \n" + " \n" + "// \n" + "// \n" + "// \n" + " \n"); + fprintf(target->state->source, + " \n" + "// \n" + "// \n" + "// \n" + " \n"); + + // close files + fclose(target->state->header); + fclose(target->state->source); + + // free state + free(target->state); + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO: + { + fprintf(target->state->source, + "\nHS_TRANSPOSE_KERNEL_PROTO(%u)\n", + config->warp.lanes); + } + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE: + { + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + } + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY: + { + fprintf(target->state->source, + "HS_TRANSPOSE_SLAB()\n"); + } + break; + + case HSG_OP_TYPE_BS_KERNEL_PROTO: + { + struct hsg_merge const * const m = merge + ops->a; + + uint32_t const bs = pow2_ru_u32(m->warps); + uint32_t const msb = msb_idx_u32(bs); + + fprintf(target->state->source, + "\nHS_BS_KERNEL_PROTO(%u,%u,%u)\n", + config->warp.lanes,m->warps,msb); + } + break; + + case HSG_OP_TYPE_BS_KERNEL_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + if (m->warps > 1) + { + fprintf(target->state->source, + "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n", + m->warps * config->warp.lanes, + m->rows_bs); + } + + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + } + break; + + case HSG_OP_TYPE_BC_KERNEL_PROTO: + { + struct hsg_merge const * const m = merge + ops->a; + + uint32_t const msb = msb_idx_u32(m->warps); + + fprintf(target->state->source, + "\nHS_BC_KERNEL_PROTO(%u,%u,%u)\n", + config->warp.lanes,m->warps,msb); + } + break; + + case HSG_OP_TYPE_BC_KERNEL_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + if (m->warps > 1) + { + fprintf(target->state->source, + "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n", + m->warps * config->warp.lanes, + m->rows_bc); + } + + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + } + break; + + case HSG_OP_TYPE_FM_KERNEL_PROTO: + fprintf(target->state->source, + "\nHS_FM_KERNEL_PROTO(%u,%u)\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_FM_KERNEL_PREAMBLE: + fprintf(target->state->source, + "HS_FM_PREAMBLE(%u);\n", + ops->a); + break; + + case HSG_OP_TYPE_HM_KERNEL_PROTO: + { + fprintf(target->state->source, + "\nHS_HM_KERNEL_PROTO(%u)\n", + ops->a); + } + break; + + case HSG_OP_TYPE_HM_KERNEL_PREAMBLE: + fprintf(target->state->source, + "HS_HM_PREAMBLE(%u);\n", + ops->a); + break; + + case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD: + { + static char const * const vstr[] = { "vin", "vout" }; + + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_SLAB_GLOBAL_LOAD(%s,%u,%u);\n", + ops->n,vstr[ops->v],config->warp.lanes,ops->n-1); + } + break; + + case HSG_OP_TYPE_BX_REG_GLOBAL_STORE: + fprintf(target->state->source, + "HS_SLAB_GLOBAL_STORE(%u,%u,r%u);\n", + config->warp.lanes,ops->n-1,ops->n); + break; + + case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_HM_REG_GLOBAL_STORE: + fprintf(target->state->source, + "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT: + fprintf(target->state->source, + "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_FM_GLOBAL_LOAD_R(%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT: + fprintf(target->state->source, + "HS_FM_GLOBAL_STORE_R(%-3u,r%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED: + { + if (ops->a <= ops->b) + { + fprintf(target->state->source, + "if (HS_FM_IS_NOT_LAST_SPAN() || (fm_frac == 0))\n"); + } + else if (ops->b > 1) + { + fprintf(target->state->source, + "else if (fm_frac == %u)\n", + ops->b); + } + else + { + fprintf(target->state->source, + "else\n"); + } + } + break; + + case HSG_OP_TYPE_SLAB_FLIP: + fprintf(target->state->source, + "HS_SLAB_FLIP_PREAMBLE(%u);\n", + ops->n-1); + break; + + case HSG_OP_TYPE_SLAB_HALF: + fprintf(target->state->source, + "HS_SLAB_HALF_PREAMBLE(%u);\n", + ops->n / 2); + break; + + case HSG_OP_TYPE_CMP_FLIP: + fprintf(target->state->source, + "HS_CMP_FLIP(%-3u,r%-3u,r%-3u);\n",ops->a,ops->b,ops->c); + break; + + case HSG_OP_TYPE_CMP_HALF: + fprintf(target->state->source, + "HS_CMP_HALF(%-3u,r%-3u);\n",ops->a,ops->b); + break; + + case HSG_OP_TYPE_CMP_XCHG: + if (ops->c == UINT32_MAX) + { + fprintf(target->state->source, + "HS_CMP_XCHG(r%-3u,r%-3u);\n", + ops->a,ops->b); + } + else + { + fprintf(target->state->source, + "HS_CMP_XCHG(r%u_%u,r%u_%u);\n", + ops->c,ops->a,ops->c,ops->b); + } + break; + + case HSG_OP_TYPE_BS_REG_SHARED_STORE_V: + fprintf(target->state->source, + "HS_BX_LOCAL_V(%-3u * %-2u * %-3u) = r%u;\n", + merge[ops->a].warps,config->warp.lanes,ops->c,ops->b); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V: + fprintf(target->state->source, + "r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n", + ops->b,merge[ops->a].warps,config->warp.lanes,ops->c); + break; + + case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n", + ops->b,ops->a,config->warp.lanes,ops->c); + break; + + case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT: + fprintf(target->state->source, + "HS_SLAB_LOCAL_L(%5u) = r%u_%u;\n", + ops->b * config->warp.lanes, + ops->c, + ops->a); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT: + fprintf(target->state->source, + "HS_SLAB_LOCAL_R(%5u) = r%u_%u;\n", + ops->b * config->warp.lanes, + ops->c, + ops->a); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_L(%u);\n", + ops->c, + ops->a, + ops->b * config->warp.lanes); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_R(%u);\n", + ops->c, + ops->a, + ops->b * config->warp.lanes); + break; + + case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_BC_GLOBAL_LOAD_L(%u,%u);\n", + ops->c, + ops->a, + config->warp.lanes,ops->b); + break; + + case HSG_OP_TYPE_BLOCK_SYNC: + fprintf(target->state->source, + "HS_BLOCK_BARRIER();\n"); + // + // FIXME - Named barriers to allow coordinating warps to proceed? + // + break; + + case HSG_OP_TYPE_BS_FRAC_PRED: + { + if (ops->m == 0) + { + fprintf(target->state->source, + "if (warp_idx < bs_full)\n"); + } + else + { + fprintf(target->state->source, + "else if (bs_frac == %u)\n", + ops->w); + } + } + break; + + case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + fprintf(target->state->source, + "HS_BS_MERGE_H_PREAMBLE(%u,%u);\n", + config->warp.lanes,m->warps); + } + break; + + case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + fprintf(target->state->source, + "HS_BC_MERGE_H_PREAMBLE(%u,%u,%u);\n", + config->warp.lanes,config->thread.regs,m->warps); + } + break; + + case HSG_OP_TYPE_BX_MERGE_H_PRED: + fprintf(target->state->source, + "if (get_sub_group_id() < %u)\n", + ops->a); + break; + + case HSG_OP_TYPE_BS_ACTIVE_PRED: + { + struct hsg_merge const * const m = merge + ops->a; + + if (m->warps <= 32) + { + fprintf(target->state->source, + "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n", + m->levels[ops->b].active.b32a2[0]); + } + else + { + fprintf(target->state->source, + "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n", + m->levels[ops->b].active.b32a2[1], + m->levels[ops->b].active.b32a2[0]); + } + } + break; + + default: + fprintf(stderr,"type not found: %s\n",hsg_op_type_string[ops->type]); + exit(EXIT_FAILURE); + break; + } +} + +// +// +// diff --git a/src/compute/hs/gen/target_cuda_sm3x.c b/src/compute/hs/gen/target_cuda_sm3x.c deleted file mode 100644 index 6369aa33b0..0000000000 --- a/src/compute/hs/gen/target_cuda_sm3x.c +++ /dev/null @@ -1,776 +0,0 @@ -/* - * Copyright 2016 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#include <stdio.h> - -// -// -// - -#include "gen.h" -#include "util.h" - -// -// -// - -void -hsg_target_cuda_sm3x(struct hsg_file * const files, - struct hsg_merge const * const merge, - struct hsg_op const * const ops, - uint32_t const depth) -{ - const char* const type = (hsg_config.type.words == 2) ? "uint64_t" : "uint32_t"; - const char* const type_max = (hsg_config.type.words == 2) ? "UINT64_MAX" : "UINT32_MAX"; - - switch (ops->type) - { - case HSG_OP_TYPE_END: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "}\n"); - break; - - case HSG_OP_TYPE_BEGIN: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "{\n"); - break; - - case HSG_OP_TYPE_ELSE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "else\n"); - break; - - case HSG_OP_TYPE_FILE_HEADER: - { - uint32_t const bc_min = msb_idx_u32(hsg_config.block.warps_min); - uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge[0].warps)); - - fprintf(files[HSG_FILE_TYPE_HEADER].file, - "// \n" - "// Copyright 2016 Google Inc. \n" - "// \n" - "// Use of this source code is governed by a BSD-style \n" - "// license that can be found in the LICENSE file. \n" - "// \n" - " \n" - "#pragma once \n" - " \n" - "#include <stdint.h> \n" - " \n" - "#define HS_LANES_PER_WARP %u \n" - "#define HS_BS_WARPS_PER_BLOCK %u \n" - "#define HS_BC_WARPS_LOG2_MIN %u \n" - "#define HS_BC_WARPS_LOG2_MAX %u \n" - "#define HS_KEYS_PER_THREAD %u \n" - "#define HS_KEY_WORDS %u \n" - "#define HS_KEY_TYPE %s \n" - " \n" - "#include <%s_args.h> \n" - " \n", - hsg_config.warp.lanes, - merge->warps, - bc_min, - bc_max, - hsg_config.thread.regs, - hsg_config.type.words, - type, - files[HSG_FILE_TYPE_SOURCE].prefix); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "// -*- compile-command: \"nvcc -arch sm_52 -Xptxas=-v,-abi=no -cubin -I. %s\"; -*-\n", - files[HSG_FILE_TYPE_SOURCE].name); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "// \n" - "// Copyright 2016 Google Inc. \n" - "// \n" - "// Use of this source code is governed by a BSD-style \n" - "// license that can be found in the LICENSE file. \n" - "// \n" - " \n" - "#ifdef __cplusplus \n" - "extern \"C\" { \n" - "#endif \n" - " \n" - "#include \"%s_launcher.h\" \n" - " \n" - "#ifdef __cplusplus \n" - "} \n" - "#endif \n" - " \n" - "#include \"%s_launch_bounds.h\" \n" - "#include <%s_finalize.inl> \n" - " \n" - "// \n" - "// \n" - "// \n", - files[HSG_FILE_TYPE_HEADER].prefix, - files[HSG_FILE_TYPE_SOURCE].prefix, - files[HSG_FILE_TYPE_SOURCE].prefix); - } - break; - - case HSG_OP_TYPE_FILE_FOOTER: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "// \n" - "// \n" - "// \n" - " \n" - "#include \"%s_launcher.inl\" \n" - " \n" - "// \n" - "// \n" - "// \n", - files[HSG_FILE_TYPE_SOURCE].prefix); - break; - - case HSG_OP_TYPE_BS_KERNEL_PROTO: - { - const uint32_t tpb = merge->warps * hsg_config.warp.lanes; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "extern \"C\" \n" - "__global__ \n" - "__launch_bounds__(%u,%u) \n" - "void \n" - "hs_bs_kernel(const struct hs_args args) \n", - tpb,1); - } - break; - - case HSG_OP_TYPE_BS_KERNEL_PREAMBLE: - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "__shared__ union { \n"); - - for (uint32_t ii=0; ii<MERGE_LEVELS_MAX_LOG2; ii++) - { - const struct hsg_merge* const m = merge + ii; - - if (m->warps < 2) - break; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " %s m%u[%u][%u];\n", - type, - ii, - m->rows_bs, - m->warps * hsg_config.warp.lanes); - } - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " struct { \n" - " %s f[%u][%u]; \n" - " %s l[%u]; \n" - " }; \n", - type, - merge[0].warps, - hsg_config.warp.skpw_bs - 1, - type, - merge[0].warps); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "} shared; \n" - " \n"); - - const uint32_t kpw = hsg_config.warp.lanes * hsg_config.thread.regs; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "const int32_t block_warp_idx = threadIdx.x / %u; \n" - "const int32_t warp_lane_idx = threadIdx.x & %u; \n" - "const int32_t warp_idx = blockIdx.x * %u + block_warp_idx; \n" - "const int32_t warp_gmem_idx = warp_idx * %u + warp_lane_idx; \n" - " \n" - "%s const * const vin_ptr = args.vin + warp_gmem_idx; \n" - "%s * const vout_ptr = args.vout + warp_gmem_idx; \n" - " \n", - - hsg_config.warp.lanes, - hsg_config.warp.lanes - 1, - merge[0].warps, - kpw, - type, - type); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (warp_idx >= args.bs.full + args.bs.frac) \n" - " return; \n" - " \n"); - } - break; - - case HSG_OP_TYPE_BC_KERNEL_PROTO: - { - uint32_t const bc_warps = merge[ops->a].warps; - uint32_t const tpb = bc_warps * hsg_config.warp.lanes; - uint32_t const bpm = hsg_config.block.warps_max / bc_warps; - uint32_t const msb = msb_idx_u32(bc_warps); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "extern \"C\" \n" - "__global__ \n" - "__launch_bounds__(%u,%u) \n" - "void \n" - "hs_bc_%u_kernel(const struct hs_args args) \n", - tpb,bpm, - msb); - } - break; - - case HSG_OP_TYPE_BC_KERNEL_PREAMBLE: - { - const struct hsg_merge* const m = merge + ops->a; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "__shared__ union { \n"); - - if (m->warps >= 2) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " %s m%u[%u][%u]; \n", - type, - ops->a, - m->rows_bc, - m->warps * hsg_config.warp.lanes); - } - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " struct { \n" - " %s f[%u][%u]; \n" - " %s l[%u]; \n" - " }; \n" - "} shared; \n" - " \n", - type,m->warps,m->skpw_bc - 1, - type,m->warps); - - const uint32_t kpw = hsg_config.warp.lanes * hsg_config.thread.regs; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "const int32_t block_warp_idx = threadIdx.x / %u; \n" - "const int32_t warp_lane_idx = threadIdx.x & %u; \n" - "const int32_t warp_gmem_base = blockIdx.x * %u * %u + warp_lane_idx; \n" - "const int32_t warp_gmem_idx = warp_gmem_base + block_warp_idx * %u; \n" - " \n" - "%s * const vout_ptr = args.vout + warp_gmem_idx; \n" - " \n", - hsg_config.warp.lanes, - hsg_config.warp.lanes - 1, - m->warps,kpw, - kpw, - type); - -#if 0 - // - // NO LONGER NEED THIS TEST - // - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (warp_idx >= args.bc.full) \n" - " return; \n" - " \n"); -#endif - } - break; - - case HSG_OP_TYPE_FM_KERNEL_PROTO: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "#define HS_FM_WARPS_LOG2_%u %u \n" - "extern \"C\" \n" - "__global__ \n" - "HS_FM_LAUNCH_BOUNDS_%u \n" - "void \n" - "hs_fm_%u_kernel(const struct hs_args args) \n", - ops->a, - ops->b, - ops->a - ops->b, - ops->a); - break; - - case HSG_OP_TYPE_FM_KERNEL_PREAMBLE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "const int32_t warp_idx = (blockDim.x * blockIdx.x + threadIdx.x) / %u; \n" - "const int32_t warp_lane_idx = threadIdx.x & %u; \n" - " \n" - "const int32_t merge_idx = warp_idx / %u >> %u; \n" - " \n" - "const int32_t merge_stride = %u * %u << %u; \n" - "const int32_t merge_keys = merge_stride * %u; \n" - " \n" - "const int32_t merge_base = merge_idx * merge_keys; \n" - " \n" - "const int32_t merge_l_off = (warp_idx - merge_idx * (%u << %u)) * %u + warp_lane_idx; \n" - "const int32_t merge_l_end = merge_l_off + merge_stride * (%u / 2 - 1); \n" - "%s * const merge_l = args.vout + merge_base + merge_l_off; \n" - " \n" - "const int32_t merge_r_off = merge_keys - merge_l_end - 1; \n" - "%s * const merge_r = args.vout + merge_base + merge_r_off; \n" - " \n", - hsg_config.warp.lanes, - hsg_config.warp.lanes-1, - hsg_config.thread.regs,ops->b, - hsg_config.thread.regs,hsg_config.warp.lanes,ops->b, - ops->a, - hsg_config.thread.regs,ops->b,hsg_config.warp.lanes, - ops->a, - type, - type); - break; - - case HSG_OP_TYPE_HM_KERNEL_PROTO: - { - const uint32_t bc_max = msb_idx_u32(pow2_rd_u32(merge[0].warps)); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "#define HS_HM_WARPS_LOG2_%u %u \n" - "extern \"C\" \n" - "__global__ \n" - "HS_HM_LAUNCH_BOUNDS_%u \n" - "void \n" - "hs_hm_%u_kernel(const struct hs_args args) \n", - ops->a, - ops->b, - ops->a - ops->b - bc_max, - ops->a); - } - break; - - case HSG_OP_TYPE_HM_KERNEL_PREAMBLE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "const int32_t warp_idx = (blockDim.x * blockIdx.x + threadIdx.x) / %u; \n" - "const int32_t warp_lane_idx = threadIdx.x & %u; \n" - " \n" - "const int32_t merge_idx = (warp_idx / %u) >> %u; \n" - " \n" - "const int32_t merge_stride = %u * %u << %u; \n" - "const int32_t merge_keys = merge_stride * %u; \n" - " \n" - "const int32_t merge_base = merge_idx * merge_keys; \n" - " \n" - "const int32_t merge_off = (warp_idx - merge_idx * (%u << %u)) * %u; \n" - "%s * const merge_ptr = args.vout + merge_base + merge_off + warp_lane_idx; \n" - " \n", - hsg_config.warp.lanes, - hsg_config.warp.lanes-1, - hsg_config.thread.regs,ops->b, - hsg_config.thread.regs,hsg_config.warp.lanes,ops->b, - ops->a, - hsg_config.thread.regs,ops->b,hsg_config.warp.lanes, - type); - break; - - case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD: - { - static const char* const vstr[] = { "vin_ptr", "vout_ptr" }; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s r%-3u = %s[%-3u * %u]; \n", - type,ops->n,vstr[ops->v],ops->n-1,hsg_config.warp.lanes); - } - break; - - case HSG_OP_TYPE_BX_REG_GLOBAL_STORE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "vout_ptr[%-3u * %u] = r%u; \n", - ops->n-1,hsg_config.warp.lanes,ops->n); - break; - -#if 0 - case HSG_OP_TYPE_BX_WARP_STORE_PRED: - if (ops->a == 1) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (!args.is_final) \n"); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (blockIdx.x * %u + block_warp_idx >= args.bx.ru) \n" - "{ \n" - " return; \n" - "} \n" - "else if (!args.is_final) \n", - ops->a); - } - break; -#endif - - case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s r%-3u = merge_ptr[%-3u * merge_stride];\n", - type,ops->a,ops->b); - break; - - case HSG_OP_TYPE_HM_REG_GLOBAL_STORE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "merge_ptr[%-3u * merge_stride] = r%u;\n", - ops->b,ops->a); - break; - - case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s r%-3u = merge_l[%-3u * merge_stride];\n", - type,ops->a,ops->b); - break; - - case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "merge_l[%-3u * merge_stride] = r%u;\n", - ops->b,ops->a); - break; - - case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s r%-3u = merge_r[%-3u * merge_stride];\n", - type,ops->a,ops->b); - break; - - case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "merge_r[%-3u * merge_stride] = r%u;\n", - ops->b,ops->a); - break; - - case HSG_OP_TYPE_WARP_FLIP: - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "const int32_t flip_lane_mask = %u; \n" - "const int32_t flip_lane_idx = warp_lane_idx ^ flip_lane_mask; \n" - "const bool t_lt = warp_lane_idx < flip_lane_idx; \n", - ops->n-1); - } - break; - - case HSG_OP_TYPE_WARP_HALF: - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "const int32_t half_lane_mask = %u; \n" - "const int32_t half_lane_idx = warp_lane_idx ^ half_lane_mask; \n" - "const bool t_lt = warp_lane_idx < half_lane_idx; \n", - ops->n / 2); - } - break; - - case HSG_OP_TYPE_CMP_FLIP: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_CMP_FLIP(r%-3u,r%-3u,r%-3u)\n",ops->a,ops->b,ops->c); - break; - - case HSG_OP_TYPE_CMP_HALF: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_CMP_HALF(r%-3u,r%-3u)\n",ops->a,ops->b); - break; - - case HSG_OP_TYPE_CMP_XCHG: - if (ops->c == UINT32_MAX) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_CMP_XCHG(r%-3u,r%-3u)\n", - ops->a,ops->b); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_CMP_XCHG(r%u_%u,r%u_%u)\n", - ops->c,ops->a,ops->c,ops->b); - } - break; - - case HSG_OP_TYPE_BS_REG_SHARED_STORE_V: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "smem_v[%-3u * %-2u * %-3u] = r%u;\n", - ops->a,hsg_config.warp.lanes,ops->c,ops->b); - break; - - case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "r%-3u = smem_v[%-3u * %-2u * %-3u];\n", - ops->b,ops->a,hsg_config.warp.lanes,ops->c); - break; - - case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s r%-3u = smem_v[%-3u * %-2u * %-3u];\n", - type,ops->b,ops->a,hsg_config.warp.lanes,ops->c); - break; - - case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "smem_l[%5u] = r%u_%u;\n", - ops->b * hsg_config.warp.lanes, - ops->c, - ops->a); - break; - - case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "smem_r[%5u] = r%u_%u;\n", - ops->b * hsg_config.warp.lanes, - ops->c, - ops->a); - break; - - case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s r%u_%-3u = smem_l[%u];\n", - type, - ops->c, - ops->a, - ops->b * hsg_config.warp.lanes); - break; - - case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s r%u_%-3u = smem_r[%u];\n", - type, - ops->c, - ops->a, - ops->b * hsg_config.warp.lanes); - break; - - case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s r%u_%-3u = gmem_l[%u];\n", - type, - ops->c, - ops->a, - ops->b * hsg_config.warp.lanes); - break; - -#if 0 - case HSG_OP_TYPE_REG_F_PREAMBLE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s* const f_%u_smem_st_ptr = &shared.f[block_warp_idx]", - type, - ops->a); - - if (ops->a >= (int32_t)hsg_config.warp.lanes) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "[warp_lane_idx * %u];\n", - (ops->a / hsg_config.warp.lanes) * hsg_config.warp.lanes + 1); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "[(warp_lane_idx & 0x%X) * %u + (warp_lane_idx & ~0x%X)];\n", - ops->a-1, - hsg_config.warp.lanes + 1, - ops->a-1); - } - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "const %s* const f_%u_smem_ld_ptr = &shared.f[block_warp_idx][warp_lane_idx];\n", - type, - ops->a); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s* const f_%u_gmem_st_ptr = args.vout + warp_gmem_idx", - type, - ops->a); - - if (ops->a >= (int32_t)hsg_config.warp.lanes) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file,";\n"); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " - warp_lane_idx + (warp_lane_idx & ~0x%X) * %u + (warp_lane_idx & 0x%X);\n", - ops->a-1, - hsg_config.thread.regs, - ops->a-1); - } - break; - - case HSG_OP_TYPE_REG_SHARED_STORE_F: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "f_%u_smem_st_ptr[%-3u] = r%u;\n", - ops->c, - ops->b, - ops->a); - break; - - case HSG_OP_TYPE_REG_SHARED_LOAD_F: - if (ops->c >= (int32_t)hsg_config.warp.lanes) - { - uint32_t const adjacent = ops->c / hsg_config.warp.lanes; - uint32_t const stride = adjacent * hsg_config.warp.lanes + 1; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "r%-3u = f_%u_smem_ld_ptr[%-3u];\n", - ops->a, - ops->c, - (ops->b / adjacent) * stride + (ops->b % adjacent) * hsg_config.warp.lanes); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "r%-3u = f_%u_smem_ld_ptr[%-3u];\n", - ops->a, - ops->c, - ops->b * (hsg_config.warp.lanes + 1)); - } - break; - - case HSG_OP_TYPE_REG_GLOBAL_STORE_F: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "f_%u_gmem_st_ptr[%-3u * %u + %-3u] = r%u;\n", - ops->c, - ops->b, - hsg_config.thread.regs, // hsg_config.warp.lanes, - (ops->a - 1) & ~(ops->c - 1), - ops->a); - break; -#endif - -#if 0 - case HSG_OP_TYPE_FINALIZE: - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_FINALIZE(%s,args,shared.f[block_warp_idx],shared.l,\n" - " block_warp_idx,warp_lane_idx,warp_gmem_idx,\n" - " r%-3u", - ops->a == 1 ? "true" : "false", - 1); - -#define HS_WARP_FINALIZE_PRETTY_PRINT 8 - - for (uint32_t r=2; r<=hsg_config.thread.regs; r++) - { - if (r % HS_WARP_FINALIZE_PRETTY_PRINT == 1) - fprintf(files[HSG_FILE_TYPE_SOURCE].file,",\n"); - else - fprintf(files[HSG_FILE_TYPE_SOURCE].file,","); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file,"r%-3u",r); - } - - fprintf(files[HSG_FILE_TYPE_SOURCE].file,");\n"); - } - break; -#endif - - case HSG_OP_TYPE_BLOCK_SYNC: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "__syncthreads();\n"); - break; - - case HSG_OP_TYPE_BS_FRAC_PRED: - { - if (ops->m == 0) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (warp_idx < args.bs.full)\n"); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "else if (args.bs.frac == %u)\n", - ops->w); - } - } - break; - -#if 0 // DELETED - case HSG_OP_TYPE_BX_MERGE_V_PREAMBLE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s * const smem_v = shared.m%u[0] + threadIdx.x; \n", - type,ops->a); - break; -#endif - - case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE: - if (ops->c == 0) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s * smem_l = shared.m%u[block_warp_idx ] + warp_lane_idx; \n" - "%s * smem_r = shared.m%u[block_warp_idx ^ 1] + (warp_lane_idx ^ %u); \n", - type,ops->a, - type,ops->a,hsg_config.warp.lanes-1); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "smem_l = shared.m%u[block_warp_idx ] + warp_lane_idx; \n" - "smem_r = shared.m%u[block_warp_idx ^ 1] + (warp_lane_idx ^ %u); \n", - ops->a, - ops->a,hsg_config.warp.lanes-1); - } - break; - - case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "%s const * const gmem_l = args.vout + (warp_gmem_base + block_warp_idx * %u); \n" - "%s * const smem_l = shared.m%u[block_warp_idx] + warp_lane_idx; \n" - "%s * const smem_v = shared.m%u[0] + threadIdx.x; \n", - type,hsg_config.warp.lanes, - type,ops->a, - type,ops->a); - break; - - case HSG_OP_TYPE_BX_MERGE_H_PRED: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (threadIdx.x < %u)\n", - ops->a * hsg_config.warp.lanes); - break; - - case HSG_OP_TYPE_BS_ACTIVE_PRED: - { - const struct hsg_merge* const m = merge + ops->a; - - if (m->warps <= 32) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (((1u << block_warp_idx) & 0x%08X) != 0)\n", - m->levels[ops->b].active.b32a2[0]); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (((1UL << block_warp_idx) & 0x%08X%08XL) != 0L)\n", - m->levels[ops->b].active.b32a2[1], - m->levels[ops->b].active.b32a2[0]); - } - } - break; - - case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED: - { - if (ops->a == ops->b) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (merge_idx < args.fm.full) \n"); - } - else if (ops->b > 1) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "else if (args.fm.frac == %u) \n", - ops->b); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "else\n"); - } - } - break; - - default: - hsg_target_debug(files,merge,ops,depth); - break; - } -} - -// -// -// diff --git a/src/compute/hs/gen/target_debug.c b/src/compute/hs/gen/target_debug.c new file mode 100644 index 0000000000..1481ca8041 --- /dev/null +++ b/src/compute/hs/gen/target_debug.c @@ -0,0 +1,73 @@ +/* + * Copyright 2018 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#include <stdio.h> +#include <stdlib.h> + +// +// +// + +#include "gen.h" + +// +// +// + +#define HSG_INDENT 2 + +// +// +// + +struct hsg_target_state +{ + FILE * txt; +}; + +// +// +// + +void +hsg_target_indent(struct hsg_target * const target, uint32_t const depth) +{ + fprintf(target->state->txt, + "%*s", + depth*HSG_INDENT,""); +} + +void +hsg_target_debug(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth) +{ + if (ops->type == HSG_OP_TYPE_TARGET_BEGIN) + { + target->state = malloc(sizeof(*target->state)); + fopen_s(&target->state->txt,"hs_debug.txt","wb"); + } + + hsg_target_indent(target,depth); + + fprintf(target->state->txt, + "%s\n", + hsg_op_type_string[ops->type]); + + if (ops->type == HSG_OP_TYPE_TARGET_END) + { + fclose(target->state->txt); + free(target->state); + } +} + +// +// +// diff --git a/src/compute/hs/gen/target_glsl.c b/src/compute/hs/gen/target_glsl.c new file mode 100644 index 0000000000..2bb75797ab --- /dev/null +++ b/src/compute/hs/gen/target_glsl.c @@ -0,0 +1,674 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#include <stdio.h> +#include <stdlib.h> + +// +// +// + +#include "gen.h" +#include "transpose.h" + +#include "common/util.h" +#include "common/macros.h" + +// +// +// + +struct hsg_transpose_state +{ + FILE * header; + struct hsg_config const * config; +}; + +static +char +hsg_transpose_reg_prefix(uint32_t const cols_log2) +{ + return 'a' + (('r' + cols_log2 - 'a') % 26); +} + +static +void +hsg_transpose_blend(uint32_t const cols_log2, + uint32_t const row_ll, // lower-left + uint32_t const row_ur, // upper-right + struct hsg_transpose_state * const state) +{ + // we're starting register names at '1' for now + fprintf(state->header, + " HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n", + hsg_transpose_reg_prefix(cols_log2-1), + hsg_transpose_reg_prefix(cols_log2), + cols_log2,row_ll+1,row_ur+1); +} + +static +void +hsg_transpose_remap(uint32_t const row_from, + uint32_t const row_to, + struct hsg_transpose_state * const state) +{ + // we're starting register names at '1' for now + fprintf(state->header, + " HS_TRANSPOSE_REMAP( %c, %3u, %3u ) \\\n", + hsg_transpose_reg_prefix(state->config->warp.lanes_log2), + row_from+1,row_to+1); +} + +// +// +// + +static +void +hsg_copyright(FILE * file) +{ + fprintf(file, + "// \n" + "// Copyright 2016 Google Inc. \n" + "// \n" + "// Use of this source code is governed by a BSD-style \n" + "// license that can be found in the LICENSE file. \n" + "// \n" + "\n"); +} + +static +void +hsg_macros(FILE * file) +{ + fprintf(file, + "#include \"hs_glsl_macros.h\" \n" + " \n" + "// \n" + "// \n" + "// \n" + "\n"); +} + +// +// +// + +struct hsg_target_state +{ + FILE * header; + FILE * embeds; + FILE * source; +}; + +// +// +// + +void +hsg_target_glsl(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth) +{ + switch (ops->type) + { + case HSG_OP_TYPE_END: + fprintf(target->state->source, + "}\n"); + + if (depth == 0) { + fclose(target->state->source); + target->state->source = NULL; + } + break; + + case HSG_OP_TYPE_BEGIN: + fprintf(target->state->source, + "{\n"); + break; + + case HSG_OP_TYPE_ELSE: + fprintf(target->state->source, + "else\n"); + break; + + case HSG_OP_TYPE_TARGET_BEGIN: + { + // allocate state + target->state = malloc(sizeof(*target->state)); + + // allocate files + fopen_s(&target->state->header,"hs_glsl.h", "wb"); + fopen_s(&target->state->embeds,"hs_kernels.h","wb"); + + hsg_copyright(target->state->header); + hsg_copyright(target->state->embeds); + + // initialize header + uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps)); + + fprintf(target->state->header, + "#ifndef HS_GLSL_ONCE \n" + "#define HS_GLSL_ONCE \n" + " \n" + "#define HS_SLAB_THREADS_LOG2 %u \n" + "#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2) \n" + "#define HS_SLAB_WIDTH_LOG2 %u \n" + "#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2) \n" + "#define HS_SLAB_HEIGHT %u \n" + "#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)\n" + "#define HS_REG_LAST(c) c##%u \n" + "#define HS_KEY_TYPE %s \n" + "#define HS_KEY_WORDS %u \n" + "#define HS_VAL_WORDS 0 \n" + "#define HS_BS_SLABS %u \n" + "#define HS_BS_SLABS_LOG2_RU %u \n" + "#define HS_BC_SLABS_LOG2_MAX %u \n" + "#define HS_FM_SCALE_MIN %u \n" + "#define HS_FM_SCALE_MAX %u \n" + "#define HS_HM_SCALE_MIN %u \n" + "#define HS_HM_SCALE_MAX %u \n" + "#define HS_EMPTY \n" + " \n", + config->warp.lanes_log2, + config->warp.lanes_log2, + config->thread.regs, + config->thread.regs, + (config->type.words == 2) ? "uint64_t" : "uint32_t", + config->type.words, + merge->warps, + msb_idx_u32(pow2_ru_u32(merge->warps)), + bc_max, + config->merge.flip.lo, + config->merge.flip.hi, + config->merge.half.lo, + config->merge.half.hi); + + fprintf(target->state->header, + "#define HS_SLAB_ROWS() \\\n"); + + for (uint32_t ii=1; ii<=config->thread.regs; ii++) + fprintf(target->state->header, + " HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1); + + fprintf(target->state->header, + " HS_EMPTY\n" + " \n"); + + fprintf(target->state->header, + "#define HS_TRANSPOSE_SLAB() \\\n"); + + for (uint32_t ii=1; ii<=config->warp.lanes_log2; ii++) + fprintf(target->state->header, + " HS_TRANSPOSE_STAGE( %u ) \\\n",ii); + + struct hsg_transpose_state state[1] = + { + { .header = target->state->header, + .config = config + } + }; + + hsg_transpose(config->warp.lanes_log2, + config->thread.regs, + hsg_transpose_blend,state, + hsg_transpose_remap,state); + + fprintf(target->state->header, + " HS_EMPTY\n" + " \n"); + +#if 0 + fprintf(target->state->source, + "#include <hs_glsl_macros.h> \n" + " \n" + "// \n" + "// \n" + "// \n"); +#endif + } + break; + + case HSG_OP_TYPE_TARGET_END: + // decorate the files + fprintf(target->state->header, + "#endif \n" + " \n" + "// \n" + "// \n" + "// \n" + " \n"); + + // close files + fclose(target->state->header); + fclose(target->state->embeds); + + // free state + free(target->state); + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO: + { + fprintf(target->state->embeds, + "#include \"hs_transpose.len.xxd\"\n,\n" + "#include \"hs_transpose.spv.xxd\"\n,\n"); + + fopen_s(&target->state->source,"hs_transpose.comp","w+"); + + hsg_copyright(target->state->source); + + hsg_macros(target->state->source); + + fprintf(target->state->source, + "HS_TRANSPOSE_KERNEL_PROTO(%u)\n", + config->warp.lanes); + } + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE: + { + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + } + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY: + { + fprintf(target->state->source, + "HS_TRANSPOSE_SLAB()\n"); + } + break; + + case HSG_OP_TYPE_BS_KERNEL_PROTO: + { + struct hsg_merge const * const m = merge + ops->a; + + uint32_t const bs = pow2_ru_u32(m->warps); + uint32_t const msb = msb_idx_u32(bs); + + fprintf(target->state->embeds, + "#include \"hs_bs_%u.len.xxd\"\n,\n" + "#include \"hs_bs_%u.spv.xxd\"\n,\n", + msb, + msb); + + char filename[] = { "hs_bs_123.comp" }; + sprintf_s(filename,sizeof(filename),"hs_bs_%u.comp",msb); + fopen_s(&target->state->source,filename,"w+"); + + hsg_copyright(target->state->source); + + hsg_macros(target->state->source); + + if (m->warps > 1) + { + fprintf(target->state->source, + "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n", + m->warps * config->warp.lanes, + m->rows_bs); + } + + fprintf(target->state->source, + "HS_BS_KERNEL_PROTO(%u,%u,%u)\n", + config->warp.lanes,m->warps,msb); + } + break; + + case HSG_OP_TYPE_BS_KERNEL_PREAMBLE: + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + break; + + case HSG_OP_TYPE_BC_KERNEL_PROTO: + { + struct hsg_merge const * const m = merge + ops->a; + + uint32_t const msb = msb_idx_u32(m->warps); + + fprintf(target->state->embeds, + "#include \"hs_bc_%u.len.xxd\"\n,\n" + "#include \"hs_bc_%u.spv.xxd\"\n,\n", + msb, + msb); + + char filename[] = { "hs_bc_123.comp" }; + sprintf_s(filename,sizeof(filename),"hs_bc_%u.comp",msb); + fopen_s(&target->state->source,filename,"w+"); + + hsg_copyright(target->state->source); + + hsg_macros(target->state->source); + + if (m->warps > 1) + { + fprintf(target->state->source, + "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n", + m->warps * config->warp.lanes, + m->rows_bc); + } + + fprintf(target->state->source, + "HS_BC_KERNEL_PROTO(%u,%u,%u)\n", + config->warp.lanes,m->warps,msb); + } + break; + + case HSG_OP_TYPE_BC_KERNEL_PREAMBLE: + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + break; + + case HSG_OP_TYPE_FM_KERNEL_PROTO: + { + fprintf(target->state->embeds, + "#include \"hs_fm_%u_%u.len.xxd\"\n,\n" + "#include \"hs_fm_%u_%u.spv.xxd\"\n,\n", + ops->a,ops->b, + ops->a,ops->b); + + char filename[] = { "hs_fm_123_123.comp" }; + sprintf_s(filename,sizeof(filename),"hs_fm_%u_%u.comp",ops->a,ops->b); + fopen_s(&target->state->source,filename,"w+"); + + hsg_copyright(target->state->source); + + hsg_macros(target->state->source); + + fprintf(target->state->source, + "HS_FM_KERNEL_PROTO(%u,%u)\n", + ops->a,ops->b); + } + break; + + case HSG_OP_TYPE_FM_KERNEL_PREAMBLE: + fprintf(target->state->source, + "HS_FM_PREAMBLE(%u);\n", + ops->a); + break; + + case HSG_OP_TYPE_HM_KERNEL_PROTO: + { + fprintf(target->state->embeds, + "#include \"hs_hm_%u_%u.len.xxd\"\n,\n" + "#include \"hs_hm_%u_%u.spv.xxd\"\n,\n", + ops->a,ops->b, + ops->a,ops->b); + + char filename[] = { "hs_hm_123_123.comp" }; + sprintf_s(filename,sizeof(filename),"hs_hm_%u_%u.comp",ops->a,ops->b); + fopen_s(&target->state->source,filename,"w+"); + + hsg_copyright(target->state->source); + + hsg_macros(target->state->source); + + fprintf(target->state->source, + "HS_HM_KERNEL_PROTO(%u)\n", + ops->a); + } + break; + + case HSG_OP_TYPE_HM_KERNEL_PREAMBLE: + fprintf(target->state->source, + "HS_HM_PREAMBLE(%u);\n", + ops->a); + break; + + case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD: + { + static char const * const vstr[] = { "vin", "vout" }; + + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_SLAB_GLOBAL_LOAD(%s,%u,%u);\n", + ops->n,vstr[ops->v],config->warp.lanes,ops->n-1); + } + break; + + case HSG_OP_TYPE_BX_REG_GLOBAL_STORE: + fprintf(target->state->source, + "HS_SLAB_GLOBAL_STORE(%u,%u,r%u);\n", + config->warp.lanes,ops->n-1,ops->n); + break; + + case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_HM_REG_GLOBAL_STORE: + fprintf(target->state->source, + "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT: + fprintf(target->state->source, + "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_FM_GLOBAL_LOAD_R(%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT: + fprintf(target->state->source, + "HS_FM_GLOBAL_STORE_R(%-3u,r%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED: + { + if (ops->a <= ops->b) + { + fprintf(target->state->source, + "if (HS_FM_IS_NOT_LAST_SPAN() || (fm_frac == 0))\n"); + } + else if (ops->b > 1) + { + fprintf(target->state->source, + "else if (fm_frac == %u)\n", + ops->b); + } + else + { + fprintf(target->state->source, + "else\n"); + } + } + break; + + case HSG_OP_TYPE_SLAB_FLIP: + fprintf(target->state->source, + "HS_SLAB_FLIP_PREAMBLE(%u);\n", + ops->n-1); + break; + + case HSG_OP_TYPE_SLAB_HALF: + fprintf(target->state->source, + "HS_SLAB_HALF_PREAMBLE(%u);\n", + ops->n / 2); + break; + + case HSG_OP_TYPE_CMP_FLIP: + fprintf(target->state->source, + "HS_CMP_FLIP(%-3u,r%-3u,r%-3u);\n",ops->a,ops->b,ops->c); + break; + + case HSG_OP_TYPE_CMP_HALF: + fprintf(target->state->source, + "HS_CMP_HALF(%-3u,r%-3u);\n",ops->a,ops->b); + break; + + case HSG_OP_TYPE_CMP_XCHG: + if (ops->c == UINT32_MAX) + { + fprintf(target->state->source, + "HS_CMP_XCHG(r%-3u,r%-3u);\n", + ops->a,ops->b); + } + else + { + fprintf(target->state->source, + "HS_CMP_XCHG(r%u_%u,r%u_%u);\n", + ops->c,ops->a,ops->c,ops->b); + } + break; + + case HSG_OP_TYPE_BS_REG_SHARED_STORE_V: + fprintf(target->state->source, + "HS_BX_LOCAL_V(%-3u * %-2u * %-3u) = r%u;\n", + merge[ops->a].warps,config->warp.lanes,ops->c,ops->b); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V: + fprintf(target->state->source, + "r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n", + ops->b,merge[ops->a].warps,config->warp.lanes,ops->c); + break; + + case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n", + ops->b,ops->a,config->warp.lanes,ops->c); + break; + + case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT: + fprintf(target->state->source, + "HS_SLAB_LOCAL_L(%5u) = r%u_%u;\n", + ops->b * config->warp.lanes, + ops->c, + ops->a); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT: + fprintf(target->state->source, + "HS_SLAB_LOCAL_R(%5u) = r%u_%u;\n", + ops->b * config->warp.lanes, + ops->c, + ops->a); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_L(%u);\n", + ops->c, + ops->a, + ops->b * config->warp.lanes); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_R(%u);\n", + ops->c, + ops->a, + ops->b * config->warp.lanes); + break; + + case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_BC_GLOBAL_LOAD_L(%u,%u);\n", + ops->c, + ops->a, + config->warp.lanes,ops->b); + break; + + case HSG_OP_TYPE_BLOCK_SYNC: + fprintf(target->state->source, + "HS_BLOCK_BARRIER();\n"); + // + // FIXME - Named barriers to allow coordinating warps to proceed? + // + break; + + case HSG_OP_TYPE_BS_FRAC_PRED: + { + if (ops->m == 0) + { + fprintf(target->state->source, + "if (warp_idx < bs_full)\n"); + } + else + { + fprintf(target->state->source, + "else if (bs_frac == %u)\n", + ops->w); + } + } + break; + + case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + fprintf(target->state->source, + "HS_BS_MERGE_H_PREAMBLE(%u,%u);\n", + config->warp.lanes,m->warps); + } + break; + + case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + fprintf(target->state->source, + "HS_BC_MERGE_H_PREAMBLE(%u,%u,%u);\n", + config->warp.lanes,config->thread.regs,m->warps); + } + break; + + case HSG_OP_TYPE_BX_MERGE_H_PRED: + fprintf(target->state->source, + "if (get_sub_group_id() < %u)\n", + ops->a); + break; + + case HSG_OP_TYPE_BS_ACTIVE_PRED: + { + struct hsg_merge const * const m = merge + ops->a; + + if (m->warps <= 32) + { + fprintf(target->state->source, + "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n", + m->levels[ops->b].active.b32a2[0]); + } + else + { + fprintf(target->state->source, + "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n", + m->levels[ops->b].active.b32a2[1], + m->levels[ops->b].active.b32a2[0]); + } + } + break; + + default: + fprintf(stderr,"type not found: %s\n",hsg_op_type_string[ops->type]); + exit(EXIT_FAILURE); + break; + } +} + +// +// +// diff --git a/src/compute/hs/gen/target_igp_genx.c b/src/compute/hs/gen/target_igp_genx.c deleted file mode 100644 index 3d0f2bc1b8..0000000000 --- a/src/compute/hs/gen/target_igp_genx.c +++ /dev/null @@ -1,672 +0,0 @@ -/* - * Copyright 2016 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#include <stdio.h> - -// -// -// - -#include "gen.h" -#include "util.h" -#include "macros.h" -#include "transpose.h" - -// -// -// - -static -char -hsg_transpose_reg_prefix(uint32_t const cols_log2) -{ - return 'a' + (('r' + cols_log2 - 'a') % 26); -} - -static -void -hsg_transpose_blend(uint32_t const cols_log2, - uint32_t const row_ll, // lower-left - uint32_t const row_ur, // upper-right - FILE * file) -{ - // we're starting register names at '1' for now - fprintf(file, - " HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n", - hsg_transpose_reg_prefix(cols_log2-1), - hsg_transpose_reg_prefix(cols_log2), - cols_log2,row_ll+1,row_ur+1); -} - -static -void -hsg_transpose_remap(uint32_t const row_from, - uint32_t const row_to, - FILE * file) -{ - // we're starting register names at '1' for now - fprintf(file, - " HS_TRANSPOSE_REMAP( %c, %3u, %3u ) \\\n", - hsg_transpose_reg_prefix(msb_idx_u32(hsg_config.warp.lanes)), - row_from+1,row_to+1); -} - -// -// -// - -void -hsg_target_igp_genx(struct hsg_file * const files, - struct hsg_merge const * const merge, - struct hsg_op const * const ops, - uint32_t const depth) -{ - switch (ops->type) - { - case HSG_OP_TYPE_END: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "}\n"); - break; - - case HSG_OP_TYPE_BEGIN: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "{\n"); - break; - - case HSG_OP_TYPE_ELSE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "else\n"); - break; - - case HSG_OP_TYPE_FILE_HEADER: - { - uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps)); - uint32_t const warp_lanes_log2 = msb_idx_u32(hsg_config.warp.lanes); - - fprintf(files[HSG_FILE_TYPE_HEADER].file, - "// \n" - "// Copyright 2016 Google Inc. \n" - "// \n" - "// Use of this source code is governed by a BSD-style \n" - "// license that can be found in the LICENSE file. \n" - "// \n" - " \n" - "#ifndef HS_CL_ONCE \n" - "#define HS_CL_ONCE \n" - " \n" - "#define HS_LANES_PER_WARP_LOG2 %u \n" - "#define HS_LANES_PER_WARP (1 << HS_LANES_PER_WARP_LOG2) \n" - "#define HS_BS_WARPS %u \n" - "#define HS_BS_WARPS_LOG2_RU %u \n" - "#define HS_BC_WARPS_LOG2_MAX %u \n" - "#define HS_FM_BLOCKS_LOG2_MIN %u \n" - "#define HS_HM_BLOCKS_LOG2_MIN %u \n" - "#define HS_KEYS_PER_LANE %u \n" - "#define HS_REG_LAST(c) c##%u \n" - "#define HS_KEY_WORDS %u \n" - "#define HS_KEY_TYPE %s \n" - "#define HS_EMPTY \n" - " \n", - warp_lanes_log2, - merge->warps, - msb_idx_u32(pow2_ru_u32(merge->warps)), - bc_max, - hsg_config.merge.flip.lo, - hsg_config.merge.half.lo, - hsg_config.thread.regs, - hsg_config.thread.regs, - hsg_config.type.words, - (hsg_config.type.words == 2) ? "ulong" : "uint"); - - fprintf(files[HSG_FILE_TYPE_HEADER].file, - "#define HS_SLAB_ROWS() \\\n"); - - for (uint32_t ii=1; ii<=hsg_config.thread.regs; ii++) - fprintf(files[HSG_FILE_TYPE_HEADER].file, - " HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1); - - fprintf(files[HSG_FILE_TYPE_HEADER].file, - " HS_EMPTY\n" - " \n"); - - fprintf(files[HSG_FILE_TYPE_HEADER].file, - "#define HS_TRANSPOSE_SLAB() \\\n"); - - for (uint32_t ii=1; ii<=warp_lanes_log2; ii++) - fprintf(files[HSG_FILE_TYPE_HEADER].file, - " HS_TRANSPOSE_STAGE( %u ) \\\n",ii); - - hsg_transpose(msb_idx_u32(hsg_config.warp.lanes), - hsg_config.thread.regs, - files[HSG_FILE_TYPE_HEADER].file, - files[HSG_FILE_TYPE_HEADER].file, - hsg_transpose_blend, - hsg_transpose_remap); - - fprintf(files[HSG_FILE_TYPE_HEADER].file, - " HS_EMPTY\n" - " \n"); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "// \n" - "// Copyright 2016 Google Inc. \n" - "// \n" - "// Use of this source code is governed by a BSD-style \n" - "// license that can be found in the LICENSE file. \n" - "// \n" - " \n" - "#include <%s_macros.h> \n" - " \n" - "// \n" - "// \n" - "// \n", - files[HSG_FILE_TYPE_SOURCE].prefix); - } - break; - - case HSG_OP_TYPE_FILE_FOOTER: - fprintf(files[HSG_FILE_TYPE_HEADER].file, - " \n" - "#endif \n" - " \n" - "// \n" - "// \n" - "// \n" - " \n"); - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "// \n" - "// \n" - "// \n" - " \n"); - break; - - case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO: - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "__kernel \n" - "__attribute__((intel_reqd_sub_group_size(%u))) \n" - "void hs_kernel_transpose(__global HS_KEY_TYPE * const restrict vout) \n", - hsg_config.warp.lanes); - } - break; - - case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE: - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const global_id = get_global_id(0); \n" - "uint const gmem_idx = (global_id / %u) * %u + (global_id & %u); \n" - " \n", - hsg_config.warp.lanes, - hsg_config.warp.lanes * hsg_config.thread.regs, - hsg_config.warp.lanes-1); - } - break; - - case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY: - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_TRANSPOSE_SLAB()\n"); - } - break; - - case HSG_OP_TYPE_BS_KERNEL_PROTO: - { - struct hsg_merge const * const m = merge + ops->a; - - uint32_t const tpb = m->warps * hsg_config.warp.lanes; - uint32_t const bs = pow2_ru_u32(m->warps); - uint32_t const msb = msb_idx_u32(bs); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "__kernel \n" - "__attribute__((reqd_work_group_size(%u,1,1))) \n" - "__attribute__((intel_reqd_sub_group_size(%u))) \n" - "void hs_kernel_bs_%u(__global HS_KEY_TYPE const * const restrict vin, \n" - " __global HS_KEY_TYPE * const restrict vout) \n", - tpb, - hsg_config.warp.lanes, - msb); - } - break; - - case HSG_OP_TYPE_BS_KERNEL_PREAMBLE: - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "__local union { \n"); - - struct hsg_merge const * const m = merge + ops->a; - - if (m->warps > 1) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " HS_KEY_TYPE m[%u * %u];\n", - m->rows_bs, - m->warps * hsg_config.warp.lanes); - } - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "} shared; \n" - " \n"); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const global_id = get_global_id(0); \n" - "uint const gmem_idx = (global_id / %u) * %u + (global_id & %u); \n" - " \n", - hsg_config.warp.lanes, - hsg_config.warp.lanes * hsg_config.thread.regs, - hsg_config.warp.lanes-1); - } - break; - - case HSG_OP_TYPE_BC_KERNEL_PROTO: - { - uint32_t const bc_max = pow2_rd_u32(merge[0].warps); - uint32_t const tpb = bc_max * hsg_config.warp.lanes; - uint32_t const msb = msb_idx_u32(merge[ops->a].warps); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "__kernel \n" - "__attribute__((intel_reqd_sub_group_size(%u))) \n" - "void hs_kernel_bc_%u(__global HS_KEY_TYPE * const restrict vout) \n", - hsg_config.warp.lanes,msb); - } - break; - - case HSG_OP_TYPE_BC_KERNEL_PREAMBLE: - { - struct hsg_merge const * const m = merge + ops->a; - uint32_t const bc_max = pow2_rd_u32(merge[0].warps); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "__local union { \n"); - - if (m->warps > 1) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " HS_KEY_TYPE m[%-3u * %u];\n", - m->rows_bc, - m->warps * hsg_config.warp.lanes); - } - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "} shared; \n" - " \n"); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const global_id = get_global_id(0); \n" - "uint const gmem_idx = (global_id / %u) * %u + (global_id & %u); \n" - " \n", - hsg_config.warp.lanes, - hsg_config.warp.lanes * hsg_config.thread.regs, - hsg_config.warp.lanes-1); - } - break; - - case HSG_OP_TYPE_FM_KERNEL_PROTO: - fprintf(files[HSG_FILE_TYPE_HEADER].file, - "#define HS_FM_BLOCKS_LOG2_%-2u %u \n", - ops->a,ops->b); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "__kernel \n" - "__attribute__((intel_reqd_sub_group_size(%u))) \n" - "void hs_kernel_fm_%u(__global HS_KEY_TYPE * const restrict vout, \n" - " uint const fm_full, \n" - " uint const fm_frac) \n", - hsg_config.warp.lanes,ops->a); - break; - - case HSG_OP_TYPE_FM_KERNEL_PREAMBLE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const global_id = (uint)get_global_id(0); \n" - "uint const warp_idx = global_id / %u; \n" - "uint const warp_lane_idx = global_id & %u; \n" - " \n" - "uint const merge_idx = warp_idx / %u >> %u; \n" - " \n" - "uint const merge_stride = %u * %u << %u; \n" - "uint const merge_keys = merge_stride * %u; \n" - " \n" - "uint const merge_base = merge_idx * merge_keys; \n" - " \n" - "uint const merge_l_off = (warp_idx - merge_idx * (%u << %u)) * %u + warp_lane_idx; \n" - "uint const merge_l_end = merge_stride * (%u / 2 - 1) + merge_l_off; \n" - " \n" - "int const merge_r_off = merge_keys - merge_l_end - 1; \n" - " \n" - "__global HS_KEY_TYPE * const restrict merge_l = vout + (merge_base + merge_l_off); \n" - "__global HS_KEY_TYPE * const restrict merge_r = vout + (merge_base + merge_r_off); \n" - " \n", - hsg_config.warp.lanes, - hsg_config.warp.lanes-1, - hsg_config.thread.regs,ops->b, - hsg_config.thread.regs,hsg_config.warp.lanes,ops->b, - ops->a, - hsg_config.thread.regs,ops->b,hsg_config.warp.lanes, - ops->a); - break; - - case HSG_OP_TYPE_HM_KERNEL_PROTO: - { - uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge[0].warps)); - - fprintf(files[HSG_FILE_TYPE_HEADER].file, - "#define HS_HM_BLOCKS_LOG2_%-2u %u \n", - ops->a,ops->b); - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - " \n" - "__kernel \n" - "__attribute__((intel_reqd_sub_group_size(%u))) \n" - "void hs_kernel_hm_%u(__global HS_KEY_TYPE * const restrict vout) \n", - hsg_config.warp.lanes,ops->a); - } - break; - - case HSG_OP_TYPE_HM_KERNEL_PREAMBLE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const global_id = (uint)get_global_id(0); \n" - "uint const warp_idx = global_id / %u; \n" - "uint const warp_lane_idx = global_id & %u; \n" - " \n" - "uint const merge_idx = (warp_idx / %u) >> %u; \n" - " \n" - "uint const merge_stride = %u * %u << %u; \n" - "uint const merge_keys = merge_stride * %u; \n" - " \n" - "uint const merge_base = merge_idx * merge_keys; \n" - "uint const merge_off = (warp_idx - merge_idx * (%u << %u)) * %u; \n" - " \n" - "__global HS_KEY_TYPE * const restrict merge_ptr = vout + (merge_base + merge_off + warp_lane_idx); \n" - " \n", - hsg_config.warp.lanes, - hsg_config.warp.lanes-1, - hsg_config.thread.regs,ops->b, - hsg_config.thread.regs,hsg_config.warp.lanes,ops->b, - ops->a, - hsg_config.thread.regs,ops->b,hsg_config.warp.lanes); - break; - - case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD: - { - static char const * const vstr[] = { "vin", "vout" }; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_KEY_TYPE r%-3u = (%s + gmem_idx)[%-3u * %u]; \n", - ops->n,vstr[ops->v],ops->n-1,hsg_config.warp.lanes); - } - break; - - case HSG_OP_TYPE_BX_REG_GLOBAL_STORE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "(vout + gmem_idx)[%-3u * %u] = r%u; \n", - ops->n-1,hsg_config.warp.lanes,ops->n); - break; - - case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_KEY_TYPE r%-3u = merge_ptr[%-3u * merge_stride];\n", - ops->a,ops->b); - break; - - case HSG_OP_TYPE_HM_REG_GLOBAL_STORE: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "merge_ptr[%-3u * merge_stride] = r%u;\n", - ops->b,ops->a); - break; - - case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_KEY_TYPE r%-3u = merge_l[%-3u * merge_stride];\n", - ops->a,ops->b); - break; - - case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "merge_l[%-3u * merge_stride] = r%u;\n", - ops->b,ops->a); - break; - - case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_KEY_TYPE r%-3u = merge_r[%-3u * merge_stride];\n", - ops->a,ops->b); - break; - - case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "merge_r[%-3u * merge_stride] = r%u;\n", - ops->b,ops->a); - break; - - case HSG_OP_TYPE_WARP_FLIP: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const flip_lane_mask = %u; \n" - "uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask; \n" - "int const t_lt = get_sub_group_local_id() < flip_lane_idx; \n", - ops->n-1); - break; - - case HSG_OP_TYPE_WARP_HALF: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const half_lane_mask = %u; \n" - "uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask; \n" - "int const t_lt = get_sub_group_local_id() < half_lane_idx; \n", - ops->n / 2); - break; - - case HSG_OP_TYPE_CMP_FLIP: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_CMP_FLIP(%-3u,r%-3u,r%-3u)\n",ops->a,ops->b,ops->c); - break; - - case HSG_OP_TYPE_CMP_HALF: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_CMP_HALF(%-3u,r%-3u)\n",ops->a,ops->b); - break; - - case HSG_OP_TYPE_CMP_XCHG: - if (ops->c == UINT32_MAX) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_CMP_XCHG(r%-3u,r%-3u)\n", - ops->a,ops->b); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_CMP_XCHG(r%u_%u,r%u_%u)\n", - ops->c,ops->a,ops->c,ops->b); - } - break; - - case HSG_OP_TYPE_BS_REG_SHARED_STORE_V: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "(shared.m + get_local_id(0))[%-3u * %-2u * %-3u] = r%u;\n", - merge[ops->a].warps,hsg_config.warp.lanes,ops->c,ops->b); - break; - - case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "r%-3u = (shared.m + get_local_id(0))[%-3u * %-2u * %-3u];\n", - ops->b,merge[ops->a].warps,hsg_config.warp.lanes,ops->c); - break; - - case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_KEY_TYPE r%-3u = (shared.m + get_local_id(0))[%-3u * %-2u * %-3u];\n", - ops->b,ops->a,hsg_config.warp.lanes,ops->c); - break; - - case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "(shared.m + smem_l_idx)[%5u] = r%u_%u;\n", - ops->b * hsg_config.warp.lanes, - ops->c, - ops->a); - break; - - case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "(shared.m + smem_r_idx)[%5u] = r%u_%u;\n", - ops->b * hsg_config.warp.lanes, - ops->c, - ops->a); - break; - - case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_KEY_TYPE r%u_%-3u = (shared.m + smem_l_idx)[%u];\n", - ops->c, - ops->a, - ops->b * hsg_config.warp.lanes); - break; - - case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_KEY_TYPE r%u_%-3u = (shared.m + smem_r_idx)[%u];\n", - ops->c, - ops->a, - ops->b * hsg_config.warp.lanes); - break; - - case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "HS_KEY_TYPE r%u_%-3u = (vout + gmem_l_idx)[%u];\n", - ops->c, - ops->a, - ops->b * hsg_config.warp.lanes); - break; - - case HSG_OP_TYPE_BLOCK_SYNC: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "barrier(CLK_LOCAL_MEM_FENCE);\n"); // OpenCL 2.0+: work_group_barrier - break; - - case HSG_OP_TYPE_BS_FRAC_PRED: - { - if (ops->m == 0) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (warp_idx < bs_full)\n"); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "else if (bs_frac == %u)\n", - ops->w); - } - } - break; - - case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE: - { - struct hsg_merge const * const m = merge + ops->a; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const smem_l_idx = get_sub_group_id() * %u + get_sub_group_local_id(); \n" - "uint const smem_r_idx = (get_sub_group_id() ^ 1) * %u + (get_sub_group_local_id() ^ %u); \n", - m->warps * hsg_config.warp.lanes, - m->warps * hsg_config.warp.lanes, hsg_config.warp.lanes-1); -#if 0 - if (ops->b == true) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint smem_l_idx = get_sub_group_id() * %u + get_sub_group_local_id(); \n" - "uint smem_r_idx = (get_sub_group_id() ^ 1) * %u + (get_sub_group_local_id() ^ %u); \n", - m->warps * hsg_config.warp.lanes, - m->warps * hsg_config.warp.lanes, hsg_config.warp.lanes-1); - } - else // update - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "smem_l_idx = get_sub_group_id() * %u + get_sub_group_local_id(); \n" - "smem_r_idx = (get_sub_group_id() ^ 1) * %u + (get_sub_group_local_id() ^ %u); \n", - m->warps * hsg_config.warp.lanes, - m->warps * hsg_config.warp.lanes, hsg_config.warp.lanes-1); - } -#endif - } - break; - - case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE: - { - struct hsg_merge const * const m = merge + ops->a; - uint32_t const b = m->warps * hsg_config.warp.lanes; - uint32_t const k = b * hsg_config.thread.regs; - - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "uint const gmem_l_idx = (global_id / %u) * %u + (global_id & %u); \n" - "uint const smem_l_idx = get_sub_group_id() * %u + get_sub_group_local_id(); \n", - b,k,b-1, - b); - - } - break; - - case HSG_OP_TYPE_BX_MERGE_H_PRED: - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (get_sub_group_id() < %u)\n", - ops->a); - break; - - case HSG_OP_TYPE_BS_ACTIVE_PRED: - { - struct hsg_merge const * const m = merge + ops->a; - - if (m->warps <= 32) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n", - m->levels[ops->b].active.b32a2[0]); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n", - m->levels[ops->b].active.b32a2[1], - m->levels[ops->b].active.b32a2[0]); - } - } - break; - - case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED: - { - if (ops->a == ops->b) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "if (merge_idx < fm_full) \n"); - } - else if (ops->b > 1) - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "else if (fm_frac == %u) \n", - ops->b); - } - else - { - fprintf(files[HSG_FILE_TYPE_SOURCE].file, - "else\n"); - } - } - break; - - default: - hsg_target_debug(files,merge,ops,depth); - break; - } -} - -// -// -// diff --git a/src/compute/hs/gen/target_opencl.c b/src/compute/hs/gen/target_opencl.c new file mode 100644 index 0000000000..fe7343ba5d --- /dev/null +++ b/src/compute/hs/gen/target_opencl.c @@ -0,0 +1,600 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#include <stdio.h> +#include <stdlib.h> + +// +// +// + +#include "gen.h" +#include "transpose.h" + +#include "common/util.h" +#include "common/macros.h" + +// +// +// + +struct hsg_transpose_state +{ + FILE * header; + struct hsg_config const * config; +}; + +static +char +hsg_transpose_reg_prefix(uint32_t const cols_log2) +{ + return 'a' + (('r' + cols_log2 - 'a') % 26); +} + +static +void +hsg_transpose_blend(uint32_t const cols_log2, + uint32_t const row_ll, // lower-left + uint32_t const row_ur, // upper-right + struct hsg_transpose_state * const state) +{ + // we're starting register names at '1' for now + fprintf(state->header, + " HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n", + hsg_transpose_reg_prefix(cols_log2-1), + hsg_transpose_reg_prefix(cols_log2), + cols_log2,row_ll+1,row_ur+1); +} + +static +void +hsg_transpose_remap(uint32_t const row_from, + uint32_t const row_to, + struct hsg_transpose_state * const state) +{ + // we're starting register names at '1' for now + fprintf(state->header, + " HS_TRANSPOSE_REMAP( %c, %3u, %3u ) \\\n", + hsg_transpose_reg_prefix(state->config->warp.lanes_log2), + row_from+1,row_to+1); +} + +// +// +// + +static +void +hsg_copyright(FILE * file) +{ + fprintf(file, + "// \n" + "// Copyright 2016 Google Inc. \n" + "// \n" + "// Use of this source code is governed by a BSD-style \n" + "// license that can be found in the LICENSE file. \n" + "// \n" + "\n"); +} + +// +// +// + +struct hsg_target_state +{ + FILE * header; + FILE * source; +}; + +// +// +// + +void +hsg_target_opencl(struct hsg_target * const target, + struct hsg_config const * const config, + struct hsg_merge const * const merge, + struct hsg_op const * const ops, + uint32_t const depth) +{ + switch (ops->type) + { + case HSG_OP_TYPE_END: + fprintf(target->state->source, + "}\n"); + break; + + case HSG_OP_TYPE_BEGIN: + fprintf(target->state->source, + "{\n"); + break; + + case HSG_OP_TYPE_ELSE: + fprintf(target->state->source, + "else\n"); + break; + + case HSG_OP_TYPE_TARGET_BEGIN: + { + // allocate state + target->state = malloc(sizeof(*target->state)); + + // allocate files + fopen_s(&target->state->header,"hs_cl.h", "wb"); + fopen_s(&target->state->source,"hs_cl.cl","wb"); + + // initialize header + uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps)); + + hsg_copyright(target->state->header); + + fprintf(target->state->header, + "#ifndef HS_CL_ONCE \n" + "#define HS_CL_ONCE \n" + " \n" + "#define HS_SLAB_THREADS_LOG2 %u \n" + "#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2) \n" + "#define HS_SLAB_WIDTH_LOG2 %u \n" + "#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2) \n" + "#define HS_SLAB_HEIGHT %u \n" + "#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)\n" + "#define HS_REG_LAST(c) c##%u \n" + "#define HS_KEY_TYPE %s \n" + "#define HS_KEY_WORDS %u \n" + "#define HS_VAL_WORDS 0 \n" + "#define HS_BS_SLABS %u \n" + "#define HS_BS_SLABS_LOG2_RU %u \n" + "#define HS_BC_SLABS_LOG2_MAX %u \n" + "#define HS_FM_SCALE_MIN %u \n" + "#define HS_FM_SCALE_MAX %u \n" + "#define HS_HM_SCALE_MIN %u \n" + "#define HS_HM_SCALE_MAX %u \n" + "#define HS_EMPTY \n" + " \n", + config->warp.lanes_log2, + config->warp.lanes_log2, + config->thread.regs, + config->thread.regs, + (config->type.words == 2) ? "ulong" : "uint", + config->type.words, + merge->warps, + msb_idx_u32(pow2_ru_u32(merge->warps)), + bc_max, + config->merge.flip.lo, + config->merge.flip.hi, + config->merge.half.lo, + config->merge.half.hi); + + fprintf(target->state->header, + "#define HS_SLAB_ROWS() \\\n"); + + for (uint32_t ii=1; ii<=config->thread.regs; ii++) + fprintf(target->state->header, + " HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1); + + fprintf(target->state->header, + " HS_EMPTY\n" + " \n"); + + fprintf(target->state->header, + "#define HS_TRANSPOSE_SLAB() \\\n"); + + for (uint32_t ii=1; ii<=config->warp.lanes_log2; ii++) + fprintf(target->state->header, + " HS_TRANSPOSE_STAGE( %u ) \\\n",ii); + + struct hsg_transpose_state state[1] = + { + { .header = target->state->header, + .config = config + } + }; + + hsg_transpose(config->warp.lanes_log2, + config->thread.regs, + hsg_transpose_blend,state, + hsg_transpose_remap,state); + + fprintf(target->state->header, + " HS_EMPTY\n" + " \n"); + + hsg_copyright(target->state->source); + + fprintf(target->state->source, + "#include \"hs_cl_macros.h\" \n" + " \n" + "// \n" + "// \n" + "// \n"); + } + break; + + case HSG_OP_TYPE_TARGET_END: + // decorate the files + fprintf(target->state->header, + "#endif \n" + " \n" + "// \n" + "// \n" + "// \n" + " \n"); + fprintf(target->state->source, + " \n" + "// \n" + "// \n" + "// \n" + " \n"); + + // close files + fclose(target->state->header); + fclose(target->state->source); + + // free state + free(target->state); + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO: + { + fprintf(target->state->source, + "\nHS_TRANSPOSE_KERNEL_PROTO(%u)\n", + config->warp.lanes); + } + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE: + { + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + } + break; + + case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY: + { + fprintf(target->state->source, + "HS_TRANSPOSE_SLAB()\n"); + } + break; + + case HSG_OP_TYPE_BS_KERNEL_PROTO: + { + struct hsg_merge const * const m = merge + ops->a; + + uint32_t const bs = pow2_ru_u32(m->warps); + uint32_t const msb = msb_idx_u32(bs); + + fprintf(target->state->source, + "\nHS_BS_KERNEL_PROTO(%u,%u,%u)\n", + config->warp.lanes,m->warps,msb); + } + break; + + case HSG_OP_TYPE_BS_KERNEL_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + if (m->warps > 1) + { + fprintf(target->state->source, + "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n", + m->warps * config->warp.lanes, + m->rows_bs); + } + + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + } + break; + + case HSG_OP_TYPE_BC_KERNEL_PROTO: + { + struct hsg_merge const * const m = merge + ops->a; + + uint32_t const msb = msb_idx_u32(m->warps); + + fprintf(target->state->source, + "\nHS_BC_KERNEL_PROTO(%u,%u,%u)\n", + config->warp.lanes,m->warps,msb); + } + break; + + case HSG_OP_TYPE_BC_KERNEL_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + if (m->warps > 1) + { + fprintf(target->state->source, + "HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n", + m->warps * config->warp.lanes, + m->rows_bc); + } + + fprintf(target->state->source, + "HS_SLAB_GLOBAL_PREAMBLE(%u,%u);\n", + config->warp.lanes,config->thread.regs); + } + break; + + case HSG_OP_TYPE_FM_KERNEL_PROTO: + fprintf(target->state->source, + "\nHS_FM_KERNEL_PROTO(%u,%u)\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_FM_KERNEL_PREAMBLE: + fprintf(target->state->source, + "HS_FM_PREAMBLE(%u);\n", + ops->a); + break; + + case HSG_OP_TYPE_HM_KERNEL_PROTO: + { + fprintf(target->state->source, + "\nHS_HM_KERNEL_PROTO(%u)\n", + ops->a); + } + break; + + case HSG_OP_TYPE_HM_KERNEL_PREAMBLE: + fprintf(target->state->source, + "HS_HM_PREAMBLE(%u);\n", + ops->a); + break; + + case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD: + { + static char const * const vstr[] = { "vin", "vout" }; + + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_SLAB_GLOBAL_LOAD(%s,%u,%u);\n", + ops->n,vstr[ops->v],config->warp.lanes,ops->n-1); + } + break; + + case HSG_OP_TYPE_BX_REG_GLOBAL_STORE: + fprintf(target->state->source, + "HS_SLAB_GLOBAL_STORE(%u,%u,r%u);\n", + config->warp.lanes,ops->n-1,ops->n); + break; + + case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_HM_REG_GLOBAL_STORE: + fprintf(target->state->source, + "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT: + fprintf(target->state->source, + "HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_FM_GLOBAL_LOAD_R(%u);\n", + ops->b,ops->a); + break; + + case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT: + fprintf(target->state->source, + "HS_FM_GLOBAL_STORE_R(%-3u,r%u);\n", + ops->a,ops->b); + break; + + case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED: + { + if (ops->a <= ops->b) + { + fprintf(target->state->source, + "if (HS_FM_IS_NOT_LAST_SPAN() || (fm_frac == 0))\n"); + } + else if (ops->b > 1) + { + fprintf(target->state->source, + "else if (fm_frac == %u)\n", + ops->b); + } + else + { + fprintf(target->state->source, + "else\n"); + } + } + break; + + case HSG_OP_TYPE_SLAB_FLIP: + fprintf(target->state->source, + "HS_SLAB_FLIP_PREAMBLE(%u);\n", + ops->n-1); + break; + + case HSG_OP_TYPE_SLAB_HALF: + fprintf(target->state->source, + "HS_SLAB_HALF_PREAMBLE(%u);\n", + ops->n / 2); + break; + + case HSG_OP_TYPE_CMP_FLIP: + fprintf(target->state->source, + "HS_CMP_FLIP(%-3u,r%-3u,r%-3u);\n",ops->a,ops->b,ops->c); + break; + + case HSG_OP_TYPE_CMP_HALF: + fprintf(target->state->source, + "HS_CMP_HALF(%-3u,r%-3u);\n",ops->a,ops->b); + break; + + case HSG_OP_TYPE_CMP_XCHG: + if (ops->c == UINT32_MAX) + { + fprintf(target->state->source, + "HS_CMP_XCHG(r%-3u,r%-3u);\n", + ops->a,ops->b); + } + else + { + fprintf(target->state->source, + "HS_CMP_XCHG(r%u_%u,r%u_%u);\n", + ops->c,ops->a,ops->c,ops->b); + } + break; + + case HSG_OP_TYPE_BS_REG_SHARED_STORE_V: + fprintf(target->state->source, + "HS_BX_LOCAL_V(%-3u * %-2u * %-3u) = r%u;\n", + merge[ops->a].warps,config->warp.lanes,ops->c,ops->b); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V: + fprintf(target->state->source, + "r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n", + ops->b,merge[ops->a].warps,config->warp.lanes,ops->c); + break; + + case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V: + fprintf(target->state->source, + "HS_KEY_TYPE r%-3u = HS_BX_LOCAL_V(%-3u * %-2u * %-3u);\n", + ops->b,ops->a,config->warp.lanes,ops->c); + break; + + case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT: + fprintf(target->state->source, + "HS_SLAB_LOCAL_L(%5u) = r%u_%u;\n", + ops->b * config->warp.lanes, + ops->c, + ops->a); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT: + fprintf(target->state->source, + "HS_SLAB_LOCAL_R(%5u) = r%u_%u;\n", + ops->b * config->warp.lanes, + ops->c, + ops->a); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_L(%u);\n", + ops->c, + ops->a, + ops->b * config->warp.lanes); + break; + + case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_R(%u);\n", + ops->c, + ops->a, + ops->b * config->warp.lanes); + break; + + case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT: + fprintf(target->state->source, + "HS_KEY_TYPE r%u_%-3u = HS_BC_GLOBAL_LOAD_L(%u,%u);\n", + ops->c, + ops->a, + config->warp.lanes,ops->b); + break; + + case HSG_OP_TYPE_BLOCK_SYNC: + fprintf(target->state->source, + "HS_BLOCK_BARRIER();\n"); + // + // FIXME - Named barriers to allow coordinating warps to proceed? + // + break; + + case HSG_OP_TYPE_BS_FRAC_PRED: + { + if (ops->m == 0) + { + fprintf(target->state->source, + "if (warp_idx < bs_full)\n"); + } + else + { + fprintf(target->state->source, + "else if (bs_frac == %u)\n", + ops->w); + } + } + break; + + case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + fprintf(target->state->source, + "HS_BS_MERGE_H_PREAMBLE(%u,%u);\n", + config->warp.lanes,m->warps); + } + break; + + case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE: + { + struct hsg_merge const * const m = merge + ops->a; + + fprintf(target->state->source, + "HS_BC_MERGE_H_PREAMBLE(%u,%u,%u);\n", + config->warp.lanes,config->thread.regs,m->warps); + } + break; + + case HSG_OP_TYPE_BX_MERGE_H_PRED: + fprintf(target->state->source, + "if (get_sub_group_id() < %u)\n", + ops->a); + break; + + case HSG_OP_TYPE_BS_ACTIVE_PRED: + { + struct hsg_merge const * const m = merge + ops->a; + + if (m->warps <= 32) + { + fprintf(target->state->source, + "if (((1u << get_sub_group_id()) & 0x%08X) != 0)\n", + m->levels[ops->b].active.b32a2[0]); + } + else + { + fprintf(target->state->source, + "if (((1UL << get_sub_group_id()) & 0x%08X%08XL) != 0L)\n", + m->levels[ops->b].active.b32a2[1], + m->levels[ops->b].active.b32a2[0]); + } + } + break; + + default: + fprintf(stderr,"type not found: %s\n",hsg_op_type_string[ops->type]); + exit(EXIT_FAILURE); + break; + } +} + +// +// +// diff --git a/src/compute/hs/gen/transpose.c b/src/compute/hs/gen/transpose.c index de15c62631..095f53d330 100644 --- a/src/compute/hs/gen/transpose.c +++ b/src/compute/hs/gen/transpose.c @@ -11,7 +11,7 @@ // #include "transpose.h" -#include "macros.h" +#include "common/macros.h" // // Rows must be an even number. This is enforced elsewhere. @@ -21,19 +21,19 @@ void hsg_transpose(uint32_t const cols_log2, uint32_t const rows, - void * blend, - void * remap, void (*pfn_blend)(uint32_t const cols_log2, uint32_t const row_ll, // lower-left uint32_t const row_ur, // upper-right void * blend), + void * blend, void (*pfn_remap)(uint32_t const row_from, uint32_t const row_to, - void * remap)) + void * remap), + void * remap) { // get mapping array - uint32_t * map_curr = ALLOCA(rows * sizeof(*map_curr)); - uint32_t * map_next = ALLOCA(rows * sizeof(*map_next)); + uint32_t * map_curr = ALLOCA_MACRO(rows * sizeof(*map_curr)); + uint32_t * map_next = ALLOCA_MACRO(rows * sizeof(*map_next)); // init the mapping array for (uint32_t ii=0; ii<rows; ii++) @@ -89,35 +89,35 @@ static uint32_t cols; // implicit on SIMD/GPU static void -hsg_debug_remap(uint32_t const row_from, - uint32_t const row_to, - uint32_t * const r) -{ - fprintf(stdout,"REMAP( %3u, %3u )\n",row_from,row_to); - - r[row_to] = row_from; -} - -static -void hsg_debug_blend(uint32_t const cols_log2, uint32_t const row_ll, // lower-left uint32_t const row_ur, // upper-right - uint32_t * m) + uint32_t * b) { fprintf(stdout,"BLEND( %u, %3u, %3u )\n",cols_log2,row_ll,row_ur); - uint32_t * const ll = ALLOCA(cols * sizeof(*m)); - uint32_t * const ur = ALLOCA(cols * sizeof(*m)); + uint32_t * const ll = ALLOCA(cols * sizeof(*b)); + uint32_t * const ur = ALLOCA(cols * sizeof(*b)); - memcpy(ll,m+row_ll*cols,cols * sizeof(*m)); - memcpy(ur,m+row_ur*cols,cols * sizeof(*m)); + memcpy(ll,b+row_ll*cols,cols * sizeof(*b)); + memcpy(ur,b+row_ur*cols,cols * sizeof(*b)); for (uint32_t ii=0; ii<cols; ii++) - m[row_ll*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii] : ur[ii^(1<<cols_log2-1)]; + b[row_ll*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii] : ur[ii^(1<<cols_log2-1)]; for (uint32_t ii=0; ii<cols; ii++) - m[row_ur*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii^(1<<cols_log2-1)] : ur[ii]; + b[row_ur*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii^(1<<cols_log2-1)] : ur[ii]; +} + +static +void +hsg_debug_remap(uint32_t const row_from, + uint32_t const row_to, + uint32_t * const r) +{ + fprintf(stdout,"REMAP( %3u, %3u )\n",row_from,row_to); + + r[row_to] = row_from; } static @@ -144,23 +144,22 @@ main(int argc, char * argv[]) cols = 1 << cols_log2; - uint32_t * const m = ALLOCA(cols * rows * sizeof(*m)); + uint32_t * const b = ALLOCA(cols * rows * sizeof(*b)); uint32_t * const r = ALLOCA( rows * sizeof(*r)); for (uint32_t rr=0; rr<rows; rr++) { r[rr] = rr; for (uint32_t cc=0; cc<cols; cc++) - m[rr*cols+cc] = cc*rows+rr; + b[rr*cols+cc] = cc*rows+rr; } - hsg_debug_print(rows,m,r); + hsg_debug_print(rows,b,r); hsg_transpose(cols_log2,rows, - m,r, - hsg_debug_blend, - hsg_debug_remap); + hsg_debug_blend,b, + hsg_debug_remap,r); - hsg_debug_print(rows,m,r); + hsg_debug_print(rows,b,r); return 0; } diff --git a/src/compute/hs/gen/transpose.h b/src/compute/hs/gen/transpose.h index 83f6fc4e42..380210970d 100644 --- a/src/compute/hs/gen/transpose.h +++ b/src/compute/hs/gen/transpose.h @@ -23,15 +23,15 @@ void hsg_transpose(uint32_t const cols_log2, uint32_t const rows, - void * blend, - void * remap, void (*pfn_blend)(uint32_t const cols_log2, uint32_t const row_ll, // lower-left uint32_t const row_ur, // upper-right void * blend), + void * blend, void (*pfn_remap)(uint32_t const row_from, uint32_t const row_to, - void * remap)); + void * remap), + void * remap); // // |