aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/compute/hs/cl/bench/main.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/compute/hs/cl/bench/main.c')
-rw-r--r--src/compute/hs/cl/bench/main.c262
1 files changed, 164 insertions, 98 deletions
diff --git a/src/compute/hs/cl/bench/main.c b/src/compute/hs/cl/bench/main.c
index 3b9ef6e1c7..bfa7c1da38 100644
--- a/src/compute/hs/cl/bench/main.c
+++ b/src/compute/hs/cl/bench/main.c
@@ -32,9 +32,10 @@
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#endif
-#include "macros.h"
-#include "assert_cl.h"
-#include "find_cl.h"
+#include "common/macros.h"
+#include "common/cl/assert_cl.h"
+#include "common/cl/find_cl.h"
+
#include "hs_cl_launcher.h"
//
@@ -90,10 +91,10 @@ char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count);
static
char const *
-hs_cpu_sort(void * sorted_h,
- uint32_t const count,
- struct hs_info const * const info,
- double * const cpu_ns)
+hs_cpu_sort(uint32_t const hs_words,
+ void * sorted_h,
+ uint32_t const count,
+ double * const cpu_ns)
{
char const * algo;
@@ -101,7 +102,7 @@ hs_cpu_sort(void * sorted_h,
QueryPerformanceCounter(&t0);
- if (info->words == 1)
+ if (hs_words == 1)
algo = hs_cpu_sort_u32(sorted_h,count);
else
algo = hs_cpu_sort_u64(sorted_h,count);
@@ -117,27 +118,34 @@ hs_cpu_sort(void * sorted_h,
static
bool
-hs_verify_linear(void * sorted_h, void * vout_h, const uint32_t count, struct hs_info const * const info)
+hs_verify_linear(uint32_t const hs_words,
+ void * sorted_h,
+ void * vout_h,
+ uint32_t const count)
{
- return memcmp(sorted_h, vout_h, sizeof(uint32_t) * info->words * count) == 0;
+ return memcmp(sorted_h, vout_h, sizeof(uint32_t) * hs_words * count) == 0;
}
static
void
-hs_transpose_slabs_u32(uint32_t * vout_h, const uint32_t count, struct hs_info const * const info)
+hs_transpose_slabs_u32(uint32_t const hs_words,
+ uint32_t const hs_width,
+ uint32_t const hs_height,
+ uint32_t * vout_h,
+ uint32_t const count)
{
- uint32_t const slab_keys = info->keys * info->lanes;
- size_t const slab_size = sizeof(uint32_t) * info->words * slab_keys;
- uint32_t * const slab = _alloca(slab_size);
+ uint32_t const slab_keys = hs_width * hs_height;
+ size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys;
+ uint32_t * const slab = ALLOCA_MACRO(slab_size);
uint32_t slab_count = count / slab_keys;
while (slab_count-- > 0)
{
memcpy(slab,vout_h,slab_size);
- for (uint32_t row=0; row<info->keys; row++)
- for (uint32_t col=0; col<info->lanes; col++)
- vout_h[col * info->keys + row] = slab[row * info->lanes + col];
+ for (uint32_t row=0; row<hs_height; row++)
+ for (uint32_t col=0; col<hs_width; col++)
+ vout_h[col * hs_height + row] = slab[row * hs_width + col];
vout_h += slab_keys;
}
@@ -145,20 +153,24 @@ hs_transpose_slabs_u32(uint32_t * vout_h, const uint32_t count, struct hs_info c
static
void
-hs_transpose_slabs_u64(uint64_t * vout_h, const uint32_t count, struct hs_info const * const info)
+hs_transpose_slabs_u64(uint32_t const hs_words,
+ uint32_t const hs_width,
+ uint32_t const hs_height,
+ uint64_t * vout_h,
+ uint32_t const count)
{
- uint32_t const slab_keys = info->keys * info->lanes;
- size_t const slab_size = sizeof(uint32_t) * info->words * slab_keys;
- uint64_t * const slab = _alloca(slab_size);
+ uint32_t const slab_keys = hs_width * hs_height;
+ size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys;
+ uint64_t * const slab = ALLOCA_MACRO(slab_size);
uint32_t slab_count = count / slab_keys;
while (slab_count-- > 0)
{
memcpy(slab,vout_h,slab_size);
- for (uint32_t row=0; row<info->keys; row++)
- for (uint32_t col=0; col<info->lanes; col++)
- vout_h[col * info->keys + row] = slab[row * info->lanes + col];
+ for (uint32_t row=0; row<hs_height; row++)
+ for (uint32_t col=0; col<hs_width; col++)
+ vout_h[col * hs_height + row] = slab[row * hs_width + col];
vout_h += slab_keys;
}
@@ -166,12 +178,16 @@ hs_transpose_slabs_u64(uint64_t * vout_h, const uint32_t count, struct hs_info c
static
void
-hs_transpose_slabs(void * vout_h, const uint32_t count, struct hs_info const * const info)
+hs_transpose_slabs(uint32_t const hs_words,
+ uint32_t const hs_width,
+ uint32_t const hs_height,
+ void * vout_h,
+ uint32_t const count)
{
- if (info->words == 1)
- hs_transpose_slabs_u32(vout_h,count,info);
+ if (hs_words == 1)
+ hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count);
else
- hs_transpose_slabs_u64(vout_h,count,info);
+ hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count);
}
//
@@ -180,18 +196,18 @@ hs_transpose_slabs(void * vout_h, const uint32_t count, struct hs_info const * c
static
void
-hs_debug_u32(
- uint32_t const * vout_h,
- uint32_t const count,
- struct hs_info const * const info)
+hs_debug_u32(uint32_t const hs_width,
+ uint32_t const hs_height,
+ uint32_t const * vout_h,
+ uint32_t const count)
{
- uint32_t const slab = info->keys * info->lanes;
- uint32_t const slabs = (count + slab - 1) / slab;
+ uint32_t const slab_keys = hs_width * hs_height;
+ uint32_t const slabs = (count + slab_keys - 1) / slab_keys;
for (uint32_t ss=0; ss<slabs; ss++) {
fprintf(stderr,"%u\n",ss);
- for (uint32_t cc=0; cc<info->keys; cc++) {
- for (uint32_t rr=0; rr<info->lanes; rr++)
+ for (uint32_t cc=0; cc<hs_height; cc++) {
+ for (uint32_t rr=0; rr<hs_width; rr++)
fprintf(stderr,"%8X ",*vout_h++);
fprintf(stderr,"\n");
}
@@ -200,17 +216,18 @@ hs_debug_u32(
static
void
-hs_debug_u64(uint64_t const * vout_h,
- uint32_t const count,
- struct hs_info const * const info)
+hs_debug_u64(uint32_t const hs_width,
+ uint32_t const hs_height,
+ uint64_t const * vout_h,
+ uint32_t const count)
{
- uint32_t const slab = info->keys * info->lanes;
- uint32_t const slabs = (count + slab - 1) / slab;
+ uint32_t const slab_keys = hs_width * hs_height;
+ uint32_t const slabs = (count + slab_keys - 1) / slab_keys;
for (uint32_t ss=0; ss<slabs; ss++) {
fprintf(stderr,"%u\n",ss);
- for (uint32_t cc=0; cc<info->keys; cc++) {
- for (uint32_t rr=0; rr<info->lanes; rr++)
+ for (uint32_t cc=0; cc<hs_height; cc++) {
+ for (uint32_t rr=0; rr<hs_width; rr++)
fprintf(stderr,"%16llX ",*vout_h++);
fprintf(stderr,"\n");
}
@@ -275,7 +292,10 @@ hs_dummy_kernel_release()
static
void
-hs_dummy_kernel_enqueue(cl_command_queue cq, cl_event * const event)
+hs_dummy_kernel_enqueue(cl_command_queue cq,
+ uint32_t wait_list_size,
+ cl_event const * wait_list,
+ cl_event * event)
{
size_t const global_work_size = 1;
@@ -285,8 +305,8 @@ hs_dummy_kernel_enqueue(cl_command_queue cq, cl_event * const event)
NULL,
&global_work_size,
NULL,
- 0,
- NULL,
+ wait_list_size,
+ wait_list,
event));
}
@@ -298,8 +318,12 @@ static
void
hs_bench(cl_context context,
cl_command_queue cq,
+ cl_command_queue cq_profile,
char const * const device_name,
- struct hs_info const * const info,
+ uint32_t const hs_words,
+ uint32_t const hs_width,
+ uint32_t const hs_height,
+ struct hs_cl const * const hs,
uint32_t const count_lo,
uint32_t const count_hi,
uint32_t const count_step,
@@ -318,14 +342,13 @@ hs_bench(cl_context context,
//
uint32_t count_hi_padded_in, count_hi_padded_out;
- hs_pad(count_hi,&count_hi_padded_in,&count_hi_padded_out);
+ hs_cl_pad(hs,count_hi,&count_hi_padded_in,&count_hi_padded_out);
//
// SIZE
//
- size_t const key_size = sizeof(uint32_t) * info->words;
+ size_t const key_size = sizeof(uint32_t) * hs_words;
- size_t const size_hi = count_hi * key_size;
size_t const size_hi_in = count_hi_padded_in * key_size;
size_t const size_hi_out = count_hi_padded_out * key_size;
@@ -363,7 +386,7 @@ hs_bench(cl_context context,
&cl_err); cl_ok(cl_err);
// fill with random numbers
- hs_fill_rand(random_h,count_hi,info->words);
+ hs_fill_rand(random_h,count_hi,hs_words);
//
// UNMAP
@@ -379,16 +402,14 @@ hs_bench(cl_context context,
// compute padding before sorting
uint32_t count_padded_in, count_padded_out;
- hs_pad(count,&count_padded_in,&count_padded_out);
+ hs_cl_pad(hs,count,&count_padded_in,&count_padded_out);
cl_ulong elapsed_ns_min = ULONG_MAX;
cl_ulong elapsed_ns_max = 0;
cl_ulong elapsed_ns_sum = 0;
-#if 1
- cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL));
- cl(Finish(cq));
-#endif
+ cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL));
+ cl(Finish(cq));
for (uint32_t ii=0; ii<warmup+loops; ii++)
{
@@ -410,23 +431,23 @@ hs_bench(cl_context context,
//
// sort vin
//
- cl_event start, end;
+ cl_event start, complete, end;
- hs_dummy_kernel_enqueue(cq,&start);
-
- cl(EnqueueBarrierWithWaitList(cq,0,NULL,NULL));
+ hs_dummy_kernel_enqueue(cq_profile,0,NULL,&start);
// note hs_sort enqueues a final barrier
- hs_sort(cq,
- vin,vout,
- count,
- count_padded_in,
- count_padded_out,
- linearize);
+ hs_cl_sort(hs,
+ cq,
+ 1,&start,&complete,
+ vin,vout,
+ count,
+ count_padded_in,
+ count_padded_out,
+ linearize);
- hs_dummy_kernel_enqueue(cq,&end);
+ hs_dummy_kernel_enqueue(cq_profile,1,&complete,&end);
- cl(Finish(cq));
+ cl(Finish(cq_profile));
//
// measure duration
@@ -439,7 +460,6 @@ hs_bench(cl_context context,
sizeof(cl_ulong),
&t_start,
NULL));
- cl(ReleaseEvent(start));
// end
cl(GetEventProfilingInfo(end,
@@ -447,13 +467,16 @@ hs_bench(cl_context context,
sizeof(cl_ulong),
&t_end,
NULL));
- cl(ReleaseEvent(end));
cl_ulong const t = t_end - t_start;
elapsed_ns_min = MIN_MACRO(elapsed_ns_min,t);
elapsed_ns_max = MAX_MACRO(elapsed_ns_max,t);
elapsed_ns_sum += t;
+
+ cl(ReleaseEvent(start));
+ cl(ReleaseEvent(complete));
+ cl(ReleaseEvent(end));
}
//
@@ -485,27 +508,27 @@ hs_bench(cl_context context,
double cpu_ns;
- char const * const algo = hs_cpu_sort(sorted_h,count_padded_in,info,&cpu_ns);
+ char const * const algo = hs_cpu_sort(hs_words,sorted_h,count_padded_in,&cpu_ns);
//
// EXPLICITLY TRANSPOSE THE CPU SORTED SLABS IF NOT LINEARIZING
//
if (!linearize) {
- hs_transpose_slabs(vout_h,count_padded_in,info);
+ hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in);
}
//
// VERIFY
//
- bool const verified = hs_verify_linear(sorted_h,vout_h,count_padded_in,info);
+ bool const verified = hs_verify_linear(hs_words,sorted_h,vout_h,count_padded_in);
#ifndef NDEBUG
if (!verified)
{
- if (info->words == 1)
- hs_debug_u32(vout_h,count,info);
+ if (hs_words == 1)
+ hs_debug_u32(hs_width,hs_height,vout_h,count);
else // ulong
- hs_debug_u64(vout_h,count,info);
+ hs_debug_u64(hs_width,hs_height,vout_h,count);
}
#endif
@@ -519,7 +542,7 @@ hs_bench(cl_context context,
//
fprintf(stdout,"%s, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n",
device_name,
- (info->words == 1) ? "uint" : "ulong",
+ (hs_words == 1) ? "uint" : "ulong",
linearize ? "linear" : "slab",
verified ? " OK " : "*FAIL*",
count,
@@ -555,8 +578,15 @@ hs_bench(cl_context context,
//
//
+#define HS_TARGET_NAME hs_target
+#include "intel/gen8/u64/hs_target.h"
+
+//
+//
+//
+
int
-main(int argc, char** argv)
+main(int argc, char const * argv[])
{
char const * const target_platform_substring = "Intel";
char const * const target_device_substring = "Graphics";
@@ -601,43 +631,64 @@ main(int argc, char** argv)
//
// create command queue
//
- cl_command_queue_properties const props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE;
-
#if 0 // OPENCL 2.0
- cl_queue_properties queue_properties[] =
- {
- CL_QUEUE_PROPERTIES, (cl_queue_properties)props,
- 0
- };
+
+ cl_queue_properties props[] = {
+ CL_QUEUE_PROPERTIES,
+ (cl_queue_properties)CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
+#ifndef NDEBUG
+ (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE,
+#endif
+ 0
+ };
+
+ cl_queue_properties props_profile[] = {
+ CL_QUEUE_PROPERTIES,
+ (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE,
+ 0
+ };
cl_command_queue cq = clCreateCommandQueueWithProperties(context,
device_id,
- queue_properties,
+ props,
&cl_err); cl_ok(cl_err);
+
+ cl_command_queue cq_profile = clCreateCommandQueueWithProperties(context,
+ device_id,
+ props_profile,
+ &cl_err); cl_ok(cl_err);
#else // OPENCL 1.2
+
cl_command_queue cq = clCreateCommandQueue(context,
device_id,
- props,
+#ifndef NDEBUG
+ CL_QUEUE_PROFILING_ENABLE |
+#endif
+ CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
&cl_err); cl_ok(cl_err);
+
+ cl_command_queue cq_profile = clCreateCommandQueue(context,
+ device_id,
+ CL_QUEUE_PROFILING_ENABLE,
+ &cl_err); cl_ok(cl_err);
#endif
//
+ // Intel GEN workaround -- create dummy kernel for semi-accurate
+ // profiling on an out-of-order queue.
+ //
+ hs_dummy_kernel_create(context,device_id);
+
+ //
// create kernels
//
fprintf(stdout,"Creating... ");
- struct hs_info info;
-
- hs_create(context,device_id,&info);
+ struct hs_cl * const hs = hs_cl_create(&hs_target,context,device_id);
fprintf(stdout,"done.\n");
//
- // create dummy kernel for profiling
- //
- hs_dummy_kernel_create(context,device_id);
-
- //
//
//
#ifdef NDEBUG
@@ -651,7 +702,7 @@ main(int argc, char** argv)
//
// sort sizes and loops
//
- uint32_t const kpb = info.keys * info.lanes;
+ uint32_t const kpb = hs_target.config.slab.height << hs_target.config.slab.width_log2;
uint32_t const count_lo = (argc <= 1) ? kpb : strtoul(argv[1],NULL,0);
uint32_t const count_hi = (argc <= 2) ? count_lo : strtoul(argv[2],NULL,0);
@@ -663,15 +714,30 @@ main(int argc, char** argv)
//
// benchmark
//
- hs_bench(context,cq,device_name,&info,count_lo,count_hi,count_step,loops,warmup,linearize);
+ hs_bench(context,
+ cq,cq_profile,
+ device_name,
+ hs_target.config.words.key + hs_target.config.words.val,
+ 1 << hs_target.config.slab.width_log2,
+ hs_target.config.slab.height,
+ hs,
+ count_lo,
+ count_hi,
+ count_step,
+ loops,
+ warmup,
+ linearize);
//
// release everything
//
+ hs_cl_release(hs);
+
hs_dummy_kernel_release();
- hs_release();
cl(ReleaseCommandQueue(cq));
+ cl(ReleaseCommandQueue(cq_profile));
+
cl(ReleaseContext(context));
return 0;