diff options
Diffstat (limited to 'src/compute/hs/cl/bench/main.c')
-rw-r--r-- | src/compute/hs/cl/bench/main.c | 262 |
1 files changed, 164 insertions, 98 deletions
diff --git a/src/compute/hs/cl/bench/main.c b/src/compute/hs/cl/bench/main.c index 3b9ef6e1c7..bfa7c1da38 100644 --- a/src/compute/hs/cl/bench/main.c +++ b/src/compute/hs/cl/bench/main.c @@ -32,9 +32,10 @@ #define CL_USE_DEPRECATED_OPENCL_1_2_APIS #endif -#include "macros.h" -#include "assert_cl.h" -#include "find_cl.h" +#include "common/macros.h" +#include "common/cl/assert_cl.h" +#include "common/cl/find_cl.h" + #include "hs_cl_launcher.h" // @@ -90,10 +91,10 @@ char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count); static char const * -hs_cpu_sort(void * sorted_h, - uint32_t const count, - struct hs_info const * const info, - double * const cpu_ns) +hs_cpu_sort(uint32_t const hs_words, + void * sorted_h, + uint32_t const count, + double * const cpu_ns) { char const * algo; @@ -101,7 +102,7 @@ hs_cpu_sort(void * sorted_h, QueryPerformanceCounter(&t0); - if (info->words == 1) + if (hs_words == 1) algo = hs_cpu_sort_u32(sorted_h,count); else algo = hs_cpu_sort_u64(sorted_h,count); @@ -117,27 +118,34 @@ hs_cpu_sort(void * sorted_h, static bool -hs_verify_linear(void * sorted_h, void * vout_h, const uint32_t count, struct hs_info const * const info) +hs_verify_linear(uint32_t const hs_words, + void * sorted_h, + void * vout_h, + uint32_t const count) { - return memcmp(sorted_h, vout_h, sizeof(uint32_t) * info->words * count) == 0; + return memcmp(sorted_h, vout_h, sizeof(uint32_t) * hs_words * count) == 0; } static void -hs_transpose_slabs_u32(uint32_t * vout_h, const uint32_t count, struct hs_info const * const info) +hs_transpose_slabs_u32(uint32_t const hs_words, + uint32_t const hs_width, + uint32_t const hs_height, + uint32_t * vout_h, + uint32_t const count) { - uint32_t const slab_keys = info->keys * info->lanes; - size_t const slab_size = sizeof(uint32_t) * info->words * slab_keys; - uint32_t * const slab = _alloca(slab_size); + uint32_t const slab_keys = hs_width * hs_height; + size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys; + uint32_t * const slab = ALLOCA_MACRO(slab_size); uint32_t slab_count = count / slab_keys; while (slab_count-- > 0) { memcpy(slab,vout_h,slab_size); - for (uint32_t row=0; row<info->keys; row++) - for (uint32_t col=0; col<info->lanes; col++) - vout_h[col * info->keys + row] = slab[row * info->lanes + col]; + for (uint32_t row=0; row<hs_height; row++) + for (uint32_t col=0; col<hs_width; col++) + vout_h[col * hs_height + row] = slab[row * hs_width + col]; vout_h += slab_keys; } @@ -145,20 +153,24 @@ hs_transpose_slabs_u32(uint32_t * vout_h, const uint32_t count, struct hs_info c static void -hs_transpose_slabs_u64(uint64_t * vout_h, const uint32_t count, struct hs_info const * const info) +hs_transpose_slabs_u64(uint32_t const hs_words, + uint32_t const hs_width, + uint32_t const hs_height, + uint64_t * vout_h, + uint32_t const count) { - uint32_t const slab_keys = info->keys * info->lanes; - size_t const slab_size = sizeof(uint32_t) * info->words * slab_keys; - uint64_t * const slab = _alloca(slab_size); + uint32_t const slab_keys = hs_width * hs_height; + size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys; + uint64_t * const slab = ALLOCA_MACRO(slab_size); uint32_t slab_count = count / slab_keys; while (slab_count-- > 0) { memcpy(slab,vout_h,slab_size); - for (uint32_t row=0; row<info->keys; row++) - for (uint32_t col=0; col<info->lanes; col++) - vout_h[col * info->keys + row] = slab[row * info->lanes + col]; + for (uint32_t row=0; row<hs_height; row++) + for (uint32_t col=0; col<hs_width; col++) + vout_h[col * hs_height + row] = slab[row * hs_width + col]; vout_h += slab_keys; } @@ -166,12 +178,16 @@ hs_transpose_slabs_u64(uint64_t * vout_h, const uint32_t count, struct hs_info c static void -hs_transpose_slabs(void * vout_h, const uint32_t count, struct hs_info const * const info) +hs_transpose_slabs(uint32_t const hs_words, + uint32_t const hs_width, + uint32_t const hs_height, + void * vout_h, + uint32_t const count) { - if (info->words == 1) - hs_transpose_slabs_u32(vout_h,count,info); + if (hs_words == 1) + hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count); else - hs_transpose_slabs_u64(vout_h,count,info); + hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count); } // @@ -180,18 +196,18 @@ hs_transpose_slabs(void * vout_h, const uint32_t count, struct hs_info const * c static void -hs_debug_u32( - uint32_t const * vout_h, - uint32_t const count, - struct hs_info const * const info) +hs_debug_u32(uint32_t const hs_width, + uint32_t const hs_height, + uint32_t const * vout_h, + uint32_t const count) { - uint32_t const slab = info->keys * info->lanes; - uint32_t const slabs = (count + slab - 1) / slab; + uint32_t const slab_keys = hs_width * hs_height; + uint32_t const slabs = (count + slab_keys - 1) / slab_keys; for (uint32_t ss=0; ss<slabs; ss++) { fprintf(stderr,"%u\n",ss); - for (uint32_t cc=0; cc<info->keys; cc++) { - for (uint32_t rr=0; rr<info->lanes; rr++) + for (uint32_t cc=0; cc<hs_height; cc++) { + for (uint32_t rr=0; rr<hs_width; rr++) fprintf(stderr,"%8X ",*vout_h++); fprintf(stderr,"\n"); } @@ -200,17 +216,18 @@ hs_debug_u32( static void -hs_debug_u64(uint64_t const * vout_h, - uint32_t const count, - struct hs_info const * const info) +hs_debug_u64(uint32_t const hs_width, + uint32_t const hs_height, + uint64_t const * vout_h, + uint32_t const count) { - uint32_t const slab = info->keys * info->lanes; - uint32_t const slabs = (count + slab - 1) / slab; + uint32_t const slab_keys = hs_width * hs_height; + uint32_t const slabs = (count + slab_keys - 1) / slab_keys; for (uint32_t ss=0; ss<slabs; ss++) { fprintf(stderr,"%u\n",ss); - for (uint32_t cc=0; cc<info->keys; cc++) { - for (uint32_t rr=0; rr<info->lanes; rr++) + for (uint32_t cc=0; cc<hs_height; cc++) { + for (uint32_t rr=0; rr<hs_width; rr++) fprintf(stderr,"%16llX ",*vout_h++); fprintf(stderr,"\n"); } @@ -275,7 +292,10 @@ hs_dummy_kernel_release() static void -hs_dummy_kernel_enqueue(cl_command_queue cq, cl_event * const event) +hs_dummy_kernel_enqueue(cl_command_queue cq, + uint32_t wait_list_size, + cl_event const * wait_list, + cl_event * event) { size_t const global_work_size = 1; @@ -285,8 +305,8 @@ hs_dummy_kernel_enqueue(cl_command_queue cq, cl_event * const event) NULL, &global_work_size, NULL, - 0, - NULL, + wait_list_size, + wait_list, event)); } @@ -298,8 +318,12 @@ static void hs_bench(cl_context context, cl_command_queue cq, + cl_command_queue cq_profile, char const * const device_name, - struct hs_info const * const info, + uint32_t const hs_words, + uint32_t const hs_width, + uint32_t const hs_height, + struct hs_cl const * const hs, uint32_t const count_lo, uint32_t const count_hi, uint32_t const count_step, @@ -318,14 +342,13 @@ hs_bench(cl_context context, // uint32_t count_hi_padded_in, count_hi_padded_out; - hs_pad(count_hi,&count_hi_padded_in,&count_hi_padded_out); + hs_cl_pad(hs,count_hi,&count_hi_padded_in,&count_hi_padded_out); // // SIZE // - size_t const key_size = sizeof(uint32_t) * info->words; + size_t const key_size = sizeof(uint32_t) * hs_words; - size_t const size_hi = count_hi * key_size; size_t const size_hi_in = count_hi_padded_in * key_size; size_t const size_hi_out = count_hi_padded_out * key_size; @@ -363,7 +386,7 @@ hs_bench(cl_context context, &cl_err); cl_ok(cl_err); // fill with random numbers - hs_fill_rand(random_h,count_hi,info->words); + hs_fill_rand(random_h,count_hi,hs_words); // // UNMAP @@ -379,16 +402,14 @@ hs_bench(cl_context context, // compute padding before sorting uint32_t count_padded_in, count_padded_out; - hs_pad(count,&count_padded_in,&count_padded_out); + hs_cl_pad(hs,count,&count_padded_in,&count_padded_out); cl_ulong elapsed_ns_min = ULONG_MAX; cl_ulong elapsed_ns_max = 0; cl_ulong elapsed_ns_sum = 0; -#if 1 - cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL)); - cl(Finish(cq)); -#endif + cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL)); + cl(Finish(cq)); for (uint32_t ii=0; ii<warmup+loops; ii++) { @@ -410,23 +431,23 @@ hs_bench(cl_context context, // // sort vin // - cl_event start, end; + cl_event start, complete, end; - hs_dummy_kernel_enqueue(cq,&start); - - cl(EnqueueBarrierWithWaitList(cq,0,NULL,NULL)); + hs_dummy_kernel_enqueue(cq_profile,0,NULL,&start); // note hs_sort enqueues a final barrier - hs_sort(cq, - vin,vout, - count, - count_padded_in, - count_padded_out, - linearize); + hs_cl_sort(hs, + cq, + 1,&start,&complete, + vin,vout, + count, + count_padded_in, + count_padded_out, + linearize); - hs_dummy_kernel_enqueue(cq,&end); + hs_dummy_kernel_enqueue(cq_profile,1,&complete,&end); - cl(Finish(cq)); + cl(Finish(cq_profile)); // // measure duration @@ -439,7 +460,6 @@ hs_bench(cl_context context, sizeof(cl_ulong), &t_start, NULL)); - cl(ReleaseEvent(start)); // end cl(GetEventProfilingInfo(end, @@ -447,13 +467,16 @@ hs_bench(cl_context context, sizeof(cl_ulong), &t_end, NULL)); - cl(ReleaseEvent(end)); cl_ulong const t = t_end - t_start; elapsed_ns_min = MIN_MACRO(elapsed_ns_min,t); elapsed_ns_max = MAX_MACRO(elapsed_ns_max,t); elapsed_ns_sum += t; + + cl(ReleaseEvent(start)); + cl(ReleaseEvent(complete)); + cl(ReleaseEvent(end)); } // @@ -485,27 +508,27 @@ hs_bench(cl_context context, double cpu_ns; - char const * const algo = hs_cpu_sort(sorted_h,count_padded_in,info,&cpu_ns); + char const * const algo = hs_cpu_sort(hs_words,sorted_h,count_padded_in,&cpu_ns); // // EXPLICITLY TRANSPOSE THE CPU SORTED SLABS IF NOT LINEARIZING // if (!linearize) { - hs_transpose_slabs(vout_h,count_padded_in,info); + hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in); } // // VERIFY // - bool const verified = hs_verify_linear(sorted_h,vout_h,count_padded_in,info); + bool const verified = hs_verify_linear(hs_words,sorted_h,vout_h,count_padded_in); #ifndef NDEBUG if (!verified) { - if (info->words == 1) - hs_debug_u32(vout_h,count,info); + if (hs_words == 1) + hs_debug_u32(hs_width,hs_height,vout_h,count); else // ulong - hs_debug_u64(vout_h,count,info); + hs_debug_u64(hs_width,hs_height,vout_h,count); } #endif @@ -519,7 +542,7 @@ hs_bench(cl_context context, // fprintf(stdout,"%s, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n", device_name, - (info->words == 1) ? "uint" : "ulong", + (hs_words == 1) ? "uint" : "ulong", linearize ? "linear" : "slab", verified ? " OK " : "*FAIL*", count, @@ -555,8 +578,15 @@ hs_bench(cl_context context, // // +#define HS_TARGET_NAME hs_target +#include "intel/gen8/u64/hs_target.h" + +// +// +// + int -main(int argc, char** argv) +main(int argc, char const * argv[]) { char const * const target_platform_substring = "Intel"; char const * const target_device_substring = "Graphics"; @@ -601,43 +631,64 @@ main(int argc, char** argv) // // create command queue // - cl_command_queue_properties const props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE; - #if 0 // OPENCL 2.0 - cl_queue_properties queue_properties[] = - { - CL_QUEUE_PROPERTIES, (cl_queue_properties)props, - 0 - }; + + cl_queue_properties props[] = { + CL_QUEUE_PROPERTIES, + (cl_queue_properties)CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, +#ifndef NDEBUG + (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE, +#endif + 0 + }; + + cl_queue_properties props_profile[] = { + CL_QUEUE_PROPERTIES, + (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE, + 0 + }; cl_command_queue cq = clCreateCommandQueueWithProperties(context, device_id, - queue_properties, + props, &cl_err); cl_ok(cl_err); + + cl_command_queue cq_profile = clCreateCommandQueueWithProperties(context, + device_id, + props_profile, + &cl_err); cl_ok(cl_err); #else // OPENCL 1.2 + cl_command_queue cq = clCreateCommandQueue(context, device_id, - props, +#ifndef NDEBUG + CL_QUEUE_PROFILING_ENABLE | +#endif + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &cl_err); cl_ok(cl_err); + + cl_command_queue cq_profile = clCreateCommandQueue(context, + device_id, + CL_QUEUE_PROFILING_ENABLE, + &cl_err); cl_ok(cl_err); #endif // + // Intel GEN workaround -- create dummy kernel for semi-accurate + // profiling on an out-of-order queue. + // + hs_dummy_kernel_create(context,device_id); + + // // create kernels // fprintf(stdout,"Creating... "); - struct hs_info info; - - hs_create(context,device_id,&info); + struct hs_cl * const hs = hs_cl_create(&hs_target,context,device_id); fprintf(stdout,"done.\n"); // - // create dummy kernel for profiling - // - hs_dummy_kernel_create(context,device_id); - - // // // #ifdef NDEBUG @@ -651,7 +702,7 @@ main(int argc, char** argv) // // sort sizes and loops // - uint32_t const kpb = info.keys * info.lanes; + uint32_t const kpb = hs_target.config.slab.height << hs_target.config.slab.width_log2; uint32_t const count_lo = (argc <= 1) ? kpb : strtoul(argv[1],NULL,0); uint32_t const count_hi = (argc <= 2) ? count_lo : strtoul(argv[2],NULL,0); @@ -663,15 +714,30 @@ main(int argc, char** argv) // // benchmark // - hs_bench(context,cq,device_name,&info,count_lo,count_hi,count_step,loops,warmup,linearize); + hs_bench(context, + cq,cq_profile, + device_name, + hs_target.config.words.key + hs_target.config.words.val, + 1 << hs_target.config.slab.width_log2, + hs_target.config.slab.height, + hs, + count_lo, + count_hi, + count_step, + loops, + warmup, + linearize); // // release everything // + hs_cl_release(hs); + hs_dummy_kernel_release(); - hs_release(); cl(ReleaseCommandQueue(cq)); + cl(ReleaseCommandQueue(cq_profile)); + cl(ReleaseContext(context)); return 0; |